--- a/matcher.scala Wed Nov 14 08:46:00 2012 +0000
+++ b/matcher.scala Mon Nov 19 14:18:42 2012 +0000
@@ -11,15 +11,6 @@
case class NOT(r: Rexp) extends Rexp
-// some convenience for typing in regular expressions
-def charlist2rexp(s : List[Char]) : Rexp = s match {
- case Nil => EMPTY
- case c::Nil => CHAR(c)
- case c::s => SEQ(CHAR(c), charlist2rexp(s))
-}
-implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
-
-
// nullable function: tests whether the regular
// expression can recognise the empty string
def nullable (r: Rexp) : Boolean = r match {
@@ -59,42 +50,74 @@
// regular expression for specifying
// ranges of characters
-def RANGE(s : List[Char]) : Rexp = s match {
+def Range(s : List[Char]) : Rexp = s match {
case Nil => NULL
case c::Nil => CHAR(c)
- case c::s => ALT(CHAR(c), RANGE(s))
+ case c::s => ALT(CHAR(c), Range(s))
}
+def RANGE(s: String) = Range(s.toList)
+
// one or more
def PLUS(r: Rexp) = SEQ(r, STAR(r))
+// many alternatives
+def Alts(rs: List[Rexp]) : Rexp = rs match {
+ case Nil => NULL
+ case r::Nil => r
+ case r::rs => ALT(r, Alts(rs))
+}
+def ALTS(rs: Rexp*) = Alts(rs.toList)
+
+// repetitions
+def Seqs(rs: List[Rexp]) : Rexp = rs match {
+ case Nil => NULL
+ case r::Nil => r
+ case r::rs => SEQ(r, Seqs(rs))
+}
+def SEQS(rs: Rexp*) = Seqs(rs.toList)
+
+// some convenience for typing in regular expressions
+def charlist2rexp(s : List[Char]) : Rexp = s match {
+ case Nil => EMPTY
+ case c::Nil => CHAR(c)
+ case c::s => SEQ(CHAR(c), charlist2rexp(s))
+}
+implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
+
type Rule[T] = (Rexp, List[Char] => T)
-def error (s: String) = throw new IllegalArgumentException ("Cannot tokenize: " + s)
+case class Tokenizer[T](rules: List[Rule[T]], excl: List[T] = Nil) {
-def munch[T](r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] =
- s match {
- case Nil if (nullable(r)) => Some(Nil, action(t))
- case Nil => None
- case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
- case c::s if (no_more(der (c, r))) => None
- case c::s => munch(der (c, r), action, s, t ::: List(c))
+ def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] =
+ s match {
+ case Nil if (nullable(r)) => Some(Nil, action(t))
+ case Nil => None
+ case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
+ case c::s if (no_more(der (c, r))) => None
+ case c::s => munch(der (c, r), action, s, t ::: List(c))
+ }
+
+ def one_token(s: List[Char]) : Either[(List[Char], T), String] = {
+ val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten
+ if (somes == Nil) Right(s.mkString)
+ else Left(somes sortBy (_._1.length) head)
}
-def one_token[T](rs: List[Rule[T]], s: List[Char]) : (List[Char], T) = {
- val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten
- if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
+ def tokenize(cs: List[Char]) : List[T] = cs match {
+ case Nil => Nil
+ case _ => one_token(cs) match {
+ case Left((rest, token)) => token :: tokenize(rest)
+ case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil }
+ }
+ }
+
+ def fromString(s: String) : List[T] =
+ tokenize(s.toList).filterNot(excl.contains(_))
+
+ def fromFile(name: String) : List[T] =
+ fromString(io.Source.fromFile(name).mkString)
+
}
-def tokenize[T](rs: List[Rule[T]], s: List[Char]) : List[T] = s match {
- case Nil => Nil
- case _ => one_token(rs, s) match {
- case (rest, token) => token :: tokenize(rs, rest)
- }
-}
-
-
-
-
-