matcher.scala
changeset 64 2d625418c011
parent 62 5988e44ea048
child 75 898c25a4e399
--- a/matcher.scala	Wed Nov 14 08:46:00 2012 +0000
+++ b/matcher.scala	Mon Nov 19 14:18:42 2012 +0000
@@ -11,15 +11,6 @@
 case class NOT(r: Rexp) extends Rexp
 
 
-// some convenience for typing in regular expressions
-def charlist2rexp(s : List[Char]) : Rexp = s match {
-  case Nil => EMPTY
-  case c::Nil => CHAR(c)
-  case c::s => SEQ(CHAR(c), charlist2rexp(s))
-}
-implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
-
-
 // nullable function: tests whether the regular 
 // expression can recognise the empty string
 def nullable (r: Rexp) : Boolean = r match {
@@ -59,42 +50,74 @@
 
 // regular expression for specifying 
 // ranges of characters
-def RANGE(s : List[Char]) : Rexp = s match {
+def Range(s : List[Char]) : Rexp = s match {
   case Nil => NULL
   case c::Nil => CHAR(c)
-  case c::s => ALT(CHAR(c), RANGE(s))
+  case c::s => ALT(CHAR(c), Range(s))
 }
+def RANGE(s: String) = Range(s.toList)
+
 
 // one or more
 def PLUS(r: Rexp) = SEQ(r, STAR(r))
 
+// many alternatives
+def Alts(rs: List[Rexp]) : Rexp = rs match {
+  case Nil => NULL
+  case r::Nil => r
+  case r::rs => ALT(r, Alts(rs))
+}
+def ALTS(rs: Rexp*) = Alts(rs.toList)
+
+// repetitions
+def Seqs(rs: List[Rexp]) : Rexp = rs match {
+  case Nil => NULL
+  case r::Nil => r
+  case r::rs => SEQ(r, Seqs(rs))
+}
+def SEQS(rs: Rexp*) = Seqs(rs.toList)
+
+// some convenience for typing in regular expressions
+def charlist2rexp(s : List[Char]) : Rexp = s match {
+  case Nil => EMPTY
+  case c::Nil => CHAR(c)
+  case c::s => SEQ(CHAR(c), charlist2rexp(s))
+}
+implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
+
 
 type Rule[T] = (Rexp, List[Char] => T)
 
-def error (s: String) = throw new IllegalArgumentException ("Cannot tokenize: " + s)
+case class Tokenizer[T](rules: List[Rule[T]], excl: List[T] = Nil) {
 
-def munch[T](r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = 
-  s match {
-    case Nil if (nullable(r)) => Some(Nil, action(t))
-    case Nil => None
-    case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
-    case c::s if (no_more(der (c, r))) => None
-    case c::s => munch(der (c, r), action, s, t ::: List(c))
+  def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = 
+    s match {
+      case Nil if (nullable(r)) => Some(Nil, action(t))
+      case Nil => None
+      case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
+      case c::s if (no_more(der (c, r))) => None
+      case c::s => munch(der (c, r), action, s, t ::: List(c))
+    }
+
+  def one_token(s: List[Char]) : Either[(List[Char], T), String] = {
+    val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten
+    if (somes == Nil) Right(s.mkString) 
+    else Left(somes sortBy (_._1.length) head)
   }
 
-def one_token[T](rs: List[Rule[T]], s: List[Char]) : (List[Char], T) = {
- val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten
- if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
+  def tokenize(cs: List[Char]) : List[T] = cs match {
+    case Nil => Nil
+    case _ => one_token(cs) match {
+      case Left((rest, token)) => token :: tokenize(rest)
+      case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } 
+    }
+  }
+
+  def fromString(s: String) : List[T] = 
+    tokenize(s.toList).filterNot(excl.contains(_))
+
+  def fromFile(name: String) : List[T] = 
+    fromString(io.Source.fromFile(name).mkString)
+
 }
 
-def tokenize[T](rs: List[Rule[T]], s: List[Char]) : List[T] = s match {
-  case Nil => Nil
-  case _ => one_token(rs, s) match {
-    case (rest, token) => token :: tokenize(rs, rest) 
-  }
-}
-
-
-
-
-