diff -r dff4b062a8a9 -r 2d625418c011 matcher.scala --- a/matcher.scala Wed Nov 14 08:46:00 2012 +0000 +++ b/matcher.scala Mon Nov 19 14:18:42 2012 +0000 @@ -11,15 +11,6 @@ case class NOT(r: Rexp) extends Rexp -// some convenience for typing in regular expressions -def charlist2rexp(s : List[Char]) : Rexp = s match { - case Nil => EMPTY - case c::Nil => CHAR(c) - case c::s => SEQ(CHAR(c), charlist2rexp(s)) -} -implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) - - // nullable function: tests whether the regular // expression can recognise the empty string def nullable (r: Rexp) : Boolean = r match { @@ -59,42 +50,74 @@ // regular expression for specifying // ranges of characters -def RANGE(s : List[Char]) : Rexp = s match { +def Range(s : List[Char]) : Rexp = s match { case Nil => NULL case c::Nil => CHAR(c) - case c::s => ALT(CHAR(c), RANGE(s)) + case c::s => ALT(CHAR(c), Range(s)) } +def RANGE(s: String) = Range(s.toList) + // one or more def PLUS(r: Rexp) = SEQ(r, STAR(r)) +// many alternatives +def Alts(rs: List[Rexp]) : Rexp = rs match { + case Nil => NULL + case r::Nil => r + case r::rs => ALT(r, Alts(rs)) +} +def ALTS(rs: Rexp*) = Alts(rs.toList) + +// repetitions +def Seqs(rs: List[Rexp]) : Rexp = rs match { + case Nil => NULL + case r::Nil => r + case r::rs => SEQ(r, Seqs(rs)) +} +def SEQS(rs: Rexp*) = Seqs(rs.toList) + +// some convenience for typing in regular expressions +def charlist2rexp(s : List[Char]) : Rexp = s match { + case Nil => EMPTY + case c::Nil => CHAR(c) + case c::s => SEQ(CHAR(c), charlist2rexp(s)) +} +implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) + type Rule[T] = (Rexp, List[Char] => T) -def error (s: String) = throw new IllegalArgumentException ("Cannot tokenize: " + s) +case class Tokenizer[T](rules: List[Rule[T]], excl: List[T] = Nil) { -def munch[T](r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = - s match { - case Nil if (nullable(r)) => Some(Nil, action(t)) - case Nil => None - case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t)) - case c::s if (no_more(der (c, r))) => None - case c::s => munch(der (c, r), action, s, t ::: List(c)) + def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = + s match { + case Nil if (nullable(r)) => Some(Nil, action(t)) + case Nil => None + case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t)) + case c::s if (no_more(der (c, r))) => None + case c::s => munch(der (c, r), action, s, t ::: List(c)) + } + + def one_token(s: List[Char]) : Either[(List[Char], T), String] = { + val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten + if (somes == Nil) Right(s.mkString) + else Left(somes sortBy (_._1.length) head) } -def one_token[T](rs: List[Rule[T]], s: List[Char]) : (List[Char], T) = { - val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten - if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head) + def tokenize(cs: List[Char]) : List[T] = cs match { + case Nil => Nil + case _ => one_token(cs) match { + case Left((rest, token)) => token :: tokenize(rest) + case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } + } + } + + def fromString(s: String) : List[T] = + tokenize(s.toList).filterNot(excl.contains(_)) + + def fromFile(name: String) : List[T] = + fromString(io.Source.fromFile(name).mkString) + } -def tokenize[T](rs: List[Rule[T]], s: List[Char]) : List[T] = s match { - case Nil => Nil - case _ => one_token(rs, s) match { - case (rest, token) => token :: tokenize(rs, rest) - } -} - - - - -