|      1 package object matcher { |         | 
|      2  |         | 
|      3 // regular expressions  |         | 
|      4 // including constructors for NOT and ALLC |         | 
|      5 sealed abstract class Rexp |         | 
|      6  |         | 
|      7 case object NULL extends Rexp |         | 
|      8 case object EMPTY extends Rexp |         | 
|      9 case object ALLC extends Rexp            // recognises any character |         | 
|     10 case class CHAR(c: Char) extends Rexp |         | 
|     11 case class ALT(r1: Rexp, r2: Rexp) extends Rexp |         | 
|     12 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp |         | 
|     13 case class STAR(r: Rexp) extends Rexp |         | 
|     14 case class NOT(r: Rexp) extends Rexp     // negation of a regular expression |         | 
|     15  |         | 
|     16  |         | 
|     17 // nullable function: tests whether the regular  |         | 
|     18 // expression can recognise the empty string |         | 
|     19 def nullable (r: Rexp) : Boolean = r match { |         | 
|     20   case NULL => false |         | 
|     21   case EMPTY => true |         | 
|     22   case ALLC => false |         | 
|     23   case CHAR(_) => false |         | 
|     24   case ALT(r1, r2) => nullable(r1) || nullable(r2) |         | 
|     25   case SEQ(r1, r2) => nullable(r1) && nullable(r2) |         | 
|     26   case STAR(_) => true |         | 
|     27   case NOT(r) => !(nullable(r)) |         | 
|     28 } |         | 
|     29  |         | 
|     30 // tests whether a regular expression  |         | 
|     31 // cannot recognise more |         | 
|     32 def no_more (r: Rexp) : Boolean = r match { |         | 
|     33   case NULL => true |         | 
|     34   case EMPTY => false |         | 
|     35   case ALLC => false |         | 
|     36   case CHAR(_) => false |         | 
|     37   case ALT(r1, r2) => no_more(r1) && no_more(r2) |         | 
|     38   case SEQ(r1, r2) => if (nullable(r1)) (no_more(r1) && no_more(r2)) else no_more(r1) |         | 
|     39   case STAR(_) => false |         | 
|     40   case NOT(r) => !(no_more(r)) |         | 
|     41 } |         | 
|     42  |         | 
|     43  |         | 
|     44 // derivative of a regular expression w.r.t. a character |         | 
|     45 def der (c: Char, r: Rexp) : Rexp = r match { |         | 
|     46   case NULL => NULL |         | 
|     47   case EMPTY => NULL   |         | 
|     48   case ALLC => EMPTY |         | 
|     49   case CHAR(d) => if (c == d) EMPTY else NULL |         | 
|     50   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) |         | 
|     51   case SEQ(r1, r2) =>  |         | 
|     52     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |         | 
|     53     else SEQ(der(c, r1), r2) |         | 
|     54   case STAR(r) => SEQ(der(c, r), STAR(r)) |         | 
|     55   case NOT(r) => NOT(der (c, r)) |         | 
|     56 } |         | 
|     57  |         | 
|     58 // main class for the tokenizer |         | 
|     59 case class Tokenizer[T](rules: List[(Rexp, List[Char] => T)], excl: List[T] = Nil) { |         | 
|     60  |         | 
|     61 def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] =  |         | 
|     62   s match { |         | 
|     63     case Nil if (nullable(r)) => Some(Nil, action(t)) |         | 
|     64     case Nil => None |         | 
|     65     case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t)) |         | 
|     66     case c::s if (no_more(der (c, r))) => None |         | 
|     67     case c::s => munch(der (c, r), action, s, t ::: List(c)) |         | 
|     68   } |         | 
|     69  |         | 
|     70 def one_token(s: List[Char]) : Either[(List[Char], T), String] = { |         | 
|     71   val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten |         | 
|     72   if (somes == Nil) Right(s.mkString)  |         | 
|     73   else Left(somes sortBy (_._1.length) head) |         | 
|     74 } |         | 
|     75  |         | 
|     76 def tokenize(cs: List[Char]) : List[T] = cs match { |         | 
|     77   case Nil => Nil |         | 
|     78   case _ => one_token(cs) match { |         | 
|     79     case Left((rest, token)) => token :: tokenize(rest) |         | 
|     80     case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil }  |         | 
|     81   } |         | 
|     82 } |         | 
|     83  |         | 
|     84 def fromString(s: String) : List[T] =  |         | 
|     85   tokenize(s.toList).filterNot(excl.contains(_)) |         | 
|     86  |         | 
|     87 def fromFile(name: String) : List[T] =  |         | 
|     88   fromString(io.Source.fromFile(name).mkString) |         | 
|     89  |         | 
|     90 } |         | 
|     91  |         | 
|     92  |         | 
|     93 // regular expression for specifying  |         | 
|     94 // ranges of characters |         | 
|     95 def Range(s : List[Char]) : Rexp = s match { |         | 
|     96   case Nil => NULL |         | 
|     97   case c::Nil => CHAR(c) |         | 
|     98   case c::s => ALT(CHAR(c), Range(s)) |         | 
|     99 } |         | 
|    100 def RANGE(s: String) = Range(s.toList) |         | 
|    101  |         | 
|    102  |         | 
|    103 // one or more |         | 
|    104 def PLUS(r: Rexp) = SEQ(r, STAR(r)) |         | 
|    105  |         | 
|    106 // many alternatives |         | 
|    107 def Alts(rs: List[Rexp]) : Rexp = rs match { |         | 
|    108   case Nil => NULL |         | 
|    109   case r::Nil => r |         | 
|    110   case r::rs => ALT(r, Alts(rs)) |         | 
|    111 } |         | 
|    112 def ALTS(rs: Rexp*) = Alts(rs.toList) |         | 
|    113  |         | 
|    114 // repetitions |         | 
|    115 def Seqs(rs: List[Rexp]) : Rexp = rs match { |         | 
|    116   case Nil => NULL |         | 
|    117   case r::Nil => r |         | 
|    118   case r::rs => SEQ(r, Seqs(rs)) |         | 
|    119 } |         | 
|    120 def SEQS(rs: Rexp*) = Seqs(rs.toList) |         | 
|    121  |         | 
|    122 // some convenience for typing in regular expressions |         | 
|    123 def charlist2rexp(s : List[Char]) : Rexp = s match { |         | 
|    124   case Nil => EMPTY |         | 
|    125   case c::Nil => CHAR(c) |         | 
|    126   case c::s => SEQ(CHAR(c), charlist2rexp(s)) |         | 
|    127 } |         | 
|    128 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) |         | 
|    129  |         | 
|    130 } |         |