matcher.scala
changeset 85 1a4065f965fb
parent 75 898c25a4e399
equal deleted inserted replaced
84:719fd738d2a0 85:1a4065f965fb
       
     1 package object matcher {
     1 
     2 
     2 // regular expressions including NOT
     3 // regular expressions 
       
     4 // including constructors for NOT and ALLC
     3 abstract class Rexp
     5 abstract class Rexp
     4 
     6 
     5 case object NULL extends Rexp
     7 case object NULL extends Rexp
     6 case object EMPTY extends Rexp
     8 case object EMPTY extends Rexp
     7 case object ALLC extends Rexp            // recognises any character
     9 case object ALLC extends Rexp            // recognises any character
    51     else SEQ(der(c, r1), r2)
    53     else SEQ(der(c, r1), r2)
    52   case STAR(r) => SEQ(der(c, r), STAR(r))
    54   case STAR(r) => SEQ(der(c, r), STAR(r))
    53   case NOT(r) => NOT(der (c, r))
    55   case NOT(r) => NOT(der (c, r))
    54 }
    56 }
    55 
    57 
       
    58 // main class for the tokenizer
       
    59 case class Tokenizer[T](rules: List[(Rexp, List[Char] => T)], excl: List[T] = Nil) {
       
    60 
       
    61 def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = 
       
    62   s match {
       
    63     case Nil if (nullable(r)) => Some(Nil, action(t))
       
    64     case Nil => None
       
    65     case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
       
    66     case c::s if (no_more(der (c, r))) => None
       
    67     case c::s => munch(der (c, r), action, s, t ::: List(c))
       
    68   }
       
    69 
       
    70 def one_token(s: List[Char]) : Either[(List[Char], T), String] = {
       
    71   val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten
       
    72   if (somes == Nil) Right(s.mkString) 
       
    73   else Left(somes sortBy (_._1.length) head)
       
    74 }
       
    75 
       
    76 def tokenize(cs: List[Char]) : List[T] = cs match {
       
    77   case Nil => Nil
       
    78   case _ => one_token(cs) match {
       
    79     case Left((rest, token)) => token :: tokenize(rest)
       
    80     case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } 
       
    81   }
       
    82 }
       
    83 
       
    84 def fromString(s: String) : List[T] = 
       
    85   tokenize(s.toList).filterNot(excl.contains(_))
       
    86 
       
    87 def fromFile(name: String) : List[T] = 
       
    88   fromString(io.Source.fromFile(name).mkString)
       
    89 
       
    90 }
       
    91 
       
    92 
    56 // regular expression for specifying 
    93 // regular expression for specifying 
    57 // ranges of characters
    94 // ranges of characters
    58 def Range(s : List[Char]) : Rexp = s match {
    95 def Range(s : List[Char]) : Rexp = s match {
    59   case Nil => NULL
    96   case Nil => NULL
    60   case c::Nil => CHAR(c)
    97   case c::Nil => CHAR(c)
    88   case c::Nil => CHAR(c)
   125   case c::Nil => CHAR(c)
    89   case c::s => SEQ(CHAR(c), charlist2rexp(s))
   126   case c::s => SEQ(CHAR(c), charlist2rexp(s))
    90 }
   127 }
    91 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
   128 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
    92 
   129 
    93 
       
    94 type Rule[T] = (Rexp, List[Char] => T)
       
    95 
       
    96 case class Tokenizer[T](rules: List[Rule[T]], excl: List[T] = Nil) {
       
    97 
       
    98   def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = 
       
    99     s match {
       
   100       case Nil if (nullable(r)) => Some(Nil, action(t))
       
   101       case Nil => None
       
   102       case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
       
   103       case c::s if (no_more(der (c, r))) => None
       
   104       case c::s => munch(der (c, r), action, s, t ::: List(c))
       
   105     }
       
   106 
       
   107   def one_token(s: List[Char]) : Either[(List[Char], T), String] = {
       
   108     val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten
       
   109     if (somes == Nil) Right(s.mkString) 
       
   110     else Left(somes sortBy (_._1.length) head)
       
   111   }
       
   112 
       
   113   def tokenize(cs: List[Char]) : List[T] = cs match {
       
   114     case Nil => Nil
       
   115     case _ => one_token(cs) match {
       
   116       case Left((rest, token)) => token :: tokenize(rest)
       
   117       case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } 
       
   118     }
       
   119   }
       
   120 
       
   121   def fromString(s: String) : List[T] = 
       
   122     tokenize(s.toList).filterNot(excl.contains(_))
       
   123 
       
   124   def fromFile(name: String) : List[T] = 
       
   125     fromString(io.Source.fromFile(name).mkString)
       
   126 
       
   127 }
   130 }
   128