matcher.scala
changeset 64 2d625418c011
parent 62 5988e44ea048
child 75 898c25a4e399
equal deleted inserted replaced
63:dff4b062a8a9 64:2d625418c011
     7 case class CHAR(c: Char) extends Rexp
     7 case class CHAR(c: Char) extends Rexp
     8 case class ALT(r1: Rexp, r2: Rexp) extends Rexp
     8 case class ALT(r1: Rexp, r2: Rexp) extends Rexp
     9 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
     9 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
    10 case class STAR(r: Rexp) extends Rexp
    10 case class STAR(r: Rexp) extends Rexp
    11 case class NOT(r: Rexp) extends Rexp
    11 case class NOT(r: Rexp) extends Rexp
    12 
       
    13 
       
    14 // some convenience for typing in regular expressions
       
    15 def charlist2rexp(s : List[Char]) : Rexp = s match {
       
    16   case Nil => EMPTY
       
    17   case c::Nil => CHAR(c)
       
    18   case c::s => SEQ(CHAR(c), charlist2rexp(s))
       
    19 }
       
    20 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
       
    21 
    12 
    22 
    13 
    23 // nullable function: tests whether the regular 
    14 // nullable function: tests whether the regular 
    24 // expression can recognise the empty string
    15 // expression can recognise the empty string
    25 def nullable (r: Rexp) : Boolean = r match {
    16 def nullable (r: Rexp) : Boolean = r match {
    57   case NOT(r) => NOT(der (c, r))
    48   case NOT(r) => NOT(der (c, r))
    58 }
    49 }
    59 
    50 
    60 // regular expression for specifying 
    51 // regular expression for specifying 
    61 // ranges of characters
    52 // ranges of characters
    62 def RANGE(s : List[Char]) : Rexp = s match {
    53 def Range(s : List[Char]) : Rexp = s match {
    63   case Nil => NULL
    54   case Nil => NULL
    64   case c::Nil => CHAR(c)
    55   case c::Nil => CHAR(c)
    65   case c::s => ALT(CHAR(c), RANGE(s))
    56   case c::s => ALT(CHAR(c), Range(s))
    66 }
    57 }
       
    58 def RANGE(s: String) = Range(s.toList)
       
    59 
    67 
    60 
    68 // one or more
    61 // one or more
    69 def PLUS(r: Rexp) = SEQ(r, STAR(r))
    62 def PLUS(r: Rexp) = SEQ(r, STAR(r))
    70 
    63 
       
    64 // many alternatives
       
    65 def Alts(rs: List[Rexp]) : Rexp = rs match {
       
    66   case Nil => NULL
       
    67   case r::Nil => r
       
    68   case r::rs => ALT(r, Alts(rs))
       
    69 }
       
    70 def ALTS(rs: Rexp*) = Alts(rs.toList)
       
    71 
       
    72 // repetitions
       
    73 def Seqs(rs: List[Rexp]) : Rexp = rs match {
       
    74   case Nil => NULL
       
    75   case r::Nil => r
       
    76   case r::rs => SEQ(r, Seqs(rs))
       
    77 }
       
    78 def SEQS(rs: Rexp*) = Seqs(rs.toList)
       
    79 
       
    80 // some convenience for typing in regular expressions
       
    81 def charlist2rexp(s : List[Char]) : Rexp = s match {
       
    82   case Nil => EMPTY
       
    83   case c::Nil => CHAR(c)
       
    84   case c::s => SEQ(CHAR(c), charlist2rexp(s))
       
    85 }
       
    86 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
       
    87 
    71 
    88 
    72 type Rule[T] = (Rexp, List[Char] => T)
    89 type Rule[T] = (Rexp, List[Char] => T)
    73 
    90 
    74 def error (s: String) = throw new IllegalArgumentException ("Cannot tokenize: " + s)
    91 case class Tokenizer[T](rules: List[Rule[T]], excl: List[T] = Nil) {
    75 
    92 
    76 def munch[T](r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = 
    93   def munch(r: Rexp, action: List[Char] => T, s: List[Char], t: List[Char]) : Option[(List[Char], T)] = 
    77   s match {
    94     s match {
    78     case Nil if (nullable(r)) => Some(Nil, action(t))
    95       case Nil if (nullable(r)) => Some(Nil, action(t))
    79     case Nil => None
    96       case Nil => None
    80     case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
    97       case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
    81     case c::s if (no_more(der (c, r))) => None
    98       case c::s if (no_more(der (c, r))) => None
    82     case c::s => munch(der (c, r), action, s, t ::: List(c))
    99       case c::s => munch(der (c, r), action, s, t ::: List(c))
       
   100     }
       
   101 
       
   102   def one_token(s: List[Char]) : Either[(List[Char], T), String] = {
       
   103     val somes = rules.map { (r) => munch(r._1, r._2, s, Nil) }.flatten
       
   104     if (somes == Nil) Right(s.mkString) 
       
   105     else Left(somes sortBy (_._1.length) head)
    83   }
   106   }
    84 
   107 
    85 def one_token[T](rs: List[Rule[T]], s: List[Char]) : (List[Char], T) = {
   108   def tokenize(cs: List[Char]) : List[T] = cs match {
    86  val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten
   109     case Nil => Nil
    87  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
   110     case _ => one_token(cs) match {
       
   111       case Left((rest, token)) => token :: tokenize(rest)
       
   112       case Right(s) => { println("Cannot tokenize: \"" + s + "\""); Nil } 
       
   113     }
       
   114   }
       
   115 
       
   116   def fromString(s: String) : List[T] = 
       
   117     tokenize(s.toList).filterNot(excl.contains(_))
       
   118 
       
   119   def fromFile(name: String) : List[T] = 
       
   120     fromString(io.Source.fromFile(name).mkString)
       
   121 
    88 }
   122 }
    89 
   123 
    90 def tokenize[T](rs: List[Rule[T]], s: List[Char]) : List[T] = s match {
       
    91   case Nil => Nil
       
    92   case _ => one_token(rs, s) match {
       
    93     case (rest, token) => token :: tokenize(rs, rest) 
       
    94   }
       
    95 }
       
    96 
       
    97 
       
    98 
       
    99 
       
   100