| 29 |      1 | 
 | 
|  |      2 | // regular expressions including NOT
 | 
| 26 |      3 | abstract class Rexp
 | 
|  |      4 | 
 | 
|  |      5 | case object NULL extends Rexp
 | 
|  |      6 | case object EMPTY extends Rexp
 | 
|  |      7 | case class CHAR(c: Char) extends Rexp
 | 
|  |      8 | case class ALT(r1: Rexp, r2: Rexp) extends Rexp
 | 
|  |      9 | case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
 | 
|  |     10 | case class STAR(r: Rexp) extends Rexp
 | 
|  |     11 | case class NOT(r: Rexp) extends Rexp
 | 
|  |     12 | 
 | 
|  |     13 | 
 | 
|  |     14 | // some convenience for typing in regular expressions
 | 
|  |     15 | def charlist2rexp(s : List[Char]) : Rexp = s match {
 | 
|  |     16 |   case Nil => EMPTY
 | 
|  |     17 |   case c::Nil => CHAR(c)
 | 
|  |     18 |   case c::s => SEQ(CHAR(c), charlist2rexp(s))
 | 
|  |     19 | }
 | 
|  |     20 | implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
 | 
|  |     21 | 
 | 
|  |     22 | 
 | 
|  |     23 | // nullable function: tests whether the regular 
 | 
|  |     24 | // expression can recognise the empty string
 | 
|  |     25 | def nullable (r: Rexp) : Boolean = r match {
 | 
|  |     26 |   case NULL => false
 | 
|  |     27 |   case EMPTY => true
 | 
|  |     28 |   case CHAR(_) => false
 | 
|  |     29 |   case ALT(r1, r2) => nullable(r1) || nullable(r2)
 | 
|  |     30 |   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
 | 
|  |     31 |   case STAR(_) => true
 | 
|  |     32 |   case NOT(r) => !(nullable(r))
 | 
|  |     33 | }
 | 
|  |     34 | 
 | 
|  |     35 | // tests whether a regular expression 
 | 
| 29 |     36 | // cannot recognise more
 | 
|  |     37 | def no_more (r: Rexp) : Boolean = r match {
 | 
| 26 |     38 |   case NULL => true
 | 
|  |     39 |   case EMPTY => false
 | 
|  |     40 |   case CHAR(_) => false
 | 
| 29 |     41 |   case ALT(r1, r2) => no_more(r1) && no_more(r2)
 | 
|  |     42 |   case SEQ(r1, r2) => if (nullable(r1)) (no_more(r1) && no_more(r2)) else no_more(r1)
 | 
| 26 |     43 |   case STAR(_) => false
 | 
| 29 |     44 |   case NOT(r) => !(no_more(r))
 | 
| 26 |     45 | }
 | 
|  |     46 | 
 | 
|  |     47 | 
 | 
|  |     48 | // derivative of a regular expression w.r.t. a character
 | 
|  |     49 | def der (c: Char, r: Rexp) : Rexp = r match {
 | 
|  |     50 |   case NULL => NULL
 | 
|  |     51 |   case EMPTY => NULL
 | 
|  |     52 |   case CHAR(d) => if (c == d) EMPTY else NULL
 | 
|  |     53 |   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
 | 
|  |     54 |   case SEQ(r1, r2) => 
 | 
|  |     55 |     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
 | 
|  |     56 |     else SEQ(der(c, r1), r2)
 | 
|  |     57 |   case STAR(r) => SEQ(der(c, r), STAR(r))
 | 
|  |     58 |   case NOT(r) => NOT(der (c, r))
 | 
|  |     59 | }
 | 
|  |     60 | 
 | 
|  |     61 | 
 | 
|  |     62 | // regular expression for specifying 
 | 
|  |     63 | // ranges of characters
 | 
|  |     64 | def RANGE(s : List[Char]) : Rexp = s match {
 | 
|  |     65 |   case Nil => NULL
 | 
|  |     66 |   case c::Nil => CHAR(c)
 | 
|  |     67 |   case c::s => ALT(CHAR(c), RANGE(s))
 | 
|  |     68 | }
 | 
|  |     69 | 
 | 
|  |     70 | //one or more
 | 
|  |     71 | def PLUS(r: Rexp) = SEQ(r, STAR(r))
 | 
|  |     72 | 
 | 
|  |     73 | 
 | 
|  |     74 | //some regular expressions
 | 
|  |     75 | val LOWERCASE = RANGE("abcdefghijklmnopqrstuvwxyz".toList)
 | 
|  |     76 | val UPPERCASE = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZ".toList)
 | 
|  |     77 | val LETTER = ALT(LOWERCASE, UPPERCASE)
 | 
| 29 |     78 | val DIGIT = RANGE("0123456789".toList)
 | 
|  |     79 | val NONZERODIGIT = RANGE("123456789".toList)
 | 
| 26 |     80 | 
 | 
| 29 |     81 | val IDENT = SEQ(LETTER, STAR(ALT(LETTER,DIGIT)))
 | 
|  |     82 | val NUMBER = ALT(SEQ(NONZERODIGIT, STAR(DIGIT)), "0")
 | 
| 26 |     83 | val WHITESPACE = RANGE(" \n".toList)
 | 
| 28 |     84 | val WHITESPACES = PLUS(WHITESPACE)
 | 
| 26 |     85 | 
 | 
| 29 |     86 | val ALL = ALT(ALT(LETTER, DIGIT), WHITESPACE)
 | 
| 26 |     87 | val COMMENT = SEQ(SEQ("/*", NOT(SEQ(SEQ(STAR(ALL), "*/"), STAR(ALL)))), "*/")
 | 
|  |     88 | 
 | 
|  |     89 | 
 | 
|  |     90 | // an example list of regular expressions
 | 
| 29 |     91 | val regs: List[Rexp]=  List("if", "then", "else", "+", IDENT, NUMBER, WHITESPACES, COMMENT) 
 | 
|  |     92 | 
 | 
| 26 |     93 | 
 | 
|  |     94 | def error (s: String) = throw new IllegalArgumentException ("Could not lex " + s)
 | 
|  |     95 | 
 | 
| 29 |     96 | def munch(r: Rexp, s: List[Char], t: List[Char]) : Option[(List[Char], List[Char])] = 
 | 
|  |     97 |   s match {
 | 
|  |     98 |     case Nil if (nullable(r)) => Some(Nil, t)
 | 
|  |     99 |     case Nil => None
 | 
|  |    100 |     case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, t)
 | 
|  |    101 |     case c::s if (no_more(der (c, r))) => None
 | 
|  |    102 |     case c::s => munch(der (c, r), s, t ::: List(c))
 | 
|  |    103 |   }
 | 
| 26 |    104 | 
 | 
| 29 |    105 | def one_string (regs: List[Rexp], s: List[Char]) : (List[Char], List[Char]) = {
 | 
| 26 |    106 |  val somes = regs.map { munch(_, s, Nil) } .flatten
 | 
|  |    107 |  if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
 | 
|  |    108 | }
 | 
|  |    109 | 
 | 
| 29 |    110 | def tokenize (regs: List[Rexp], s: List[Char]) : List[String] = s match {
 | 
| 26 |    111 |   case Nil => Nil
 | 
| 29 |    112 |   case _ => one_string(regs, s) match {
 | 
|  |    113 |     case (rest, s) => s.mkString :: tokenize(regs, rest) 
 | 
| 26 |    114 |   }
 | 
|  |    115 | }
 | 
|  |    116 | 
 | 
| 29 |    117 | //examples
 | 
|  |    118 | println(tokenize(regs, "if true then then 42 else +".toList))
 | 
|  |    119 | println(tokenize(regs, "if+true+then+then+42+else +".toList))
 | 
|  |    120 | println(tokenize(regs, "ifff if     34 34".toList))
 | 
|  |    121 | println(tokenize(regs, "/*ifff if */ hhjj /*34 */".toList))
 | 
|  |    122 | println(tokenize(regs, "/* if true then */ then 42 else +".toList))
 | 
|  |    123 | //println(tokenize(regs, "ifff $ if 34".toList)) // causes an error because of the symbol $
 |