|      1 // A simple lexer inspired by work of Sulzmann & Lu |         | 
|      2 //================================================== |         | 
|      3  |         | 
|      4  |         | 
|      5 import scala.language.implicitConversions     |         | 
|      6 import scala.language.reflectiveCalls |         | 
|      7  |         | 
|      8 // regular expressions including records |         | 
|      9 abstract class Rexp  |         | 
|     10 case object ZERO extends Rexp |         | 
|     11 case object ONE extends Rexp |         | 
|     12 case class CHAR(c: Char) extends Rexp |         | 
|     13 case class ALT(r1: Rexp, r2: Rexp) extends Rexp  |         | 
|     14 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  |         | 
|     15 case class STAR(r: Rexp) extends Rexp  |         | 
|     16 case class RECD(x: String, r: Rexp) extends Rexp |         | 
|     17    |         | 
|     18 // values   |         | 
|     19 abstract class Val |         | 
|     20 case object Empty extends Val |         | 
|     21 case class Chr(c: Char) extends Val |         | 
|     22 case class Sequ(v1: Val, v2: Val) extends Val |         | 
|     23 case class Left(v: Val) extends Val |         | 
|     24 case class Right(v: Val) extends Val |         | 
|     25 case class Stars(vs: List[Val]) extends Val |         | 
|     26 case class Rec(x: String, v: Val) extends Val |         | 
|     27     |         | 
|     28 // some convenience for typing in regular expressions |         | 
|     29 def charlist2rexp(s : List[Char]): Rexp = s match { |         | 
|     30   case Nil => ONE |         | 
|     31   case c::Nil => CHAR(c) |         | 
|     32   case c::s => SEQ(CHAR(c), charlist2rexp(s)) |         | 
|     33 } |         | 
|     34 implicit def string2rexp(s : String) : Rexp =  |         | 
|     35   charlist2rexp(s.toList) |         | 
|     36  |         | 
|     37 implicit def RexpOps(r: Rexp) = new { |         | 
|     38   def | (s: Rexp) = ALT(r, s) |         | 
|     39   def % = STAR(r) |         | 
|     40   def ~ (s: Rexp) = SEQ(r, s) |         | 
|     41 } |         | 
|     42  |         | 
|     43 implicit def stringOps(s: String) = new { |         | 
|     44   def | (r: Rexp) = ALT(s, r) |         | 
|     45   def | (r: String) = ALT(s, r) |         | 
|     46   def % = STAR(s) |         | 
|     47   def ~ (r: Rexp) = SEQ(s, r) |         | 
|     48   def ~ (r: String) = SEQ(s, r) |         | 
|     49   def $ (r: Rexp) = RECD(s, r) |         | 
|     50 } |         | 
|     51  |         | 
|     52 def nullable(r: Rexp) : Boolean = r match { |         | 
|     53   case ZERO => false |         | 
|     54   case ONE => true |         | 
|     55   case CHAR(_) => false |         | 
|     56   case ALT(r1, r2) => nullable(r1) || nullable(r2) |         | 
|     57   case SEQ(r1, r2) => nullable(r1) && nullable(r2) |         | 
|     58   case STAR(_) => true |         | 
|     59   case RECD(_, r1) => nullable(r1) |         | 
|     60 } |         | 
|     61  |         | 
|     62 def der(c: Char, r: Rexp) : Rexp = r match { |         | 
|     63   case ZERO => ZERO |         | 
|     64   case ONE => ZERO |         | 
|     65   case CHAR(d) => if (c == d) ONE else ZERO |         | 
|     66   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) |         | 
|     67   case SEQ(r1, r2) =>  |         | 
|     68     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |         | 
|     69     else SEQ(der(c, r1), r2) |         | 
|     70   case STAR(r) => SEQ(der(c, r), STAR(r)) |         | 
|     71   case RECD(_, r1) => der(c, r1) |         | 
|     72 } |         | 
|     73  |         | 
|     74  |         | 
|     75 // extracts a string from value |         | 
|     76 def flatten(v: Val) : String = v match { |         | 
|     77   case Empty => "" |         | 
|     78   case Chr(c) => c.toString |         | 
|     79   case Left(v) => flatten(v) |         | 
|     80   case Right(v) => flatten(v) |         | 
|     81   case Sequ(v1, v2) => flatten(v1) + flatten(v2) |         | 
|     82   case Stars(vs) => vs.map(flatten).mkString |         | 
|     83   case Rec(_, v) => flatten(v) |         | 
|     84 } |         | 
|     85  |         | 
|     86  |         | 
|     87 // extracts an environment from a value; |         | 
|     88 // used for tokenise a string |         | 
|     89 def env(v: Val) : List[(String, String)] = v match { |         | 
|     90   case Empty => Nil |         | 
|     91   case Chr(c) => Nil |         | 
|     92   case Left(v) => env(v) |         | 
|     93   case Right(v) => env(v) |         | 
|     94   case Sequ(v1, v2) => env(v1) ::: env(v2) |         | 
|     95   case Stars(vs) => vs.flatMap(env) |         | 
|     96   case Rec(x, v) => (x, flatten(v))::env(v) |         | 
|     97 } |         | 
|     98  |         | 
|     99 // The Injection Part of the lexer |         | 
|    100  |         | 
|    101 def mkeps(r: Rexp) : Val = r match { |         | 
|    102   case ONE => Empty |         | 
|    103   case ALT(r1, r2) =>  |         | 
|    104     if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2)) |         | 
|    105   case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2)) |         | 
|    106   case STAR(r) => Stars(Nil) |         | 
|    107   case RECD(x, r) => Rec(x, mkeps(r)) |         | 
|    108 } |         | 
|    109  |         | 
|    110 def inj(r: Rexp, c: Char, v: Val) : Val = (r, v) match { |         | 
|    111   case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs) |         | 
|    112   case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2) |         | 
|    113   case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2) |         | 
|    114   case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2)) |         | 
|    115   case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1)) |         | 
|    116   case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2)) |         | 
|    117   case (CHAR(d), Empty) => Chr(c)  |         | 
|    118   case (RECD(x, r1), _) => Rec(x, inj(r1, c, v)) |         | 
|    119 } |         | 
|    120  |         | 
|    121 // some "rectification" functions for simplification |         | 
|    122 def F_ID(v: Val): Val = v |         | 
|    123 def F_RIGHT(f: Val => Val) = (v:Val) => Right(f(v)) |         | 
|    124 def F_LEFT(f: Val => Val) = (v:Val) => Left(f(v)) |         | 
|    125 def F_ALT(f1: Val => Val, f2: Val => Val) = (v:Val) => v match { |         | 
|    126   case Right(v) => Right(f2(v)) |         | 
|    127   case Left(v) => Left(f1(v)) |         | 
|    128 } |         | 
|    129 def F_SEQ(f1: Val => Val, f2: Val => Val) = (v:Val) => v match { |         | 
|    130   case Sequ(v1, v2) => Sequ(f1(v1), f2(v2)) |         | 
|    131 } |         | 
|    132 def F_SEQ_Empty1(f1: Val => Val, f2: Val => Val) =  |         | 
|    133   (v:Val) => Sequ(f1(Empty), f2(v)) |         | 
|    134 def F_SEQ_Empty2(f1: Val => Val, f2: Val => Val) =  |         | 
|    135   (v:Val) => Sequ(f1(v), f2(Empty)) |         | 
|    136 def F_RECD(f: Val => Val) = (v:Val) => v match { |         | 
|    137   case Rec(x, v) => Rec(x, f(v)) |         | 
|    138 } |         | 
|    139 def F_ERROR(v: Val): Val = throw new Exception("error") |         | 
|    140  |         | 
|    141 def simp(r: Rexp): (Rexp, Val => Val) = r match { |         | 
|    142   case ALT(r1, r2) => { |         | 
|    143     val (r1s, f1s) = simp(r1) |         | 
|    144     val (r2s, f2s) = simp(r2) |         | 
|    145     (r1s, r2s) match { |         | 
|    146       case (ZERO, _) => (r2s, F_RIGHT(f2s)) |         | 
|    147       case (_, ZERO) => (r1s, F_LEFT(f1s)) |         | 
|    148       case _ => if (r1s == r2s) (r1s, F_LEFT(f1s)) |         | 
|    149                 else (ALT (r1s, r2s), F_ALT(f1s, f2s))  |         | 
|    150     } |         | 
|    151   } |         | 
|    152   case SEQ(r1, r2) => { |         | 
|    153     val (r1s, f1s) = simp(r1) |         | 
|    154     val (r2s, f2s) = simp(r2) |         | 
|    155     (r1s, r2s) match { |         | 
|    156       case (ZERO, _) => (ZERO, F_ERROR) |         | 
|    157       case (_, ZERO) => (ZERO, F_ERROR) |         | 
|    158       case (ONE, _) => (r2s, F_SEQ_Empty1(f1s, f2s)) |         | 
|    159       case (_, ONE) => (r1s, F_SEQ_Empty2(f1s, f2s)) |         | 
|    160       case _ => (SEQ(r1s,r2s), F_SEQ(f1s, f2s)) |         | 
|    161     } |         | 
|    162   } |         | 
|    163   case RECD(x, r1) => { |         | 
|    164     val (r1s, f1s) = simp(r1) |         | 
|    165     (RECD(x, r1s), F_RECD(f1s)) |         | 
|    166   } |         | 
|    167   case r => (r, F_ID) |         | 
|    168 } |         | 
|    169  |         | 
|    170 // lexing functions including simplification |         | 
|    171 def lex_simp(r: Rexp, s: List[Char]) : Val = s match { |         | 
|    172   case Nil => if (nullable(r)) mkeps(r) else  |         | 
|    173     { throw new Exception("lexing error") }  |         | 
|    174   case c::cs => { |         | 
|    175     val (r_simp, f_simp) = simp(der(c, r)) |         | 
|    176     inj(r, c, f_simp(lex_simp(r_simp, cs))) |         | 
|    177   } |         | 
|    178 } |         | 
|    179  |         | 
|    180 def lexing_simp(r: Rexp, s: String) =  |         | 
|    181   env(lex_simp(r, s.toList)) |         | 
|    182  |         | 
|    183  |         | 
|    184 // The Lexing Rules for the Fun Language |         | 
|    185  |         | 
|    186 def PLUS(r: Rexp) = r ~ r.% |         | 
|    187  |         | 
|    188 def Range(s : List[Char]) : Rexp = s match { |         | 
|    189   case Nil => ZERO |         | 
|    190   case c::Nil => CHAR(c) |         | 
|    191   case c::s => ALT(CHAR(c), Range(s)) |         | 
|    192 } |         | 
|    193 def RANGE(s: String) = Range(s.toList) |         | 
|    194  |         | 
|    195 val SYM = RANGE("ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz_") |         | 
|    196 val DIGIT = RANGE("0123456789") |         | 
|    197 val ID = SYM ~ (SYM | DIGIT).%  |         | 
|    198 val NUM = PLUS(DIGIT) |         | 
|    199 val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write"  |         | 
|    200 val SEMI: Rexp = ";" |         | 
|    201 val OP: Rexp = ":=" | "=" | "-" | "+" | "*" | "!=" | "<" | ">" |         | 
|    202 val WHITESPACE = PLUS(" " | "\n" | "\t") |         | 
|    203 val RPAREN: Rexp = "{" |         | 
|    204 val LPAREN: Rexp = "}" |         | 
|    205 val STRING: Rexp = "\"" ~ SYM.% ~ "\"" |         | 
|    206  |         | 
|    207  |         | 
|    208 val WHILE_REGS = (("k" $ KEYWORD) |  |         | 
|    209                   ("i" $ ID) |  |         | 
|    210                   ("o" $ OP) |  |         | 
|    211                   ("n" $ NUM) |  |         | 
|    212                   ("s" $ SEMI) |  |         | 
|    213                   ("str" $ STRING) | |         | 
|    214                   ("p" $ (LPAREN | RPAREN)) |  |         | 
|    215                   ("w" $ WHITESPACE)).% |         | 
|    216  |         | 
|    217  |         | 
|    218 // Two Simple While Tests |         | 
|    219 //======================== |         | 
|    220  |         | 
|    221 println("test: read n") |         | 
|    222  |         | 
|    223 val prog0 = """read n""" |         | 
|    224 println(lexing_simp(WHILE_REGS, prog0)) |         | 
|    225  |         | 
|    226 println("test: read  n; write n ") |         | 
|    227  |         | 
|    228 val prog1 = """read  n; write n""" |         | 
|    229 println(lexing_simp(WHILE_REGS, prog1)) |         | 
|    230  |         | 
|    231  |         | 
|    232 // Bigger Tests |         | 
|    233 //============== |         | 
|    234  |         | 
|    235 // escapes strings and prints them out as "", "\n" and so on |         | 
|    236 def esc(raw: String): String = { |         | 
|    237   import scala.reflect.runtime.universe._ |         | 
|    238   Literal(Constant(raw)).toString |         | 
|    239 } |         | 
|    240  |         | 
|    241 def escape(tks: List[(String, String)]) = |         | 
|    242   tks.map{ case (s1, s2) => (s1, esc(s2))} |         | 
|    243  |         | 
|    244 val prog2 = """ |         | 
|    245 write "Fib"; |         | 
|    246 read n; |         | 
|    247 minus1 := 0; |         | 
|    248 minus2 := 1; |         | 
|    249 while n > 0 do { |         | 
|    250   temp := minus2; |         | 
|    251   minus2 := minus1 + minus2; |         | 
|    252   minus1 := temp; |         | 
|    253   n := n - 1 |         | 
|    254 }; |         | 
|    255 write "Result"; |         | 
|    256 write minus2 |         | 
|    257 """ |         | 
|    258  |         | 
|    259 println("Tokens for Fib") |         | 
|    260 println(escape(lexing_simp(WHILE_REGS, prog2)).mkString("\n")) |         | 
|    261  |         | 
|    262  |         | 
|    263  |         | 
|    264 val prog3 = """ |         | 
|    265 start := 1000; |         | 
|    266 x := start; |         | 
|    267 y := start; |         | 
|    268 z := start; |         | 
|    269 while 0 < x do { |         | 
|    270  while 0 < y do { |         | 
|    271   while 0 < z do { |         | 
|    272     z := z - 1 |         | 
|    273   }; |         | 
|    274   z := start; |         | 
|    275   y := y - 1 |         | 
|    276  };      |         | 
|    277  y := start; |         | 
|    278  x := x - 1 |         | 
|    279 } |         | 
|    280 """ |         | 
|    281  |         | 
|    282 println("Tokens for Loops") |         | 
|    283 println(escape(lexing_simp(WHILE_REGS, prog3)).mkString("\n")) |         | 
|    284  |         | 
|    285  |         | 
|    286  |         | 
|    287  |         | 
|    288  |         | 
|    289  |         | 
|    290  |         | 
|    291 // The tokens for the WHILE language |         | 
|    292 import java.io._ |         | 
|    293  |         | 
|    294 abstract class Token extends Serializable  |         | 
|    295 case object T_SEMI extends Token |         | 
|    296 case object T_LPAREN extends Token |         | 
|    297 case object T_RPAREN extends Token |         | 
|    298 case class T_ID(s: String) extends Token |         | 
|    299 case class T_OP(s: String) extends Token |         | 
|    300 case class T_NUM(n: Int) extends Token |         | 
|    301 case class T_KWD(s: String) extends Token |         | 
|    302 case class T_STR(s: String) extends Token |         | 
|    303  |         | 
|    304 val token : PartialFunction[(String, String), Token] = { |         | 
|    305   case ("s", _) => T_SEMI |         | 
|    306   case ("p", "{") => T_LPAREN |         | 
|    307   case ("p", "}") => T_RPAREN |         | 
|    308   case ("i", s) => T_ID(s) |         | 
|    309   case ("o", s) => T_OP(s) |         | 
|    310   case ("n", s) => T_NUM(s.toInt) |         | 
|    311   case ("k", s) => T_KWD(s) |         | 
|    312   case ("str", s) => T_STR(s) |         | 
|    313 } |         | 
|    314  |         | 
|    315 def tokenise(s: String) : List[Token] =  |         | 
|    316   lexing_simp(WHILE_REGS, s).collect(token) |         | 
|    317  |         | 
|    318 tokenise(prog2) |         | 
|    319 tokenise(prog3) |         | 
|    320  |         | 
|    321  |         | 
|    322 object Lexer extends App { |         | 
|    323  |         | 
|    324   val fname = "/tmp/nflx" |         | 
|    325  |         | 
|    326   def serialise(tks: List[Token]) = { |         | 
|    327     val out = new ObjectOutputStream(new FileOutputStream(fname)) |         | 
|    328     out.writeObject(tks) |         | 
|    329     out.close |         | 
|    330   } |         | 
|    331  |         | 
|    332   def deserialise() : List[Token] = { |         | 
|    333     val in = new ObjectInputStream(new FileInputStream(fname)) |         | 
|    334     val tks = in.readObject.asInstanceOf[List[Token]] |         | 
|    335     in.close |         | 
|    336     tks |         | 
|    337   } |         | 
|    338  |         | 
|    339   serialise(tokenise(prog2)) |         | 
|    340   println(deserialise()) |         | 
|    341  |         | 
|    342 } |         |