progs/lexer/token2.scala
changeset 729 b147a10be8dd
parent 728 c669b39debe3
child 730 18fee9d3b6a8
equal deleted inserted replaced
728:c669b39debe3 729:b147a10be8dd
     1 import scala.language.implicitConversions    
       
     2 import scala.language.reflectiveCalls
       
     3 import scala.annotation.tailrec   
       
     4 
       
     5 abstract class Rexp 
       
     6 case object NULL extends Rexp
       
     7 case object EMPTY extends Rexp
       
     8 case class CHAR(c: Char) extends Rexp
       
     9 case class ALT(r1: Rexp, r2: Rexp) extends Rexp 
       
    10 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp 
       
    11 case class STAR(r: Rexp) extends Rexp 
       
    12 case class RECD(x: String, r: Rexp) extends Rexp
       
    13 case class CRANGE(cs: String) extends Rexp
       
    14 case class PLUS(r: Rexp) extends Rexp
       
    15 case class OPT(r: Rexp) extends Rexp
       
    16 case class NTIMES(r: Rexp, n: Int) extends Rexp
       
    17 
       
    18 abstract class Val
       
    19 case object Empty extends Val
       
    20 case class Chr(c: Char) extends Val
       
    21 case class Seq(v1: Val, v2: Val) extends Val
       
    22 case class Left(v: Val) extends Val
       
    23 case class Right(v: Val) extends Val
       
    24 case class Stars(vs: List[Val]) extends Val
       
    25 case class Rec(x: String, v: Val) extends Val
       
    26    
       
    27 // some convenience for typing in regular expressions
       
    28 def charlist2rexp(s : List[Char]): Rexp = s match {
       
    29   case Nil => EMPTY
       
    30   case c::Nil => CHAR(c)
       
    31   case c::s => SEQ(CHAR(c), charlist2rexp(s))
       
    32 }
       
    33 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
       
    34 
       
    35 implicit def RexpOps(r: Rexp) = new {
       
    36   def | (s: Rexp) = ALT(r, s)
       
    37   def % = STAR(r)
       
    38   def ~ (s: Rexp) = SEQ(r, s)
       
    39 }
       
    40 
       
    41 implicit def stringOps(s: String) = new {
       
    42   def | (r: Rexp) = ALT(s, r)
       
    43   def | (r: String) = ALT(s, r)
       
    44   def % = STAR(s)
       
    45   def ~ (r: Rexp) = SEQ(s, r)
       
    46   def ~ (r: String) = SEQ(s, r)
       
    47   def $ (r: Rexp) = RECD(s, r)
       
    48 }
       
    49 
       
    50 // nullable function: tests whether the regular 
       
    51 // expression can recognise the empty string
       
    52 def nullable (r: Rexp) : Boolean = r match {
       
    53   case NULL => false
       
    54   case EMPTY => true
       
    55   case CHAR(_) => false
       
    56   case ALT(r1, r2) => nullable(r1) || nullable(r2)
       
    57   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
       
    58   case STAR(_) => true
       
    59   case RECD(_, r) => nullable(r)
       
    60   case CRANGE(_) => false
       
    61   case PLUS(r) => nullable(r)
       
    62   case OPT(_) => true
       
    63   case NTIMES(r, n) => if (n == 0) true else nullable(r)
       
    64 }
       
    65 
       
    66 // derivative of a regular expression w.r.t. a character
       
    67 def der (c: Char, r: Rexp) : Rexp = r match {
       
    68   case NULL => NULL
       
    69   case EMPTY => NULL
       
    70   case CHAR(d) => if (c == d) EMPTY else NULL
       
    71   case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
       
    72   case SEQ(r1, r2) => 
       
    73     if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
       
    74     else SEQ(der(c, r1), r2)
       
    75   case STAR(r) => SEQ(der(c, r), STAR(r))
       
    76   case RECD(_, r1) => der(c, r1)
       
    77   case CRANGE(cs) => if (cs.contains(c)) EMPTY else NULL
       
    78   case PLUS(r) => SEQ(der(c, r), STAR(r))
       
    79   case OPT(r) => ALT(der(c, r), NULL)
       
    80   case NTIMES(r, n) => if (n == 0) NULL else der(c, SEQ(r, NTIMES(r, n - 1)))
       
    81 }
       
    82 
       
    83 // derivative w.r.t. a string (iterates der)
       
    84 def ders (s: List[Char], r: Rexp) : Rexp = s match {
       
    85   case Nil => r
       
    86   case c::s => ders(s, der(c, r))
       
    87 }
       
    88 
       
    89 // extracts a string from value
       
    90 def flatten(v: Val) : String = v match {
       
    91   case Empty => ""
       
    92   case Chr(c) => c.toString
       
    93   case Left(v) => flatten(v)
       
    94   case Right(v) => flatten(v)
       
    95   case Seq(v1, v2) => flatten(v1) + flatten(v2)
       
    96   case Stars(vs) => vs.map(flatten).mkString
       
    97   case Rec(_, v) => flatten(v)
       
    98 }
       
    99 
       
   100 // extracts an environment from a value
       
   101 def env(v: Val) : List[(String, String)] = v match {
       
   102   case Empty => Nil
       
   103   case Chr(c) => Nil
       
   104   case Left(v) => env(v)
       
   105   case Right(v) => env(v)
       
   106   case Seq(v1, v2) => env(v1) ::: env(v2)
       
   107   case Stars(vs) => vs.flatMap(env)
       
   108   case Rec(x, v) => (x, flatten(v))::env(v)
       
   109 }
       
   110 
       
   111 // injection part
       
   112 def mkeps(r: Rexp) : Val = r match {
       
   113   case EMPTY => Empty
       
   114   case ALT(r1, r2) => 
       
   115     if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2))
       
   116   case SEQ(r1, r2) => Seq(mkeps(r1), mkeps(r2))
       
   117   case STAR(r) => Stars(Nil)
       
   118   case RECD(x, r) => Rec(x, mkeps(r))
       
   119   case PLUS(r) => Stars(List(mkeps(r)))
       
   120   case OPT(_) => Right(Empty)
       
   121   case NTIMES(r, n) => if (n == 0) Stars(Nil) else Stars(Nil.padTo(n, mkeps(r)))
       
   122   case _ => { println ("r : " + r.toString); 
       
   123               throw new Exception("mkeps error")}
       
   124 }
       
   125 
       
   126 
       
   127 def inj(r: Rexp, c: Char, v: Val) : Val = (r, v) match {
       
   128   case (STAR(r), Seq(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)
       
   129   case (SEQ(r1, r2), Seq(v1, v2)) => Seq(inj(r1, c, v1), v2)
       
   130   case (SEQ(r1, r2), Left(Seq(v1, v2))) => Seq(inj(r1, c, v1), v2)
       
   131   case (SEQ(r1, r2), Right(v2)) => Seq(mkeps(r1), inj(r2, c, v2))
       
   132   case (ALT(r1, r2), Left(v1)) => Left(inj(r1, c, v1))
       
   133   case (ALT(r1, r2), Right(v2)) => Right(inj(r2, c, v2))
       
   134   case (CHAR(_), Empty) => Chr(c) 
       
   135   case (CRANGE(_), Empty) => Chr(c) 
       
   136   case (RECD(x, r1), _) => Rec(x, inj(r1, c, v))
       
   137   case (PLUS(r), Seq(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)
       
   138   case (OPT(r), Left(v1)) => Left(inj(r, c, v1))
       
   139   case (NTIMES(r, n), Seq(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)
       
   140   case (NTIMES(r, n), Left(Seq(v1, Stars(vs)))) => Stars(inj(r, c, v1)::vs)
       
   141   case (NTIMES(r, n), Right(Stars(v::vs))) => Stars(mkeps(r)::inj(r, c, v)::vs)
       
   142   case _ => { println ("r : " + r.toString + "  v: " + v.toString); 
       
   143               throw new Exception("inj error")}
       
   144 }
       
   145 
       
   146 // main lexing function (produces a value)
       
   147 def lex(r: Rexp, s: List[Char]) : Val = s match {
       
   148   case Nil => if (nullable(r)) mkeps(r) else throw new Exception("Not matched")
       
   149   case c::cs => inj(r, c, lex(der(c, r), cs))
       
   150 }
       
   151 
       
   152 def lexing(r: Rexp, s: String) : Val = lex(r, s.toList)
       
   153 
       
   154 lexing(("ab" | "ab") ~ ("b" | EMPTY), "ab")
       
   155 
       
   156 lexing(OPT("ab"), "ab")
       
   157 
       
   158 lexing(NTIMES("1", 3), "111")
       
   159 lexing(NTIMES("1" | EMPTY, 3), "11")
       
   160 
       
   161 // some "rectification" functions for simplification
       
   162 def F_ID(v: Val): Val = v
       
   163 def F_RIGHT(f: Val => Val) = (v:Val) => Right(f(v))
       
   164 def F_LEFT(f: Val => Val) = (v:Val) => Left(f(v))
       
   165 def F_ALT(f1: Val => Val, f2: Val => Val) = (v:Val) => v match {
       
   166   case Right(v) => Right(f2(v))
       
   167   case Left(v) => Left(f1(v))
       
   168 }
       
   169 def F_SEQ(f1: Val => Val, f2: Val => Val) = (v:Val) => v match {
       
   170   case Seq(v1, v2) => Seq(f1(v1), f2(v2))
       
   171 }
       
   172 def F_SEQ_Empty1(f1: Val => Val, f2: Val => Val) = 
       
   173   (v:Val) => Seq(f1(Empty), f2(v))
       
   174 def F_SEQ_Empty2(f1: Val => Val, f2: Val => Val) = 
       
   175   (v:Val) => Seq(f1(v), f2(Empty))
       
   176 def F_RECD(f: Val => Val) = (v:Val) => v match {
       
   177   case Rec(x, v) => Rec(x, f(v))
       
   178 }
       
   179 def F_ERROR(v: Val): Val = throw new Exception("error")
       
   180 
       
   181 // simplification of regular expressions returning also an
       
   182 // rectification function; no simplification under STAR 
       
   183 def simp(r: Rexp): (Rexp, Val => Val) = r match {
       
   184   case ALT(r1, r2) => {
       
   185     val (r1s, f1s) = simp(r1)
       
   186     val (r2s, f2s) = simp(r2)
       
   187     (r1s, r2s) match {
       
   188       case (NULL, _) => (r2s, F_RIGHT(f2s))
       
   189       case (_, NULL) => (r1s, F_LEFT(f1s))
       
   190       case _ => if (r1s == r2s) (r1s, F_LEFT(f1s))
       
   191                 else (ALT (r1s, r2s), F_ALT(f1s, f2s)) 
       
   192     }
       
   193   }
       
   194   case SEQ(r1, r2) => {
       
   195     val (r1s, f1s) = simp(r1)
       
   196     val (r2s, f2s) = simp(r2)
       
   197     (r1s, r2s) match {
       
   198       case (NULL, _) => (NULL, F_ERROR)
       
   199       case (_, NULL) => (NULL, F_ERROR)
       
   200       case (EMPTY, _) => (r2s, F_SEQ_Empty1(f1s, f2s))
       
   201       case (_, EMPTY) => (r1s, F_SEQ_Empty2(f1s, f2s))
       
   202       case _ => (SEQ(r1s,r2s), F_SEQ(f1s, f2s))
       
   203     }
       
   204   }
       
   205   case RECD(x, r1) => {
       
   206     val (r1s, f1s) = simp(r1)
       
   207     (RECD(x, r1s), F_RECD(f1s))
       
   208   }
       
   209   case r => (r, F_ID)
       
   210 }
       
   211 
       
   212 def lex_simp(r: Rexp, s: List[Char]) : Val = s match {
       
   213   case Nil => if (nullable(r)) mkeps(r) else throw new Exception("Not matched")
       
   214   case c::cs => {
       
   215     val (r_simp, f_simp) = simp(der(c, r))
       
   216     inj(r, c, f_simp(lex_simp(r_simp, cs)))
       
   217   }
       
   218 }
       
   219 
       
   220 def lexing_simp(r: Rexp, s: String) : Val = lex_simp(r, s.toList)
       
   221 
       
   222 lexing_simp(("a" | "ab") ~ ("b" | ""), "ab")
       
   223 
       
   224 lexing_simp(OPT("ab"), "ab")
       
   225 
       
   226 // Lexing Rules for a Small While Language
       
   227 
       
   228 val SYM = CRANGE("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
       
   229 val DIGIT = CRANGE("0123456789")
       
   230 val ID = SYM ~ (SYM | DIGIT).% 
       
   231 val NUM = PLUS(DIGIT)
       
   232 val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write" | "true" | "false"
       
   233 val SEMI: Rexp = ";"
       
   234 val OP: Rexp = ":=" | "==" | "-" | "+" | "*" | "!=" | "<" | ">" | "<=" | ">=" | "%" | "/"
       
   235 val WHITESPACE = PLUS(" " | "\n" | "\t")
       
   236 val RPAREN: Rexp = ")"
       
   237 val LPAREN: Rexp = "("
       
   238 val BEGIN: Rexp = "{"
       
   239 val END: Rexp = "}"
       
   240 val STRING: Rexp = "\"" ~ SYM.% ~ "\""
       
   241 
       
   242 
       
   243 val WHILE_REGS = (("k" $ KEYWORD) | 
       
   244                   ("i" $ ID) | 
       
   245                   ("o" $ OP) | 
       
   246                   ("n" $ NUM) | 
       
   247                   ("s" $ SEMI) | 
       
   248                   ("str" $ STRING) |
       
   249                   ("p" $ (LPAREN | RPAREN)) | 
       
   250                   ("b" $ (BEGIN | END)) | 
       
   251                   ("w" $ WHITESPACE)).%
       
   252 
       
   253 // filters out all white spaces
       
   254 def tokenise(r: Rexp, s: String) = 
       
   255   env(lexing_simp(r, s)).filterNot { (s) => s._1 == "w"}
       
   256 
       
   257 //   Testing
       
   258 //============
       
   259 
       
   260 def time[T](code: => T) = {
       
   261   val start = System.nanoTime()
       
   262   val result = code
       
   263   val end = System.nanoTime()
       
   264   println((end - start)/1.0e9)
       
   265   result
       
   266 }
       
   267 
       
   268 println(lexing_simp(OPT("1"), "1"))
       
   269 println(lexing_simp(OPT("1"), ""))
       
   270 println(ders("111".toList, NTIMES("1",3)))
       
   271 println(lexing_simp(NTIMES("1",3), "111"))
       
   272 
       
   273 val r1 = ("a" | "ab") ~ ("bcd" | "c")
       
   274 println(lexing(r1, "abcd"))
       
   275 
       
   276 val r2 = ("" | "a") ~ ("ab" | "b")
       
   277 println(lexing(r2, "ab"))
       
   278 
       
   279 
       
   280 // Two Simple While Tests
       
   281 //========================
       
   282 println("prog0 test")
       
   283 
       
   284 val prog0 = """read n"""
       
   285 println(env(lexing_simp(WHILE_REGS, prog0)))
       
   286 
       
   287 println("prog1 test")
       
   288 
       
   289 val prog1 = """read  n; write (n)"""
       
   290 println(env(lexing_simp(WHILE_REGS, prog1)))
       
   291 
       
   292 
       
   293 // Big Test
       
   294 //==========
       
   295 
       
   296 def escape(raw: String): String = {
       
   297   import scala.reflect.runtime.universe._
       
   298   Literal(Constant(raw)).toString
       
   299 }
       
   300 
       
   301 val prog2 = """
       
   302 write "Fib";
       
   303 read n;
       
   304 minus1 := 0;
       
   305 minus2 := 1;
       
   306 while n > 0 do {
       
   307   temp := minus2;
       
   308   minus2 := minus1 + minus2;
       
   309   minus1 := temp;
       
   310   n := n - 1
       
   311 };
       
   312 write "Result";
       
   313 write minus2
       
   314 """
       
   315 
       
   316 val prog3 = """
       
   317 start := 1000;
       
   318 x := start;
       
   319 y := start;
       
   320 z := start;
       
   321 while 0 < x do {
       
   322  while 0 < y do {
       
   323   while 0 < z do {
       
   324     z := z - 1
       
   325   };
       
   326   z := start;
       
   327   y := y - 1
       
   328  };     
       
   329  y := start;
       
   330  x := x - 1
       
   331 }
       
   332 """
       
   333 
       
   334 
       
   335 println("Tokens")
       
   336 println(tokenise(WHILE_REGS, prog2).mkString("\n"))
       
   337 
       
   338 val fib_tokens = tokenise(WHILE_REGS, prog2)
       
   339 fib_tokens.map{case (s1, s2) => (escape(s1), escape(s2))}.mkString(",\n")
       
   340 
       
   341 
       
   342 val test_tokens = tokenise(WHILE_REGS, prog3)
       
   343 test_tokens.map{case (s1, s2) => (escape(s1), escape(s2))}.mkString(",\n")
       
   344 
       
   345 
       
   346 /*
       
   347 for (i <- 1 to 120 by 10) {
       
   348   print(i.toString + ":  ")
       
   349   time(lexing_simp(WHILE_REGS, prog2 * i))
       
   350 }
       
   351 */
       
   352 
       
   353 val toks_fib = 
       
   354   List(("k","write"),
       
   355        ("str","Fib"),
       
   356        ("s",";"),
       
   357        ("k","read"),
       
   358        ("i","n"),
       
   359        ("s",";"),
       
   360        ("i","minus1"),
       
   361        ("o",":="),
       
   362        ("n","0"),
       
   363        ("s",";"),
       
   364        ("i","minus2"),
       
   365        ("o",":="),
       
   366        ("n","1"),
       
   367        ("s",";"),
       
   368        ("k","while"),
       
   369        ("i","n"),
       
   370        ("o",">"),
       
   371        ("n","0"),
       
   372        ("k","do"),
       
   373        ("b","{"),
       
   374        ("i","temp"),
       
   375        ("o",":="),
       
   376        ("i","minus2"),
       
   377        ("s",";"),
       
   378        ("i","minus2"),
       
   379        ("o",":="),
       
   380        ("i","minus1"),
       
   381        ("o","+"),
       
   382        ("i","minus2"),
       
   383        ("s",";"),
       
   384        ("i","minus1"),
       
   385        ("o",":="),
       
   386        ("i","temp"),
       
   387        ("s",";"),
       
   388        ("i","n"),
       
   389        ("o",":="),
       
   390        ("i","n"),
       
   391        ("o","-"),
       
   392        ("n","1"),
       
   393        ("b","}"),
       
   394        ("s",";"),
       
   395        ("k","write"),
       
   396        ("str","Result"),
       
   397        ("s",";"),
       
   398        ("k","write"),
       
   399        ("i","minus2"))
       
   400 
       
   401 val toks_test =
       
   402   List(("i","start"),
       
   403        ("o",":="),
       
   404        ("n","1000"),
       
   405        ("s",";"),
       
   406        ("i","x"),
       
   407        ("o",":="),
       
   408        ("i","start"),
       
   409        ("s",";"),
       
   410        ("i","y"),
       
   411        ("o",":="),
       
   412        ("i","start"),
       
   413        ("s",";"),
       
   414        ("i","z"),
       
   415        ("o",":="),
       
   416        ("i","start"),
       
   417        ("s",";"),
       
   418        ("k","while"),
       
   419        ("n","0"),
       
   420        ("o","<"),
       
   421        ("i","x"),
       
   422        ("k","do"),
       
   423        ("b","{"),
       
   424        ("k","while"),
       
   425        ("n","0"),
       
   426        ("o","<"),
       
   427        ("i","y"),
       
   428        ("k","do"),
       
   429        ("b","{"),
       
   430        ("k","while"),
       
   431        ("n","0"),
       
   432        ("o","<"),
       
   433        ("i","z"),
       
   434        ("k","do"),
       
   435        ("b","{"),
       
   436        ("i","z"),
       
   437        ("o",":="),
       
   438        ("i","z"),
       
   439        ("o","-"),
       
   440        ("n","1"),
       
   441        ("b","}"),
       
   442        ("s",";"),
       
   443        ("i","z"),
       
   444        ("o",":="),
       
   445        ("i","start"),
       
   446        ("s",";"),
       
   447        ("i","y"),
       
   448        ("o",":="),
       
   449        ("i","y"),
       
   450        ("o","-"),
       
   451        ("n","1"),
       
   452        ("b","}"),
       
   453        ("s",";"),
       
   454        ("i","y"),
       
   455        ("o",":="),
       
   456        ("i","start"),
       
   457        ("s",";"),
       
   458        ("i","x"),
       
   459        ("o",":="),
       
   460        ("i","x"),
       
   461        ("o","-"),
       
   462        ("n","1"),
       
   463        ("b","}"))