thys2/blexer2.sc
changeset 431 47c75ba5ad28
child 432 994403dbbed5
equal deleted inserted replaced
430:579caa608a15 431:47c75ba5ad28
       
     1 // A simple lexer inspired by work of Sulzmann & Lu
       
     2 //==================================================
       
     3 //
       
     4 // Call the test cases with 
       
     5 //
       
     6 //   amm lexer.sc small
       
     7 //   amm lexer.sc fib
       
     8 //   amm lexer.sc loops
       
     9 //   amm lexer.sc email
       
    10 //
       
    11 //   amm lexer.sc all
       
    12 
       
    13 
       
    14 // regular expressions including records
       
    15 abstract class Rexp 
       
    16 case object ZERO extends Rexp
       
    17 case object ONE extends Rexp
       
    18 case object ANYCHAR extends Rexp
       
    19 case class CHAR(c: Char) extends Rexp
       
    20 case class ALTS(r1: Rexp, r2: Rexp) extends Rexp 
       
    21 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp 
       
    22 case class STAR(r: Rexp) extends Rexp 
       
    23 case class RECD(x: String, r: Rexp) extends Rexp  
       
    24 case class NTIMES(n: Int, r: Rexp) extends Rexp
       
    25 case class OPTIONAL(r: Rexp) extends Rexp
       
    26 case class NOT(r: Rexp) extends Rexp
       
    27                 // records for extracting strings or tokens
       
    28   
       
    29 // values  
       
    30 abstract class Val
       
    31 case object Empty extends Val
       
    32 case class Chr(c: Char) extends Val
       
    33 case class Sequ(v1: Val, v2: Val) extends Val
       
    34 case class Left(v: Val) extends Val
       
    35 case class Right(v: Val) extends Val
       
    36 case class Stars(vs: List[Val]) extends Val
       
    37 case class Rec(x: String, v: Val) extends Val
       
    38 case class Ntime(vs: List[Val]) extends Val
       
    39 case class Optionall(v: Val) extends Val
       
    40 case class Nots(s: String) extends Val
       
    41 
       
    42 
       
    43 
       
    44 abstract class Bit
       
    45 case object Z extends Bit
       
    46 case object S extends Bit
       
    47 
       
    48 
       
    49 type Bits = List[Bit]
       
    50 
       
    51 abstract class ARexp 
       
    52 case object AZERO extends ARexp
       
    53 case class AONE(bs: Bits) extends ARexp
       
    54 case class ACHAR(bs: Bits, c: Char) extends ARexp
       
    55 case class AALTS(bs: Bits, rs: List[ARexp]) extends ARexp 
       
    56 case class ASEQ(bs: Bits, r1: ARexp, r2: ARexp) extends ARexp 
       
    57 case class ASTAR(bs: Bits, r: ARexp) extends ARexp 
       
    58 case class ANOT(bs: Bits, r: ARexp) extends ARexp
       
    59 case class AANYCHAR(bs: Bits) extends ARexp
       
    60 
       
    61 
       
    62    
       
    63 // some convenience for typing in regular expressions
       
    64 
       
    65 def charlist2rexp(s : List[Char]): Rexp = s match {
       
    66   case Nil => ONE
       
    67   case c::Nil => CHAR(c)
       
    68   case c::s => SEQ(CHAR(c), charlist2rexp(s))
       
    69 }
       
    70 implicit def string2rexp(s : String) : Rexp = 
       
    71   charlist2rexp(s.toList)
       
    72 
       
    73 implicit def RexpOps(r: Rexp) = new {
       
    74   def | (s: Rexp) = ALTS(r, s)
       
    75   def % = STAR(r)
       
    76   def ~ (s: Rexp) = SEQ(r, s)
       
    77 }
       
    78 
       
    79 implicit def stringOps(s: String) = new {
       
    80   def | (r: Rexp) = ALTS(s, r)
       
    81   def | (r: String) = ALTS(s, r)
       
    82   def % = STAR(s)
       
    83   def ~ (r: Rexp) = SEQ(s, r)
       
    84   def ~ (r: String) = SEQ(s, r)
       
    85   def $ (r: Rexp) = RECD(s, r)
       
    86 }
       
    87 
       
    88 def nullable(r: Rexp) : Boolean = r match {
       
    89   case ZERO => false
       
    90   case ONE => true
       
    91   case CHAR(_) => false
       
    92   case ANYCHAR => false
       
    93   case ALTS(r1, r2) => nullable(r1) || nullable(r2)
       
    94   case SEQ(r1, r2) => nullable(r1) && nullable(r2)
       
    95   case STAR(_) => true
       
    96   case RECD(_, r1) => nullable(r1)
       
    97   case NTIMES(n, r) => if (n == 0) true else nullable(r)
       
    98   case OPTIONAL(r) => true
       
    99   case NOT(r) => !nullable(r)
       
   100 }
       
   101 
       
   102 def der(c: Char, r: Rexp) : Rexp = r match {
       
   103   case ZERO => ZERO
       
   104   case ONE => ZERO
       
   105   case CHAR(d) => if (c == d) ONE else ZERO
       
   106   case ANYCHAR => ONE 
       
   107   case ALTS(r1, r2) => ALTS(der(c, r1), der(c, r2))
       
   108   case SEQ(r1, r2) => 
       
   109     if (nullable(r1)) ALTS(SEQ(der(c, r1), r2), der(c, r2))
       
   110     else SEQ(der(c, r1), r2)
       
   111   case STAR(r) => SEQ(der(c, r), STAR(r))
       
   112   case RECD(_, r1) => der(c, r1)
       
   113   case NTIMES(n, r) => if(n > 0) SEQ(der(c, r), NTIMES(n - 1, r)) else ZERO
       
   114   case OPTIONAL(r) => der(c, r)
       
   115   case NOT(r) =>  NOT(der(c, r))
       
   116 }
       
   117 
       
   118 
       
   119 // extracts a string from a value
       
   120 def flatten(v: Val) : String = v match {
       
   121   case Empty => ""
       
   122   case Chr(c) => c.toString
       
   123   case Left(v) => flatten(v)
       
   124   case Right(v) => flatten(v)
       
   125   case Sequ(v1, v2) => flatten(v1) ++ flatten(v2)
       
   126   case Stars(vs) => vs.map(flatten).mkString
       
   127   case Ntime(vs) => vs.map(flatten).mkString
       
   128   case Optionall(v) => flatten(v)
       
   129   case Rec(_, v) => flatten(v)
       
   130 }
       
   131 
       
   132 
       
   133 // extracts an environment from a value;
       
   134 // used for tokenising a string
       
   135 def env(v: Val) : List[(String, String)] = v match {
       
   136   case Empty => Nil
       
   137   case Chr(c) => Nil
       
   138   case Left(v) => env(v)
       
   139   case Right(v) => env(v)
       
   140   case Sequ(v1, v2) => env(v1) ::: env(v2)
       
   141   case Stars(vs) => vs.flatMap(env)
       
   142   case Ntime(vs) => vs.flatMap(env)
       
   143   case Rec(x, v) => (x, flatten(v))::env(v)
       
   144   case Optionall(v) => env(v)
       
   145   case Nots(s) => ("Negative", s) :: Nil
       
   146 }
       
   147 
       
   148 
       
   149 // The injection and mkeps part of the lexer
       
   150 //===========================================
       
   151 
       
   152 def mkeps(r: Rexp) : Val = r match {
       
   153   case ONE => Empty
       
   154   case ALTS(r1, r2) => 
       
   155     if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2))
       
   156   case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2))
       
   157   case STAR(r) => Stars(Nil)
       
   158   case RECD(x, r) => Rec(x, mkeps(r))
       
   159   case NTIMES(n, r) => Ntime(List.fill(n)(mkeps(r)))
       
   160   case OPTIONAL(r) => Optionall(Empty)
       
   161   case NOT(rInner) => if(nullable(rInner)) throw new Exception("error")  
       
   162                          else Nots("")//Nots(s.reverse.toString)
       
   163 //   case NOT(ZERO) => Empty
       
   164 //   case NOT(CHAR(c)) => Empty
       
   165 //   case NOT(SEQ(r1, r2)) => Sequ(mkeps(NOT(r1)), mkeps(NOT(r2)))
       
   166 //   case NOT(ALTS(r1, r2)) => if(!nullable(r1)) Left(mkeps(NOT(r1))) else Right(mkeps(NOT(r2)))
       
   167 //   case NOT(STAR(r)) => Stars(Nil) 
       
   168 
       
   169 }
       
   170 
       
   171 def inj(r: Rexp, c: Char, v: Val) : Val = (r, v) match {
       
   172   case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs)
       
   173   case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2)
       
   174   case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2)
       
   175   case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2))
       
   176   case (ALTS(r1, r2), Left(v1)) => Left(inj(r1, c, v1))
       
   177   case (ALTS(r1, r2), Right(v2)) => Right(inj(r2, c, v2))
       
   178   case (CHAR(d), Empty) => Chr(c) 
       
   179   case (RECD(x, r1), _) => Rec(x, inj(r1, c, v))
       
   180   case (NTIMES(n, r), Sequ(v1, Ntime(vs))) => Ntime(inj(r, c, v1)::vs)
       
   181   case (OPTIONAL(r), v) => Optionall(inj(r, c, v))
       
   182   case (NOT(r), Nots(s)) => Nots(c.toString ++ s)
       
   183   case (ANYCHAR, Empty) => Chr(c)
       
   184 }
       
   185 
       
   186 // some "rectification" functions for simplification
       
   187 def F_ID(v: Val): Val = v
       
   188 def F_RIGHT(f: Val => Val) = (v:Val) => Right(f(v))
       
   189 def F_LEFT(f: Val => Val) = (v:Val) => Left(f(v))
       
   190 def F_ALT(f1: Val => Val, f2: Val => Val) = (v:Val) => v match {
       
   191   case Right(v) => Right(f2(v))
       
   192   case Left(v) => Left(f1(v))
       
   193 }
       
   194 def F_SEQ(f1: Val => Val, f2: Val => Val) = (v:Val) => v match {
       
   195   case Sequ(v1, v2) => Sequ(f1(v1), f2(v2))
       
   196 }
       
   197 def F_SEQ_Empty1(f1: Val => Val, f2: Val => Val) = 
       
   198   (v:Val) => Sequ(f1(Empty), f2(v))
       
   199 def F_SEQ_Empty2(f1: Val => Val, f2: Val => Val) = 
       
   200   (v:Val) => Sequ(f1(v), f2(Empty))
       
   201 
       
   202 def F_ERROR(v: Val): Val = throw new Exception("error")
       
   203 
       
   204 // simplification
       
   205 def simp(r: Rexp): (Rexp, Val => Val) = r match {
       
   206   case ALTS(r1, r2) => {
       
   207     val (r1s, f1s) = simp(r1)
       
   208     val (r2s, f2s) = simp(r2)
       
   209     (r1s, r2s) match {
       
   210       case (ZERO, _) => (r2s, F_RIGHT(f2s))
       
   211       case (_, ZERO) => (r1s, F_LEFT(f1s))
       
   212       case _ => if (r1s == r2s) (r1s, F_LEFT(f1s))
       
   213                 else (ALTS (r1s, r2s), F_ALT(f1s, f2s)) 
       
   214     }
       
   215   }
       
   216   case SEQ(r1, r2) => {
       
   217     val (r1s, f1s) = simp(r1)
       
   218     val (r2s, f2s) = simp(r2)
       
   219     (r1s, r2s) match {
       
   220       case (ZERO, _) => (ZERO, F_ERROR)
       
   221       case (_, ZERO) => (ZERO, F_ERROR)
       
   222       case (ONE, _) => (r2s, F_SEQ_Empty1(f1s, f2s))
       
   223       case (_, ONE) => (r1s, F_SEQ_Empty2(f1s, f2s))
       
   224       case _ => (SEQ(r1s,r2s), F_SEQ(f1s, f2s))
       
   225     }
       
   226   }
       
   227   case r => (r, F_ID)
       
   228 }
       
   229 
       
   230 // lexing functions including simplification
       
   231 def lex_simp(r: Rexp, s: List[Char]) : Val = s match {
       
   232   case Nil => if (nullable(r)) mkeps(r) else 
       
   233     { throw new Exception(s"lexing error $r not nullable") } 
       
   234   case c::cs => {
       
   235     val (r_simp, f_simp) = simp(der(c, r))
       
   236     inj(r, c, f_simp(lex_simp(r_simp, cs)))
       
   237   }
       
   238 }
       
   239 
       
   240 def lexing_simp(r: Rexp, s: String) = 
       
   241   env(lex_simp(r, s.toList))
       
   242 
       
   243 // The Lexing Rules for the WHILE Language
       
   244 
       
   245 def PLUS(r: Rexp) = r ~ r.%
       
   246 
       
   247 def Range(s : List[Char]) : Rexp = s match {
       
   248   case Nil => ZERO
       
   249   case c::Nil => CHAR(c)
       
   250   case c::s => ALTS(CHAR(c), Range(s))
       
   251 }
       
   252 def RANGE(s: String) = Range(s.toList)
       
   253 
       
   254 val SYM = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_")
       
   255 val DIGIT = RANGE("0123456789")
       
   256 val ID = SYM ~ (SYM | DIGIT).% 
       
   257 val NUM = PLUS(DIGIT)
       
   258 val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write" 
       
   259 val SEMI: Rexp = ";"
       
   260 val OP: Rexp = ":=" | "=" | "-" | "+" | "*" | "!=" | "<" | ">"
       
   261 val WHITESPACE = PLUS(" " | "\n" | "\t" | "\r")
       
   262 val RPAREN: Rexp = "{"
       
   263 val LPAREN: Rexp = "}"
       
   264 val STRING: Rexp = "\"" ~ SYM.% ~ "\""
       
   265 
       
   266 
       
   267 //ab \ a --> 1b
       
   268 //
       
   269 val WHILE_REGS = (("k" $ KEYWORD) | 
       
   270                   ("i" $ ID) | 
       
   271                   ("o" $ OP) | 
       
   272                   ("n" $ NUM) | 
       
   273                   ("s" $ SEMI) | 
       
   274                   ("str" $ STRING) |
       
   275                   ("p" $ (LPAREN | RPAREN)) | 
       
   276                   ("w" $ WHITESPACE)).%
       
   277 
       
   278 val NREGS = NTIMES(5, OPTIONAL(SYM))
       
   279 val NREGS1 = ("test" $ NREGS)
       
   280 // Two Simple While Tests
       
   281 //========================
       
   282 val NOTREG = "hehe" ~ NOT((ANYCHAR.%) ~ "haha" ~ (ANYCHAR.%))
       
   283 
       
   284 
       
   285   // bnullable function: tests whether the aregular 
       
   286   // expression can recognise the empty string
       
   287 def bnullable (r: ARexp) : Boolean = r match {
       
   288     case AZERO => false
       
   289     case AONE(_) => true
       
   290     case ACHAR(_,_) => false
       
   291     case AALTS(_, rs) => rs.exists(bnullable)
       
   292     case ASEQ(_, r1, r2) => bnullable(r1) && bnullable(r2)
       
   293     case ASTAR(_, _) => true
       
   294     case ANOT(_, rn) => !bnullable(rn)
       
   295   }
       
   296 
       
   297 def mkepsBC(r: ARexp) : Bits = r match {
       
   298     case AONE(bs) => bs
       
   299     case AALTS(bs, rs) => {
       
   300       val n = rs.indexWhere(bnullable)
       
   301       bs ++ mkepsBC(rs(n))
       
   302     }
       
   303     case ASEQ(bs, r1, r2) => bs ++ mkepsBC(r1) ++ mkepsBC(r2)
       
   304     case ASTAR(bs, r) => bs ++ List(Z)
       
   305     case ANOT(bs, rn) => bs
       
   306   }
       
   307 
       
   308 
       
   309 def bder(c: Char, r: ARexp) : ARexp = r match {
       
   310     case AZERO => AZERO
       
   311     case AONE(_) => AZERO
       
   312     case ACHAR(bs, f) => if (c == f) AONE(bs) else AZERO
       
   313     case AALTS(bs, rs) => AALTS(bs, rs.map(bder(c, _)))
       
   314     case ASEQ(bs, r1, r2) => 
       
   315       if (bnullable(r1)) AALTS(bs, ASEQ(Nil, bder(c, r1), r2) :: fuse(mkepsBC(r1), bder(c, r2)) :: Nil )
       
   316       else ASEQ(bs, bder(c, r1), r2)
       
   317     case ASTAR(bs, r) => ASEQ(bs, fuse(List(S), bder(c, r)), ASTAR(Nil, r))
       
   318     case ANOT(bs, rn) => ANOT(bs, bder(c, rn))
       
   319     case AANYCHAR(bs) => AONE(bs)
       
   320   } 
       
   321 
       
   322 def fuse(bs: Bits, r: ARexp) : ARexp = r match {
       
   323     case AZERO => AZERO
       
   324     case AONE(cs) => AONE(bs ++ cs)
       
   325     case ACHAR(cs, f) => ACHAR(bs ++ cs, f)
       
   326     case AALTS(cs, rs) => AALTS(bs ++ cs, rs)
       
   327     case ASEQ(cs, r1, r2) => ASEQ(bs ++ cs, r1, r2)
       
   328     case ASTAR(cs, r) => ASTAR(bs ++ cs, r)
       
   329     case ANOT(cs, r) => ANOT(bs ++ cs, r)
       
   330   }
       
   331 
       
   332 
       
   333 def internalise(r: Rexp) : ARexp = r match {
       
   334     case ZERO => AZERO
       
   335     case ONE => AONE(Nil)
       
   336     case CHAR(c) => ACHAR(Nil, c)
       
   337     //case PRED(f) => APRED(Nil, f)
       
   338     case ALTS(r1, r2) => 
       
   339       AALTS(Nil, List(fuse(List(Z), internalise(r1)), fuse(List(S), internalise(r2))))
       
   340     // case ALTS(r1::rs) => {
       
   341     //   val AALTS(Nil, rs2) = internalise(ALTS(rs))
       
   342     //   AALTS(Nil, fuse(List(Z), internalise(r1)) :: rs2.map(fuse(List(S), _)))
       
   343     // }
       
   344     case SEQ(r1, r2) => ASEQ(Nil, internalise(r1), internalise(r2))
       
   345     case STAR(r) => ASTAR(Nil, internalise(r))
       
   346     case RECD(x, r) => internalise(r)
       
   347     case NOT(r) => ANOT(Nil, internalise(r))
       
   348     case ANYCHAR => AANYCHAR(Nil)
       
   349   }
       
   350 
       
   351 
       
   352 def bsimp(r: ARexp): ARexp = 
       
   353   {
       
   354     r match {
       
   355       case ASEQ(bs1, r1, r2) => (bsimp(r1), bsimp(r2)) match {
       
   356           case (AZERO, _) => AZERO
       
   357           case (_, AZERO) => AZERO
       
   358           case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s)
       
   359           case (r1s, r2s) => ASEQ(bs1, r1s, r2s)
       
   360       }
       
   361       case AALTS(bs1, rs) => {
       
   362             val rs_simp = rs.map(bsimp(_))
       
   363             val flat_res = flats(rs_simp)
       
   364             val dist_res = distinctBy(flat_res, erase)//strongDB(flat_res)//distinctBy(flat_res, erase)
       
   365             dist_res match {
       
   366               case Nil => AZERO
       
   367               case s :: Nil => fuse(bs1, s)
       
   368               case rs => AALTS(bs1, rs)  
       
   369             }
       
   370           
       
   371       }
       
   372       case r => r
       
   373     }
       
   374   }
       
   375   def strongBsimp(r: ARexp): ARexp =
       
   376   {
       
   377     r match {
       
   378       case ASEQ(bs1, r1, r2) => (strongBsimp(r1), strongBsimp(r2)) match {
       
   379           case (AZERO, _) => AZERO
       
   380           case (_, AZERO) => AZERO
       
   381           case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s)
       
   382           case (r1s, r2s) => ASEQ(bs1, r1s, r2s)
       
   383       }
       
   384       case AALTS(bs1, rs) => {
       
   385             val rs_simp = rs.map(strongBsimp(_))
       
   386             val flat_res = flats(rs_simp)
       
   387             val dist_res = distinctBy4(flat_res)//distinctBy(flat_res, erase)
       
   388             dist_res match {
       
   389               case Nil => AZERO
       
   390               case s :: Nil => fuse(bs1, s)
       
   391               case rs => AALTS(bs1, rs)  
       
   392             }
       
   393           
       
   394       }
       
   395       case r => r
       
   396     }
       
   397   }
       
   398 
       
   399   def bders (s: List[Char], r: ARexp) : ARexp = s match {
       
   400     case Nil => r
       
   401     case c::s => bders(s, bder(c, r))
       
   402   }
       
   403 
       
   404   def flats(rs: List[ARexp]): List[ARexp] = rs match {
       
   405       case Nil => Nil
       
   406       case AZERO :: rs1 => flats(rs1)
       
   407       case AALTS(bs, rs1) :: rs2 => rs1.map(fuse(bs, _)) ::: flats(rs2)
       
   408       case r1 :: rs2 => r1 :: flats(rs2)
       
   409     }
       
   410 
       
   411   def distinctBy[B, C](xs: List[B], f: B => C, acc: List[C] = Nil): List[B] = xs match {
       
   412     case Nil => Nil
       
   413     case (x::xs) => {
       
   414       val res = f(x)
       
   415       if (acc.contains(res)) distinctBy(xs, f, acc)  
       
   416       else x::distinctBy(xs, f, res::acc)
       
   417     }
       
   418   } 
       
   419 
       
   420   def projectFirstChild(rs: List[ARexp]) : List[ARexp] = rs match {
       
   421     case (ASEQ(bs, r1, r2p)::rs1) => r1::projectFirstChild(rs1)
       
   422     case Nil => Nil
       
   423   }
       
   424 
       
   425 
       
   426   def distinctBy2(xs: List[ARexp], acc: List[Rexp] = Nil): List[ARexp] = xs match {
       
   427     case Nil => Nil
       
   428     case (x::xs) => {
       
   429       val res = erase(x)
       
   430       if(acc.contains(res))
       
   431         distinctBy2(xs, acc)
       
   432       else
       
   433         x match {
       
   434           case ASEQ(bs0, AALTS(bs1, rs), r2) => 
       
   435             val newTerms =  distinctBy2(rs.map(r1 => ASEQ(Nil, r1, r2)), acc)
       
   436             val rsPrime = projectFirstChild(newTerms)
       
   437             newTerms match {
       
   438               case Nil => distinctBy2(xs, acc)
       
   439               case t::Nil => ASEQ(bs0, fuse(bs1, rsPrime.head), r2)::distinctBy2(xs, erase(t)::acc)
       
   440               case ts => ASEQ(bs0, AALTS(bs1, rsPrime), r2)::distinctBy2(xs, newTerms.map(erase(_)):::acc)
       
   441             }
       
   442           case x => x::distinctBy2(xs, res::acc)
       
   443         }
       
   444     }
       
   445   }
       
   446 
       
   447   def deepFlts(r: ARexp): List[ARexp] = r match{
       
   448 
       
   449     case ASEQ(bs, r1, r2) => deepFlts(r1).map(r1p => ASEQ(bs, r1p, r2))
       
   450     case ASTAR(bs, r0) => List(r)
       
   451     case ACHAR(_, _) => List(r)
       
   452     case AALTS(bs, rs) => rs.flatMap(deepFlts(_))//throw new Error("doubly nested alts in bsimp r")
       
   453   }
       
   454 
       
   455 
       
   456   def distinctBy3(xs: List[ARexp], acc: List[Rexp] = Nil): List[ARexp] = xs match {
       
   457     case Nil => Nil
       
   458     case (x::xs) => {
       
   459       val res = erase(x)
       
   460       if(acc.contains(res))
       
   461         distinctBy3(xs, acc)
       
   462       else
       
   463         x match {
       
   464           case ASEQ(bs0, AALTS(bs1, rs), r2) => 
       
   465             val newTerms =  distinctBy3(rs.flatMap(deepFlts(_)).map(r1 => ASEQ(Nil, r1, r2)), acc)//deepFlts(rs.flatMap(r1 => ASEQ(Nil, r1, r2)), acc)
       
   466             val rsPrime = projectFirstChild(newTerms)
       
   467             newTerms match {
       
   468               case Nil => distinctBy3(xs, acc)
       
   469               case t::Nil => ASEQ(bs0, fuse(bs1, rsPrime.head), r2)::distinctBy3(xs, erase(t)::acc)
       
   470               case ts => ASEQ(bs0, AALTS(bs1, rsPrime), r2)::distinctBy3(xs, newTerms.map(erase(_)):::acc)
       
   471             }
       
   472           case x => x::distinctBy3(xs, res::acc)
       
   473         }
       
   474     }
       
   475   }
       
   476 
       
   477   def pruneRexp(r: ARexp, allowableTerms: List[Rexp]) : ARexp = {
       
   478     r match {
       
   479       case ASEQ(bs, r1, r2) => 
       
   480         val termsTruncated = allowableTerms.collect(rt => rt match {
       
   481           case SEQ(r1p, r2p) if(r2p == erase(r2)) => r1p
       
   482         })
       
   483         val pruned : ARexp = pruneRexp(r1, termsTruncated)
       
   484         pruned match {
       
   485           case AZERO => AZERO
       
   486           case AONE(bs1) => fuse(bs ++ bs1, r2)
       
   487           case pruned1 => ASEQ(bs, pruned1, r2)
       
   488         }
       
   489 
       
   490       case AALTS(bs, rs) => 
       
   491         //allowableTerms.foreach(a => println(shortRexpOutput(a)))        
       
   492         val rsp = rs.map(r => pruneRexp(r, allowableTerms)).filter(r => r != AZERO)
       
   493         rsp match {
       
   494           case Nil => AZERO
       
   495           case r1::Nil => fuse(bs, r1)
       
   496           case rs1 => AALTS(bs, rs1)
       
   497         }
       
   498       case r => 
       
   499         if(allowableTerms.contains(erase(r))) r else AZERO //assert(r != AZERO)
       
   500     }
       
   501   }
       
   502 
       
   503   def oneSimp(r: Rexp) : Rexp = r match {
       
   504     case SEQ(ONE, r) => r
       
   505     case SEQ(r1, r2) => SEQ(oneSimp(r1), r2)
       
   506     case r => r//assert r != 0 
       
   507      
       
   508   }
       
   509 
       
   510 
       
   511   def distinctBy4(xs: List[ARexp], acc: List[Rexp] = Nil) : List[ARexp] = xs match {
       
   512     case Nil => Nil
       
   513     case x :: xs =>
       
   514       val erased = erase(x)
       
   515       if(acc.contains(erased))
       
   516         distinctBy4(xs, acc)
       
   517       else{
       
   518         val addToAcc =  breakIntoTerms(erased).filter(r => !acc.contains(oneSimp(r)))
       
   519         val xp = pruneRexp(x, addToAcc)
       
   520         xp match {
       
   521           case AZERO => distinctBy4(xs, addToAcc.map(oneSimp(_)) ::: acc)
       
   522           case xPrime => xPrime :: distinctBy4(xs, addToAcc.map(oneSimp(_)) ::: acc)
       
   523         }
       
   524         
       
   525       }
       
   526   }
       
   527 
       
   528 
       
   529   def breakIntoTerms(r: Rexp) : List[Rexp] = r match {
       
   530     case SEQ(r1, r2)  => breakIntoTerms(r1).map(r11 => SEQ(r11, r2))
       
   531       // breakIntoTerms(r1).map(r11 => r11 match {
       
   532       //   case ONE => r2
       
   533       //   case r => SEQ(r11, r2)
       
   534       // }
       
   535       //)
       
   536     case ALTS(r1, r2) => breakIntoTerms(r1) ::: breakIntoTerms(r2)
       
   537     case _ => r::Nil
       
   538   } 
       
   539 
       
   540   def prettyRexp(r: Rexp) : String = r match {
       
   541       case STAR(r0) => s"${prettyRexp(r0)}*"
       
   542       case SEQ(CHAR(c), r2) => c.toString ++ prettyRexp(r2)
       
   543       case SEQ(r1, r2) => s"${prettyRexp(r1)}~${prettyRexp(r2)}"
       
   544       case CHAR(c) => c.toString
       
   545       case ANYCHAR => "."
       
   546     //   case NOT(r0) => s
       
   547   }
       
   548 
       
   549   def decode_aux(r: Rexp, bs: Bits) : (Val, Bits) = (r, bs) match {
       
   550     case (ONE, bs) => (Empty, bs)
       
   551     case (CHAR(f), bs) => (Chr(f), bs)
       
   552     case (ALTS(r1, r2), Z::bs1) => {
       
   553         val (v, bs2) = decode_aux(r1, bs1)
       
   554         (Left(v), bs2)
       
   555     }
       
   556     case (ALTS(r1, r2), S::bs1) => {
       
   557         val (v, bs2) = decode_aux(r2, bs1)
       
   558         (Right(v), bs2)
       
   559     }
       
   560     case (SEQ(r1, r2), bs) => {
       
   561       val (v1, bs1) = decode_aux(r1, bs)
       
   562       val (v2, bs2) = decode_aux(r2, bs1)
       
   563       (Sequ(v1, v2), bs2)
       
   564     }
       
   565     case (STAR(r1), S::bs) => {
       
   566       val (v, bs1) = decode_aux(r1, bs)
       
   567       //println(v)
       
   568       val (Stars(vs), bs2) = decode_aux(STAR(r1), bs1)
       
   569       (Stars(v::vs), bs2)
       
   570     }
       
   571     case (STAR(_), Z::bs) => (Stars(Nil), bs)
       
   572     case (RECD(x, r1), bs) => {
       
   573       val (v, bs1) = decode_aux(r1, bs)
       
   574       (Rec(x, v), bs1)
       
   575     }
       
   576     case (NOT(r), bs) => (Nots(prettyRexp(r)), bs)
       
   577   }
       
   578 
       
   579   def decode(r: Rexp, bs: Bits) = decode_aux(r, bs) match {
       
   580     case (v, Nil) => v
       
   581     case _ => throw new Exception("Not decodable")
       
   582   }
       
   583 
       
   584 
       
   585 
       
   586 def blexSimp(r: Rexp, s: String) : List[Bit] = {
       
   587     blex_simp(internalise(r), s.toList)
       
   588 }
       
   589 
       
   590 def blexing_simp(r: Rexp, s: String) : Val = {
       
   591     val bit_code = blex_simp(internalise(r), s.toList)
       
   592     decode(r, bit_code)
       
   593   }
       
   594 
       
   595   def strong_blexing_simp(r: Rexp, s: String) : Val = {
       
   596     decode(r, strong_blex_simp(internalise(r), s.toList))
       
   597   }
       
   598 
       
   599   def strong_blex_simp(r: ARexp, s: List[Char]) :Bits = s match {
       
   600     case Nil => {
       
   601       if (bnullable(r)) {
       
   602         //println(asize(r))
       
   603         val bits = mkepsBC(r)
       
   604         bits
       
   605       }
       
   606       else 
       
   607         throw new Exception("Not matched")
       
   608     }
       
   609     case c::cs => {
       
   610       val der_res = bder(c,r)
       
   611       val simp_res = strongBsimp(der_res)  
       
   612       strong_blex_simp(simp_res, cs)      
       
   613     }
       
   614   }
       
   615 
       
   616 
       
   617 
       
   618   def bders_simp(s: List[Char], r: ARexp) : ARexp = s match {
       
   619     case Nil => r
       
   620     case c::s => bders_simp(s, bsimp(bder(c, r)))
       
   621   }
       
   622   
       
   623   def bdersSimp(s: String, r: Rexp) : ARexp = bders_simp(s.toList, internalise(r))
       
   624 
       
   625   def bders_simpS(s: List[Char], r: ARexp) : ARexp = s match {
       
   626     case Nil => r 
       
   627     case c::s => bders_simpS(s, strongBsimp(bder(c, r)))
       
   628   }
       
   629 
       
   630   def bdersSimpS(s: String, r: Rexp) : ARexp = bders_simpS(s.toList, internalise(r))
       
   631 
       
   632   def erase(r:ARexp): Rexp = r match{
       
   633     case AZERO => ZERO
       
   634     case AONE(_) => ONE
       
   635     case ACHAR(bs, c) => CHAR(c)
       
   636     case AALTS(bs, Nil) => ZERO
       
   637     case AALTS(bs, a::Nil) => erase(a)
       
   638     case AALTS(bs, a::as) => ALTS(erase(a), erase(AALTS(bs, as)))
       
   639     case ASEQ(bs, r1, r2) => SEQ (erase(r1), erase(r2))
       
   640     case ASTAR(cs, r)=> STAR(erase(r))
       
   641     case ANOT(bs, r) => NOT(erase(r))
       
   642     case AANYCHAR(bs) => ANYCHAR
       
   643   }
       
   644 
       
   645 
       
   646 def allCharSeq(r: Rexp) : Boolean = r match {
       
   647   case CHAR(c) => true
       
   648   case SEQ(r1, r2) => allCharSeq(r1) && allCharSeq(r2)
       
   649   case _ => false
       
   650 }
       
   651 
       
   652 def flattenSeq(r: Rexp) : String = r match {
       
   653   case CHAR(c) => c.toString
       
   654   case SEQ(r1, r2) => flattenSeq(r1) ++ flattenSeq(r2)
       
   655   case _ => throw new Error("flatten unflattenable rexp")
       
   656 } 
       
   657 
       
   658 def shortRexpOutput(r: Rexp) : String = r match {
       
   659     case CHAR(c) => c.toString
       
   660     case ONE => "1"
       
   661     case ZERO => "0"
       
   662     case SEQ(r1, r2) if(allCharSeq(r)) => flattenSeq(r)//"[" ++ shortRexpOutput(r1) ++ "~" ++ shortRexpOutput(r2) ++ "]"
       
   663     case SEQ(r1, r2) => "[" ++ shortRexpOutput(r1) ++ "~" ++ shortRexpOutput(r2) ++ "]"
       
   664     case ALTS(r1, r2) => "(" ++ shortRexpOutput(r1) ++ "+" ++ shortRexpOutput(r2) ++ ")"
       
   665     case STAR(STAR(r)) => "(..)*"// ++ shortRexpOutput(r) ++ "]*"
       
   666     case STAR(r) => "STAR(" ++ shortRexpOutput(r) ++ ")"
       
   667     //case RTOP => "RTOP"
       
   668   }
       
   669 
       
   670 
       
   671 def blex_simp(r: ARexp, s: List[Char]) : Bits = s match {
       
   672     case Nil => {
       
   673       if (bnullable(r)) {
       
   674         //println(asize(r))
       
   675         val bits = mkepsBC(r)
       
   676         bits
       
   677       }
       
   678       else 
       
   679         throw new Exception("Not matched")
       
   680     }
       
   681     case c::cs => {
       
   682       val der_res = bder(c,r)
       
   683       val simp_res = bsimp(der_res)  
       
   684       blex_simp(simp_res, cs)      
       
   685     }
       
   686   }
       
   687   def size(r: Rexp) : Int = r match {
       
   688     case ZERO => 1
       
   689     case ONE => 1
       
   690     case CHAR(_) => 1
       
   691     case ANYCHAR => 1
       
   692     case NOT(r0) => 1 + size(r0)
       
   693     case SEQ(r1, r2) => 1 + size(r1) + size(r2)
       
   694     case ALTS(r1, r2) => 1 + List(r1, r2).map(size).sum
       
   695     case STAR(r) => 1 + size(r)
       
   696   }
       
   697 
       
   698   def asize(a: ARexp) = size(erase(a))
       
   699 
       
   700 //pder related
       
   701 type Mon = (Char, Rexp)
       
   702 type Lin = Set[Mon]
       
   703 
       
   704 def dot_prod(rs: Set[Rexp], r: Rexp): Set[Rexp] = r match {
       
   705     case ZERO => Set()
       
   706     case ONE => rs
       
   707     case r => rs.map((re) => if (re == ONE) r else SEQ(re, r)  )   
       
   708   }
       
   709   def cir_prod(l: Lin, t: Rexp): Lin = t match {//remember this Lin is different from the Lin in Antimirov's paper. Here it does not mean the set of all subsets of linear forms that does not contain zero, but rather the type a set of linear forms
       
   710     case ZERO => Set()
       
   711     case ONE => l
       
   712     case t => l.map( m => m._2 match {case ZERO => m case ONE => (m._1, t) case p => (m._1, SEQ(p, t)) }  )
       
   713   }
       
   714   def lf(r: Rexp): Lin = r match {
       
   715     case ZERO => Set()
       
   716     case ONE => Set()
       
   717     case CHAR(f) => {
       
   718       //val Some(c) = alphabet.find(f) 
       
   719       Set((f, ONE))
       
   720     }
       
   721     case ALTS(r1, r2) => {
       
   722       lf(r1 ) ++ lf(r2)
       
   723     }
       
   724     case STAR(r1) => cir_prod(lf(r1), STAR(r1)) //may try infix notation later......
       
   725     case SEQ(r1, r2) =>{
       
   726       if (nullable(r1))
       
   727         cir_prod(lf(r1), r2) ++ lf(r2)
       
   728       else
       
   729         cir_prod(lf(r1), r2) 
       
   730     }
       
   731   }
       
   732   def lfs(r: Set[Rexp]): Lin = {
       
   733     r.foldLeft(Set[Mon]())((acc, r) => acc ++ lf(r))
       
   734   }
       
   735 
       
   736   def pder(x: Char, t: Rexp): Set[Rexp] = {
       
   737     val lft = lf(t)
       
   738     (lft.filter(mon => if(mon._1 == x) true else false)).map(mon => mon._2)
       
   739   }
       
   740   def pders_single(s: List[Char], t: Rexp) : Set[Rexp] = s match {
       
   741     case x::xs => pders(xs, pder(x, t))
       
   742     case Nil => Set(t)
       
   743   }
       
   744   def pders(s: List[Char], ts: Set[Rexp]) : Set[Rexp] = s match {
       
   745     case x::xs => pders(xs, ts.foldLeft(Set[Rexp]())((acc, t) => acc ++ pder(x, t)))
       
   746     case Nil => ts 
       
   747   }
       
   748   def pderss(ss: List[List[Char]], t: Rexp): Set[Rexp] = ss.foldLeft( Set[Rexp]() )( (acc, s) => pders_single(s, t) ++ acc )
       
   749   def pdera(t: Rexp): Set[Rexp] = lf(t).map(mon => mon._2)
       
   750   //all implementation of partial derivatives that involve set union are potentially buggy
       
   751   //because they don't include the original regular term before they are pdered.
       
   752   //now only pderas is fixed.
       
   753   def pderas(t: Set[Rexp], d: Int): Set[Rexp] = if(d > 0) pderas(lfs(t).map(mon => mon._2), d - 1) ++ t else lfs(t).map(mon => mon._2) ++ t//repeated application of pderas over the newest set of pders.
       
   754   def pderUNIV(r: Rexp) : Set[Rexp] = pderas(Set(r), awidth(r) + 1)
       
   755   def awidth(r: Rexp) : Int = r match {
       
   756     case CHAR(c) => 1
       
   757     case SEQ(r1, r2) => awidth(r1) + awidth(r2)
       
   758     case ALTS(r1, r2) => awidth(r1) + awidth(r2)
       
   759     case ONE => 0
       
   760     case ZERO => 0
       
   761     case STAR(r) => awidth(r)
       
   762   }
       
   763   //def sigma_lf(l: Set[Mon]) : Rexp = ALTS(l.map(mon => SEQ(CHAR(mon._1),mon._2)).toList)
       
   764   //def sigma(rs: Set[Rexp]) : Rexp = ALTS(rs.toList)
       
   765   def o(r: Rexp) = if (nullable(r)) ONE else ZERO
       
   766   //def nlf(t: Rexp) : Rexp = ALTS(List( o(t), sigma_lf(lf(t)) ))
       
   767   def pdp(x: Char, r: Rexp) : Set[Rexp] = r match {
       
   768     case ZERO => Set[Rexp]()
       
   769     case ONE => Set[Rexp]()
       
   770     case CHAR(f) => if(x == f) Set(ONE) else Set[Rexp]()
       
   771     case ALTS(r1, r2) => pdp(x, r1) ++ pdp(x, r2)
       
   772     case STAR(r1) => pdp(x, r).map(a => SEQ(a, STAR(r1)))
       
   773     case SEQ(a0, b) => if(nullable(a0)) pdp(x, a0).map(a => SEQ(a, b)) ++ pdp(x, b) else pdp(x, a0).map(a => SEQ(a, b))
       
   774   }
       
   775   def pdps(s: List[Char], ts: Set[Rexp]): Set[Rexp] = s match {
       
   776     case x::xs => pdps(xs, ts.foldLeft(Set[Rexp]())((acc, t) => acc ++ pder(x, t)))
       
   777     case Nil => ts   
       
   778   }
       
   779   def pdpss(ss: List[List[Char]], t: Rexp): Set[Rexp] = ss.foldLeft( Set[Rexp]())((acc, s) => pdps(s, Set(t)) ++ acc)
       
   780 
       
   781 
       
   782 
       
   783 def starPrint(r: Rexp) : Unit = r match {
       
   784         
       
   785           case SEQ(head, rstar) =>
       
   786             println(shortRexpOutput(head) ++ "~STARREG")
       
   787           case STAR(rstar) =>
       
   788             println("STARREG")
       
   789           case ALTS(r1, r2) =>  
       
   790             println("(")
       
   791             starPrint(r1)
       
   792             println("+")
       
   793             starPrint(r2)
       
   794             println(")")
       
   795           case ZERO => println("0")
       
   796         
       
   797       }
       
   798 
       
   799 // @arg(doc = "small tests")
       
   800 val STARREG = //((( (STAR("aa")) | STAR("aaa") ).%).%)
       
   801 (((STAR("a") | ( STAR("aaa")) | STAR("aaaaa"| ( STAR("aaaaaaa")) | STAR("aaaaaaaaaaa"))).%).%).%
       
   802 
       
   803 // @main
       
   804 def small() = {
       
   805 
       
   806 
       
   807 //   println(lexing_simp(NOTREG, prog0))
       
   808 //   val v = lex_simp(NOTREG, prog0.toList)
       
   809 //   println(v)
       
   810 
       
   811 //   val d =  (lex_simp(NOTREG, prog0.toList))
       
   812 //   println(d)
       
   813   val pderSTAR = pderUNIV(STARREG)
       
   814 
       
   815   val refSize = pderSTAR.map(size(_)).sum
       
   816   println("different partial derivative terms:")
       
   817   pderSTAR.foreach(r => r match {
       
   818       
       
   819         case SEQ(head, rstar) =>
       
   820           println(shortRexpOutput(head) ++ "~STARREG")
       
   821         case STAR(rstar) =>
       
   822           println("STARREG")
       
   823       
       
   824     }
       
   825     )
       
   826   println("the total number of terms is")
       
   827   //println(refSize)
       
   828   println(pderSTAR.size)
       
   829 
       
   830   // val A : Rexp= ("c" | (ONE | "b") ~ "d") ~((ONE).%)
       
   831   // val B : Rexp = ((ONE).%)
       
   832   // val C : Rexp = ("d") ~ ((ONE).%)
       
   833   // val PRUNE_REG : Rexp = (A | B | C)
       
   834   // val APRUNE_REG = internalise(PRUNE_REG)
       
   835   // val program_solution = pruneRexp(APRUNE_REG, breakIntoTerms(PRUNE_REG))
       
   836   // println("program executes and gives: as disired!")
       
   837   // println(shortRexpOutput(erase(program_solution)))
       
   838   
       
   839   for(i <- List( 4, 8, 9, 17, 29) ){// 100, 400, 800, 840, 841, 900
       
   840     val prog0 = "a" * i
       
   841     //println(s"test: $prog0")
       
   842     println(s"testing with $i a's" )
       
   843     val bd = bdersSimp(prog0, STARREG)//DB
       
   844     val sbd = bdersSimpS(prog0, STARREG)//strongDB
       
   845     starPrint(erase(sbd))
       
   846     val subTerms = breakIntoTerms(erase(sbd))
       
   847     val subTermsLarge = breakIntoTerms(erase(bd))
       
   848     
       
   849     println(s"subterms of regex with strongDB: ${subTerms.length}, standard DB: ${subTermsLarge.length}")
       
   850 
       
   851 
       
   852 
       
   853     println("the number of distinct subterms for bsimp with strongDB and standardDB")
       
   854     println(subTerms.distinct.size)
       
   855     println(subTermsLarge.distinct.size)
       
   856     println("which coincides with the number of PDER terms")
       
   857 
       
   858 
       
   859     // println(shortRexpOutput(erase(sbd)))
       
   860     // println(shortRexpOutput(erase(bd)))
       
   861     
       
   862     println("pdersize, original, strongSimp, bsimp")
       
   863     println(refSize, size(STARREG),  asize(sbd), asize(bd))
       
   864 
       
   865     val vres = strong_blexing_simp( STARREG, prog0)
       
   866     println(vres)
       
   867   }
       
   868 //   println(vs.length)
       
   869 //   println(vs)
       
   870    
       
   871 
       
   872   // val prog1 = """read  n; write n"""  
       
   873   // println(s"test: $prog1")
       
   874   // println(lexing_simp(WHILE_REGS, prog1))
       
   875 }
       
   876 
       
   877 small()