|         |      1 // Thompson Construction | 
|         |      2 // (needs  :load dfa.scala | 
|         |      3 //         :load nfa.scala | 
|         |      4 //         :load enfa.scala) | 
|         |      5  | 
|         |      6  | 
|         |      7 // states for Thompson construction | 
|         |      8 case class TState(i: Int) extends State | 
|         |      9  | 
|         |     10 object TState { | 
|         |     11   var counter = 0 | 
|         |     12    | 
|         |     13   def apply() : TState = { | 
|         |     14     counter += 1; | 
|         |     15     new TState(counter - 1) | 
|         |     16   } | 
|         |     17 } | 
|         |     18  | 
|         |     19  | 
|         |     20 // some types abbreviations | 
|         |     21 type NFAt = NFA[TState, Char] | 
|         |     22 type NFAtrans = (TState, Char) :=> Set[TState] | 
|         |     23 type eNFAtrans = (TState, Option[Char]) :=> Set[TState] | 
|         |     24  | 
|         |     25  | 
|         |     26 // for composing an eNFA transition with a NFA transition | 
|         |     27 implicit class RichPF(val f: eNFAtrans) extends AnyVal { | 
|         |     28   def +++(g: NFAtrans) : eNFAtrans =  | 
|         |     29   { case (q, None) =>  applyOrElse(f, (q, None))  | 
|         |     30     case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c))  } | 
|         |     31 } | 
|         |     32  | 
|         |     33  | 
|         |     34 // NFA that does not accept any string | 
|         |     35 def NFA_ZERO(): NFAt = { | 
|         |     36   val Q = TState() | 
|         |     37   NFA(Set(Q), { case _ => Set() }, Set()) | 
|         |     38 } | 
|         |     39  | 
|         |     40 // NFA that accepts the empty string | 
|         |     41 def NFA_ONE() : NFAt = { | 
|         |     42   val Q = TState() | 
|         |     43   NFA(Set(Q), { case _ => Set() }, Set(Q)) | 
|         |     44 } | 
|         |     45  | 
|         |     46 // NFA that accepts the string "c" | 
|         |     47 def NFA_CHAR(c: Char) : NFAt = { | 
|         |     48   val Q1 = TState() | 
|         |     49   val Q2 = TState() | 
|         |     50   NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2)) | 
|         |     51 } | 
|         |     52  | 
|         |     53 // sequence of two NFAs | 
|         |     54 def NFA_SEQ(enfa1: NFAt, enfa2: NFAt) : NFAt = { | 
|         |     55   val new_delta : eNFAtrans =  | 
|         |     56     { case (q, None) if enfa1.fins(q) => enfa2.starts } | 
|         |     57    | 
|         |     58   eNFA(enfa1.starts, new_delta +++ enfa1.delta +++ enfa2.delta,  | 
|         |     59        enfa2.fins) | 
|         |     60 } | 
|         |     61  | 
|         |     62 // alternative of two NFAs | 
|         |     63 def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = { | 
|         |     64   val new_delta : NFAtrans = {  | 
|         |     65     case (q, c) =>  applyOrElse(enfa1.delta, (q, c)) |  | 
|         |     66                     applyOrElse(enfa2.delta, (q, c)) } | 
|         |     67   val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q) | 
|         |     68  | 
|         |     69   NFA(enfa1.starts | enfa2.starts, new_delta, new_fins) | 
|         |     70 } | 
|         |     71  | 
|         |     72 // star of a NFA | 
|         |     73 def NFA_STAR(enfa: NFAt) : NFAt = { | 
|         |     74   val Q = TState() | 
|         |     75   val new_delta : eNFAtrans =  | 
|         |     76     { case (Q, None) => enfa.starts | 
|         |     77       case (q, None) if enfa.fins(q) => Set(Q) } | 
|         |     78  | 
|         |     79   eNFA(Set(Q), new_delta +++ enfa.delta, Set(Q)) | 
|         |     80 } | 
|         |     81  | 
|         |     82  | 
|         |     83  | 
|         |     84 // regular expressions | 
|         |     85 abstract class Rexp | 
|         |     86 case object ZERO extends Rexp                    // matches nothing | 
|         |     87 case object ONE extends Rexp                     // matches the empty string | 
|         |     88 case class CHAR(c: Char) extends Rexp            // matches a character c | 
|         |     89 case class ALT(r1: Rexp, r2: Rexp) extends Rexp  // alternative | 
|         |     90 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  // sequence | 
|         |     91 case class STAR(r: Rexp) extends Rexp            // star | 
|         |     92  | 
|         |     93  | 
|         |     94  | 
|         |     95  | 
|         |     96 // thompson construction  | 
|         |     97 def thompson (r: Rexp) : NFAt = r match { | 
|         |     98   case ZERO => NFA_ZERO() | 
|         |     99   case ONE => NFA_ONE() | 
|         |    100   case CHAR(c) => NFA_CHAR(c)   | 
|         |    101   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2)) | 
|         |    102   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2)) | 
|         |    103   case STAR(r1) => NFA_STAR(thompson(r1)) | 
|         |    104 } | 
|         |    105  | 
|         |    106 //optional regular expression (one or zero times) | 
|         |    107 def OPT(r: Rexp) = ALT(r, ONE) | 
|         |    108  | 
|         |    109 //n-times regular expression (explicitly expanded) | 
|         |    110 def NTIMES(r: Rexp, n: Int) : Rexp = n match { | 
|         |    111   case 0 => ONE | 
|         |    112   case 1 => r | 
|         |    113   case n => SEQ(r, NTIMES(r, n - 1)) | 
|         |    114 } | 
|         |    115  | 
|         |    116  | 
|         |    117 def tmatches(r: Rexp, s: String) : Boolean = | 
|         |    118   thompson(r).accepts(s.toList) | 
|         |    119  | 
|         |    120 def tmatches2(r: Rexp, s: String) : Boolean = | 
|         |    121   thompson(r).accepts2(s.toList) | 
|         |    122  | 
|         |    123 // dfa via subset construction | 
|         |    124 def tmatches_dfa(r: Rexp, s: String) : Boolean = | 
|         |    125   subset(thompson(r)).accepts(s.toList) | 
|         |    126  | 
|         |    127 // Test Cases | 
|         |    128  | 
|         |    129  | 
|         |    130 // the evil regular expression  a?{n} a{n} | 
|         |    131 def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) | 
|         |    132  | 
|         |    133 // the evil regular expression (a*)*b | 
|         |    134 val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) | 
|         |    135  | 
|         |    136 //for measuring time | 
|         |    137 def time_needed[T](i: Int, code: => T) = { | 
|         |    138   val start = System.nanoTime() | 
|         |    139   for (j <- 1 to i) code | 
|         |    140   val end = System.nanoTime() | 
|         |    141   (end - start)/(i * 1.0e9) | 
|         |    142 } | 
|         |    143  | 
|         |    144 // the size of the NFA can be large,  | 
|         |    145 // thus slowing down the breadth-first search | 
|         |    146  | 
|         |    147 for (i <- 1 to 13) { | 
|         |    148   println(i + ": " + "%.5f".format(time_needed(2, tmatches(EVIL1(i), "a" * i)))) | 
|         |    149 } | 
|         |    150  | 
|         |    151 for (i <- 1 to 100 by 5) { | 
|         |    152   println(i + " " + "%.5f".format(time_needed(2, tmatches(EVIL2, "a" * i)))) | 
|         |    153 } | 
|         |    154  | 
|         |    155  | 
|         |    156 // the backtracking needed in depth-first search  | 
|         |    157 // can be painfully slow | 
|         |    158  | 
|         |    159 for (i <- 1 to 8) { | 
|         |    160   println(i + " " + "%.5f".format(time_needed(2, tmatches2(EVIL2, "a" * i)))) | 
|         |    161 } | 
|         |    162  | 
|         |    163  | 
|         |    164  | 
|         |    165 // while my thompson-enfa-subset-partial-function-chain | 
|         |    166 // is probably not the most effcient way to obtain a fast DFA  | 
|         |    167 // (the below should be much faster with a more direct construction), | 
|         |    168 // in general the DFAs can be slow because of the state explosion | 
|         |    169 // in the subset construction | 
|         |    170  | 
|         |    171 for (i <- 1 to 13) { | 
|         |    172   println(i + ": " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL1(i), "a" * i)))) | 
|         |    173 } | 
|         |    174  | 
|         |    175 for (i <- 1 to 100 by 5) { | 
|         |    176   println(i + " " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL2, "a" * i)))) | 
|         |    177 } |