| 487 |      1 | // Thompson Construction
 | 
| 733 |      2 | //=======================
 | 
|  |      3 | 
 | 
|  |      4 | import $file.dfa, dfa._ 
 | 
|  |      5 | import $file.nfa, nfa._
 | 
|  |      6 | import $file.enfa, enfa._
 | 
| 487 |      7 | 
 | 
|  |      8 | 
 | 
|  |      9 | // states for Thompson construction
 | 
|  |     10 | case class TState(i: Int) extends State
 | 
|  |     11 | 
 | 
|  |     12 | object TState {
 | 
|  |     13 |   var counter = 0
 | 
|  |     14 |   
 | 
|  |     15 |   def apply() : TState = {
 | 
|  |     16 |     counter += 1;
 | 
| 733 |     17 |     new TState(counter)
 | 
| 487 |     18 |   }
 | 
|  |     19 | }
 | 
|  |     20 | 
 | 
|  |     21 | 
 | 
|  |     22 | // some types abbreviations
 | 
|  |     23 | type NFAt = NFA[TState, Char]
 | 
|  |     24 | type NFAtrans = (TState, Char) :=> Set[TState]
 | 
|  |     25 | type eNFAtrans = (TState, Option[Char]) :=> Set[TState]
 | 
|  |     26 | 
 | 
| 733 |     27 |  
 | 
| 487 |     28 | // NFA that does not accept any string
 | 
|  |     29 | def NFA_ZERO(): NFAt = {
 | 
| 486 |     30 |   val Q = TState()
 | 
|  |     31 |   NFA(Set(Q), { case _ => Set() }, Set())
 | 
|  |     32 | }
 | 
|  |     33 | 
 | 
| 487 |     34 | // NFA that accepts the empty string
 | 
|  |     35 | def NFA_ONE() : NFAt = {
 | 
| 486 |     36 |   val Q = TState()
 | 
|  |     37 |   NFA(Set(Q), { case _ => Set() }, Set(Q))
 | 
|  |     38 | }
 | 
|  |     39 | 
 | 
| 487 |     40 | // NFA that accepts the string "c"
 | 
|  |     41 | def NFA_CHAR(c: Char) : NFAt = {
 | 
| 486 |     42 |   val Q1 = TState()
 | 
|  |     43 |   val Q2 = TState()
 | 
|  |     44 |   NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2))
 | 
|  |     45 | }
 | 
|  |     46 | 
 | 
| 753 |     47 | 
 | 
|  |     48 | // for composing an eNFA transition with an NFA transition
 | 
|  |     49 | // | is for set union
 | 
| 932 |     50 | extension (f: eNFAtrans) {
 | 
| 753 |     51 |   def +++(g: NFAtrans) : eNFAtrans = 
 | 
|  |     52 |   { case (q, None) =>  applyOrElse(f, (q, None)) 
 | 
|  |     53 |     case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c))  }
 | 
|  |     54 | }
 | 
|  |     55 | 
 | 
|  |     56 | 
 | 
| 487 |     57 | // sequence of two NFAs
 | 
| 779 |     58 | def NFA_SEQ(nfa1: NFAt, nfa2: NFAt) : NFAt = {
 | 
| 487 |     59 |   val new_delta : eNFAtrans = 
 | 
| 779 |     60 |     { case (q, None) if nfa1.fins(q) => nfa2.starts }
 | 
| 487 |     61 |   
 | 
| 779 |     62 |   eNFA(nfa1.starts, 
 | 
|  |     63 |        new_delta +++ nfa1.delta +++ nfa2.delta, 
 | 
|  |     64 |        nfa2.fins)
 | 
| 487 |     65 | }
 | 
|  |     66 | 
 | 
|  |     67 | // alternative of two NFAs
 | 
| 779 |     68 | def NFA_ALT(nfa1: NFAt, nfa2: NFAt) : NFAt = {
 | 
| 489 |     69 |   val new_delta : NFAtrans = { 
 | 
| 779 |     70 |     case (q, c) =>  applyOrElse(nfa1.delta, (q, c)) | 
 | 
|  |     71 |                     applyOrElse(nfa2.delta, (q, c)) }
 | 
|  |     72 |   val new_fins = (q: TState) => nfa1.fins(q) || nfa2.fins(q)
 | 
| 486 |     73 | 
 | 
| 779 |     74 |   NFA(nfa1.starts | nfa2.starts, new_delta, new_fins)
 | 
| 486 |     75 | }
 | 
|  |     76 | 
 | 
| 487 |     77 | // star of a NFA
 | 
| 779 |     78 | def NFA_STAR(nfa: NFAt) : NFAt = {
 | 
| 486 |     79 |   val Q = TState()
 | 
|  |     80 |   val new_delta : eNFAtrans = 
 | 
| 779 |     81 |     { case (Q, None) => nfa.starts
 | 
|  |     82 |       case (q, None) if nfa.fins(q) => Set(Q) }
 | 
| 486 |     83 | 
 | 
| 779 |     84 |   eNFA(Set(Q), new_delta +++ nfa.delta, Set(Q))
 | 
| 487 |     85 | }
 | 
|  |     86 | 
 | 
|  |     87 | 
 | 
| 733 |     88 | // We are now ready to translate regular expressions
 | 
|  |     89 | // into DFAs (via eNFAs and NFAs, and the subset construction)
 | 
| 487 |     90 | 
 | 
|  |     91 | // regular expressions
 | 
|  |     92 | abstract class Rexp
 | 
|  |     93 | case object ZERO extends Rexp                    // matches nothing
 | 
|  |     94 | case object ONE extends Rexp                     // matches the empty string
 | 
|  |     95 | case class CHAR(c: Char) extends Rexp            // matches a character c
 | 
|  |     96 | case class ALT(r1: Rexp, r2: Rexp) extends Rexp  // alternative
 | 
|  |     97 | case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  // sequence
 | 
|  |     98 | case class STAR(r: Rexp) extends Rexp            // star
 | 
|  |     99 | 
 | 
|  |    100 | // thompson construction 
 | 
|  |    101 | def thompson (r: Rexp) : NFAt = r match {
 | 
|  |    102 |   case ZERO => NFA_ZERO()
 | 
|  |    103 |   case ONE => NFA_ONE()
 | 
|  |    104 |   case CHAR(c) => NFA_CHAR(c)  
 | 
|  |    105 |   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2))
 | 
|  |    106 |   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2))
 | 
|  |    107 |   case STAR(r1) => NFA_STAR(thompson(r1))
 | 
| 486 |    108 | }
 | 
| 487 |    109 | 
 | 
|  |    110 | //optional regular expression (one or zero times)
 | 
|  |    111 | def OPT(r: Rexp) = ALT(r, ONE)
 | 
|  |    112 | 
 | 
|  |    113 | //n-times regular expression (explicitly expanded)
 | 
|  |    114 | def NTIMES(r: Rexp, n: Int) : Rexp = n match {
 | 
|  |    115 |   case 0 => ONE
 | 
|  |    116 |   case 1 => r
 | 
|  |    117 |   case n => SEQ(r, NTIMES(r, n - 1))
 | 
|  |    118 | }
 | 
|  |    119 | 
 | 
|  |    120 | 
 | 
| 733 |    121 | def tmatches_nfa(r: Rexp, s: String) : Boolean =
 | 
| 488 |    122 |   thompson(r).accepts(s.toList)
 | 
|  |    123 | 
 | 
| 733 |    124 | def tmatches_nfa2(r: Rexp, s: String) : Boolean =
 | 
| 488 |    125 |   thompson(r).accepts2(s.toList)
 | 
|  |    126 | 
 | 
| 733 |    127 | // dfas via subset construction
 | 
| 491 |    128 | def tmatches_dfa(r: Rexp, s: String) : Boolean =
 | 
|  |    129 |   subset(thompson(r)).accepts(s.toList)
 | 
| 488 |    130 | 
 | 
| 487 |    131 | // Test Cases
 | 
| 733 |    132 | //============
 | 
| 488 |    133 | 
 | 
| 487 |    134 | // the evil regular expression  a?{n} a{n}
 | 
| 489 |    135 | def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
 | 
| 487 |    136 | 
 | 
|  |    137 | // the evil regular expression (a*)*b
 | 
| 489 |    138 | val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
 | 
| 487 |    139 | 
 | 
|  |    140 | //for measuring time
 | 
|  |    141 | def time_needed[T](i: Int, code: => T) = {
 | 
|  |    142 |   val start = System.nanoTime()
 | 
|  |    143 |   for (j <- 1 to i) code
 | 
|  |    144 |   val end = System.nanoTime()
 | 
|  |    145 |   (end - start)/(i * 1.0e9)
 | 
| 488 |    146 | }
 | 
| 487 |    147 | 
 | 
|  |    148 | // the size of the NFA can be large, 
 | 
|  |    149 | // thus slowing down the breadth-first search
 | 
| 779 |    150 | println("Breadth-first search EVIL1 / EVIL2")
 | 
| 487 |    151 | 
 | 
| 489 |    152 | for (i <- 1 to 13) {
 | 
| 967 |    153 |   println(s"$i: ${"%.5f".format(time_needed(2, tmatches_nfa(EVIL1(i), "a" * i)))}")
 | 
| 487 |    154 | }
 | 
|  |    155 | 
 | 
| 489 |    156 | for (i <- 1 to 100 by 5) {
 | 
| 967 |    157 |   println(s"$i: ${"%.5f".format(time_needed(2, tmatches_nfa(EVIL2, "a" * i)))}")
 | 
| 487 |    158 | }
 | 
|  |    159 | 
 | 
|  |    160 | 
 | 
| 733 |    161 | // the backtracking that is needed in depth-first 
 | 
|  |    162 | // search can be painfully slow
 | 
| 779 |    163 | println("Depth-first search EVIL1 / EVIL2")
 | 
| 487 |    164 | 
 | 
| 779 |    165 | for (i <- 1 to 9) {
 | 
| 967 |    166 |   println(s"$i: ${"%.5f".format(time_needed(2, tmatches_nfa2(EVIL1(i), "a" * i)))}")
 | 
| 779 |    167 | }
 | 
|  |    168 | 
 | 
|  |    169 | for (i <- 1 to 7) {
 | 
| 967 |    170 |   println(s"$i: ${"%.5f".format(time_needed(2, tmatches_nfa2(EVIL2, "a" * i)))}")
 | 
| 487 |    171 | }
 | 
| 491 |    172 | 
 | 
|  |    173 | 
 | 
|  |    174 | 
 | 
| 586 |    175 | // while my thompson->enfa->subset->partial-function-chain
 | 
| 491 |    176 | // is probably not the most effcient way to obtain a fast DFA 
 | 
| 521 |    177 | // (the test below should be much faster with a more direct 
 | 
|  |    178 | // construction), in general the DFAs can be slow because of 
 | 
|  |    179 | // the state explosion in the subset construction
 | 
| 491 |    180 | 
 | 
| 742 |    181 | for (i <- 1 to 7) {
 | 
| 967 |    182 |   println(s"$i: ${"%.5f".format(time_needed(2, tmatches_dfa(EVIL1(i), "a" * i)))}")
 | 
| 491 |    183 | }
 | 
|  |    184 | 
 | 
|  |    185 | for (i <- 1 to 100 by 5) {
 | 
| 967 |    186 |   println(s"$i: ${"%.5f".format(time_needed(2, tmatches_dfa(EVIL2, "a" * i)))}")
 | 
| 491 |    187 | }
 | 
| 784 |    188 | 
 | 
|  |    189 | 
 | 
|  |    190 | 
 | 
|  |    191 | 
 | 
|  |    192 | 
 |