| 487 |      1 | // Thompson Construction
 | 
| 733 |      2 | //=======================
 | 
|  |      3 | 
 | 
|  |      4 | import $file.dfa, dfa._ 
 | 
|  |      5 | import $file.nfa, nfa._
 | 
|  |      6 | import $file.enfa, enfa._
 | 
| 487 |      7 | 
 | 
|  |      8 | 
 | 
|  |      9 | // states for Thompson construction
 | 
|  |     10 | case class TState(i: Int) extends State
 | 
|  |     11 | 
 | 
|  |     12 | object TState {
 | 
|  |     13 |   var counter = 0
 | 
|  |     14 |   
 | 
|  |     15 |   def apply() : TState = {
 | 
|  |     16 |     counter += 1;
 | 
| 733 |     17 |     new TState(counter)
 | 
| 487 |     18 |   }
 | 
|  |     19 | }
 | 
|  |     20 | 
 | 
|  |     21 | 
 | 
|  |     22 | // some types abbreviations
 | 
|  |     23 | type NFAt = NFA[TState, Char]
 | 
|  |     24 | type NFAtrans = (TState, Char) :=> Set[TState]
 | 
|  |     25 | type eNFAtrans = (TState, Option[Char]) :=> Set[TState]
 | 
|  |     26 | 
 | 
|  |     27 | 
 | 
| 733 |     28 | // for composing an eNFA transition with an NFA transition
 | 
|  |     29 | // | is for set union
 | 
|  |     30 | implicit def nfaOps(f: eNFAtrans) = new {
 | 
| 487 |     31 |   def +++(g: NFAtrans) : eNFAtrans = 
 | 
|  |     32 |   { case (q, None) =>  applyOrElse(f, (q, None)) 
 | 
|  |     33 |     case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c))  }
 | 
|  |     34 | }
 | 
|  |     35 | 
 | 
| 733 |     36 |  
 | 
| 487 |     37 | // NFA that does not accept any string
 | 
|  |     38 | def NFA_ZERO(): NFAt = {
 | 
| 486 |     39 |   val Q = TState()
 | 
|  |     40 |   NFA(Set(Q), { case _ => Set() }, Set())
 | 
|  |     41 | }
 | 
|  |     42 | 
 | 
| 487 |     43 | // NFA that accepts the empty string
 | 
|  |     44 | def NFA_ONE() : NFAt = {
 | 
| 486 |     45 |   val Q = TState()
 | 
|  |     46 |   NFA(Set(Q), { case _ => Set() }, Set(Q))
 | 
|  |     47 | }
 | 
|  |     48 | 
 | 
| 487 |     49 | // NFA that accepts the string "c"
 | 
|  |     50 | def NFA_CHAR(c: Char) : NFAt = {
 | 
| 486 |     51 |   val Q1 = TState()
 | 
|  |     52 |   val Q2 = TState()
 | 
|  |     53 |   NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2))
 | 
|  |     54 | }
 | 
|  |     55 | 
 | 
| 487 |     56 | // sequence of two NFAs
 | 
|  |     57 | def NFA_SEQ(enfa1: NFAt, enfa2: NFAt) : NFAt = {
 | 
|  |     58 |   val new_delta : eNFAtrans = 
 | 
|  |     59 |     { case (q, None) if enfa1.fins(q) => enfa2.starts }
 | 
|  |     60 |   
 | 
| 733 |     61 |   eNFA(enfa1.starts, 
 | 
|  |     62 |        new_delta +++ enfa1.delta +++ enfa2.delta, 
 | 
| 487 |     63 |        enfa2.fins)
 | 
|  |     64 | }
 | 
|  |     65 | 
 | 
|  |     66 | // alternative of two NFAs
 | 
|  |     67 | def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = {
 | 
| 489 |     68 |   val new_delta : NFAtrans = { 
 | 
|  |     69 |     case (q, c) =>  applyOrElse(enfa1.delta, (q, c)) | 
 | 
|  |     70 |                     applyOrElse(enfa2.delta, (q, c)) }
 | 
| 486 |     71 |   val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q)
 | 
|  |     72 | 
 | 
| 489 |     73 |   NFA(enfa1.starts | enfa2.starts, new_delta, new_fins)
 | 
| 486 |     74 | }
 | 
|  |     75 | 
 | 
| 487 |     76 | // star of a NFA
 | 
|  |     77 | def NFA_STAR(enfa: NFAt) : NFAt = {
 | 
| 486 |     78 |   val Q = TState()
 | 
|  |     79 |   val new_delta : eNFAtrans = 
 | 
|  |     80 |     { case (Q, None) => enfa.starts
 | 
|  |     81 |       case (q, None) if enfa.fins(q) => Set(Q) }
 | 
|  |     82 | 
 | 
| 487 |     83 |   eNFA(Set(Q), new_delta +++ enfa.delta, Set(Q))
 | 
|  |     84 | }
 | 
|  |     85 | 
 | 
|  |     86 | 
 | 
| 733 |     87 | // We are now ready to translate regular expressions
 | 
|  |     88 | // into DFAs (via eNFAs and NFAs, and the subset construction)
 | 
| 487 |     89 | 
 | 
|  |     90 | // regular expressions
 | 
|  |     91 | abstract class Rexp
 | 
|  |     92 | case object ZERO extends Rexp                    // matches nothing
 | 
|  |     93 | case object ONE extends Rexp                     // matches the empty string
 | 
|  |     94 | case class CHAR(c: Char) extends Rexp            // matches a character c
 | 
|  |     95 | case class ALT(r1: Rexp, r2: Rexp) extends Rexp  // alternative
 | 
|  |     96 | case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  // sequence
 | 
|  |     97 | case class STAR(r: Rexp) extends Rexp            // star
 | 
|  |     98 | 
 | 
|  |     99 | // thompson construction 
 | 
|  |    100 | def thompson (r: Rexp) : NFAt = r match {
 | 
|  |    101 |   case ZERO => NFA_ZERO()
 | 
|  |    102 |   case ONE => NFA_ONE()
 | 
|  |    103 |   case CHAR(c) => NFA_CHAR(c)  
 | 
|  |    104 |   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2))
 | 
|  |    105 |   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2))
 | 
|  |    106 |   case STAR(r1) => NFA_STAR(thompson(r1))
 | 
| 486 |    107 | }
 | 
| 487 |    108 | 
 | 
|  |    109 | //optional regular expression (one or zero times)
 | 
|  |    110 | def OPT(r: Rexp) = ALT(r, ONE)
 | 
|  |    111 | 
 | 
|  |    112 | //n-times regular expression (explicitly expanded)
 | 
|  |    113 | def NTIMES(r: Rexp, n: Int) : Rexp = n match {
 | 
|  |    114 |   case 0 => ONE
 | 
|  |    115 |   case 1 => r
 | 
|  |    116 |   case n => SEQ(r, NTIMES(r, n - 1))
 | 
|  |    117 | }
 | 
|  |    118 | 
 | 
|  |    119 | 
 | 
| 733 |    120 | def tmatches_nfa(r: Rexp, s: String) : Boolean =
 | 
| 488 |    121 |   thompson(r).accepts(s.toList)
 | 
|  |    122 | 
 | 
| 733 |    123 | def tmatches_nfa2(r: Rexp, s: String) : Boolean =
 | 
| 488 |    124 |   thompson(r).accepts2(s.toList)
 | 
|  |    125 | 
 | 
| 733 |    126 | // dfas via subset construction
 | 
| 491 |    127 | def tmatches_dfa(r: Rexp, s: String) : Boolean =
 | 
|  |    128 |   subset(thompson(r)).accepts(s.toList)
 | 
| 488 |    129 | 
 | 
| 487 |    130 | // Test Cases
 | 
| 733 |    131 | //============
 | 
| 488 |    132 | 
 | 
| 487 |    133 | // the evil regular expression  a?{n} a{n}
 | 
| 489 |    134 | def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
 | 
| 487 |    135 | 
 | 
|  |    136 | // the evil regular expression (a*)*b
 | 
| 489 |    137 | val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
 | 
| 487 |    138 | 
 | 
|  |    139 | //for measuring time
 | 
|  |    140 | def time_needed[T](i: Int, code: => T) = {
 | 
|  |    141 |   val start = System.nanoTime()
 | 
|  |    142 |   for (j <- 1 to i) code
 | 
|  |    143 |   val end = System.nanoTime()
 | 
|  |    144 |   (end - start)/(i * 1.0e9)
 | 
| 488 |    145 | }
 | 
| 487 |    146 | 
 | 
|  |    147 | // the size of the NFA can be large, 
 | 
|  |    148 | // thus slowing down the breadth-first search
 | 
|  |    149 | 
 | 
| 489 |    150 | for (i <- 1 to 13) {
 | 
| 733 |    151 |   println(i + ": " + "%.5f".format(time_needed(2, tmatches_nfa(EVIL1(i), "a" * i))))
 | 
| 487 |    152 | }
 | 
|  |    153 | 
 | 
| 489 |    154 | for (i <- 1 to 100 by 5) {
 | 
| 733 |    155 |   println(i + " " + "%.5f".format(time_needed(2, tmatches_nfa(EVIL2, "a" * i))))
 | 
| 487 |    156 | }
 | 
|  |    157 | 
 | 
|  |    158 | 
 | 
| 733 |    159 | // the backtracking that is needed in depth-first 
 | 
|  |    160 | // search can be painfully slow
 | 
| 487 |    161 | 
 | 
|  |    162 | for (i <- 1 to 8) {
 | 
| 733 |    163 |   println(i + " " + "%.5f".format(time_needed(2, tmatches_nfa2(EVIL2, "a" * i))))
 | 
| 487 |    164 | }
 | 
| 491 |    165 | 
 | 
|  |    166 | 
 | 
|  |    167 | 
 | 
| 586 |    168 | // while my thompson->enfa->subset->partial-function-chain
 | 
| 491 |    169 | // is probably not the most effcient way to obtain a fast DFA 
 | 
| 521 |    170 | // (the test below should be much faster with a more direct 
 | 
|  |    171 | // construction), in general the DFAs can be slow because of 
 | 
|  |    172 | // the state explosion in the subset construction
 | 
| 491 |    173 | 
 | 
|  |    174 | for (i <- 1 to 13) {
 | 
|  |    175 |   println(i + ": " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL1(i), "a" * i))))
 | 
|  |    176 | }
 | 
|  |    177 | 
 | 
|  |    178 | for (i <- 1 to 100 by 5) {
 | 
|  |    179 |   println(i + " " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL2, "a" * i))))
 | 
|  |    180 | }
 |