| 487 |      1 | // Thompson Construction
 | 
|  |      2 | // (needs  :load nfa.scala
 | 
|  |      3 | //         :load enfa.scala)
 | 
|  |      4 | 
 | 
|  |      5 | 
 | 
|  |      6 | // states for Thompson construction
 | 
|  |      7 | case class TState(i: Int) extends State
 | 
|  |      8 | 
 | 
|  |      9 | object TState {
 | 
|  |     10 |   var counter = 0
 | 
|  |     11 |   
 | 
|  |     12 |   def apply() : TState = {
 | 
|  |     13 |     counter += 1;
 | 
|  |     14 |     new TState(counter - 1)
 | 
|  |     15 |   }
 | 
|  |     16 | }
 | 
|  |     17 | 
 | 
|  |     18 | 
 | 
|  |     19 | // some types abbreviations
 | 
|  |     20 | type NFAt = NFA[TState, Char]
 | 
|  |     21 | type NFAtrans = (TState, Char) :=> Set[TState]
 | 
|  |     22 | type eNFAtrans = (TState, Option[Char]) :=> Set[TState]
 | 
|  |     23 | 
 | 
|  |     24 | 
 | 
|  |     25 | // for composing an eNFA transition with a NFA transition
 | 
|  |     26 | implicit class RichPF(val f: eNFAtrans) extends AnyVal {
 | 
|  |     27 |   def +++(g: NFAtrans) : eNFAtrans = 
 | 
|  |     28 |   { case (q, None) =>  applyOrElse(f, (q, None)) 
 | 
|  |     29 |     case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c))  }
 | 
|  |     30 | }
 | 
|  |     31 | 
 | 
|  |     32 | 
 | 
|  |     33 | // NFA that does not accept any string
 | 
|  |     34 | def NFA_ZERO(): NFAt = {
 | 
| 486 |     35 |   val Q = TState()
 | 
|  |     36 |   NFA(Set(Q), { case _ => Set() }, Set())
 | 
|  |     37 | }
 | 
|  |     38 | 
 | 
| 487 |     39 | // NFA that accepts the empty string
 | 
|  |     40 | def NFA_ONE() : NFAt = {
 | 
| 486 |     41 |   val Q = TState()
 | 
|  |     42 |   NFA(Set(Q), { case _ => Set() }, Set(Q))
 | 
|  |     43 | }
 | 
|  |     44 | 
 | 
| 487 |     45 | // NFA that accepts the string "c"
 | 
|  |     46 | def NFA_CHAR(c: Char) : NFAt = {
 | 
| 486 |     47 |   val Q1 = TState()
 | 
|  |     48 |   val Q2 = TState()
 | 
|  |     49 |   NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2))
 | 
|  |     50 | }
 | 
|  |     51 | 
 | 
| 487 |     52 | // sequence of two NFAs
 | 
|  |     53 | def NFA_SEQ(enfa1: NFAt, enfa2: NFAt) : NFAt = {
 | 
|  |     54 |   val new_delta : eNFAtrans = 
 | 
|  |     55 |     { case (q, None) if enfa1.fins(q) => enfa2.starts }
 | 
|  |     56 |   
 | 
|  |     57 |   eNFA(enfa1.starts, new_delta +++ enfa1.delta +++ enfa2.delta, 
 | 
|  |     58 |        enfa2.fins)
 | 
|  |     59 | }
 | 
|  |     60 | 
 | 
|  |     61 | // alternative of two NFAs
 | 
|  |     62 | def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = {
 | 
| 486 |     63 |   val Q = TState()
 | 
|  |     64 |   val new_delta : eNFAtrans = 
 | 
|  |     65 |     { case (Q, None) => enfa1.starts | enfa2.starts }
 | 
|  |     66 |   val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q)
 | 
|  |     67 | 
 | 
| 487 |     68 |   eNFA(Set(Q), new_delta +++ enfa1.delta +++ enfa2.delta, new_fins)
 | 
| 486 |     69 | }
 | 
|  |     70 | 
 | 
| 487 |     71 | // star of a NFA
 | 
|  |     72 | def NFA_STAR(enfa: NFAt) : NFAt = {
 | 
| 486 |     73 |   val Q = TState()
 | 
|  |     74 |   val new_delta : eNFAtrans = 
 | 
|  |     75 |     { case (Q, None) => enfa.starts
 | 
|  |     76 |       case (q, None) if enfa.fins(q) => Set(Q) }
 | 
|  |     77 | 
 | 
| 487 |     78 |   eNFA(Set(Q), new_delta +++ enfa.delta, Set(Q))
 | 
|  |     79 | }
 | 
|  |     80 | 
 | 
|  |     81 | 
 | 
|  |     82 | 
 | 
|  |     83 | // regular expressions
 | 
|  |     84 | abstract class Rexp
 | 
|  |     85 | case object ZERO extends Rexp                    // matches nothing
 | 
|  |     86 | case object ONE extends Rexp                     // matches the empty string
 | 
|  |     87 | case class CHAR(c: Char) extends Rexp            // matches a character c
 | 
|  |     88 | case class ALT(r1: Rexp, r2: Rexp) extends Rexp  // alternative
 | 
|  |     89 | case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  // sequence
 | 
|  |     90 | case class STAR(r: Rexp) extends Rexp            // star
 | 
|  |     91 | 
 | 
|  |     92 | 
 | 
|  |     93 | 
 | 
|  |     94 | 
 | 
|  |     95 | // thompson construction 
 | 
|  |     96 | def thompson (r: Rexp) : NFAt = r match {
 | 
|  |     97 |   case ZERO => NFA_ZERO()
 | 
|  |     98 |   case ONE => NFA_ONE()
 | 
|  |     99 |   case CHAR(c) => NFA_CHAR(c)  
 | 
|  |    100 |   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2))
 | 
|  |    101 |   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2))
 | 
|  |    102 |   case STAR(r1) => NFA_STAR(thompson(r1))
 | 
| 486 |    103 | }
 | 
| 487 |    104 | 
 | 
|  |    105 | 
 | 
|  |    106 | def tmatches(r: Rexp, s: String) : Boolean =
 | 
|  |    107 |   thompson(r).accepts(s.toList)
 | 
|  |    108 | 
 | 
|  |    109 | def tmatches2(r: Rexp, s: String) : Boolean =
 | 
|  |    110 |   thompson(r).accepts2(s.toList)
 | 
|  |    111 | 
 | 
|  |    112 | 
 | 
|  |    113 | //optional regular expression (one or zero times)
 | 
|  |    114 | def OPT(r: Rexp) = ALT(r, ONE)
 | 
|  |    115 | 
 | 
|  |    116 | //n-times regular expression (explicitly expanded)
 | 
|  |    117 | def NTIMES(r: Rexp, n: Int) : Rexp = n match {
 | 
|  |    118 |   case 0 => ONE
 | 
|  |    119 |   case 1 => r
 | 
|  |    120 |   case n => SEQ(r, NTIMES(r, n - 1))
 | 
|  |    121 | }
 | 
|  |    122 | 
 | 
|  |    123 | 
 | 
|  |    124 | // Test Cases
 | 
|  |    125 | 
 | 
|  |    126 | // the evil regular expression  a?{n} a{n}
 | 
|  |    127 | def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
 | 
|  |    128 | 
 | 
|  |    129 | // the evil regular expression (a*)*b
 | 
|  |    130 | val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
 | 
|  |    131 | 
 | 
|  |    132 | //for measuring time
 | 
|  |    133 | def time_needed[T](i: Int, code: => T) = {
 | 
|  |    134 |   val start = System.nanoTime()
 | 
|  |    135 |   for (j <- 1 to i) code
 | 
|  |    136 |   val end = System.nanoTime()
 | 
|  |    137 |   (end - start)/(i * 1.0e9)
 | 
|  |    138 | 
 | 
|  |    139 | 
 | 
|  |    140 | // the size of the NFA can be large, 
 | 
|  |    141 | // thus slowing down the breadth-first search
 | 
|  |    142 | 
 | 
|  |    143 | for (i <- 1 to 10) {
 | 
|  |    144 |   println(i + ": " + "%.5f".format(time_needed(2, tmatches(EVIL1(i), "a" * i))))
 | 
|  |    145 | }
 | 
|  |    146 | 
 | 
|  |    147 | for (i <- 1 to 10) {
 | 
|  |    148 |   println(i + " " + "%.5f".format(time_needed(2, tmatches(EVIL2, "a" * i))))
 | 
|  |    149 | }
 | 
|  |    150 | 
 | 
|  |    151 | 
 | 
|  |    152 | // the backtracking needed in depth-first search 
 | 
|  |    153 | // can be painfully slow
 | 
|  |    154 | 
 | 
|  |    155 | for (i <- 1 to 8) {
 | 
|  |    156 |   println(i + " " + "%.5f".format(time_needed(2, tmatches2(EVIL2, "a" * i))))
 | 
|  |    157 | }
 |