|      1 // NFAs and Thompson's construction  |         | 
|      2  |         | 
|      3 // helper function for recording time |         | 
|      4 def time_needed[T](i: Int, code: => T) = { |         | 
|      5   val start = System.nanoTime() |         | 
|      6   for (j <- 1 to i) code |         | 
|      7   val end = System.nanoTime() |         | 
|      8   (end - start)/(i * 1.0e9) |         | 
|      9 } |         | 
|     10  |         | 
|     11  |         | 
|     12 // state nodes |         | 
|     13 abstract class State |         | 
|     14 type States = Set[State] |         | 
|     15  |         | 
|     16 case class IntState(i: Int) extends State |         | 
|     17  |         | 
|     18 object NewState { |         | 
|     19   var counter = 0 |         | 
|     20    |         | 
|     21   def apply() : IntState = { |         | 
|     22     counter += 1; |         | 
|     23     new IntState(counter - 1) |         | 
|     24   } |         | 
|     25 } |         | 
|     26  |         | 
|     27  |         | 
|     28 case class NFA(states: States,  |         | 
|     29                start: State,  |         | 
|     30                delta: (State, Char) => States,  |         | 
|     31                eps: State => States, |         | 
|     32                fins: States) { |         | 
|     33    |         | 
|     34   def epsclosure(qs: States) : States = { |         | 
|     35     val ps = qs ++ qs.flatMap(eps(_)) |         | 
|     36     if (qs == ps) ps else epsclosure(ps) |         | 
|     37   } |         | 
|     38  |         | 
|     39   def deltas(qs: States, s: List[Char]) : States = s match { |         | 
|     40     case Nil => epsclosure(qs) |         | 
|     41     case c::cs => deltas(epsclosure(epsclosure(qs).flatMap (delta (_, c))), cs) |         | 
|     42   } |         | 
|     43  |         | 
|     44   def accepts(s: String) : Boolean =  |         | 
|     45     deltas(Set(start), s.toList) exists (fins contains (_)) |         | 
|     46 } |         | 
|     47  |         | 
|     48 // A small example NFA from the lectures  |         | 
|     49 val Q0 = NewState() |         | 
|     50 val Q1 = NewState() |         | 
|     51 val Q2 = NewState() |         | 
|     52  |         | 
|     53 val delta : (State, Char) => States = { |         | 
|     54   case (Q0, 'a') => Set(Q0) |         | 
|     55   case (Q1, 'a') => Set(Q1) |         | 
|     56   case (Q2, 'b') => Set(Q2) |         | 
|     57   case (_, _) => Set () |         | 
|     58 } |         | 
|     59  |         | 
|     60 val eps : State => States = { |         | 
|     61   case Q0 => Set(Q1, Q2) |         | 
|     62   case _ => Set() |         | 
|     63 } |         | 
|     64  |         | 
|     65 val NFA1 = NFA(Set(Q0, Q1, Q2), Q0, delta, eps, Set(Q2)) |         | 
|     66  |         | 
|     67 NFA1.accepts("aa") |         | 
|     68 NFA1.accepts("aaaaa") |         | 
|     69 NFA1.accepts("aaaaabbb") |         | 
|     70 NFA1.accepts("aaaaabbbaaa") |         | 
|     71 NFA1.accepts("ac") |         | 
|     72  |         | 
|     73  |         | 
|     74 // explicit construction of some NFAs; used in |         | 
|     75 // Thompson's construction |         | 
|     76  |         | 
|     77 // NFA that does not accept any string |         | 
|     78 def NFA_NULL() : NFA = { |         | 
|     79   val Q = NewState() |         | 
|     80   NFA(Set(Q), Q, { case (_, _) => Set() }, { case _ => Set() }, Set()) |         | 
|     81 } |         | 
|     82  |         | 
|     83 // NFA that accepts the empty string |         | 
|     84 def NFA_EMPTY() : NFA = { |         | 
|     85   val Q = NewState() |         | 
|     86   NFA(Set(Q), Q, { case (_, _) => Set() }, { case _ => Set() }, Set(Q)) |         | 
|     87 } |         | 
|     88  |         | 
|     89 // NFA that accepts the string "c" |         | 
|     90 def NFA_CHAR(c: Char) : NFA = { |         | 
|     91   val Q1 = NewState() |         | 
|     92   val Q2 = NewState() |         | 
|     93   NFA(Set(Q1, Q2),  |         | 
|     94       Q1,  |         | 
|     95       { case (Q1, d) if (c == d) => Set(Q2) |         | 
|     96         case (_, _) => Set() }, |         | 
|     97       { case _ => Set() }, |         | 
|     98       Set(Q2)) |         | 
|     99 } |         | 
|    100  |         | 
|    101 // alternative of two NFAs |         | 
|    102 def NFA_ALT(nfa1: NFA, nfa2: NFA) : NFA = { |         | 
|    103   val Q = NewState() |         | 
|    104   NFA(Set(Q) ++ nfa1.states ++ nfa2.states, |         | 
|    105       Q, |         | 
|    106       { case (q, c) if (nfa1.states contains q) => nfa1.delta(q, c) |         | 
|    107         case (q, c) if (nfa2.states contains q) => nfa2.delta(q, c) |         | 
|    108         case (_, _) => Set() }, |         | 
|    109       { case Q => Set(nfa1.start, nfa2.start) |         | 
|    110         case q if (nfa1.states contains q) => nfa1.eps(q) |         | 
|    111         case q if (nfa2.states contains q) => nfa2.eps(q) |         | 
|    112         case _ => Set() }, |         | 
|    113       nfa1.fins ++ nfa2.fins) |         | 
|    114 } |         | 
|    115  |         | 
|    116 // sequence of two NFAs |         | 
|    117 def NFA_SEQ(nfa1: NFA, nfa2: NFA) : NFA = { |         | 
|    118   NFA(nfa1.states ++ nfa2.states, |         | 
|    119       nfa1.start, |         | 
|    120       { case (q, c) if (nfa1.states contains q) => nfa1.delta(q, c) |         | 
|    121         case (q, c) if (nfa2.states contains q) => nfa2.delta(q, c) |         | 
|    122         case (_, _) => Set() }, |         | 
|    123       { case q if (nfa1.fins contains q) => nfa1.eps(q) ++ Set(nfa2.start) |         | 
|    124         case q if (nfa1.states contains q) => nfa1.eps(q) |         | 
|    125         case q if (nfa2.states contains q) => nfa2.eps(q) |         | 
|    126         case _ => Set() }, |         | 
|    127       nfa2.fins) |         | 
|    128 } |         | 
|    129  |         | 
|    130 // star of an NFA |         | 
|    131 def NFA_STAR(nfa: NFA) : NFA = { |         | 
|    132   val Q = NewState() |         | 
|    133   NFA(Set(Q) ++ nfa.states,  |         | 
|    134       Q, |         | 
|    135       nfa.delta, |         | 
|    136       { case Q => Set(nfa.start) |         | 
|    137         case q if (nfa.fins contains q) => nfa.eps(q) ++ Set(Q) |         | 
|    138         case q if (nfa.states contains q) => nfa.eps(q) |         | 
|    139         case _ => Set() }, |         | 
|    140       Set(Q)) |         | 
|    141 } |         | 
|    142  |         | 
|    143  |         | 
|    144 // regular expressions used for Thompson's construction |         | 
|    145 abstract class Rexp |         | 
|    146  |         | 
|    147 case object NULL extends Rexp |         | 
|    148 case object EMPTY extends Rexp |         | 
|    149 case class CHAR(c: Char) extends Rexp  |         | 
|    150 case class ALT(r1: Rexp, r2: Rexp) extends Rexp |         | 
|    151 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  |         | 
|    152 case class STAR(r: Rexp) extends Rexp |         | 
|    153  |         | 
|    154 // some convenience for typing in regular expressions |         | 
|    155 def charlist2rexp(s : List[Char]) : Rexp = s match { |         | 
|    156   case Nil => EMPTY |         | 
|    157   case c::Nil => CHAR(c) |         | 
|    158   case c::s => SEQ(CHAR(c), charlist2rexp(s)) |         | 
|    159 } |         | 
|    160 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) |         | 
|    161  |         | 
|    162  |         | 
|    163  |         | 
|    164 def thompson (r: Rexp) : NFA = r match { |         | 
|    165   case NULL => NFA_NULL() |         | 
|    166   case EMPTY => NFA_EMPTY() |         | 
|    167   case CHAR(c) => NFA_CHAR(c)   |         | 
|    168   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2)) |         | 
|    169   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2)) |         | 
|    170   case STAR(r1) => NFA_STAR(thompson(r1)) |         | 
|    171 } |         | 
|    172  |         | 
|    173 // some examples for Thompson's |         | 
|    174 val A = thompson(CHAR('a')) |         | 
|    175  |         | 
|    176 println(A.accepts("a"))   // true  |         | 
|    177 println(A.accepts("c"))   // false  |         | 
|    178 println(A.accepts("aa"))  // false |         | 
|    179  |         | 
|    180 val B = thompson(ALT("ab","ac")) |         | 
|    181   |         | 
|    182 println(B.accepts("ab"))   // true  |         | 
|    183 println(B.accepts("ac"))   // true  |         | 
|    184 println(B.accepts("bb"))   // false  |         | 
|    185 println(B.accepts("aa"))   // false  |         | 
|    186  |         | 
|    187 val C = thompson(STAR("ab")) |         | 
|    188  |         | 
|    189 println(C.accepts(""))       // true |         | 
|    190 println(C.accepts("a"))      // false  |         | 
|    191 println(C.accepts("ababab")) // true |         | 
|    192 println(C.accepts("ab"))     // true |         | 
|    193 println(C.accepts("ac"))     // false  |         | 
|    194 println(C.accepts("bb"))     // false  |         | 
|    195 println(C.accepts("aa"))     // false  |         | 
|    196  |         | 
|    197 // regular expression matcher using Thompson's |         | 
|    198 def matcher(r: Rexp, s: String) : Boolean = thompson(r).accepts(s) |         | 
|    199  |         | 
|    200  |         | 
|    201 //optional |         | 
|    202 def OPT(r: Rexp) = ALT(r, EMPTY) |         | 
|    203  |         | 
|    204 //n-times |         | 
|    205 def NTIMES(r: Rexp, n: Int) : Rexp = n match { |         | 
|    206   case 0 => EMPTY |         | 
|    207   case 1 => r |         | 
|    208   case n => SEQ(r, NTIMES(r, n - 1)) |         | 
|    209 } |         | 
|    210  |         | 
|    211 // evil regular exproession |         | 
|    212 def EVIL(n: Int) = SEQ(NTIMES(OPT("a"), n), NTIMES("a", n)) |         | 
|    213  |         | 
|    214 // test harness for the matcher |         | 
|    215 for (i <- 0 to 100 by 5) { |         | 
|    216   println(i + ": " + "%.5f".format(time_needed(1, matcher(EVIL(i), "a" * i)))) |         | 
|    217 } |         | 
|    218  |         | 
|    219  |         | 
|    220 // regular expression matching via search and backtracking |         | 
|    221 def accepts2(nfa: NFA, s: String) : Boolean = { |         | 
|    222  |         | 
|    223   def search(q: State, s: List[Char]) : Boolean = s match { |         | 
|    224     case Nil => nfa.fins contains (q) |         | 
|    225     case c::cs =>  |         | 
|    226       (nfa.delta(q, c) exists (search(_, cs))) ||  |         | 
|    227       (nfa.eps(q) exists (search(_, c::cs))) |         | 
|    228   } |         | 
|    229  |         | 
|    230   search(nfa.start, s.toList) |         | 
|    231 } |         | 
|    232  |         | 
|    233 def matcher2(r: Rexp, s: String) : Boolean = accepts2(thompson(r), s) |         | 
|    234  |         | 
|    235 // test harness for the backtracking matcher |         | 
|    236 for (i <- 0 to 20 by 1) { |         | 
|    237   println(i + ": " + "%.5f".format(time_needed(1, matcher2(EVIL(i), "a" * i)))) |         | 
|    238 } |         | 
|    239  |         | 
|    240  |         | 
|    241  |         | 
|    242  |         |