1 // NFAs and Thompson's construction   | 
     1 // NFAs in Scala based on sets of partial functions  | 
     2   | 
     2   | 
     3 // helper function for recording time  | 
     3 // type abbreviation for partial functions  | 
     4 def time_needed[T](i: Int, code: => T) = { | 
     4 type :=>[A, B] = PartialFunction[A, B]  | 
     5   val start = System.nanoTime()  | 
     5   | 
     6   for (j <- 1 to i) code  | 
     6 case class NFA[A, C](starts: Set[A],            // starting states  | 
     7   val end = System.nanoTime()  | 
     7                      delta: Set[(A, C) :=> A],  // transitions  | 
     8   (end - start)/(i * 1.0e9)  | 
     8                      fins:  A => Boolean) {     // final states  | 
         | 
     9   | 
         | 
    10   // given a state and a character, what is the set of next states?  | 
         | 
    11   // if there is none => empty set  | 
         | 
    12   def next(q: A, c: C) : Set[A] =   | 
         | 
    13     delta.flatMap(_.lift.apply(q, c))  | 
         | 
    14   | 
         | 
    15   def nexts(qs: Set[A], c: C) : Set[A] =  | 
         | 
    16     qs.flatMap(next(_, c))  | 
         | 
    17   | 
         | 
    18   def deltas(qs: Set[A], s: List[C]) : Set[A] = s match { | 
         | 
    19     case Nil => qs  | 
         | 
    20     case c::cs => deltas(nexts(qs, c), cs)  | 
         | 
    21   }  | 
         | 
    22   | 
         | 
    23   def accepts(s: List[C]) : Boolean =   | 
         | 
    24     deltas(starts, s).exists(fins)  | 
     9 }  | 
    25 }  | 
    10   | 
    26   | 
    11   | 
         | 
    12 // state nodes  | 
         | 
    13 abstract class State  | 
         | 
    14 type States = Set[State]  | 
         | 
    15   | 
         | 
    16 case class IntState(i: Int) extends State  | 
         | 
    17   | 
         | 
    18 object NewState { | 
         | 
    19   var counter = 0  | 
         | 
    20     | 
         | 
    21   def apply() : IntState = { | 
         | 
    22     counter += 1;  | 
         | 
    23     new IntState(counter - 1)  | 
         | 
    24   }  | 
         | 
    25 }  | 
         | 
    26   | 
         | 
    27   | 
         | 
    28 case class NFA(states: States,   | 
         | 
    29                start: State,   | 
         | 
    30                delta: (State, Char) => States,   | 
         | 
    31                eps: State => States,  | 
         | 
    32                fins: States) { | 
         | 
    33     | 
         | 
    34   def epsclosure(qs: States) : States = { | 
         | 
    35     val ps = qs ++ qs.flatMap(eps(_))  | 
         | 
    36     if (qs == ps) ps else epsclosure(ps)  | 
         | 
    37   }  | 
         | 
    38   | 
         | 
    39   def deltas(qs: States, s: List[Char]) : States = s match { | 
         | 
    40     case Nil => epsclosure(qs)  | 
         | 
    41     case c::cs => deltas(epsclosure(epsclosure(qs).flatMap (delta (_, c))), cs)  | 
         | 
    42   }  | 
         | 
    43   | 
         | 
    44   def accepts(s: String) : Boolean =   | 
         | 
    45     deltas(Set(start), s.toList) exists (fins contains (_))  | 
         | 
    46 }  | 
         | 
    47   | 
         | 
    48 // A small example NFA from the lectures   | 
         | 
    49 val Q0 = NewState()  | 
         | 
    50 val Q1 = NewState()  | 
         | 
    51 val Q2 = NewState()  | 
         | 
    52   | 
         | 
    53 val delta : (State, Char) => States = { | 
         | 
    54   case (Q0, 'a') => Set(Q0)  | 
         | 
    55   case (Q1, 'a') => Set(Q1)  | 
         | 
    56   case (Q2, 'b') => Set(Q2)  | 
         | 
    57   case (_, _) => Set ()  | 
         | 
    58 }  | 
         | 
    59   | 
         | 
    60 val eps : State => States = { | 
         | 
    61   case Q0 => Set(Q1, Q2)  | 
         | 
    62   case _ => Set()  | 
         | 
    63 }  | 
         | 
    64   | 
         | 
    65 val NFA1 = NFA(Set(Q0, Q1, Q2), Q0, delta, eps, Set(Q2))  | 
         | 
    66   | 
         | 
    67 NFA1.accepts("aa") | 
         | 
    68 NFA1.accepts("aaaaa") | 
         | 
    69 NFA1.accepts("aaaaabbb") | 
         | 
    70 NFA1.accepts("aaaaabbbaaa") | 
         | 
    71 NFA1.accepts("ac") | 
         | 
    72   | 
         | 
    73   | 
         | 
    74 // explicit construction of some NFAs; used in  | 
         | 
    75 // Thompson's construction  | 
         | 
    76   | 
         | 
    77 // NFA that does not accept any string  | 
         | 
    78 def NFA_NULL() : NFA = { | 
         | 
    79   val Q = NewState()  | 
         | 
    80   NFA(Set(Q), Q, { case (_, _) => Set() }, { case _ => Set() }, Set()) | 
         | 
    81 }  | 
         | 
    82   | 
         | 
    83 // NFA that accepts the empty string  | 
         | 
    84 def NFA_EMPTY() : NFA = { | 
         | 
    85   val Q = NewState()  | 
         | 
    86   NFA(Set(Q), Q, { case (_, _) => Set() }, { case _ => Set() }, Set(Q)) | 
         | 
    87 }  | 
         | 
    88   | 
         | 
    89 // NFA that accepts the string "c"  | 
         | 
    90 def NFA_CHAR(c: Char) : NFA = { | 
         | 
    91   val Q1 = NewState()  | 
         | 
    92   val Q2 = NewState()  | 
         | 
    93   NFA(Set(Q1, Q2),   | 
         | 
    94       Q1,   | 
         | 
    95       { case (Q1, d) if (c == d) => Set(Q2) | 
         | 
    96         case (_, _) => Set() },  | 
         | 
    97       { case _ => Set() }, | 
         | 
    98       Set(Q2))  | 
         | 
    99 }  | 
         | 
   100   | 
         | 
   101 // alternative of two NFAs  | 
         | 
   102 def NFA_ALT(nfa1: NFA, nfa2: NFA) : NFA = { | 
         | 
   103   val Q = NewState()  | 
         | 
   104   NFA(Set(Q) ++ nfa1.states ++ nfa2.states,  | 
         | 
   105       Q,  | 
         | 
   106       { case (q, c) if (nfa1.states contains q) => nfa1.delta(q, c) | 
         | 
   107         case (q, c) if (nfa2.states contains q) => nfa2.delta(q, c)  | 
         | 
   108         case (_, _) => Set() },  | 
         | 
   109       { case Q => Set(nfa1.start, nfa2.start) | 
         | 
   110         case q if (nfa1.states contains q) => nfa1.eps(q)  | 
         | 
   111         case q if (nfa2.states contains q) => nfa2.eps(q)  | 
         | 
   112         case _ => Set() },  | 
         | 
   113       nfa1.fins ++ nfa2.fins)  | 
         | 
   114 }  | 
         | 
   115   | 
         | 
   116 // sequence of two NFAs  | 
         | 
   117 def NFA_SEQ(nfa1: NFA, nfa2: NFA) : NFA = { | 
         | 
   118   NFA(nfa1.states ++ nfa2.states,  | 
         | 
   119       nfa1.start,  | 
         | 
   120       { case (q, c) if (nfa1.states contains q) => nfa1.delta(q, c) | 
         | 
   121         case (q, c) if (nfa2.states contains q) => nfa2.delta(q, c)  | 
         | 
   122         case (_, _) => Set() },  | 
         | 
   123       { case q if (nfa1.fins contains q) => nfa1.eps(q) ++ Set(nfa2.start) | 
         | 
   124         case q if (nfa1.states contains q) => nfa1.eps(q)  | 
         | 
   125         case q if (nfa2.states contains q) => nfa2.eps(q)  | 
         | 
   126         case _ => Set() },  | 
         | 
   127       nfa2.fins)  | 
         | 
   128 }  | 
         | 
   129   | 
         | 
   130 // star of an NFA  | 
         | 
   131 def NFA_STAR(nfa: NFA) : NFA = { | 
         | 
   132   val Q = NewState()  | 
         | 
   133   NFA(Set(Q) ++ nfa.states,   | 
         | 
   134       Q,  | 
         | 
   135       nfa.delta,  | 
         | 
   136       { case Q => Set(nfa.start) | 
         | 
   137         case q if (nfa.fins contains q) => nfa.eps(q) ++ Set(Q)  | 
         | 
   138         case q if (nfa.states contains q) => nfa.eps(q)  | 
         | 
   139         case _ => Set() },  | 
         | 
   140       Set(Q))  | 
         | 
   141 }  | 
         | 
   142   | 
         | 
   143   | 
         | 
   144 // regular expressions used for Thompson's construction  | 
         | 
   145 abstract class Rexp  | 
         | 
   146   | 
         | 
   147 case object NULL extends Rexp  | 
         | 
   148 case object EMPTY extends Rexp  | 
         | 
   149 case class CHAR(c: Char) extends Rexp   | 
         | 
   150 case class ALT(r1: Rexp, r2: Rexp) extends Rexp  | 
         | 
   151 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp   | 
         | 
   152 case class STAR(r: Rexp) extends Rexp  | 
         | 
   153   | 
         | 
   154 // some convenience for typing in regular expressions  | 
         | 
   155 def charlist2rexp(s : List[Char]) : Rexp = s match { | 
         | 
   156   case Nil => EMPTY  | 
         | 
   157   case c::Nil => CHAR(c)  | 
         | 
   158   case c::s => SEQ(CHAR(c), charlist2rexp(s))  | 
         | 
   159 }  | 
         | 
   160 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)  | 
         | 
   161   | 
         | 
   162   | 
         | 
   163   | 
         | 
   164 def thompson (r: Rexp) : NFA = r match { | 
         | 
   165   case NULL => NFA_NULL()  | 
         | 
   166   case EMPTY => NFA_EMPTY()  | 
         | 
   167   case CHAR(c) => NFA_CHAR(c)    | 
         | 
   168   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2))  | 
         | 
   169   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2))  | 
         | 
   170   case STAR(r1) => NFA_STAR(thompson(r1))  | 
         | 
   171 }  | 
         | 
   172   | 
         | 
   173 // some examples for Thompson's  | 
         | 
   174 val A = thompson(CHAR('a')) | 
         | 
   175   | 
         | 
   176 println(A.accepts("a")) | 
         | 
   177 println(A.accepts("c")) | 
         | 
   178 println(A.accepts("aa")) | 
         | 
   179   | 
         | 
   180 val B = thompson(ALT("ab","ac")) | 
         | 
   181   | 
         | 
   182 println(B.accepts("ab")) | 
         | 
   183 println(B.accepts("ac")) | 
         | 
   184 println(B.accepts("bb")) | 
         | 
   185 println(B.accepts("aa")) | 
         | 
   186   | 
         | 
   187 val C = thompson(STAR("ab")) | 
         | 
   188   | 
         | 
   189 println(C.accepts("")) | 
         | 
   190 println(C.accepts("a")) | 
         | 
   191 println(C.accepts("ababab")) | 
         | 
   192 println(C.accepts("ab")) | 
         | 
   193 println(C.accepts("ac")) | 
         | 
   194 println(C.accepts("bb")) | 
         | 
   195 println(C.accepts("aa")) | 
         | 
   196   | 
         | 
   197 // regular expression matcher using Thompson's  | 
         | 
   198 def matcher(r: Rexp, s: String) : Boolean = thompson(r).accepts(s)  | 
         | 
   199   | 
         | 
   200   | 
         | 
   201 //optional  | 
         | 
   202 def OPT(r: Rexp) = ALT(r, EMPTY)  | 
         | 
   203   | 
         | 
   204 //n-times  | 
         | 
   205 def NTIMES(r: Rexp, n: Int) : Rexp = n match { | 
         | 
   206   case 0 => EMPTY  | 
         | 
   207   case 1 => r  | 
         | 
   208   case n => SEQ(r, NTIMES(r, n - 1))  | 
         | 
   209 }  | 
         | 
   210   | 
         | 
   211 // evil regular exproession  | 
         | 
   212 def EVIL(n: Int) = SEQ(NTIMES(OPT("a"), n), NTIMES("a", n)) | 
         | 
   213   | 
         | 
   214 // test harness for the matcher  | 
         | 
   215 for (i <- 0 to 100 by 5) { | 
         | 
   216   println(i + ": " + "%.5f".format(time_needed(1, matcher(EVIL(i), "a" * i))))  | 
         | 
   217 }  | 
         | 
   218   | 
         | 
   219   | 
         | 
   220 // regular expression matching via search and backtracking  | 
         | 
   221 def accepts2(nfa: NFA, s: String) : Boolean = { | 
         | 
   222   | 
         | 
   223   def search(q: State, s: List[Char]) : Boolean = s match { | 
         | 
   224     case Nil => nfa.fins contains (q)  | 
         | 
   225     case c::cs =>   | 
         | 
   226       (nfa.delta(q, c) exists (search(_, cs))) ||   | 
         | 
   227       (nfa.eps(q) exists (search(_, c::cs)))  | 
         | 
   228   }  | 
         | 
   229   | 
         | 
   230   search(nfa.start, s.toList)  | 
         | 
   231 }  | 
         | 
   232   | 
         | 
   233 def matcher2(r: Rexp, s: String) : Boolean = accepts2(thompson(r), s)  | 
         | 
   234   | 
         | 
   235 // test harness for the backtracking matcher  | 
         | 
   236 for (i <- 0 to 20 by 1) { | 
         | 
   237   println(i + ": " + "%.5f".format(time_needed(1, matcher2(EVIL(i), "a" * i))))  | 
         | 
   238 }  | 
         | 
   239   | 
         | 
   240   | 
         | 
   241   | 
         | 
   242   | 
         |