| 
     1 // Thompson Construction  | 
         | 
     2 // (needs  :load dfa.scala  | 
         | 
     3 //         :load nfa.scala  | 
         | 
     4 //         :load enfa.scala)  | 
         | 
     5   | 
         | 
     6   | 
         | 
     7 // states for Thompson construction  | 
         | 
     8 case class TState(i: Int) extends State  | 
         | 
     9   | 
         | 
    10 object TState { | 
         | 
    11   var counter = 0  | 
         | 
    12     | 
         | 
    13   def apply() : TState = { | 
         | 
    14     counter += 1;  | 
         | 
    15     new TState(counter - 1)  | 
         | 
    16   }  | 
         | 
    17 }  | 
         | 
    18   | 
         | 
    19   | 
         | 
    20 // some types abbreviations  | 
         | 
    21 type NFAt = NFA[TState, Char]  | 
         | 
    22 type NFAtrans = (TState, Char) :=> Set[TState]  | 
         | 
    23 type eNFAtrans = (TState, Option[Char]) :=> Set[TState]  | 
         | 
    24   | 
         | 
    25   | 
         | 
    26 // for composing an eNFA transition with a NFA transition  | 
         | 
    27 implicit class RichPF(val f: eNFAtrans) extends AnyVal { | 
         | 
    28   def +++(g: NFAtrans) : eNFAtrans =   | 
         | 
    29   { case (q, None) =>  applyOrElse(f, (q, None))  | 
         | 
    30     case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c))  }  | 
         | 
    31 }  | 
         | 
    32   | 
         | 
    33   | 
         | 
    34 // NFA that does not accept any string  | 
         | 
    35 def NFA_ZERO(): NFAt = { | 
         | 
    36   val Q = TState()  | 
         | 
    37   NFA(Set(Q), { case _ => Set() }, Set()) | 
         | 
    38 }  | 
         | 
    39   | 
         | 
    40 // NFA that accepts the empty string  | 
         | 
    41 def NFA_ONE() : NFAt = { | 
         | 
    42   val Q = TState()  | 
         | 
    43   NFA(Set(Q), { case _ => Set() }, Set(Q)) | 
         | 
    44 }  | 
         | 
    45   | 
         | 
    46 // NFA that accepts the string "c"  | 
         | 
    47 def NFA_CHAR(c: Char) : NFAt = { | 
         | 
    48   val Q1 = TState()  | 
         | 
    49   val Q2 = TState()  | 
         | 
    50   NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2)) | 
         | 
    51 }  | 
         | 
    52   | 
         | 
    53 // sequence of two NFAs  | 
         | 
    54 def NFA_SEQ(enfa1: NFAt, enfa2: NFAt) : NFAt = { | 
         | 
    55   val new_delta : eNFAtrans =   | 
         | 
    56     { case (q, None) if enfa1.fins(q) => enfa2.starts } | 
         | 
    57     | 
         | 
    58   eNFA(enfa1.starts, new_delta +++ enfa1.delta +++ enfa2.delta,   | 
         | 
    59        enfa2.fins)  | 
         | 
    60 }  | 
         | 
    61   | 
         | 
    62 // alternative of two NFAs  | 
         | 
    63 def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = { | 
         | 
    64   val new_delta : NFAtrans = {  | 
         | 
    65     case (q, c) =>  applyOrElse(enfa1.delta, (q, c)) |   | 
         | 
    66                     applyOrElse(enfa2.delta, (q, c)) }  | 
         | 
    67   val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q)  | 
         | 
    68   | 
         | 
    69   NFA(enfa1.starts | enfa2.starts, new_delta, new_fins)  | 
         | 
    70 }  | 
         | 
    71   | 
         | 
    72 // star of a NFA  | 
         | 
    73 def NFA_STAR(enfa: NFAt) : NFAt = { | 
         | 
    74   val Q = TState()  | 
         | 
    75   val new_delta : eNFAtrans =   | 
         | 
    76     { case (Q, None) => enfa.starts | 
         | 
    77       case (q, None) if enfa.fins(q) => Set(Q) }  | 
         | 
    78   | 
         | 
    79   eNFA(Set(Q), new_delta +++ enfa.delta, Set(Q))  | 
         | 
    80 }  | 
         | 
    81   | 
         | 
    82   | 
         | 
    83   | 
         | 
    84 // regular expressions  | 
         | 
    85 abstract class Rexp  | 
         | 
    86 case object ZERO extends Rexp                    // matches nothing  | 
         | 
    87 case object ONE extends Rexp                     // matches the empty string  | 
         | 
    88 case class CHAR(c: Char) extends Rexp            // matches a character c  | 
         | 
    89 case class ALT(r1: Rexp, r2: Rexp) extends Rexp  // alternative  | 
         | 
    90 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  // sequence  | 
         | 
    91 case class STAR(r: Rexp) extends Rexp            // star  | 
         | 
    92   | 
         | 
    93   | 
         | 
    94   | 
         | 
    95   | 
         | 
    96 // thompson construction   | 
         | 
    97 def thompson (r: Rexp) : NFAt = r match { | 
         | 
    98   case ZERO => NFA_ZERO()  | 
         | 
    99   case ONE => NFA_ONE()  | 
         | 
   100   case CHAR(c) => NFA_CHAR(c)    | 
         | 
   101   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2))  | 
         | 
   102   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2))  | 
         | 
   103   case STAR(r1) => NFA_STAR(thompson(r1))  | 
         | 
   104 }  | 
         | 
   105   | 
         | 
   106 //optional regular expression (one or zero times)  | 
         | 
   107 def OPT(r: Rexp) = ALT(r, ONE)  | 
         | 
   108   | 
         | 
   109 //n-times regular expression (explicitly expanded)  | 
         | 
   110 def NTIMES(r: Rexp, n: Int) : Rexp = n match { | 
         | 
   111   case 0 => ONE  | 
         | 
   112   case 1 => r  | 
         | 
   113   case n => SEQ(r, NTIMES(r, n - 1))  | 
         | 
   114 }  | 
         | 
   115   | 
         | 
   116   | 
         | 
   117 def tmatches(r: Rexp, s: String) : Boolean =  | 
         | 
   118   thompson(r).accepts(s.toList)  | 
         | 
   119   | 
         | 
   120 def tmatches2(r: Rexp, s: String) : Boolean =  | 
         | 
   121   thompson(r).accepts2(s.toList)  | 
         | 
   122   | 
         | 
   123 // dfa via subset construction  | 
         | 
   124 def tmatches_dfa(r: Rexp, s: String) : Boolean =  | 
         | 
   125   subset(thompson(r)).accepts(s.toList)  | 
         | 
   126   | 
         | 
   127 // Test Cases  | 
         | 
   128   | 
         | 
   129   | 
         | 
   130 // the evil regular expression  a?{n} a{n} | 
         | 
   131 def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) | 
         | 
   132   | 
         | 
   133 // the evil regular expression (a*)*b  | 
         | 
   134 val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) | 
         | 
   135   | 
         | 
   136 //for measuring time  | 
         | 
   137 def time_needed[T](i: Int, code: => T) = { | 
         | 
   138   val start = System.nanoTime()  | 
         | 
   139   for (j <- 1 to i) code  | 
         | 
   140   val end = System.nanoTime()  | 
         | 
   141   (end - start)/(i * 1.0e9)  | 
         | 
   142 }  | 
         | 
   143   | 
         | 
   144 // the size of the NFA can be large,   | 
         | 
   145 // thus slowing down the breadth-first search  | 
         | 
   146   | 
         | 
   147 for (i <- 1 to 13) { | 
         | 
   148   println(i + ": " + "%.5f".format(time_needed(2, tmatches(EVIL1(i), "a" * i))))  | 
         | 
   149 }  | 
         | 
   150   | 
         | 
   151 for (i <- 1 to 100 by 5) { | 
         | 
   152   println(i + " " + "%.5f".format(time_needed(2, tmatches(EVIL2, "a" * i))))  | 
         | 
   153 }  | 
         | 
   154   | 
         | 
   155   | 
         | 
   156 // the backtracking needed in depth-first search   | 
         | 
   157 // can be painfully slow  | 
         | 
   158   | 
         | 
   159 for (i <- 1 to 8) { | 
         | 
   160   println(i + " " + "%.5f".format(time_needed(2, tmatches2(EVIL2, "a" * i))))  | 
         | 
   161 }  | 
         | 
   162   | 
         | 
   163   | 
         | 
   164   | 
         | 
   165 // while my thompson-enfa-subset-partial-function-chain  | 
         | 
   166 // is probably not the most effcient way to obtain a fast DFA   | 
         | 
   167 // (the below should be much faster with a more direct construction),  | 
         | 
   168 // in general the DFAs can be slow because of the state explosion  | 
         | 
   169 // in the subset construction  | 
         | 
   170   | 
         | 
   171 for (i <- 1 to 13) { | 
         | 
   172   println(i + ": " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL1(i), "a" * i))))  | 
         | 
   173 }  | 
         | 
   174   | 
         | 
   175 for (i <- 1 to 100 by 5) { | 
         | 
   176   println(i + " " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL2, "a" * i))))  | 
         | 
   177 }  |