progs/thompson.scala
changeset 733 022e2cb1668d
parent 732 c7bdd7eac4cb
child 734 5d860ff01938
equal deleted inserted replaced
732:c7bdd7eac4cb 733:022e2cb1668d
     1 // Thompson Construction
       
     2 // (needs  :load dfa.scala
       
     3 //         :load nfa.scala
       
     4 //         :load enfa.scala)
       
     5 
       
     6 
       
     7 // states for Thompson construction
       
     8 case class TState(i: Int) extends State
       
     9 
       
    10 object TState {
       
    11   var counter = 0
       
    12   
       
    13   def apply() : TState = {
       
    14     counter += 1;
       
    15     new TState(counter - 1)
       
    16   }
       
    17 }
       
    18 
       
    19 
       
    20 // some types abbreviations
       
    21 type NFAt = NFA[TState, Char]
       
    22 type NFAtrans = (TState, Char) :=> Set[TState]
       
    23 type eNFAtrans = (TState, Option[Char]) :=> Set[TState]
       
    24 
       
    25 
       
    26 // for composing an eNFA transition with a NFA transition
       
    27 implicit class RichPF(val f: eNFAtrans) extends AnyVal {
       
    28   def +++(g: NFAtrans) : eNFAtrans = 
       
    29   { case (q, None) =>  applyOrElse(f, (q, None)) 
       
    30     case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c))  }
       
    31 }
       
    32 
       
    33 
       
    34 // NFA that does not accept any string
       
    35 def NFA_ZERO(): NFAt = {
       
    36   val Q = TState()
       
    37   NFA(Set(Q), { case _ => Set() }, Set())
       
    38 }
       
    39 
       
    40 // NFA that accepts the empty string
       
    41 def NFA_ONE() : NFAt = {
       
    42   val Q = TState()
       
    43   NFA(Set(Q), { case _ => Set() }, Set(Q))
       
    44 }
       
    45 
       
    46 // NFA that accepts the string "c"
       
    47 def NFA_CHAR(c: Char) : NFAt = {
       
    48   val Q1 = TState()
       
    49   val Q2 = TState()
       
    50   NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2))
       
    51 }
       
    52 
       
    53 // sequence of two NFAs
       
    54 def NFA_SEQ(enfa1: NFAt, enfa2: NFAt) : NFAt = {
       
    55   val new_delta : eNFAtrans = 
       
    56     { case (q, None) if enfa1.fins(q) => enfa2.starts }
       
    57   
       
    58   eNFA(enfa1.starts, new_delta +++ enfa1.delta +++ enfa2.delta, 
       
    59        enfa2.fins)
       
    60 }
       
    61 
       
    62 // alternative of two NFAs
       
    63 def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = {
       
    64   val new_delta : NFAtrans = { 
       
    65     case (q, c) =>  applyOrElse(enfa1.delta, (q, c)) | 
       
    66                     applyOrElse(enfa2.delta, (q, c)) }
       
    67   val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q)
       
    68 
       
    69   NFA(enfa1.starts | enfa2.starts, new_delta, new_fins)
       
    70 }
       
    71 
       
    72 // star of a NFA
       
    73 def NFA_STAR(enfa: NFAt) : NFAt = {
       
    74   val Q = TState()
       
    75   val new_delta : eNFAtrans = 
       
    76     { case (Q, None) => enfa.starts
       
    77       case (q, None) if enfa.fins(q) => Set(Q) }
       
    78 
       
    79   eNFA(Set(Q), new_delta +++ enfa.delta, Set(Q))
       
    80 }
       
    81 
       
    82 
       
    83 
       
    84 // regular expressions
       
    85 abstract class Rexp
       
    86 case object ZERO extends Rexp                    // matches nothing
       
    87 case object ONE extends Rexp                     // matches the empty string
       
    88 case class CHAR(c: Char) extends Rexp            // matches a character c
       
    89 case class ALT(r1: Rexp, r2: Rexp) extends Rexp  // alternative
       
    90 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  // sequence
       
    91 case class STAR(r: Rexp) extends Rexp            // star
       
    92 
       
    93 
       
    94 
       
    95 
       
    96 // thompson construction 
       
    97 def thompson (r: Rexp) : NFAt = r match {
       
    98   case ZERO => NFA_ZERO()
       
    99   case ONE => NFA_ONE()
       
   100   case CHAR(c) => NFA_CHAR(c)  
       
   101   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2))
       
   102   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2))
       
   103   case STAR(r1) => NFA_STAR(thompson(r1))
       
   104 }
       
   105 
       
   106 //optional regular expression (one or zero times)
       
   107 def OPT(r: Rexp) = ALT(r, ONE)
       
   108 
       
   109 //n-times regular expression (explicitly expanded)
       
   110 def NTIMES(r: Rexp, n: Int) : Rexp = n match {
       
   111   case 0 => ONE
       
   112   case 1 => r
       
   113   case n => SEQ(r, NTIMES(r, n - 1))
       
   114 }
       
   115 
       
   116 
       
   117 def tmatches(r: Rexp, s: String) : Boolean =
       
   118   thompson(r).accepts(s.toList)
       
   119 
       
   120 def tmatches2(r: Rexp, s: String) : Boolean =
       
   121   thompson(r).accepts2(s.toList)
       
   122 
       
   123 // dfa via subset construction
       
   124 def tmatches_dfa(r: Rexp, s: String) : Boolean =
       
   125   subset(thompson(r)).accepts(s.toList)
       
   126 
       
   127 // Test Cases
       
   128 
       
   129 
       
   130 // the evil regular expression  a?{n} a{n}
       
   131 def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
       
   132 
       
   133 // the evil regular expression (a*)*b
       
   134 val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
       
   135 
       
   136 //for measuring time
       
   137 def time_needed[T](i: Int, code: => T) = {
       
   138   val start = System.nanoTime()
       
   139   for (j <- 1 to i) code
       
   140   val end = System.nanoTime()
       
   141   (end - start)/(i * 1.0e9)
       
   142 }
       
   143 
       
   144 // the size of the NFA can be large, 
       
   145 // thus slowing down the breadth-first search
       
   146 
       
   147 for (i <- 1 to 13) {
       
   148   println(i + ": " + "%.5f".format(time_needed(2, tmatches(EVIL1(i), "a" * i))))
       
   149 }
       
   150 
       
   151 for (i <- 1 to 100 by 5) {
       
   152   println(i + " " + "%.5f".format(time_needed(2, tmatches(EVIL2, "a" * i))))
       
   153 }
       
   154 
       
   155 
       
   156 // the backtracking needed in depth-first search 
       
   157 // can be painfully slow
       
   158 
       
   159 for (i <- 1 to 8) {
       
   160   println(i + " " + "%.5f".format(time_needed(2, tmatches2(EVIL2, "a" * i))))
       
   161 }
       
   162 
       
   163 
       
   164 
       
   165 // while my thompson->enfa->subset->partial-function-chain
       
   166 // is probably not the most effcient way to obtain a fast DFA 
       
   167 // (the test below should be much faster with a more direct 
       
   168 // construction), in general the DFAs can be slow because of 
       
   169 // the state explosion in the subset construction
       
   170 
       
   171 for (i <- 1 to 13) {
       
   172   println(i + ": " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL1(i), "a" * i))))
       
   173 }
       
   174 
       
   175 for (i <- 1 to 100 by 5) {
       
   176   println(i + " " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL2, "a" * i))))
       
   177 }