progs/automata/thompson.sc
changeset 733 022e2cb1668d
parent 586 451a95e1bc25
child 742 b5b5583a3a08
equal deleted inserted replaced
732:c7bdd7eac4cb 733:022e2cb1668d
       
     1 // Thompson Construction
       
     2 //=======================
       
     3 
       
     4 import $file.dfa, dfa._ 
       
     5 import $file.nfa, nfa._
       
     6 import $file.enfa, enfa._
       
     7 
       
     8 
       
     9 // states for Thompson construction
       
    10 case class TState(i: Int) extends State
       
    11 
       
    12 object TState {
       
    13   var counter = 0
       
    14   
       
    15   def apply() : TState = {
       
    16     counter += 1;
       
    17     new TState(counter)
       
    18   }
       
    19 }
       
    20 
       
    21 
       
    22 // some types abbreviations
       
    23 type NFAt = NFA[TState, Char]
       
    24 type NFAtrans = (TState, Char) :=> Set[TState]
       
    25 type eNFAtrans = (TState, Option[Char]) :=> Set[TState]
       
    26 
       
    27 
       
    28 // for composing an eNFA transition with an NFA transition
       
    29 // | is for set union
       
    30 implicit def nfaOps(f: eNFAtrans) = new {
       
    31   def +++(g: NFAtrans) : eNFAtrans = 
       
    32   { case (q, None) =>  applyOrElse(f, (q, None)) 
       
    33     case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c))  }
       
    34 }
       
    35 
       
    36  
       
    37 // NFA that does not accept any string
       
    38 def NFA_ZERO(): NFAt = {
       
    39   val Q = TState()
       
    40   NFA(Set(Q), { case _ => Set() }, Set())
       
    41 }
       
    42 
       
    43 // NFA that accepts the empty string
       
    44 def NFA_ONE() : NFAt = {
       
    45   val Q = TState()
       
    46   NFA(Set(Q), { case _ => Set() }, Set(Q))
       
    47 }
       
    48 
       
    49 // NFA that accepts the string "c"
       
    50 def NFA_CHAR(c: Char) : NFAt = {
       
    51   val Q1 = TState()
       
    52   val Q2 = TState()
       
    53   NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2))
       
    54 }
       
    55 
       
    56 // sequence of two NFAs
       
    57 def NFA_SEQ(enfa1: NFAt, enfa2: NFAt) : NFAt = {
       
    58   val new_delta : eNFAtrans = 
       
    59     { case (q, None) if enfa1.fins(q) => enfa2.starts }
       
    60   
       
    61   eNFA(enfa1.starts, 
       
    62        new_delta +++ enfa1.delta +++ enfa2.delta, 
       
    63        enfa2.fins)
       
    64 }
       
    65 
       
    66 // alternative of two NFAs
       
    67 def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = {
       
    68   val new_delta : NFAtrans = { 
       
    69     case (q, c) =>  applyOrElse(enfa1.delta, (q, c)) | 
       
    70                     applyOrElse(enfa2.delta, (q, c)) }
       
    71   val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q)
       
    72 
       
    73   NFA(enfa1.starts | enfa2.starts, new_delta, new_fins)
       
    74 }
       
    75 
       
    76 // star of a NFA
       
    77 def NFA_STAR(enfa: NFAt) : NFAt = {
       
    78   val Q = TState()
       
    79   val new_delta : eNFAtrans = 
       
    80     { case (Q, None) => enfa.starts
       
    81       case (q, None) if enfa.fins(q) => Set(Q) }
       
    82 
       
    83   eNFA(Set(Q), new_delta +++ enfa.delta, Set(Q))
       
    84 }
       
    85 
       
    86 
       
    87 // We are now ready to translate regular expressions
       
    88 // into DFAs (via eNFAs and NFAs, and the subset construction)
       
    89 
       
    90 // regular expressions
       
    91 abstract class Rexp
       
    92 case object ZERO extends Rexp                    // matches nothing
       
    93 case object ONE extends Rexp                     // matches the empty string
       
    94 case class CHAR(c: Char) extends Rexp            // matches a character c
       
    95 case class ALT(r1: Rexp, r2: Rexp) extends Rexp  // alternative
       
    96 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp  // sequence
       
    97 case class STAR(r: Rexp) extends Rexp            // star
       
    98 
       
    99 // thompson construction 
       
   100 def thompson (r: Rexp) : NFAt = r match {
       
   101   case ZERO => NFA_ZERO()
       
   102   case ONE => NFA_ONE()
       
   103   case CHAR(c) => NFA_CHAR(c)  
       
   104   case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2))
       
   105   case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2))
       
   106   case STAR(r1) => NFA_STAR(thompson(r1))
       
   107 }
       
   108 
       
   109 //optional regular expression (one or zero times)
       
   110 def OPT(r: Rexp) = ALT(r, ONE)
       
   111 
       
   112 //n-times regular expression (explicitly expanded)
       
   113 def NTIMES(r: Rexp, n: Int) : Rexp = n match {
       
   114   case 0 => ONE
       
   115   case 1 => r
       
   116   case n => SEQ(r, NTIMES(r, n - 1))
       
   117 }
       
   118 
       
   119 
       
   120 def tmatches_nfa(r: Rexp, s: String) : Boolean =
       
   121   thompson(r).accepts(s.toList)
       
   122 
       
   123 def tmatches_nfa2(r: Rexp, s: String) : Boolean =
       
   124   thompson(r).accepts2(s.toList)
       
   125 
       
   126 // dfas via subset construction
       
   127 def tmatches_dfa(r: Rexp, s: String) : Boolean =
       
   128   subset(thompson(r)).accepts(s.toList)
       
   129 
       
   130 // Test Cases
       
   131 //============
       
   132 
       
   133 // the evil regular expression  a?{n} a{n}
       
   134 def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
       
   135 
       
   136 // the evil regular expression (a*)*b
       
   137 val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
       
   138 
       
   139 //for measuring time
       
   140 def time_needed[T](i: Int, code: => T) = {
       
   141   val start = System.nanoTime()
       
   142   for (j <- 1 to i) code
       
   143   val end = System.nanoTime()
       
   144   (end - start)/(i * 1.0e9)
       
   145 }
       
   146 
       
   147 // the size of the NFA can be large, 
       
   148 // thus slowing down the breadth-first search
       
   149 
       
   150 for (i <- 1 to 13) {
       
   151   println(i + ": " + "%.5f".format(time_needed(2, tmatches_nfa(EVIL1(i), "a" * i))))
       
   152 }
       
   153 
       
   154 for (i <- 1 to 100 by 5) {
       
   155   println(i + " " + "%.5f".format(time_needed(2, tmatches_nfa(EVIL2, "a" * i))))
       
   156 }
       
   157 
       
   158 
       
   159 // the backtracking that is needed in depth-first 
       
   160 // search can be painfully slow
       
   161 
       
   162 for (i <- 1 to 8) {
       
   163   println(i + " " + "%.5f".format(time_needed(2, tmatches_nfa2(EVIL2, "a" * i))))
       
   164 }
       
   165 
       
   166 
       
   167 
       
   168 // while my thompson->enfa->subset->partial-function-chain
       
   169 // is probably not the most effcient way to obtain a fast DFA 
       
   170 // (the test below should be much faster with a more direct 
       
   171 // construction), in general the DFAs can be slow because of 
       
   172 // the state explosion in the subset construction
       
   173 
       
   174 for (i <- 1 to 13) {
       
   175   println(i + ": " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL1(i), "a" * i))))
       
   176 }
       
   177 
       
   178 for (i <- 1 to 100 by 5) {
       
   179   println(i + " " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL2, "a" * i))))
       
   180 }