|
1 // Thompson Construction |
|
2 //======================= |
|
3 |
|
4 import $file.dfa, dfa._ |
|
5 import $file.nfa, nfa._ |
|
6 import $file.enfa, enfa._ |
|
7 |
|
8 |
|
9 // states for Thompson construction |
|
10 case class TState(i: Int) extends State |
|
11 |
|
12 object TState { |
|
13 var counter = 0 |
|
14 |
|
15 def apply() : TState = { |
|
16 counter += 1; |
|
17 new TState(counter) |
|
18 } |
|
19 } |
|
20 |
|
21 |
|
22 // some types abbreviations |
|
23 type NFAt = NFA[TState, Char] |
|
24 type NFAtrans = (TState, Char) :=> Set[TState] |
|
25 type eNFAtrans = (TState, Option[Char]) :=> Set[TState] |
|
26 |
|
27 |
|
28 // for composing an eNFA transition with an NFA transition |
|
29 // | is for set union |
|
30 implicit def nfaOps(f: eNFAtrans) = new { |
|
31 def +++(g: NFAtrans) : eNFAtrans = |
|
32 { case (q, None) => applyOrElse(f, (q, None)) |
|
33 case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c)) } |
|
34 } |
|
35 |
|
36 |
|
37 // NFA that does not accept any string |
|
38 def NFA_ZERO(): NFAt = { |
|
39 val Q = TState() |
|
40 NFA(Set(Q), { case _ => Set() }, Set()) |
|
41 } |
|
42 |
|
43 // NFA that accepts the empty string |
|
44 def NFA_ONE() : NFAt = { |
|
45 val Q = TState() |
|
46 NFA(Set(Q), { case _ => Set() }, Set(Q)) |
|
47 } |
|
48 |
|
49 // NFA that accepts the string "c" |
|
50 def NFA_CHAR(c: Char) : NFAt = { |
|
51 val Q1 = TState() |
|
52 val Q2 = TState() |
|
53 NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2)) |
|
54 } |
|
55 |
|
56 // sequence of two NFAs |
|
57 def NFA_SEQ(enfa1: NFAt, enfa2: NFAt) : NFAt = { |
|
58 val new_delta : eNFAtrans = |
|
59 { case (q, None) if enfa1.fins(q) => enfa2.starts } |
|
60 |
|
61 eNFA(enfa1.starts, |
|
62 new_delta +++ enfa1.delta +++ enfa2.delta, |
|
63 enfa2.fins) |
|
64 } |
|
65 |
|
66 // alternative of two NFAs |
|
67 def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = { |
|
68 val new_delta : NFAtrans = { |
|
69 case (q, c) => applyOrElse(enfa1.delta, (q, c)) | |
|
70 applyOrElse(enfa2.delta, (q, c)) } |
|
71 val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q) |
|
72 |
|
73 NFA(enfa1.starts | enfa2.starts, new_delta, new_fins) |
|
74 } |
|
75 |
|
76 // star of a NFA |
|
77 def NFA_STAR(enfa: NFAt) : NFAt = { |
|
78 val Q = TState() |
|
79 val new_delta : eNFAtrans = |
|
80 { case (Q, None) => enfa.starts |
|
81 case (q, None) if enfa.fins(q) => Set(Q) } |
|
82 |
|
83 eNFA(Set(Q), new_delta +++ enfa.delta, Set(Q)) |
|
84 } |
|
85 |
|
86 |
|
87 // We are now ready to translate regular expressions |
|
88 // into DFAs (via eNFAs and NFAs, and the subset construction) |
|
89 |
|
90 // regular expressions |
|
91 abstract class Rexp |
|
92 case object ZERO extends Rexp // matches nothing |
|
93 case object ONE extends Rexp // matches the empty string |
|
94 case class CHAR(c: Char) extends Rexp // matches a character c |
|
95 case class ALT(r1: Rexp, r2: Rexp) extends Rexp // alternative |
|
96 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp // sequence |
|
97 case class STAR(r: Rexp) extends Rexp // star |
|
98 |
|
99 // thompson construction |
|
100 def thompson (r: Rexp) : NFAt = r match { |
|
101 case ZERO => NFA_ZERO() |
|
102 case ONE => NFA_ONE() |
|
103 case CHAR(c) => NFA_CHAR(c) |
|
104 case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2)) |
|
105 case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2)) |
|
106 case STAR(r1) => NFA_STAR(thompson(r1)) |
|
107 } |
|
108 |
|
109 //optional regular expression (one or zero times) |
|
110 def OPT(r: Rexp) = ALT(r, ONE) |
|
111 |
|
112 //n-times regular expression (explicitly expanded) |
|
113 def NTIMES(r: Rexp, n: Int) : Rexp = n match { |
|
114 case 0 => ONE |
|
115 case 1 => r |
|
116 case n => SEQ(r, NTIMES(r, n - 1)) |
|
117 } |
|
118 |
|
119 |
|
120 def tmatches_nfa(r: Rexp, s: String) : Boolean = |
|
121 thompson(r).accepts(s.toList) |
|
122 |
|
123 def tmatches_nfa2(r: Rexp, s: String) : Boolean = |
|
124 thompson(r).accepts2(s.toList) |
|
125 |
|
126 // dfas via subset construction |
|
127 def tmatches_dfa(r: Rexp, s: String) : Boolean = |
|
128 subset(thompson(r)).accepts(s.toList) |
|
129 |
|
130 // Test Cases |
|
131 //============ |
|
132 |
|
133 // the evil regular expression a?{n} a{n} |
|
134 def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) |
|
135 |
|
136 // the evil regular expression (a*)*b |
|
137 val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) |
|
138 |
|
139 //for measuring time |
|
140 def time_needed[T](i: Int, code: => T) = { |
|
141 val start = System.nanoTime() |
|
142 for (j <- 1 to i) code |
|
143 val end = System.nanoTime() |
|
144 (end - start)/(i * 1.0e9) |
|
145 } |
|
146 |
|
147 // the size of the NFA can be large, |
|
148 // thus slowing down the breadth-first search |
|
149 |
|
150 for (i <- 1 to 13) { |
|
151 println(i + ": " + "%.5f".format(time_needed(2, tmatches_nfa(EVIL1(i), "a" * i)))) |
|
152 } |
|
153 |
|
154 for (i <- 1 to 100 by 5) { |
|
155 println(i + " " + "%.5f".format(time_needed(2, tmatches_nfa(EVIL2, "a" * i)))) |
|
156 } |
|
157 |
|
158 |
|
159 // the backtracking that is needed in depth-first |
|
160 // search can be painfully slow |
|
161 |
|
162 for (i <- 1 to 8) { |
|
163 println(i + " " + "%.5f".format(time_needed(2, tmatches_nfa2(EVIL2, "a" * i)))) |
|
164 } |
|
165 |
|
166 |
|
167 |
|
168 // while my thompson->enfa->subset->partial-function-chain |
|
169 // is probably not the most effcient way to obtain a fast DFA |
|
170 // (the test below should be much faster with a more direct |
|
171 // construction), in general the DFAs can be slow because of |
|
172 // the state explosion in the subset construction |
|
173 |
|
174 for (i <- 1 to 13) { |
|
175 println(i + ": " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL1(i), "a" * i)))) |
|
176 } |
|
177 |
|
178 for (i <- 1 to 100 by 5) { |
|
179 println(i + " " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL2, "a" * i)))) |
|
180 } |