1 // Thompson Construction |
|
2 // (needs :load dfa.scala |
|
3 // :load nfa.scala |
|
4 // :load enfa.scala) |
|
5 |
|
6 |
|
7 // states for Thompson construction |
|
8 case class TState(i: Int) extends State |
|
9 |
|
10 object TState { |
|
11 var counter = 0 |
|
12 |
|
13 def apply() : TState = { |
|
14 counter += 1; |
|
15 new TState(counter - 1) |
|
16 } |
|
17 } |
|
18 |
|
19 |
|
20 // some types abbreviations |
|
21 type NFAt = NFA[TState, Char] |
|
22 type NFAtrans = (TState, Char) :=> Set[TState] |
|
23 type eNFAtrans = (TState, Option[Char]) :=> Set[TState] |
|
24 |
|
25 |
|
26 // for composing an eNFA transition with a NFA transition |
|
27 implicit class RichPF(val f: eNFAtrans) extends AnyVal { |
|
28 def +++(g: NFAtrans) : eNFAtrans = |
|
29 { case (q, None) => applyOrElse(f, (q, None)) |
|
30 case (q, Some(c)) => applyOrElse(f, (q, Some(c))) | applyOrElse(g, (q, c)) } |
|
31 } |
|
32 |
|
33 |
|
34 // NFA that does not accept any string |
|
35 def NFA_ZERO(): NFAt = { |
|
36 val Q = TState() |
|
37 NFA(Set(Q), { case _ => Set() }, Set()) |
|
38 } |
|
39 |
|
40 // NFA that accepts the empty string |
|
41 def NFA_ONE() : NFAt = { |
|
42 val Q = TState() |
|
43 NFA(Set(Q), { case _ => Set() }, Set(Q)) |
|
44 } |
|
45 |
|
46 // NFA that accepts the string "c" |
|
47 def NFA_CHAR(c: Char) : NFAt = { |
|
48 val Q1 = TState() |
|
49 val Q2 = TState() |
|
50 NFA(Set(Q1), { case (Q1, d) if (c == d) => Set(Q2) }, Set(Q2)) |
|
51 } |
|
52 |
|
53 // sequence of two NFAs |
|
54 def NFA_SEQ(enfa1: NFAt, enfa2: NFAt) : NFAt = { |
|
55 val new_delta : eNFAtrans = |
|
56 { case (q, None) if enfa1.fins(q) => enfa2.starts } |
|
57 |
|
58 eNFA(enfa1.starts, new_delta +++ enfa1.delta +++ enfa2.delta, |
|
59 enfa2.fins) |
|
60 } |
|
61 |
|
62 // alternative of two NFAs |
|
63 def NFA_ALT(enfa1: NFAt, enfa2: NFAt) : NFAt = { |
|
64 val new_delta : NFAtrans = { |
|
65 case (q, c) => applyOrElse(enfa1.delta, (q, c)) | |
|
66 applyOrElse(enfa2.delta, (q, c)) } |
|
67 val new_fins = (q: TState) => enfa1.fins(q) || enfa2.fins(q) |
|
68 |
|
69 NFA(enfa1.starts | enfa2.starts, new_delta, new_fins) |
|
70 } |
|
71 |
|
72 // star of a NFA |
|
73 def NFA_STAR(enfa: NFAt) : NFAt = { |
|
74 val Q = TState() |
|
75 val new_delta : eNFAtrans = |
|
76 { case (Q, None) => enfa.starts |
|
77 case (q, None) if enfa.fins(q) => Set(Q) } |
|
78 |
|
79 eNFA(Set(Q), new_delta +++ enfa.delta, Set(Q)) |
|
80 } |
|
81 |
|
82 |
|
83 |
|
84 // regular expressions |
|
85 abstract class Rexp |
|
86 case object ZERO extends Rexp // matches nothing |
|
87 case object ONE extends Rexp // matches the empty string |
|
88 case class CHAR(c: Char) extends Rexp // matches a character c |
|
89 case class ALT(r1: Rexp, r2: Rexp) extends Rexp // alternative |
|
90 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp // sequence |
|
91 case class STAR(r: Rexp) extends Rexp // star |
|
92 |
|
93 |
|
94 |
|
95 |
|
96 // thompson construction |
|
97 def thompson (r: Rexp) : NFAt = r match { |
|
98 case ZERO => NFA_ZERO() |
|
99 case ONE => NFA_ONE() |
|
100 case CHAR(c) => NFA_CHAR(c) |
|
101 case ALT(r1, r2) => NFA_ALT(thompson(r1), thompson(r2)) |
|
102 case SEQ(r1, r2) => NFA_SEQ(thompson(r1), thompson(r2)) |
|
103 case STAR(r1) => NFA_STAR(thompson(r1)) |
|
104 } |
|
105 |
|
106 //optional regular expression (one or zero times) |
|
107 def OPT(r: Rexp) = ALT(r, ONE) |
|
108 |
|
109 //n-times regular expression (explicitly expanded) |
|
110 def NTIMES(r: Rexp, n: Int) : Rexp = n match { |
|
111 case 0 => ONE |
|
112 case 1 => r |
|
113 case n => SEQ(r, NTIMES(r, n - 1)) |
|
114 } |
|
115 |
|
116 |
|
117 def tmatches(r: Rexp, s: String) : Boolean = |
|
118 thompson(r).accepts(s.toList) |
|
119 |
|
120 def tmatches2(r: Rexp, s: String) : Boolean = |
|
121 thompson(r).accepts2(s.toList) |
|
122 |
|
123 // dfa via subset construction |
|
124 def tmatches_dfa(r: Rexp, s: String) : Boolean = |
|
125 subset(thompson(r)).accepts(s.toList) |
|
126 |
|
127 // Test Cases |
|
128 |
|
129 |
|
130 // the evil regular expression a?{n} a{n} |
|
131 def EVIL1(n: Int) : Rexp = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) |
|
132 |
|
133 // the evil regular expression (a*)*b |
|
134 val EVIL2 : Rexp = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) |
|
135 |
|
136 //for measuring time |
|
137 def time_needed[T](i: Int, code: => T) = { |
|
138 val start = System.nanoTime() |
|
139 for (j <- 1 to i) code |
|
140 val end = System.nanoTime() |
|
141 (end - start)/(i * 1.0e9) |
|
142 } |
|
143 |
|
144 // the size of the NFA can be large, |
|
145 // thus slowing down the breadth-first search |
|
146 |
|
147 for (i <- 1 to 13) { |
|
148 println(i + ": " + "%.5f".format(time_needed(2, tmatches(EVIL1(i), "a" * i)))) |
|
149 } |
|
150 |
|
151 for (i <- 1 to 100 by 5) { |
|
152 println(i + " " + "%.5f".format(time_needed(2, tmatches(EVIL2, "a" * i)))) |
|
153 } |
|
154 |
|
155 |
|
156 // the backtracking needed in depth-first search |
|
157 // can be painfully slow |
|
158 |
|
159 for (i <- 1 to 8) { |
|
160 println(i + " " + "%.5f".format(time_needed(2, tmatches2(EVIL2, "a" * i)))) |
|
161 } |
|
162 |
|
163 |
|
164 |
|
165 // while my thompson->enfa->subset->partial-function-chain |
|
166 // is probably not the most effcient way to obtain a fast DFA |
|
167 // (the test below should be much faster with a more direct |
|
168 // construction), in general the DFAs can be slow because of |
|
169 // the state explosion in the subset construction |
|
170 |
|
171 for (i <- 1 to 13) { |
|
172 println(i + ": " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL1(i), "a" * i)))) |
|
173 } |
|
174 |
|
175 for (i <- 1 to 100 by 5) { |
|
176 println(i + " " + "%.5f".format(time_needed(2, tmatches_dfa(EVIL2, "a" * i)))) |
|
177 } |
|