17 def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2)) |
17 def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2)) |
18 def SEQ(r1: Rexp, r2: Rexp) = SEQs(List(r1, r2)) |
18 def SEQ(r1: Rexp, r2: Rexp) = SEQs(List(r1, r2)) |
19 |
19 |
20 |
20 |
21 // some convenience for typing regular expressions |
21 // some convenience for typing regular expressions |
22 import scala.language.implicitConversions |
|
23 import scala.language.reflectiveCalls |
|
24 |
22 |
25 def charlist2rexp(s: List[Char]): Rexp = s match { |
23 def charlist2rexp(s: List[Char]): Rexp = s match { |
26 case Nil => ONE |
24 case Nil => ONE |
27 case c::Nil => CHAR(c) |
25 case c::Nil => CHAR(c) |
28 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
26 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
29 } |
27 } |
30 implicit def string2rexp(s: String): Rexp = charlist2rexp(s.toList) |
28 |
31 |
29 import scala.language.implicitConversions |
32 implicit def RexpOps (r: Rexp) = new { |
30 |
|
31 given Conversion[String, Rexp] = (s => charlist2rexp(s.toList)) |
|
32 |
|
33 extension (r: Rexp) { |
33 def | (s: Rexp) = ALT(r, s) |
34 def | (s: Rexp) = ALT(r, s) |
34 def % = STAR(r) |
35 def % = STAR(r) |
35 def ~ (s: Rexp) = SEQ(r, s) |
36 def ~ (s: Rexp) = SEQ(r, s) |
36 } |
37 } |
37 |
38 |
38 implicit def stringOps (s: String) = new { |
39 // some examples for the conversion and extension: |
39 def | (r: Rexp) = ALT(s, r) |
40 |
40 def | (r: String) = ALT(s, r) |
|
41 def % = STAR(s) |
|
42 def ~ (r: Rexp) = SEQ(s, r) |
|
43 def ~ (r: String) = SEQ(s, r) |
|
44 } |
|
45 |
|
46 // examples for the implicits: |
|
47 // ALT(CHAR('a'), CHAR('b')) |
|
48 // val areg : Rexp = "a" | "b" |
41 // val areg : Rexp = "a" | "b" |
49 |
42 // => ALTs(List(CHAR('a'), CHAR('b'))) |
50 // SEQ(CHAR('a'), CHAR('b')) |
43 // |
51 // val sreg : Rexp = "a" ~ "b" |
44 // val sreg : Rexp = "a" ~ "b" |
52 |
45 // => SEQs(List(CHAR('a'), CHAR('b'))) |
|
46 // |
|
47 // val star_reg : Rexp = ("a" ~ "b").% |
|
48 // => STAR(SEQs(List(CHAR('a'), CHAR('b')))) |
53 |
49 |
54 // ADD YOUR CODE BELOW |
50 // ADD YOUR CODE BELOW |
55 //====================== |
51 //====================== |
56 |
52 |
57 // (1) |
53 // (1) |
58 def nullable (r: Rexp) : Boolean = r match { |
54 def nullable (r: Rexp) : Boolean = r match { |
59 case ZERO => false |
55 case ZERO => false |
60 case ONE => true |
56 case ONE => true |
61 case CHAR(_) => false |
57 case CHAR(_) => false |
62 case ALTs(rs) => (for(reg <- rs) yield nullable(reg)).exists(_ == true) |
58 case ALTs(rs) => rs.exists(nullable) |
63 case SEQs(rs) => (for(reg <- rs) yield nullable(reg)).forall(_ == true) |
59 case SEQs(rs) => rs.forall(nullable) |
64 case STAR(_) => true |
60 case STAR(_) => true |
65 } |
61 } |
66 |
|
67 /* |
|
68 nullable(ZERO) == false |
|
69 nullable(ONE) == true |
|
70 nullable(CHAR('a')) == false |
|
71 nullable(ZERO | ONE) == true |
|
72 nullable(ZERO | CHAR('a')) == false |
|
73 nullable(ONE ~ ONE) == true |
|
74 nullable(ONE ~ CHAR('a')) == false |
|
75 nullable(STAR(ZERO)) == true |
|
76 nullable(ALTs(List(ONE, CHAR('a'), ZERO))) == true |
|
77 nullable(SEQs(List(ONE, ALTs(List(ONE, CHAR('a'), ZERO)), STAR(ZERO)))) == true |
|
78 */ |
|
79 |
62 |
80 // (2) |
63 // (2) |
81 def der (c: Char, r: Rexp) : Rexp = r match { |
64 def der(c: Char, r: Rexp) : Rexp = r match { |
82 case ZERO => ZERO |
65 case ZERO => ZERO |
83 case ONE => ZERO |
66 case ONE => ZERO |
84 case CHAR(d) => if(c == d) ONE else ZERO |
67 case CHAR(d) => if (c == d) ONE else ZERO |
85 case ALTs(rs) => ALTs(for(reg <- rs) yield der(c, reg)) |
68 case ALTs(rs) => ALTs(rs.map(der(c, _))) |
86 case SEQs(Nil) => ZERO |
69 case SEQs(Nil) => ZERO |
87 case SEQs(r :: rs) => if(nullable(r)) ALT(SEQs(der(c, r) :: rs), der(c, SEQs(rs))) else SEQs(der(c, r) :: rs) |
70 case SEQs(r1::rs) => |
88 case STAR(r) => SEQ(der(c,r), STAR(r)) |
71 if (nullable(r1)) ALT(SEQs(der(c, r1)::rs), der(c, SEQs(rs))) |
89 } |
72 else SEQs(der(c, r1) :: rs) |
90 |
73 case STAR(r1) => SEQ(der(c, r1), STAR(r1)) |
91 /* |
74 } |
92 der('a', ZERO | ONE) == (ZERO | ZERO) |
75 |
93 der('a', (CHAR('a') | ONE) ~ CHAR('a')) == ALT((ONE | ZERO) ~ CHAR('a'), SEQs(List(ONE))) |
|
94 der('a', (CHAR('a') | CHAR('a')) ~ CHAR('a')) == (ONE | ONE) ~ CHAR('a') |
|
95 der('a', STAR(CHAR('a'))) == (ONE ~ STAR(CHAR('a'))) |
|
96 der('b', STAR(CHAR('a'))) == (ZERO ~ STAR(CHAR('a'))) |
|
97 */ |
|
98 |
76 |
99 // (3) |
77 // (3) |
100 def denest(rs: List[Rexp]) : List[Rexp] = rs match { |
78 def denest(rs: List[Rexp]) : List[Rexp] = rs match { |
101 case Nil => Nil |
79 case Nil => Nil |
102 case ZERO :: rest => denest(rest) |
80 case ZERO::tl => denest(tl) |
103 case ALTs(rgs) :: rest => rgs ::: denest(rest) |
81 case ALTs(rs1)::rs2 => rs1 ::: denest(rs2) |
104 case r :: rest => r :: denest(rest) |
82 case r::rs => r :: denest(rs) |
105 } |
83 } |
106 |
|
107 /* |
|
108 denest(List(ONE, ZERO, ALTs(List(ONE, CHAR('a'))))) == List(ONE, ONE, CHAR('a')) |
|
109 denest(List(ONE ~ ONE, ZERO, ZERO | ONE)) == List(ONE ~ ONE, ZERO, ONE) |
|
110 */ |
|
111 |
84 |
112 // (4) |
85 // (4) |
113 def flts(rs: List[Rexp], acc: List[Rexp] = Nil) : List[Rexp] = rs match { |
86 def flts(rs: List[Rexp], acc: List[Rexp] = Nil) : List[Rexp] = rs match { |
114 case Nil => acc |
87 case Nil => acc |
115 case ZERO :: rest => List(ZERO) |
88 case ZERO::rs => ZERO::Nil |
116 case ONE :: rest => flts(rest, acc) |
89 case ONE::rs => flts(rs, acc) |
117 case SEQs(rgs) :: rest => flts(rest, acc ::: rgs) |
90 case SEQs(rs1)::rs => flts(rs, acc ::: rs1) |
118 case r :: rest => flts(rest, acc ::: List(r)) |
91 case r::rs => flts(rs, acc :+ r) |
119 } |
92 } |
120 |
|
121 /* |
|
122 flts(List(CHAR('a'), ZERO, ONE), Nil) == List(ZERO) |
|
123 flts(List(CHAR('a'), ONE, ONE, CHAR('b')), Nil) == List(CHAR('a'), CHAR('b')) |
|
124 flts(List(ONE ~ CHAR('a'), CHAR('b') ~ ONE), Nil) == List(ONE, CHAR('a'), CHAR('b'), ONE) |
|
125 */ |
|
126 |
93 |
127 // (5) |
94 // (5) |
128 def ALTs_smart(rs: List[Rexp]) : Rexp = rs match { |
95 def ALTs_smart(rs: List[Rexp]) : Rexp = rs match { |
129 case Nil => ZERO |
96 case Nil => ZERO |
130 case List(r) => r |
97 case r::Nil => r |
131 case _ => ALTs(rs) |
98 case rs => ALTs(rs) |
132 } |
99 } |
133 |
100 |
134 def SEQs_smart(rs: List[Rexp]) : Rexp = rs match { |
101 def SEQs_smart(rs: List[Rexp]) : Rexp = rs match { |
135 case Nil => ONE |
102 case Nil => ONE |
136 case List(r) => r |
103 case ZERO::Nil => ZERO |
137 case _ => SEQs(rs) |
104 case r::Nil => r |
138 } |
105 case rs => SEQs(rs) |
139 |
106 } |
|
107 |
|
108 // (6) |
|
109 |
|
110 def simp(r: Rexp) : Rexp = r match { |
|
111 case ALTs(rs) => |
|
112 ALTs_smart(denest(rs.map(simp)).distinct) |
|
113 case SEQs(rs) => |
|
114 SEQs_smart(flts(rs.map(simp))) |
|
115 case r => r |
|
116 } |
|
117 |
|
118 //println("Simp tests") |
|
119 //println(simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE))) |
|
120 //println(simp(((CHAR('a') | ZERO) ~ ONE) | |
|
121 // (((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO)))) |
|
122 |
|
123 // (7) |
|
124 |
|
125 def ders (s: List[Char], r: Rexp) : Rexp = s match { |
|
126 case Nil => r |
|
127 case c::s => ders(s, simp(der(c, r))) |
|
128 } |
|
129 |
|
130 // main matcher function |
|
131 def matcher(r: Rexp, s: String) = nullable(ders(s.toList, r)) |
|
132 |
|
133 // (8) |
|
134 |
|
135 def size(r: Rexp): Int = r match { |
|
136 case ZERO => 1 |
|
137 case ONE => 1 |
|
138 case CHAR(_) => 1 |
|
139 case ALTs(rs) => 1 + rs.map(size).sum |
|
140 case SEQs(rs) => 1 + rs.map(size).sum |
|
141 case STAR(r1) => 1 + size(r1) |
|
142 } |
|
143 |
|
144 |
|
145 |
|
146 // some testing data |
140 /* |
147 /* |
141 SEQs_smart(Nil) == ONE |
148 println(matcher(("a" ~ "b") ~ "c", "abc")) // => true |
142 SEQs_smart(List(ZERO)) == ZERO |
149 println(matcher(("a" ~ "b") ~ "c", "ab")) // => false |
143 SEQs_smart(List(CHAR('a'))) == CHAR('a') |
|
144 SEQs_smart(List(ONE ~ ONE)) == ONE ~ ONE |
|
145 SEQs_smart(List(ONE, ONE)) == SEQs(List(ONE, ONE)) |
|
146 ALTs_smart(Nil) == ZERO |
|
147 ALTs_smart(List(ONE ~ ONE)) == ONE ~ ONE |
|
148 ALTs_smart(List(ZERO, ZERO)) == ALTs(List(ZERO, ZERO)) |
|
149 */ |
|
150 |
|
151 // (6) |
|
152 def simp(r: Rexp) : Rexp = r match { |
|
153 case ALTs(rs) => ALTs_smart(denest(for(reg <- rs) yield simp(reg)).distinct) |
|
154 case SEQs(rs) => SEQs_smart(flts(for(reg <- rs) yield simp(reg))) |
|
155 case _ => r |
|
156 } |
|
157 |
|
158 /* |
|
159 simp(ZERO | ONE) == ONE |
|
160 simp(STAR(ZERO | ONE)) == STAR(ZERO | ONE) |
|
161 simp(ONE ~ (ONE ~ (ONE ~ CHAR('a')))) == CHAR('a') |
|
162 simp(((ONE ~ ONE) ~ ONE) ~ CHAR('a')) == CHAR('a') |
|
163 simp(((ONE | ONE) ~ ONE) ~ CHAR('a')) == CHAR('a') |
|
164 simp(ONE ~ (ONE ~ (ONE ~ ZERO))) == ZERO |
|
165 simp(ALT(ONE ~ (ONE ~ (ONE ~ ZERO)), CHAR('a'))) == CHAR('a') |
|
166 simp(CHAR('a') | CHAR('a')) == CHAR('a') |
|
167 simp(CHAR('a') ~ CHAR('a')) == CHAR('a') ~ CHAR('a') |
|
168 simp(ONE | CHAR('a')) == (ONE | CHAR('a')) |
|
169 simp(ALT((CHAR('a') | ZERO) ~ ONE,((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO))) == CHAR('a') |
|
170 simp((ZERO | ((ZERO | ZERO) | (ZERO | ZERO))) ~ ((ONE | ZERO) | ONE ) ~ (CHAR('a'))) == ZERO |
|
171 simp(ALT(ONE | ONE, ONE | ONE)) == ONE |
|
172 simp(ALT(ZERO | CHAR('a'), CHAR('a') | ZERO)) == CHAR('a') |
|
173 simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)) == ALT(ONE, CHAR('a')) |
|
174 simp(ALTs(Nil)) == ZERO |
|
175 simp(SEQs(List(CHAR('a')))) == CHAR('a') |
|
176 */ |
|
177 |
|
178 // (7) |
|
179 def ders (s: List[Char], r: Rexp) : Rexp = s match { |
|
180 case Nil => r |
|
181 case c :: cs => ders(cs, simp(der(c, r))) |
|
182 } |
|
183 def matcher(r: Rexp, s: String): Boolean = { |
|
184 val derivatives = ders(s.toList, r) |
|
185 nullable(derivatives) |
|
186 } |
|
187 |
|
188 /* |
|
189 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) |
|
190 ders("aaaaa".toList, EVIL) == SEQs(List(STAR(CHAR('a')), STAR(STAR(CHAR('a'))), CHAR('b'))) |
|
191 ders(List('b'), EVIL) == ONE |
|
192 ders("bb".toList, EVIL) == ZERO |
|
193 matcher(EVIL, "a" * 5 ++ "b") == true |
|
194 matcher(EVIL, "b") == true |
|
195 matcher(EVIL, "bb") == false |
|
196 matcher("abc", "abc") == true |
|
197 matcher(("ab" | "a") ~ (ONE | "bc"), "abc") == true |
|
198 matcher(ONE, "") == true |
|
199 matcher(ZERO, "") == false |
|
200 matcher(ONE | CHAR('a'), "") == true |
|
201 matcher(ONE | CHAR('a'), "a") == true |
|
202 */ |
|
203 |
|
204 // (8) |
|
205 def size(r: Rexp): Int = r match { |
|
206 case ZERO => 1 |
|
207 case ONE => 1 |
|
208 case CHAR(_) => 1 |
|
209 case ALTs(rs) => 1 + (for(reg <- rs) yield size(reg)).sum |
|
210 case SEQs(rs) => 1 + (for(reg <- rs) yield size(reg)).sum |
|
211 case STAR(r) => 1 + size(r) |
|
212 } |
|
213 |
|
214 /* |
|
215 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) |
|
216 size(der('a', der('a', EVIL))) == 36 |
|
217 size(der('a', der('a', der('a', EVIL)))) == 83 |
|
218 size(ders("aaaaaa".toList, EVIL)) == 7 |
|
219 size(ders(("a" * 50).toList, EVIL)) == 7 |
|
220 */ |
|
221 |
|
222 |
|
223 // Some testing data |
|
224 //=================== |
|
225 /* |
|
226 |
|
227 simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)) // => ALTs(List(ONE, CHAR(a))) |
|
228 simp(((CHAR('a') | ZERO) ~ ONE) | (((ONE | CHAR('b')) | CHAR('c')) ~ (CHAR('d') ~ ZERO))) // => CHAR(a) |
|
229 |
|
230 matcher(("a" ~ "b") ~ "c", "ab") // => false |
|
231 matcher(("a" ~ "b") ~ "c", "abc") // => true |
|
232 |
|
233 |
150 |
234 // the supposedly 'evil' regular expression (a*)* b |
151 // the supposedly 'evil' regular expression (a*)* b |
235 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) |
152 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) |
236 |
153 |
237 matcher(EVIL, "a" * 1000) // => false |
154 println(matcher(EVIL, "a" * 1000 ++ "b")) // => true |
238 matcher(EVIL, "a" * 1000 ++ "b") // => true |
155 println(matcher(EVIL, "a" * 1000)) // => false |
239 |
|
240 |
156 |
241 // size without simplifications |
157 // size without simplifications |
242 size(der('a', der('a', EVIL))) // => 36 |
158 println(size(der('a', der('a', EVIL)))) // => 36 |
243 size(der('a', der('a', der('a', EVIL)))) // => 83 |
159 println(size(der('a', der('a', der('a', EVIL))))) // => 83 |
244 |
160 |
245 // size with simplification |
161 // size with simplification |
246 size(simp(der('a', der('a', EVIL)))) // => 7 |
162 println(simp(der('a', der('a', EVIL)))) |
247 size(simp(der('a', der('a', der('a', EVIL))))) // => 7 |
163 println(simp(der('a', der('a', der('a', EVIL))))) |
|
164 |
|
165 println(size(simp(der('a', der('a', EVIL))))) // => 7 |
|
166 println(size(simp(der('a', der('a', der('a', EVIL)))))) // => 7 |
248 |
167 |
249 // Python needs around 30 seconds for matching 28 a's with EVIL. |
168 // Python needs around 30 seconds for matching 28 a's with EVIL. |
250 // Java 9 and later increase this to an "astonishing" 40000 a's in |
169 // Java 9 and later increase this to an "astonishing" 40000 a's in |
251 // 30 seconds. |
170 // around 30 seconds. |
252 // |
171 // |
253 // Lets see how long it really takes to match strings with |
172 // Lets see how long it takes to match strings with |
254 // 5 Million a's...it should be in the range of a few |
173 // 5 Million a's...it should be in the range of a |
255 // of seconds. |
174 // few seconds. |
256 |
175 |
257 def time_needed[T](i: Int, code: => T) = { |
176 def time_needed[T](i: Int, code: => T) = { |
258 val start = System.nanoTime() |
177 val start = System.nanoTime() |
259 for (j <- 1 to i) code |
178 for (j <- 1 to i) code |
260 val end = System.nanoTime() |
179 val end = System.nanoTime() |