|
1 // A simple lexer inspired by work of Sulzmann & Lu |
|
2 //================================================== |
|
3 // |
|
4 // Call the test cases with |
|
5 // |
|
6 // amm lexer.sc small |
|
7 // amm lexer.sc fib |
|
8 // amm lexer.sc loops |
|
9 // amm lexer.sc email |
|
10 // |
|
11 // amm lexer.sc all |
|
12 |
|
13 |
|
14 // regular expressions including records |
|
15 abstract class Rexp |
|
16 case object ZERO extends Rexp |
|
17 case object ONE extends Rexp |
|
18 case object ANYCHAR extends Rexp |
|
19 case class CHAR(c: Char) extends Rexp |
|
20 case class ALTS(r1: Rexp, r2: Rexp) extends Rexp |
|
21 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp |
|
22 case class STAR(r: Rexp) extends Rexp |
|
23 case class RECD(x: String, r: Rexp) extends Rexp |
|
24 case class NTIMES(n: Int, r: Rexp) extends Rexp |
|
25 case class OPTIONAL(r: Rexp) extends Rexp |
|
26 case class NOT(r: Rexp) extends Rexp |
|
27 // records for extracting strings or tokens |
|
28 |
|
29 // values |
|
30 abstract class Val |
|
31 case object Empty extends Val |
|
32 case class Chr(c: Char) extends Val |
|
33 case class Sequ(v1: Val, v2: Val) extends Val |
|
34 case class Left(v: Val) extends Val |
|
35 case class Right(v: Val) extends Val |
|
36 case class Stars(vs: List[Val]) extends Val |
|
37 case class Rec(x: String, v: Val) extends Val |
|
38 case class Ntime(vs: List[Val]) extends Val |
|
39 case class Optionall(v: Val) extends Val |
|
40 case class Nots(s: String) extends Val |
|
41 |
|
42 |
|
43 |
|
44 abstract class Bit |
|
45 case object Z extends Bit |
|
46 case object S extends Bit |
|
47 |
|
48 |
|
49 type Bits = List[Bit] |
|
50 |
|
51 abstract class ARexp |
|
52 case object AZERO extends ARexp |
|
53 case class AONE(bs: Bits) extends ARexp |
|
54 case class ACHAR(bs: Bits, c: Char) extends ARexp |
|
55 case class AALTS(bs: Bits, rs: List[ARexp]) extends ARexp |
|
56 case class ASEQ(bs: Bits, r1: ARexp, r2: ARexp) extends ARexp |
|
57 case class ASTAR(bs: Bits, r: ARexp) extends ARexp |
|
58 case class ANOT(bs: Bits, r: ARexp) extends ARexp |
|
59 case class AANYCHAR(bs: Bits) extends ARexp |
|
60 |
|
61 |
|
62 |
|
63 // some convenience for typing in regular expressions |
|
64 |
|
65 def charlist2rexp(s : List[Char]): Rexp = s match { |
|
66 case Nil => ONE |
|
67 case c::Nil => CHAR(c) |
|
68 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
|
69 } |
|
70 implicit def string2rexp(s : String) : Rexp = |
|
71 charlist2rexp(s.toList) |
|
72 |
|
73 implicit def RexpOps(r: Rexp) = new { |
|
74 def | (s: Rexp) = ALTS(r, s) |
|
75 def % = STAR(r) |
|
76 def ~ (s: Rexp) = SEQ(r, s) |
|
77 } |
|
78 |
|
79 implicit def stringOps(s: String) = new { |
|
80 def | (r: Rexp) = ALTS(s, r) |
|
81 def | (r: String) = ALTS(s, r) |
|
82 def % = STAR(s) |
|
83 def ~ (r: Rexp) = SEQ(s, r) |
|
84 def ~ (r: String) = SEQ(s, r) |
|
85 def $ (r: Rexp) = RECD(s, r) |
|
86 } |
|
87 |
|
88 def nullable(r: Rexp) : Boolean = r match { |
|
89 case ZERO => false |
|
90 case ONE => true |
|
91 case CHAR(_) => false |
|
92 case ANYCHAR => false |
|
93 case ALTS(r1, r2) => nullable(r1) || nullable(r2) |
|
94 case SEQ(r1, r2) => nullable(r1) && nullable(r2) |
|
95 case STAR(_) => true |
|
96 case RECD(_, r1) => nullable(r1) |
|
97 case NTIMES(n, r) => if (n == 0) true else nullable(r) |
|
98 case OPTIONAL(r) => true |
|
99 case NOT(r) => !nullable(r) |
|
100 } |
|
101 |
|
102 def der(c: Char, r: Rexp) : Rexp = r match { |
|
103 case ZERO => ZERO |
|
104 case ONE => ZERO |
|
105 case CHAR(d) => if (c == d) ONE else ZERO |
|
106 case ANYCHAR => ONE |
|
107 case ALTS(r1, r2) => ALTS(der(c, r1), der(c, r2)) |
|
108 case SEQ(r1, r2) => |
|
109 if (nullable(r1)) ALTS(SEQ(der(c, r1), r2), der(c, r2)) |
|
110 else SEQ(der(c, r1), r2) |
|
111 case STAR(r) => SEQ(der(c, r), STAR(r)) |
|
112 case RECD(_, r1) => der(c, r1) |
|
113 case NTIMES(n, r) => if(n > 0) SEQ(der(c, r), NTIMES(n - 1, r)) else ZERO |
|
114 case OPTIONAL(r) => der(c, r) |
|
115 case NOT(r) => NOT(der(c, r)) |
|
116 } |
|
117 |
|
118 |
|
119 // extracts a string from a value |
|
120 def flatten(v: Val) : String = v match { |
|
121 case Empty => "" |
|
122 case Chr(c) => c.toString |
|
123 case Left(v) => flatten(v) |
|
124 case Right(v) => flatten(v) |
|
125 case Sequ(v1, v2) => flatten(v1) ++ flatten(v2) |
|
126 case Stars(vs) => vs.map(flatten).mkString |
|
127 case Ntime(vs) => vs.map(flatten).mkString |
|
128 case Optionall(v) => flatten(v) |
|
129 case Rec(_, v) => flatten(v) |
|
130 } |
|
131 |
|
132 |
|
133 // extracts an environment from a value; |
|
134 // used for tokenising a string |
|
135 def env(v: Val) : List[(String, String)] = v match { |
|
136 case Empty => Nil |
|
137 case Chr(c) => Nil |
|
138 case Left(v) => env(v) |
|
139 case Right(v) => env(v) |
|
140 case Sequ(v1, v2) => env(v1) ::: env(v2) |
|
141 case Stars(vs) => vs.flatMap(env) |
|
142 case Ntime(vs) => vs.flatMap(env) |
|
143 case Rec(x, v) => (x, flatten(v))::env(v) |
|
144 case Optionall(v) => env(v) |
|
145 case Nots(s) => ("Negative", s) :: Nil |
|
146 } |
|
147 |
|
148 |
|
149 // The injection and mkeps part of the lexer |
|
150 //=========================================== |
|
151 |
|
152 def mkeps(r: Rexp) : Val = r match { |
|
153 case ONE => Empty |
|
154 case ALTS(r1, r2) => |
|
155 if (nullable(r1)) Left(mkeps(r1)) else Right(mkeps(r2)) |
|
156 case SEQ(r1, r2) => Sequ(mkeps(r1), mkeps(r2)) |
|
157 case STAR(r) => Stars(Nil) |
|
158 case RECD(x, r) => Rec(x, mkeps(r)) |
|
159 case NTIMES(n, r) => Ntime(List.fill(n)(mkeps(r))) |
|
160 case OPTIONAL(r) => Optionall(Empty) |
|
161 case NOT(rInner) => if(nullable(rInner)) throw new Exception("error") |
|
162 else Nots("")//Nots(s.reverse.toString) |
|
163 // case NOT(ZERO) => Empty |
|
164 // case NOT(CHAR(c)) => Empty |
|
165 // case NOT(SEQ(r1, r2)) => Sequ(mkeps(NOT(r1)), mkeps(NOT(r2))) |
|
166 // case NOT(ALTS(r1, r2)) => if(!nullable(r1)) Left(mkeps(NOT(r1))) else Right(mkeps(NOT(r2))) |
|
167 // case NOT(STAR(r)) => Stars(Nil) |
|
168 |
|
169 } |
|
170 |
|
171 def inj(r: Rexp, c: Char, v: Val) : Val = (r, v) match { |
|
172 case (STAR(r), Sequ(v1, Stars(vs))) => Stars(inj(r, c, v1)::vs) |
|
173 case (SEQ(r1, r2), Sequ(v1, v2)) => Sequ(inj(r1, c, v1), v2) |
|
174 case (SEQ(r1, r2), Left(Sequ(v1, v2))) => Sequ(inj(r1, c, v1), v2) |
|
175 case (SEQ(r1, r2), Right(v2)) => Sequ(mkeps(r1), inj(r2, c, v2)) |
|
176 case (ALTS(r1, r2), Left(v1)) => Left(inj(r1, c, v1)) |
|
177 case (ALTS(r1, r2), Right(v2)) => Right(inj(r2, c, v2)) |
|
178 case (CHAR(d), Empty) => Chr(c) |
|
179 case (RECD(x, r1), _) => Rec(x, inj(r1, c, v)) |
|
180 case (NTIMES(n, r), Sequ(v1, Ntime(vs))) => Ntime(inj(r, c, v1)::vs) |
|
181 case (OPTIONAL(r), v) => Optionall(inj(r, c, v)) |
|
182 case (NOT(r), Nots(s)) => Nots(c.toString ++ s) |
|
183 case (ANYCHAR, Empty) => Chr(c) |
|
184 } |
|
185 |
|
186 // some "rectification" functions for simplification |
|
187 def F_ID(v: Val): Val = v |
|
188 def F_RIGHT(f: Val => Val) = (v:Val) => Right(f(v)) |
|
189 def F_LEFT(f: Val => Val) = (v:Val) => Left(f(v)) |
|
190 def F_ALT(f1: Val => Val, f2: Val => Val) = (v:Val) => v match { |
|
191 case Right(v) => Right(f2(v)) |
|
192 case Left(v) => Left(f1(v)) |
|
193 } |
|
194 def F_SEQ(f1: Val => Val, f2: Val => Val) = (v:Val) => v match { |
|
195 case Sequ(v1, v2) => Sequ(f1(v1), f2(v2)) |
|
196 } |
|
197 def F_SEQ_Empty1(f1: Val => Val, f2: Val => Val) = |
|
198 (v:Val) => Sequ(f1(Empty), f2(v)) |
|
199 def F_SEQ_Empty2(f1: Val => Val, f2: Val => Val) = |
|
200 (v:Val) => Sequ(f1(v), f2(Empty)) |
|
201 |
|
202 def F_ERROR(v: Val): Val = throw new Exception("error") |
|
203 |
|
204 // simplification |
|
205 def simp(r: Rexp): (Rexp, Val => Val) = r match { |
|
206 case ALTS(r1, r2) => { |
|
207 val (r1s, f1s) = simp(r1) |
|
208 val (r2s, f2s) = simp(r2) |
|
209 (r1s, r2s) match { |
|
210 case (ZERO, _) => (r2s, F_RIGHT(f2s)) |
|
211 case (_, ZERO) => (r1s, F_LEFT(f1s)) |
|
212 case _ => if (r1s == r2s) (r1s, F_LEFT(f1s)) |
|
213 else (ALTS (r1s, r2s), F_ALT(f1s, f2s)) |
|
214 } |
|
215 } |
|
216 case SEQ(r1, r2) => { |
|
217 val (r1s, f1s) = simp(r1) |
|
218 val (r2s, f2s) = simp(r2) |
|
219 (r1s, r2s) match { |
|
220 case (ZERO, _) => (ZERO, F_ERROR) |
|
221 case (_, ZERO) => (ZERO, F_ERROR) |
|
222 case (ONE, _) => (r2s, F_SEQ_Empty1(f1s, f2s)) |
|
223 case (_, ONE) => (r1s, F_SEQ_Empty2(f1s, f2s)) |
|
224 case _ => (SEQ(r1s,r2s), F_SEQ(f1s, f2s)) |
|
225 } |
|
226 } |
|
227 case r => (r, F_ID) |
|
228 } |
|
229 |
|
230 // lexing functions including simplification |
|
231 def lex_simp(r: Rexp, s: List[Char]) : Val = s match { |
|
232 case Nil => if (nullable(r)) mkeps(r) else |
|
233 { throw new Exception(s"lexing error $r not nullable") } |
|
234 case c::cs => { |
|
235 val (r_simp, f_simp) = simp(der(c, r)) |
|
236 inj(r, c, f_simp(lex_simp(r_simp, cs))) |
|
237 } |
|
238 } |
|
239 |
|
240 def lexing_simp(r: Rexp, s: String) = |
|
241 env(lex_simp(r, s.toList)) |
|
242 |
|
243 // The Lexing Rules for the WHILE Language |
|
244 |
|
245 def PLUS(r: Rexp) = r ~ r.% |
|
246 |
|
247 def Range(s : List[Char]) : Rexp = s match { |
|
248 case Nil => ZERO |
|
249 case c::Nil => CHAR(c) |
|
250 case c::s => ALTS(CHAR(c), Range(s)) |
|
251 } |
|
252 def RANGE(s: String) = Range(s.toList) |
|
253 |
|
254 val SYM = RANGE("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_") |
|
255 val DIGIT = RANGE("0123456789") |
|
256 val ID = SYM ~ (SYM | DIGIT).% |
|
257 val NUM = PLUS(DIGIT) |
|
258 val KEYWORD : Rexp = "skip" | "while" | "do" | "if" | "then" | "else" | "read" | "write" |
|
259 val SEMI: Rexp = ";" |
|
260 val OP: Rexp = ":=" | "=" | "-" | "+" | "*" | "!=" | "<" | ">" |
|
261 val WHITESPACE = PLUS(" " | "\n" | "\t" | "\r") |
|
262 val RPAREN: Rexp = "{" |
|
263 val LPAREN: Rexp = "}" |
|
264 val STRING: Rexp = "\"" ~ SYM.% ~ "\"" |
|
265 |
|
266 |
|
267 //ab \ a --> 1b |
|
268 // |
|
269 val WHILE_REGS = (("k" $ KEYWORD) | |
|
270 ("i" $ ID) | |
|
271 ("o" $ OP) | |
|
272 ("n" $ NUM) | |
|
273 ("s" $ SEMI) | |
|
274 ("str" $ STRING) | |
|
275 ("p" $ (LPAREN | RPAREN)) | |
|
276 ("w" $ WHITESPACE)).% |
|
277 |
|
278 val NREGS = NTIMES(5, OPTIONAL(SYM)) |
|
279 val NREGS1 = ("test" $ NREGS) |
|
280 // Two Simple While Tests |
|
281 //======================== |
|
282 val NOTREG = "hehe" ~ NOT((ANYCHAR.%) ~ "haha" ~ (ANYCHAR.%)) |
|
283 |
|
284 |
|
285 // bnullable function: tests whether the aregular |
|
286 // expression can recognise the empty string |
|
287 def bnullable (r: ARexp) : Boolean = r match { |
|
288 case AZERO => false |
|
289 case AONE(_) => true |
|
290 case ACHAR(_,_) => false |
|
291 case AALTS(_, rs) => rs.exists(bnullable) |
|
292 case ASEQ(_, r1, r2) => bnullable(r1) && bnullable(r2) |
|
293 case ASTAR(_, _) => true |
|
294 case ANOT(_, rn) => !bnullable(rn) |
|
295 } |
|
296 |
|
297 def mkepsBC(r: ARexp) : Bits = r match { |
|
298 case AONE(bs) => bs |
|
299 case AALTS(bs, rs) => { |
|
300 val n = rs.indexWhere(bnullable) |
|
301 bs ++ mkepsBC(rs(n)) |
|
302 } |
|
303 case ASEQ(bs, r1, r2) => bs ++ mkepsBC(r1) ++ mkepsBC(r2) |
|
304 case ASTAR(bs, r) => bs ++ List(Z) |
|
305 case ANOT(bs, rn) => bs |
|
306 } |
|
307 |
|
308 |
|
309 def bder(c: Char, r: ARexp) : ARexp = r match { |
|
310 case AZERO => AZERO |
|
311 case AONE(_) => AZERO |
|
312 case ACHAR(bs, f) => if (c == f) AONE(bs) else AZERO |
|
313 case AALTS(bs, rs) => AALTS(bs, rs.map(bder(c, _))) |
|
314 case ASEQ(bs, r1, r2) => |
|
315 if (bnullable(r1)) AALTS(bs, ASEQ(Nil, bder(c, r1), r2) :: fuse(mkepsBC(r1), bder(c, r2)) :: Nil ) |
|
316 else ASEQ(bs, bder(c, r1), r2) |
|
317 case ASTAR(bs, r) => ASEQ(bs, fuse(List(S), bder(c, r)), ASTAR(Nil, r)) |
|
318 case ANOT(bs, rn) => ANOT(bs, bder(c, rn)) |
|
319 case AANYCHAR(bs) => AONE(bs) |
|
320 } |
|
321 |
|
322 def fuse(bs: Bits, r: ARexp) : ARexp = r match { |
|
323 case AZERO => AZERO |
|
324 case AONE(cs) => AONE(bs ++ cs) |
|
325 case ACHAR(cs, f) => ACHAR(bs ++ cs, f) |
|
326 case AALTS(cs, rs) => AALTS(bs ++ cs, rs) |
|
327 case ASEQ(cs, r1, r2) => ASEQ(bs ++ cs, r1, r2) |
|
328 case ASTAR(cs, r) => ASTAR(bs ++ cs, r) |
|
329 case ANOT(cs, r) => ANOT(bs ++ cs, r) |
|
330 } |
|
331 |
|
332 |
|
333 def internalise(r: Rexp) : ARexp = r match { |
|
334 case ZERO => AZERO |
|
335 case ONE => AONE(Nil) |
|
336 case CHAR(c) => ACHAR(Nil, c) |
|
337 //case PRED(f) => APRED(Nil, f) |
|
338 case ALTS(r1, r2) => |
|
339 AALTS(Nil, List(fuse(List(Z), internalise(r1)), fuse(List(S), internalise(r2)))) |
|
340 // case ALTS(r1::rs) => { |
|
341 // val AALTS(Nil, rs2) = internalise(ALTS(rs)) |
|
342 // AALTS(Nil, fuse(List(Z), internalise(r1)) :: rs2.map(fuse(List(S), _))) |
|
343 // } |
|
344 case SEQ(r1, r2) => ASEQ(Nil, internalise(r1), internalise(r2)) |
|
345 case STAR(r) => ASTAR(Nil, internalise(r)) |
|
346 case RECD(x, r) => internalise(r) |
|
347 case NOT(r) => ANOT(Nil, internalise(r)) |
|
348 case ANYCHAR => AANYCHAR(Nil) |
|
349 } |
|
350 |
|
351 |
|
352 def bsimp(r: ARexp): ARexp = |
|
353 { |
|
354 r match { |
|
355 case ASEQ(bs1, r1, r2) => (bsimp(r1), bsimp(r2)) match { |
|
356 case (AZERO, _) => AZERO |
|
357 case (_, AZERO) => AZERO |
|
358 case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s) |
|
359 case (r1s, r2s) => ASEQ(bs1, r1s, r2s) |
|
360 } |
|
361 case AALTS(bs1, rs) => { |
|
362 val rs_simp = rs.map(bsimp(_)) |
|
363 val flat_res = flats(rs_simp) |
|
364 val dist_res = distinctBy(flat_res, erase)//strongDB(flat_res)//distinctBy(flat_res, erase) |
|
365 dist_res match { |
|
366 case Nil => AZERO |
|
367 case s :: Nil => fuse(bs1, s) |
|
368 case rs => AALTS(bs1, rs) |
|
369 } |
|
370 |
|
371 } |
|
372 case r => r |
|
373 } |
|
374 } |
|
375 def strongBsimp(r: ARexp): ARexp = |
|
376 { |
|
377 r match { |
|
378 case ASEQ(bs1, r1, r2) => (strongBsimp(r1), strongBsimp(r2)) match { |
|
379 case (AZERO, _) => AZERO |
|
380 case (_, AZERO) => AZERO |
|
381 case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s) |
|
382 case (r1s, r2s) => ASEQ(bs1, r1s, r2s) |
|
383 } |
|
384 case AALTS(bs1, rs) => { |
|
385 val rs_simp = rs.map(strongBsimp(_)) |
|
386 val flat_res = flats(rs_simp) |
|
387 val dist_res = distinctBy4(flat_res)//distinctBy(flat_res, erase) |
|
388 dist_res match { |
|
389 case Nil => AZERO |
|
390 case s :: Nil => fuse(bs1, s) |
|
391 case rs => AALTS(bs1, rs) |
|
392 } |
|
393 |
|
394 } |
|
395 case r => r |
|
396 } |
|
397 } |
|
398 |
|
399 def bders (s: List[Char], r: ARexp) : ARexp = s match { |
|
400 case Nil => r |
|
401 case c::s => bders(s, bder(c, r)) |
|
402 } |
|
403 |
|
404 def flats(rs: List[ARexp]): List[ARexp] = rs match { |
|
405 case Nil => Nil |
|
406 case AZERO :: rs1 => flats(rs1) |
|
407 case AALTS(bs, rs1) :: rs2 => rs1.map(fuse(bs, _)) ::: flats(rs2) |
|
408 case r1 :: rs2 => r1 :: flats(rs2) |
|
409 } |
|
410 |
|
411 def distinctBy[B, C](xs: List[B], f: B => C, acc: List[C] = Nil): List[B] = xs match { |
|
412 case Nil => Nil |
|
413 case (x::xs) => { |
|
414 val res = f(x) |
|
415 if (acc.contains(res)) distinctBy(xs, f, acc) |
|
416 else x::distinctBy(xs, f, res::acc) |
|
417 } |
|
418 } |
|
419 |
|
420 def projectFirstChild(rs: List[ARexp]) : List[ARexp] = rs match { |
|
421 case (ASEQ(bs, r1, r2p)::rs1) => r1::projectFirstChild(rs1) |
|
422 case Nil => Nil |
|
423 } |
|
424 |
|
425 |
|
426 def distinctBy2(xs: List[ARexp], acc: List[Rexp] = Nil): List[ARexp] = xs match { |
|
427 case Nil => Nil |
|
428 case (x::xs) => { |
|
429 val res = erase(x) |
|
430 if(acc.contains(res)) |
|
431 distinctBy2(xs, acc) |
|
432 else |
|
433 x match { |
|
434 case ASEQ(bs0, AALTS(bs1, rs), r2) => |
|
435 val newTerms = distinctBy2(rs.map(r1 => ASEQ(Nil, r1, r2)), acc) |
|
436 val rsPrime = projectFirstChild(newTerms) |
|
437 newTerms match { |
|
438 case Nil => distinctBy2(xs, acc) |
|
439 case t::Nil => ASEQ(bs0, fuse(bs1, rsPrime.head), r2)::distinctBy2(xs, erase(t)::acc) |
|
440 case ts => ASEQ(bs0, AALTS(bs1, rsPrime), r2)::distinctBy2(xs, newTerms.map(erase(_)):::acc) |
|
441 } |
|
442 case x => x::distinctBy2(xs, res::acc) |
|
443 } |
|
444 } |
|
445 } |
|
446 |
|
447 def deepFlts(r: ARexp): List[ARexp] = r match{ |
|
448 |
|
449 case ASEQ(bs, r1, r2) => deepFlts(r1).map(r1p => ASEQ(bs, r1p, r2)) |
|
450 case ASTAR(bs, r0) => List(r) |
|
451 case ACHAR(_, _) => List(r) |
|
452 case AALTS(bs, rs) => rs.flatMap(deepFlts(_))//throw new Error("doubly nested alts in bsimp r") |
|
453 } |
|
454 |
|
455 |
|
456 def distinctBy3(xs: List[ARexp], acc: List[Rexp] = Nil): List[ARexp] = xs match { |
|
457 case Nil => Nil |
|
458 case (x::xs) => { |
|
459 val res = erase(x) |
|
460 if(acc.contains(res)) |
|
461 distinctBy3(xs, acc) |
|
462 else |
|
463 x match { |
|
464 case ASEQ(bs0, AALTS(bs1, rs), r2) => |
|
465 val newTerms = distinctBy3(rs.flatMap(deepFlts(_)).map(r1 => ASEQ(Nil, r1, r2)), acc)//deepFlts(rs.flatMap(r1 => ASEQ(Nil, r1, r2)), acc) |
|
466 val rsPrime = projectFirstChild(newTerms) |
|
467 newTerms match { |
|
468 case Nil => distinctBy3(xs, acc) |
|
469 case t::Nil => ASEQ(bs0, fuse(bs1, rsPrime.head), r2)::distinctBy3(xs, erase(t)::acc) |
|
470 case ts => ASEQ(bs0, AALTS(bs1, rsPrime), r2)::distinctBy3(xs, newTerms.map(erase(_)):::acc) |
|
471 } |
|
472 case x => x::distinctBy3(xs, res::acc) |
|
473 } |
|
474 } |
|
475 } |
|
476 |
|
477 def pruneRexp(r: ARexp, allowableTerms: List[Rexp]) : ARexp = { |
|
478 r match { |
|
479 case ASEQ(bs, r1, r2) => |
|
480 val termsTruncated = allowableTerms.collect(rt => rt match { |
|
481 case SEQ(r1p, r2p) if(r2p == erase(r2)) => r1p |
|
482 }) |
|
483 val pruned : ARexp = pruneRexp(r1, termsTruncated) |
|
484 pruned match { |
|
485 case AZERO => AZERO |
|
486 case AONE(bs1) => fuse(bs ++ bs1, r2) |
|
487 case pruned1 => ASEQ(bs, pruned1, r2) |
|
488 } |
|
489 |
|
490 case AALTS(bs, rs) => |
|
491 //allowableTerms.foreach(a => println(shortRexpOutput(a))) |
|
492 val rsp = rs.map(r => pruneRexp(r, allowableTerms)).filter(r => r != AZERO) |
|
493 rsp match { |
|
494 case Nil => AZERO |
|
495 case r1::Nil => fuse(bs, r1) |
|
496 case rs1 => AALTS(bs, rs1) |
|
497 } |
|
498 case r => |
|
499 if(allowableTerms.contains(erase(r))) r else AZERO //assert(r != AZERO) |
|
500 } |
|
501 } |
|
502 |
|
503 def oneSimp(r: Rexp) : Rexp = r match { |
|
504 case SEQ(ONE, r) => r |
|
505 case SEQ(r1, r2) => SEQ(oneSimp(r1), r2) |
|
506 case r => r//assert r != 0 |
|
507 |
|
508 } |
|
509 |
|
510 |
|
511 def distinctBy4(xs: List[ARexp], acc: List[Rexp] = Nil) : List[ARexp] = xs match { |
|
512 case Nil => Nil |
|
513 case x :: xs => |
|
514 val erased = erase(x) |
|
515 if(acc.contains(erased)) |
|
516 distinctBy4(xs, acc) |
|
517 else{ |
|
518 val addToAcc = breakIntoTerms(erased).filter(r => !acc.contains(oneSimp(r))) |
|
519 val xp = pruneRexp(x, addToAcc) |
|
520 xp match { |
|
521 case AZERO => distinctBy4(xs, addToAcc.map(oneSimp(_)) ::: acc) |
|
522 case xPrime => xPrime :: distinctBy4(xs, addToAcc.map(oneSimp(_)) ::: acc) |
|
523 } |
|
524 |
|
525 } |
|
526 } |
|
527 |
|
528 |
|
529 def breakIntoTerms(r: Rexp) : List[Rexp] = r match { |
|
530 case SEQ(r1, r2) => breakIntoTerms(r1).map(r11 => SEQ(r11, r2)) |
|
531 // breakIntoTerms(r1).map(r11 => r11 match { |
|
532 // case ONE => r2 |
|
533 // case r => SEQ(r11, r2) |
|
534 // } |
|
535 //) |
|
536 case ALTS(r1, r2) => breakIntoTerms(r1) ::: breakIntoTerms(r2) |
|
537 case _ => r::Nil |
|
538 } |
|
539 |
|
540 def prettyRexp(r: Rexp) : String = r match { |
|
541 case STAR(r0) => s"${prettyRexp(r0)}*" |
|
542 case SEQ(CHAR(c), r2) => c.toString ++ prettyRexp(r2) |
|
543 case SEQ(r1, r2) => s"${prettyRexp(r1)}~${prettyRexp(r2)}" |
|
544 case CHAR(c) => c.toString |
|
545 case ANYCHAR => "." |
|
546 // case NOT(r0) => s |
|
547 } |
|
548 |
|
549 def decode_aux(r: Rexp, bs: Bits) : (Val, Bits) = (r, bs) match { |
|
550 case (ONE, bs) => (Empty, bs) |
|
551 case (CHAR(f), bs) => (Chr(f), bs) |
|
552 case (ALTS(r1, r2), Z::bs1) => { |
|
553 val (v, bs2) = decode_aux(r1, bs1) |
|
554 (Left(v), bs2) |
|
555 } |
|
556 case (ALTS(r1, r2), S::bs1) => { |
|
557 val (v, bs2) = decode_aux(r2, bs1) |
|
558 (Right(v), bs2) |
|
559 } |
|
560 case (SEQ(r1, r2), bs) => { |
|
561 val (v1, bs1) = decode_aux(r1, bs) |
|
562 val (v2, bs2) = decode_aux(r2, bs1) |
|
563 (Sequ(v1, v2), bs2) |
|
564 } |
|
565 case (STAR(r1), S::bs) => { |
|
566 val (v, bs1) = decode_aux(r1, bs) |
|
567 //println(v) |
|
568 val (Stars(vs), bs2) = decode_aux(STAR(r1), bs1) |
|
569 (Stars(v::vs), bs2) |
|
570 } |
|
571 case (STAR(_), Z::bs) => (Stars(Nil), bs) |
|
572 case (RECD(x, r1), bs) => { |
|
573 val (v, bs1) = decode_aux(r1, bs) |
|
574 (Rec(x, v), bs1) |
|
575 } |
|
576 case (NOT(r), bs) => (Nots(prettyRexp(r)), bs) |
|
577 } |
|
578 |
|
579 def decode(r: Rexp, bs: Bits) = decode_aux(r, bs) match { |
|
580 case (v, Nil) => v |
|
581 case _ => throw new Exception("Not decodable") |
|
582 } |
|
583 |
|
584 |
|
585 |
|
586 def blexSimp(r: Rexp, s: String) : List[Bit] = { |
|
587 blex_simp(internalise(r), s.toList) |
|
588 } |
|
589 |
|
590 def blexing_simp(r: Rexp, s: String) : Val = { |
|
591 val bit_code = blex_simp(internalise(r), s.toList) |
|
592 decode(r, bit_code) |
|
593 } |
|
594 |
|
595 def strong_blexing_simp(r: Rexp, s: String) : Val = { |
|
596 decode(r, strong_blex_simp(internalise(r), s.toList)) |
|
597 } |
|
598 |
|
599 def strong_blex_simp(r: ARexp, s: List[Char]) :Bits = s match { |
|
600 case Nil => { |
|
601 if (bnullable(r)) { |
|
602 //println(asize(r)) |
|
603 val bits = mkepsBC(r) |
|
604 bits |
|
605 } |
|
606 else |
|
607 throw new Exception("Not matched") |
|
608 } |
|
609 case c::cs => { |
|
610 val der_res = bder(c,r) |
|
611 val simp_res = strongBsimp(der_res) |
|
612 strong_blex_simp(simp_res, cs) |
|
613 } |
|
614 } |
|
615 |
|
616 |
|
617 |
|
618 def bders_simp(s: List[Char], r: ARexp) : ARexp = s match { |
|
619 case Nil => r |
|
620 case c::s => bders_simp(s, bsimp(bder(c, r))) |
|
621 } |
|
622 |
|
623 def bdersSimp(s: String, r: Rexp) : ARexp = bders_simp(s.toList, internalise(r)) |
|
624 |
|
625 def bders_simpS(s: List[Char], r: ARexp) : ARexp = s match { |
|
626 case Nil => r |
|
627 case c::s => bders_simpS(s, strongBsimp(bder(c, r))) |
|
628 } |
|
629 |
|
630 def bdersSimpS(s: String, r: Rexp) : ARexp = bders_simpS(s.toList, internalise(r)) |
|
631 |
|
632 def erase(r:ARexp): Rexp = r match{ |
|
633 case AZERO => ZERO |
|
634 case AONE(_) => ONE |
|
635 case ACHAR(bs, c) => CHAR(c) |
|
636 case AALTS(bs, Nil) => ZERO |
|
637 case AALTS(bs, a::Nil) => erase(a) |
|
638 case AALTS(bs, a::as) => ALTS(erase(a), erase(AALTS(bs, as))) |
|
639 case ASEQ(bs, r1, r2) => SEQ (erase(r1), erase(r2)) |
|
640 case ASTAR(cs, r)=> STAR(erase(r)) |
|
641 case ANOT(bs, r) => NOT(erase(r)) |
|
642 case AANYCHAR(bs) => ANYCHAR |
|
643 } |
|
644 |
|
645 |
|
646 def allCharSeq(r: Rexp) : Boolean = r match { |
|
647 case CHAR(c) => true |
|
648 case SEQ(r1, r2) => allCharSeq(r1) && allCharSeq(r2) |
|
649 case _ => false |
|
650 } |
|
651 |
|
652 def flattenSeq(r: Rexp) : String = r match { |
|
653 case CHAR(c) => c.toString |
|
654 case SEQ(r1, r2) => flattenSeq(r1) ++ flattenSeq(r2) |
|
655 case _ => throw new Error("flatten unflattenable rexp") |
|
656 } |
|
657 |
|
658 def shortRexpOutput(r: Rexp) : String = r match { |
|
659 case CHAR(c) => c.toString |
|
660 case ONE => "1" |
|
661 case ZERO => "0" |
|
662 case SEQ(r1, r2) if(allCharSeq(r)) => flattenSeq(r)//"[" ++ shortRexpOutput(r1) ++ "~" ++ shortRexpOutput(r2) ++ "]" |
|
663 case SEQ(r1, r2) => "[" ++ shortRexpOutput(r1) ++ "~" ++ shortRexpOutput(r2) ++ "]" |
|
664 case ALTS(r1, r2) => "(" ++ shortRexpOutput(r1) ++ "+" ++ shortRexpOutput(r2) ++ ")" |
|
665 case STAR(STAR(r)) => "(..)*"// ++ shortRexpOutput(r) ++ "]*" |
|
666 case STAR(r) => "STAR(" ++ shortRexpOutput(r) ++ ")" |
|
667 //case RTOP => "RTOP" |
|
668 } |
|
669 |
|
670 |
|
671 def blex_simp(r: ARexp, s: List[Char]) : Bits = s match { |
|
672 case Nil => { |
|
673 if (bnullable(r)) { |
|
674 //println(asize(r)) |
|
675 val bits = mkepsBC(r) |
|
676 bits |
|
677 } |
|
678 else |
|
679 throw new Exception("Not matched") |
|
680 } |
|
681 case c::cs => { |
|
682 val der_res = bder(c,r) |
|
683 val simp_res = bsimp(der_res) |
|
684 blex_simp(simp_res, cs) |
|
685 } |
|
686 } |
|
687 def size(r: Rexp) : Int = r match { |
|
688 case ZERO => 1 |
|
689 case ONE => 1 |
|
690 case CHAR(_) => 1 |
|
691 case ANYCHAR => 1 |
|
692 case NOT(r0) => 1 + size(r0) |
|
693 case SEQ(r1, r2) => 1 + size(r1) + size(r2) |
|
694 case ALTS(r1, r2) => 1 + List(r1, r2).map(size).sum |
|
695 case STAR(r) => 1 + size(r) |
|
696 } |
|
697 |
|
698 def asize(a: ARexp) = size(erase(a)) |
|
699 |
|
700 //pder related |
|
701 type Mon = (Char, Rexp) |
|
702 type Lin = Set[Mon] |
|
703 |
|
704 def dot_prod(rs: Set[Rexp], r: Rexp): Set[Rexp] = r match { |
|
705 case ZERO => Set() |
|
706 case ONE => rs |
|
707 case r => rs.map((re) => if (re == ONE) r else SEQ(re, r) ) |
|
708 } |
|
709 def cir_prod(l: Lin, t: Rexp): Lin = t match {//remember this Lin is different from the Lin in Antimirov's paper. Here it does not mean the set of all subsets of linear forms that does not contain zero, but rather the type a set of linear forms |
|
710 case ZERO => Set() |
|
711 case ONE => l |
|
712 case t => l.map( m => m._2 match {case ZERO => m case ONE => (m._1, t) case p => (m._1, SEQ(p, t)) } ) |
|
713 } |
|
714 def lf(r: Rexp): Lin = r match { |
|
715 case ZERO => Set() |
|
716 case ONE => Set() |
|
717 case CHAR(f) => { |
|
718 //val Some(c) = alphabet.find(f) |
|
719 Set((f, ONE)) |
|
720 } |
|
721 case ALTS(r1, r2) => { |
|
722 lf(r1 ) ++ lf(r2) |
|
723 } |
|
724 case STAR(r1) => cir_prod(lf(r1), STAR(r1)) //may try infix notation later...... |
|
725 case SEQ(r1, r2) =>{ |
|
726 if (nullable(r1)) |
|
727 cir_prod(lf(r1), r2) ++ lf(r2) |
|
728 else |
|
729 cir_prod(lf(r1), r2) |
|
730 } |
|
731 } |
|
732 def lfs(r: Set[Rexp]): Lin = { |
|
733 r.foldLeft(Set[Mon]())((acc, r) => acc ++ lf(r)) |
|
734 } |
|
735 |
|
736 def pder(x: Char, t: Rexp): Set[Rexp] = { |
|
737 val lft = lf(t) |
|
738 (lft.filter(mon => if(mon._1 == x) true else false)).map(mon => mon._2) |
|
739 } |
|
740 def pders_single(s: List[Char], t: Rexp) : Set[Rexp] = s match { |
|
741 case x::xs => pders(xs, pder(x, t)) |
|
742 case Nil => Set(t) |
|
743 } |
|
744 def pders(s: List[Char], ts: Set[Rexp]) : Set[Rexp] = s match { |
|
745 case x::xs => pders(xs, ts.foldLeft(Set[Rexp]())((acc, t) => acc ++ pder(x, t))) |
|
746 case Nil => ts |
|
747 } |
|
748 def pderss(ss: List[List[Char]], t: Rexp): Set[Rexp] = ss.foldLeft( Set[Rexp]() )( (acc, s) => pders_single(s, t) ++ acc ) |
|
749 def pdera(t: Rexp): Set[Rexp] = lf(t).map(mon => mon._2) |
|
750 //all implementation of partial derivatives that involve set union are potentially buggy |
|
751 //because they don't include the original regular term before they are pdered. |
|
752 //now only pderas is fixed. |
|
753 def pderas(t: Set[Rexp], d: Int): Set[Rexp] = if(d > 0) pderas(lfs(t).map(mon => mon._2), d - 1) ++ t else lfs(t).map(mon => mon._2) ++ t//repeated application of pderas over the newest set of pders. |
|
754 def pderUNIV(r: Rexp) : Set[Rexp] = pderas(Set(r), awidth(r) + 1) |
|
755 def awidth(r: Rexp) : Int = r match { |
|
756 case CHAR(c) => 1 |
|
757 case SEQ(r1, r2) => awidth(r1) + awidth(r2) |
|
758 case ALTS(r1, r2) => awidth(r1) + awidth(r2) |
|
759 case ONE => 0 |
|
760 case ZERO => 0 |
|
761 case STAR(r) => awidth(r) |
|
762 } |
|
763 //def sigma_lf(l: Set[Mon]) : Rexp = ALTS(l.map(mon => SEQ(CHAR(mon._1),mon._2)).toList) |
|
764 //def sigma(rs: Set[Rexp]) : Rexp = ALTS(rs.toList) |
|
765 def o(r: Rexp) = if (nullable(r)) ONE else ZERO |
|
766 //def nlf(t: Rexp) : Rexp = ALTS(List( o(t), sigma_lf(lf(t)) )) |
|
767 def pdp(x: Char, r: Rexp) : Set[Rexp] = r match { |
|
768 case ZERO => Set[Rexp]() |
|
769 case ONE => Set[Rexp]() |
|
770 case CHAR(f) => if(x == f) Set(ONE) else Set[Rexp]() |
|
771 case ALTS(r1, r2) => pdp(x, r1) ++ pdp(x, r2) |
|
772 case STAR(r1) => pdp(x, r).map(a => SEQ(a, STAR(r1))) |
|
773 case SEQ(a0, b) => if(nullable(a0)) pdp(x, a0).map(a => SEQ(a, b)) ++ pdp(x, b) else pdp(x, a0).map(a => SEQ(a, b)) |
|
774 } |
|
775 def pdps(s: List[Char], ts: Set[Rexp]): Set[Rexp] = s match { |
|
776 case x::xs => pdps(xs, ts.foldLeft(Set[Rexp]())((acc, t) => acc ++ pder(x, t))) |
|
777 case Nil => ts |
|
778 } |
|
779 def pdpss(ss: List[List[Char]], t: Rexp): Set[Rexp] = ss.foldLeft( Set[Rexp]())((acc, s) => pdps(s, Set(t)) ++ acc) |
|
780 |
|
781 |
|
782 |
|
783 def starPrint(r: Rexp) : Unit = r match { |
|
784 |
|
785 case SEQ(head, rstar) => |
|
786 println(shortRexpOutput(head) ++ "~STARREG") |
|
787 case STAR(rstar) => |
|
788 println("STARREG") |
|
789 case ALTS(r1, r2) => |
|
790 println("(") |
|
791 starPrint(r1) |
|
792 println("+") |
|
793 starPrint(r2) |
|
794 println(")") |
|
795 case ZERO => println("0") |
|
796 |
|
797 } |
|
798 |
|
799 // @arg(doc = "small tests") |
|
800 val STARREG = //((( (STAR("aa")) | STAR("aaa") ).%).%) |
|
801 (((STAR("a") | ( STAR("aaa")) | STAR("aaaaa"| ( STAR("aaaaaaa")) | STAR("aaaaaaaaaaa"))).%).%).% |
|
802 |
|
803 // @main |
|
804 def small() = { |
|
805 |
|
806 |
|
807 // println(lexing_simp(NOTREG, prog0)) |
|
808 // val v = lex_simp(NOTREG, prog0.toList) |
|
809 // println(v) |
|
810 |
|
811 // val d = (lex_simp(NOTREG, prog0.toList)) |
|
812 // println(d) |
|
813 val pderSTAR = pderUNIV(STARREG) |
|
814 |
|
815 val refSize = pderSTAR.map(size(_)).sum |
|
816 println("different partial derivative terms:") |
|
817 pderSTAR.foreach(r => r match { |
|
818 |
|
819 case SEQ(head, rstar) => |
|
820 println(shortRexpOutput(head) ++ "~STARREG") |
|
821 case STAR(rstar) => |
|
822 println("STARREG") |
|
823 |
|
824 } |
|
825 ) |
|
826 println("the total number of terms is") |
|
827 //println(refSize) |
|
828 println(pderSTAR.size) |
|
829 |
|
830 // val A : Rexp= ("c" | (ONE | "b") ~ "d") ~((ONE).%) |
|
831 // val B : Rexp = ((ONE).%) |
|
832 // val C : Rexp = ("d") ~ ((ONE).%) |
|
833 // val PRUNE_REG : Rexp = (A | B | C) |
|
834 // val APRUNE_REG = internalise(PRUNE_REG) |
|
835 // val program_solution = pruneRexp(APRUNE_REG, breakIntoTerms(PRUNE_REG)) |
|
836 // println("program executes and gives: as disired!") |
|
837 // println(shortRexpOutput(erase(program_solution))) |
|
838 |
|
839 for(i <- List( 4, 8, 9, 17, 29) ){// 100, 400, 800, 840, 841, 900 |
|
840 val prog0 = "a" * i |
|
841 //println(s"test: $prog0") |
|
842 println(s"testing with $i a's" ) |
|
843 val bd = bdersSimp(prog0, STARREG)//DB |
|
844 val sbd = bdersSimpS(prog0, STARREG)//strongDB |
|
845 starPrint(erase(sbd)) |
|
846 val subTerms = breakIntoTerms(erase(sbd)) |
|
847 val subTermsLarge = breakIntoTerms(erase(bd)) |
|
848 |
|
849 println(s"subterms of regex with strongDB: ${subTerms.length}, standard DB: ${subTermsLarge.length}") |
|
850 |
|
851 |
|
852 |
|
853 println("the number of distinct subterms for bsimp with strongDB and standardDB") |
|
854 println(subTerms.distinct.size) |
|
855 println(subTermsLarge.distinct.size) |
|
856 println("which coincides with the number of PDER terms") |
|
857 |
|
858 |
|
859 // println(shortRexpOutput(erase(sbd))) |
|
860 // println(shortRexpOutput(erase(bd))) |
|
861 |
|
862 println("pdersize, original, strongSimp, bsimp") |
|
863 println(refSize, size(STARREG), asize(sbd), asize(bd)) |
|
864 |
|
865 val vres = strong_blexing_simp( STARREG, prog0) |
|
866 println(vres) |
|
867 } |
|
868 // println(vs.length) |
|
869 // println(vs) |
|
870 |
|
871 |
|
872 // val prog1 = """read n; write n""" |
|
873 // println(s"test: $prog1") |
|
874 // println(lexing_simp(WHILE_REGS, prog1)) |
|
875 } |
|
876 |
|
877 small() |