1 // Core Part about Regular Expression Matching |
1 // Main Part 3 about Regular Expression Matching |
2 //============================================= |
2 //============================================= |
3 |
3 |
4 object CW8c { |
4 object M3 { |
5 |
5 |
6 // Regular Expressions |
6 // Regular Expressions |
7 abstract class Rexp |
7 abstract class Rexp |
8 case object ZERO extends Rexp |
8 case object ZERO extends Rexp |
9 case object ONE extends Rexp |
9 case object ONE extends Rexp |
10 case class CHAR(c: Char) extends Rexp |
10 case class CHAR(c: Char) extends Rexp |
11 case class ALT(r1: Rexp, r2: Rexp) extends Rexp |
11 case class ALTs(rs: List[Rexp]) extends Rexp // alternatives |
12 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp |
12 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp // sequence |
13 case class STAR(r: Rexp) extends Rexp |
13 case class STAR(r: Rexp) extends Rexp // star |
|
14 |
|
15 |
|
16 //the usual binary choice can be defined in terms of ALTs |
|
17 def ALT(r1: Rexp, r2: Rexp) = ALTs(List(r1, r2)) |
14 |
18 |
15 // some convenience for typing in regular expressions |
19 // some convenience for typing in regular expressions |
16 |
|
17 import scala.language.implicitConversions |
20 import scala.language.implicitConversions |
18 import scala.language.reflectiveCalls |
21 import scala.language.reflectiveCalls |
19 |
|
20 |
22 |
21 def charlist2rexp(s: List[Char]): Rexp = s match { |
23 def charlist2rexp(s: List[Char]): Rexp = s match { |
22 case Nil => ONE |
24 case Nil => ONE |
23 case c::Nil => CHAR(c) |
25 case c::Nil => CHAR(c) |
24 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
26 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
61 |
63 |
62 def der (c: Char, r: Rexp) : Rexp = r match { |
64 def der (c: Char, r: Rexp) : Rexp = r match { |
63 case ZERO => ZERO |
65 case ZERO => ZERO |
64 case ONE => ZERO |
66 case ONE => ZERO |
65 case CHAR(d) => if (c == d) ONE else ZERO |
67 case CHAR(d) => if (c == d) ONE else ZERO |
66 case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) |
68 case ALTs(rs) => ALTs(rs.map(der(c, _))) |
67 case SEQ(r1, r2) => |
69 case SEQ(r1, r2) => |
68 if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |
70 if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |
69 else SEQ(der(c, r1), r2) |
71 else SEQ(der(c, r1), r2) |
70 case STAR(r1) => SEQ(der(c, r1), STAR(r1)) |
72 case STAR(r1) => SEQ(der(c, r1), STAR(r1)) |
71 } |
73 } |
72 |
74 |
73 // (3) Complete the simp function according to |
75 |
|
76 // (3) Implement the flatten function flts. It |
|
77 // deletes 0s from a list of regular expressions |
|
78 // and also 'spills out', or flattens, nested |
|
79 // ALTernativeS. |
|
80 |
|
81 def flts(rs: List[Rexp]) : List[Rexp] = rs match { |
|
82 case Nil => Nil |
|
83 case ZERO::tl => flts(tl) |
|
84 case ALTs(rs1)::rs2 => rs1 ::: flts(rs2) |
|
85 case r::rs => r :: flts(rs) |
|
86 } |
|
87 |
|
88 // (4) Complete the simp function according to |
74 // the specification given in the coursework; this |
89 // the specification given in the coursework; this |
75 // function simplifies a regular expression from |
90 // function simplifies a regular expression from |
76 // the inside out, like you would simplify arithmetic |
91 // the inside out, like you would simplify arithmetic |
77 // expressions; however it does not simplify inside |
92 // expressions; however it does not simplify inside |
78 // STAR-regular expressions. |
93 // STAR-regular expressions. |
79 |
94 |
|
95 |
80 def simp(r: Rexp) : Rexp = r match { |
96 def simp(r: Rexp) : Rexp = r match { |
81 case ALT(r1, r2) => (simp(r1), simp(r2)) match { |
97 case ALTs(rs) => (flts(rs.map(simp)).distinct) match { |
82 case (ZERO, r2s) => r2s |
98 case Nil => ZERO |
83 case (r1s, ZERO) => r1s |
99 case r::Nil => r |
84 case (r1s, r2s) => if (r1s == r2s) r1s else ALT (r1s, r2s) |
100 case rs => ALTs(rs) |
85 } |
101 } |
86 case SEQ(r1, r2) => (simp(r1), simp(r2)) match { |
102 case SEQ(r1, r2) => (simp(r1), simp(r2)) match { |
87 case (ZERO, _) => ZERO |
103 case (ZERO, _) => ZERO |
88 case (_, ZERO) => ZERO |
104 case (_, ZERO) => ZERO |
89 case (ONE, r2s) => r2s |
105 case (ONE, r2s) => r2s |
91 case (r1s, r2s) => SEQ(r1s, r2s) |
107 case (r1s, r2s) => SEQ(r1s, r2s) |
92 } |
108 } |
93 case r => r |
109 case r => r |
94 } |
110 } |
95 |
111 |
|
112 simp(ALT(ONE | CHAR('a'), CHAR('a') | ONE)) |
96 |
113 |
97 // (4) Complete the two functions below; the first |
114 // (5) Complete the two functions below; the first |
98 // calculates the derivative w.r.t. a string; the second |
115 // calculates the derivative w.r.t. a string; the second |
99 // is the regular expression matcher taking a regular |
116 // is the regular expression matcher taking a regular |
100 // expression and a string and checks whether the |
117 // expression and a string and checks whether the |
101 // string matches the regular expression. |
118 // string matches the regular expression. |
102 |
119 |
106 } |
123 } |
107 |
124 |
108 // main matcher function |
125 // main matcher function |
109 def matcher(r: Rexp, s: String) = nullable(ders(s.toList, r)) |
126 def matcher(r: Rexp, s: String) = nullable(ders(s.toList, r)) |
110 |
127 |
111 // (5) Complete the size function for regular |
128 // (6) Complete the size function for regular |
112 // expressions according to the specification |
129 // expressions according to the specification |
113 // given in the coursework. |
130 // given in the coursework. |
114 |
131 |
115 |
132 |
116 def size(r: Rexp): Int = r match { |
133 def size(r: Rexp): Int = r match { |
117 case ZERO => 1 |
134 case ZERO => 1 |
118 case ONE => 1 |
135 case ONE => 1 |
119 case CHAR(_) => 1 |
136 case CHAR(_) => 1 |
120 case ALT(r1, r2) => 1 + size(r1) + size (r2) |
137 case ALTs(rs) => 1 + rs.map(size).sum |
121 case SEQ(r1, r2) => 1 + size(r1) + size (r2) |
138 case SEQ(r1, r2) => 1 + size(r1) + size (r2) |
122 case STAR(r1) => 1 + size(r1) |
139 case STAR(r1) => 1 + size(r1) |
123 } |
140 } |
124 |
141 |
125 |
142 |
128 |
145 |
129 //matcher(("a" ~ "b") ~ "c", "abc") // => true |
146 //matcher(("a" ~ "b") ~ "c", "abc") // => true |
130 //matcher(("a" ~ "b") ~ "c", "ab") // => false |
147 //matcher(("a" ~ "b") ~ "c", "ab") // => false |
131 |
148 |
132 // the supposedly 'evil' regular expression (a*)* b |
149 // the supposedly 'evil' regular expression (a*)* b |
133 val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) |
150 // val EVIL = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) |
134 |
151 |
135 //matcher(EVIL, "a" * 1000 ++ "b") // => true |
152 //println(matcher(EVIL, "a" * 1000 ++ "b")) // => true |
136 //matcher(EVIL, "a" * 1000) // => false |
153 //println(matcher(EVIL, "a" * 1000)) // => false |
137 |
154 |
138 // size without simplifications |
155 // size without simplifications |
139 //size(der('a', der('a', EVIL))) // => 28 |
156 //println(size(der('a', der('a', EVIL)))) // => 28 |
140 //size(der('a', der('a', der('a', EVIL)))) // => 58 |
157 //println(size(der('a', der('a', der('a', EVIL))))) // => 58 |
141 |
158 |
142 // size with simplification |
159 // size with simplification |
143 //size(simp(der('a', der('a', EVIL)))) // => 8 |
160 //println(simp(der('a', der('a', EVIL)))) |
144 //size(simp(der('a', der('a', der('a', EVIL))))) // => 8 |
161 //println(simp(der('a', der('a', der('a', EVIL))))) |
|
162 |
|
163 //println(size(simp(der('a', der('a', EVIL))))) // => 8 |
|
164 //println(size(simp(der('a', der('a', der('a', EVIL)))))) // => 8 |
145 |
165 |
146 // Python needs around 30 seconds for matching 28 a's with EVIL. |
166 // Python needs around 30 seconds for matching 28 a's with EVIL. |
147 // Java 9 and later increase this to an "astonishing" 40000 a's in |
167 // Java 9 and later increase this to an "astonishing" 40000 a's in |
148 // around 30 seconds. |
168 // around 30 seconds. |
149 // |
169 // |
153 |
173 |
154 def time_needed[T](i: Int, code: => T) = { |
174 def time_needed[T](i: Int, code: => T) = { |
155 val start = System.nanoTime() |
175 val start = System.nanoTime() |
156 for (j <- 1 to i) code |
176 for (j <- 1 to i) code |
157 val end = System.nanoTime() |
177 val end = System.nanoTime() |
158 (end - start)/(i * 1.0e9) |
178 "%.5f".format((end - start)/(i * 1.0e9)) |
159 } |
179 } |
160 |
180 |
161 //for (i <- 0 to 5000000 by 500000) { |
181 //for (i <- 0 to 5000000 by 500000) { |
162 // println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL, "a" * i))) + " secs.") |
182 // println(s"$i ${time_needed(2, matcher(EVIL, "a" * i))} secs.") |
163 //} |
183 //} |
164 |
184 |
165 // another "power" test case |
185 // another "power" test case |
166 //simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(100).next) == ONE |
186 //simp(Iterator.iterate(ONE:Rexp)(r => SEQ(r, ONE | ONE)).drop(100).next) == ONE |
167 |
187 |
168 // the Iterator produces the rexp |
188 // the Iterator produces the rexp |
169 // |
189 // |
170 // SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE) |
190 // SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE) |
171 // |
191 // |
172 // where SEQ is nested 100 times. |
192 // where SEQ is nested 50 times. |
173 |
193 |
174 |
194 |
175 |
195 |
176 } |
196 } |