|
1 // CW 1 |
1 import scala.language.implicitConversions |
2 import scala.language.implicitConversions |
2 import scala.language.reflectiveCalls |
3 import scala.language.reflectiveCalls |
3 |
4 |
4 abstract class Rexp { |
5 abstract class Rexp |
5 def simp : Rexp = this |
6 case object ZERO extends Rexp // matches nothing |
6 } |
7 case object ONE extends Rexp // matches the empty string |
7 |
8 case class CHAR(c: Char) extends Rexp // matches a character c |
8 case object NULL extends Rexp |
9 case class ALT(r1: Rexp, r2: Rexp) extends Rexp // alternative |
9 case object EMPTY extends Rexp |
10 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp // sequence |
10 case class CHAR(c: Char) extends Rexp |
11 case class STAR(r: Rexp) extends Rexp // star |
11 case class ALT(r1: Rexp, r2: Rexp) extends Rexp { |
12 case class RANGE(cs: Set[Char]) extends Rexp // set of characters |
12 override def toString = r1.toString + " | " + r2.toString |
13 case class PLUS(r: Rexp) extends Rexp // plus |
13 override def simp = (r1.simp, r2.simp) match { |
14 case class OPT(r: Rexp) extends Rexp // optional |
14 case (NULL, r) => r |
15 case class NTIMES(r: Rexp, n: Int) extends Rexp // n-times |
15 case (r, NULL) => r |
16 case class UPNTIMES(r: Rexp, n: Int) extends Rexp // up n-times |
16 case (r, EMPTY) => if (nullable(r)) r else ALT(r, EMPTY) |
17 case class FROMNTIMES(r: Rexp, n: Int) extends Rexp // from n-times |
17 case (EMPTY, r) => if (nullable(r)) r else ALT(r, EMPTY) |
18 case class NMTIMES(r: Rexp, n: Int, m: Int) extends Rexp // between nm-times |
18 case (r1, r2) => if (r1 == r2) r1 else ALT(r1, r2) |
19 case class NOT(r: Rexp) extends Rexp // not |
19 } |
20 case class CFUN(f: Char => Boolean) extends Rexp // subsuming CHAR and RANGE |
20 } |
21 |
21 case class RANGE(cs: List[Char]) extends Rexp { |
|
22 //override def toString = cs.mkString("[",",","]") |
|
23 override def toString = "[" + cs.head + " .. " + cs.last + "]" |
|
24 } |
|
25 object RANGE { |
|
26 def apply(s: String) : RANGE = RANGE(s.toList) |
|
27 } |
|
28 case class SEQ(r1: Rexp, r2: Rexp) extends Rexp { |
|
29 override def toString = r1.toString + " ~ " + r2.toString |
|
30 override def simp = (r1.simp, r2.simp) match { |
|
31 case (NULL, _) => NULL |
|
32 case (_, NULL) => NULL |
|
33 case (EMPTY, r) => r |
|
34 case (r, EMPTY) => r |
|
35 case (r1, r2) => SEQ(r1, r2) |
|
36 } |
|
37 } |
|
38 case class PLUS(r: Rexp) extends Rexp { |
|
39 override def simp = r.simp match { |
|
40 case NULL => EMPTY |
|
41 case EMPTY => EMPTY |
|
42 case r => PLUS(r) |
|
43 } |
|
44 } |
|
45 case class STAR(r: Rexp) extends Rexp { |
|
46 override def simp = r.simp match { |
|
47 case NULL => EMPTY |
|
48 case EMPTY => EMPTY |
|
49 case r => STAR(r) |
|
50 } |
|
51 } |
|
52 case class NTIMES(r: Rexp, n: Int) extends Rexp { |
|
53 override def simp = if (n == 0) EMPTY else |
|
54 r.simp match { |
|
55 case NULL => NULL |
|
56 case EMPTY => EMPTY |
|
57 case r => NTIMES(r, n) |
|
58 } |
|
59 } |
|
60 case class NUPTOM(r: Rexp, n: Int, m: Int) extends Rexp { |
|
61 override def simp = if (m == 0) EMPTY else |
|
62 r.simp match { |
|
63 case NULL => NULL |
|
64 case EMPTY => EMPTY |
|
65 case r => NUPTOM(r, n, m) |
|
66 } |
|
67 } |
|
68 def NMTIMES(r: Rexp, n: Int, m: Int) = { |
|
69 if(m < n) throw new IllegalArgumentException("the number m cannot be smaller than n.") |
|
70 else NUPTOM(r, n, m - n) |
|
71 } |
|
72 |
|
73 |
|
74 case class NOT(r: Rexp) extends Rexp { |
|
75 override def simp = NOT(r.simp) |
|
76 } |
|
77 case class OPT(r: Rexp) extends Rexp { |
|
78 override def simp = OPT(r.simp) |
|
79 } |
|
80 |
22 |
81 // nullable function: tests whether the regular |
23 // nullable function: tests whether the regular |
82 // expression can recognise the empty string |
24 // expression can recognise the empty string |
83 def nullable (r: Rexp) : Boolean = r match { |
25 def nullable (r: Rexp) : Boolean = r match { |
84 case NULL => false |
26 case ZERO => false |
85 case EMPTY => true |
27 case ONE => true |
86 case CHAR(_) => false |
28 case CHAR(_) => false |
87 case ALT(r1, r2) => nullable(r1) || nullable(r2) |
29 case ALT(r1, r2) => nullable(r1) || nullable(r2) |
88 case SEQ(r1, r2) => nullable(r1) && nullable(r2) |
30 case SEQ(r1, r2) => nullable(r1) && nullable(r2) |
89 case STAR(_) => true |
31 case STAR(_) => true |
|
32 case RANGE(_) => false |
90 case PLUS(r) => nullable(r) |
33 case PLUS(r) => nullable(r) |
91 case NTIMES(r, i) => if (i == 0) true else nullable(r) |
34 case OPT(_) => true |
92 case NUPTOM(r, i, j) => if (i == 0) true else nullable(r) |
35 case NTIMES(r, n) => if (n == 0) true else nullable(r) |
93 case RANGE(_) => false |
36 case UPNTIMES(_, _) => true |
|
37 case FROMNTIMES(r, n) => if (n == 0) true else nullable(r) |
|
38 case NMTIMES(r, n, m) => if (n == 0) true else nullable(r) |
94 case NOT(r) => !(nullable(r)) |
39 case NOT(r) => !(nullable(r)) |
95 case OPT(_) => true |
40 case CFUN(_) => false |
96 } |
|
97 |
|
98 def zeroable (r: Rexp) : Boolean = r match { |
|
99 case NULL => true |
|
100 case EMPTY => false |
|
101 case CHAR(_) => false |
|
102 case ALT(r1, r2) => zeroable(r1) && zeroable(r2) |
|
103 case SEQ(r1, r2) => zeroable(r1) || zeroable(r2) |
|
104 case STAR(_) => false |
|
105 case PLUS(r) => zeroable(r) |
|
106 case NTIMES(r, i) => if (i == 0) false else zeroable(r) |
|
107 case NUPTOM(r, i, j) => if (i == 0) false else zeroable(r) |
|
108 case RANGE(_) => false |
|
109 case NOT(r) => !(zeroable(r)) |
|
110 case OPT(_) => false |
|
111 } |
41 } |
112 |
42 |
113 // derivative of a regular expression w.r.t. a character |
43 // derivative of a regular expression w.r.t. a character |
114 def der (c: Char, r: Rexp) : Rexp = r match { |
44 def der (c: Char, r: Rexp) : Rexp = r match { |
115 case NULL => NULL |
45 case ZERO => ZERO |
116 case EMPTY => NULL |
46 case ONE => ZERO |
117 case CHAR(d) => if (c == d) EMPTY else NULL |
47 case CHAR(d) => if (c == d) ONE else ZERO |
118 case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) |
48 case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) |
119 case SEQ(r1, r2) => |
49 case SEQ(r1, r2) => |
120 if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |
50 if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) |
121 else SEQ(der(c, r1), r2) |
51 else SEQ(der(c, r1), r2) |
122 case STAR(r) => SEQ(der(c, r), STAR(r)) |
52 case STAR(r) => SEQ(der(c, r), STAR(r)) |
|
53 case RANGE(cs) => if (cs contains c) ONE else ZERO |
123 case PLUS(r) => SEQ(der(c, r), STAR(r)) |
54 case PLUS(r) => SEQ(der(c, r), STAR(r)) |
124 case NTIMES(r, i) => |
55 case OPT(r) => der(c, r) |
125 if (i == 0) NULL else der(c, SEQ(r, NTIMES(r, i - 1))) |
56 case NTIMES(r, n) => |
126 case NUPTOM(r, i, j) => |
57 if (n == 0) ZERO else SEQ(der(c, r), NTIMES(r, n - 1)) |
127 if (i == 0 && j == 0) NULL else |
58 case UPNTIMES(r, n) => |
128 if (i == 0) ALT(der(c, NTIMES(r, j)), der(c, NUPTOM(r, 0, j - 1))) |
59 if (n == 0) ZERO else SEQ(der(c, r), UPNTIMES(r, n - 1)) |
129 else der(c, SEQ(r, NUPTOM(r, i - 1, j))) |
60 case FROMNTIMES(r, n) => |
130 case RANGE(cs) => if (cs contains c) EMPTY else NULL |
61 if (n == 0) ZERO else SEQ(der(c, r), FROMNTIMES(r, n - 1)) |
|
62 case NMTIMES(r, n, m) => |
|
63 if (m < n) ZERO else |
|
64 if (n == 0 && m == 0) ZERO else |
|
65 if (n == 0) SEQ(der(c, r), UPNTIMES(r, m - 1)) |
|
66 else SEQ(der(c, r), NMTIMES(r, n - 1, m - 1)) |
131 case NOT(r) => NOT(der (c, r)) |
67 case NOT(r) => NOT(der (c, r)) |
132 case OPT(r) => der(c, r) |
68 case CFUN(f) => if (f(c)) ONE else ZERO |
133 } |
69 } |
|
70 |
|
71 def simp(r: Rexp) : Rexp = r match { |
|
72 case ALT(r1, r2) => (simp(r1), simp(r2)) match { |
|
73 case (ZERO, r2s) => r2s |
|
74 case (r1s, ZERO) => r1s |
|
75 case (r1s, r2s) => if (r1s == r2s) r1s else ALT (r1s, r2s) |
|
76 } |
|
77 case SEQ(r1, r2) => (simp(r1), simp(r2)) match { |
|
78 case (ZERO, _) => ZERO |
|
79 case (_, ZERO) => ZERO |
|
80 case (ONE, r2s) => r2s |
|
81 case (r1s, ONE) => r1s |
|
82 case (r1s, r2s) => SEQ(r1s, r2s) |
|
83 } |
|
84 case r => r |
|
85 } |
|
86 |
134 |
87 |
135 // derivative w.r.t. a string (iterates der) |
88 // derivative w.r.t. a string (iterates der) |
136 def ders (s: List[Char], r: Rexp) : Rexp = s match { |
89 def ders (s: List[Char], r: Rexp) : Rexp = s match { |
137 case Nil => r |
90 case Nil => r |
138 case c::s => ders(s, der(c, r)) |
91 case c::s => ders(s, der(c, r)) |
139 } |
92 } |
140 |
93 |
141 def ders_simp (s: List[Char], r: Rexp) : Rexp = s match { |
94 def ders_simp (s: List[Char], r: Rexp) : Rexp = s match { |
142 case Nil => r |
95 case Nil => r |
143 case c::s => ders_simp(s, der(c, r).simp) |
96 case c::s => ders_simp(s, simp(der(c, r))) |
144 } |
97 } |
145 |
98 |
146 // main matcher function |
99 // main matcher function including simplification |
147 def matcher(r: Rexp, s: String) : Boolean = nullable(ders_simp(s.toList, r)) |
100 def matcher(r: Rexp, s: String) : Boolean = |
|
101 nullable(ders_simp(s.toList, r)) |
|
102 |
|
103 |
148 |
104 |
149 // some convenience for typing in regular expressions |
105 // some convenience for typing in regular expressions |
150 def charlist2rexp(s : List[Char]) : Rexp = s match { |
106 def charlist2rexp(s : List[Char]) : Rexp = s match { |
151 case Nil => EMPTY |
107 case Nil => ONE |
152 case c::Nil => CHAR(c) |
108 case c::Nil => CHAR(c) |
153 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
109 case c::s => SEQ(CHAR(c), charlist2rexp(s)) |
154 } |
110 } |
155 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) |
111 implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) |
156 |
112 |
166 def % = STAR(s) |
122 def % = STAR(s) |
167 def ~ (r: Rexp) = SEQ(s, r) |
123 def ~ (r: Rexp) = SEQ(s, r) |
168 def ~ (r: String) = SEQ(s, r) |
124 def ~ (r: String) = SEQ(s, r) |
169 } |
125 } |
170 |
126 |
|
127 val r1 = NTIMES("a", 3) |
|
128 val r2 = NTIMES(OPT("a"), 3) |
|
129 val r3 = UPNTIMES("a", 3) |
|
130 val r4 = UPNTIMES(OPT("a"), 3) |
|
131 val r5 = NMTIMES("a", 3, 5) |
|
132 val r6 = NMTIMES(OPT("a"), 3, 5) |
|
133 |
|
134 val rs = List(r1, r2, r3, r4, r5, r6) |
|
135 |
|
136 rs.map(matcher(_, "")) |
|
137 rs.map(matcher(_, "a")) |
|
138 rs.map(matcher(_, "aa")) |
|
139 rs.map(matcher(_, "aaa")) |
|
140 rs.map(matcher(_, "aaaa")) |
|
141 rs.map(matcher(_, "aaaaa")) |
|
142 rs.map(matcher(_, "aaaaaa")) |
|
143 |
171 println("EMAIL:") |
144 println("EMAIL:") |
172 val LOWERCASE = "abcdefghijklmnopqrstuvwxyz" |
145 val LOWERCASE = ('a' to 'z').toSet |
173 val DIGITS = "0123456789" |
146 val DIGITS = ('0' to '9').toSet |
174 val EMAIL = PLUS(RANGE(LOWERCASE + DIGITS + "_" + "." + "-")) ~ "@" ~ |
147 val EMAIL = { PLUS(CFUN(LOWERCASE | DIGITS | ("_.-").toSet)) ~ "@" ~ |
175 PLUS(RANGE(LOWERCASE + DIGITS + "." + "-")) ~ "." ~ |
148 PLUS(CFUN(LOWERCASE | DIGITS | (".-").toSet)) ~ "." ~ |
176 NMTIMES(RANGE(LOWERCASE + "."), 2, 6) |
149 NMTIMES(CFUN(LOWERCASE | Set('.')), 2, 6) } |
177 |
150 |
178 val my_email = "christian.urban@kcl.ac.uk" |
151 val my_email = "christian.urban@kcl.ac.uk" |
179 |
152 |
180 println(EMAIL); |
153 println(EMAIL); |
181 println(matcher(EMAIL, my_email)) |
154 println(matcher(EMAIL, my_email)) |
182 println(ders_simp(my_email.toList, EMAIL)) |
155 println(ders_simp(my_email.toList, EMAIL)) |
183 |
156 |
184 println("COMMENTS:") |
157 println("COMMENTS:") |
185 val ALL = RANGE(LOWERCASE + " ") |
158 val ALL = CFUN((c:Char) => true) |
186 val COMMENT = "/*" ~ (NOT(ALL.% ~ "*/" ~ ALL.%)) ~ "*/" |
159 val COMMENT = "/*" ~ (NOT(ALL.% ~ "*/" ~ ALL.%)) ~ "*/" |
187 |
160 |
188 println(matcher(COMMENT, "/**/")) |
161 println(matcher(COMMENT, "/**/")) |
189 println(matcher(COMMENT, "/*foobar_comment*/")) |
162 println(matcher(COMMENT, "/*foobar_comment*/")) |
190 println(matcher(COMMENT, "/*test*/test*/")) |
163 println(matcher(COMMENT, "/*test*/test*/")) |