257
|
1 |
% !TEX program = xelatex
|
6
|
2 |
\documentclass{article}
|
426
|
3 |
\usepackage{../styles/style}
|
|
4 |
\usepackage{../styles/langs}
|
218
|
5 |
\usepackage{disclaimer}
|
|
6 |
\usepackage{tikz}
|
|
7 |
\usepackage{pgf}
|
|
8 |
\usepackage{pgfplots}
|
|
9 |
\usepackage{stackengine}
|
|
10 |
%% \usepackage{accents}
|
|
11 |
\newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}
|
|
12 |
|
|
13 |
\begin{filecontents}{re-python2.data}
|
|
14 |
1 0.033
|
|
15 |
5 0.036
|
|
16 |
10 0.034
|
|
17 |
15 0.036
|
|
18 |
18 0.059
|
|
19 |
19 0.084
|
|
20 |
20 0.141
|
|
21 |
21 0.248
|
|
22 |
22 0.485
|
|
23 |
23 0.878
|
|
24 |
24 1.71
|
|
25 |
25 3.40
|
|
26 |
26 7.08
|
|
27 |
27 14.12
|
|
28 |
28 26.69
|
|
29 |
\end{filecontents}
|
|
30 |
|
|
31 |
\begin{filecontents}{re-java.data}
|
|
32 |
5 0.00298
|
|
33 |
10 0.00418
|
|
34 |
15 0.00996
|
|
35 |
16 0.01710
|
|
36 |
17 0.03492
|
|
37 |
18 0.03303
|
|
38 |
19 0.05084
|
|
39 |
20 0.10177
|
|
40 |
21 0.19960
|
|
41 |
22 0.41159
|
|
42 |
23 0.82234
|
|
43 |
24 1.70251
|
|
44 |
25 3.36112
|
|
45 |
26 6.63998
|
|
46 |
27 13.35120
|
|
47 |
28 29.81185
|
|
48 |
\end{filecontents}
|
|
49 |
|
221
|
50 |
\begin{filecontents}{re-js.data}
|
|
51 |
5 0.061
|
|
52 |
10 0.061
|
|
53 |
15 0.061
|
|
54 |
20 0.070
|
|
55 |
23 0.131
|
|
56 |
25 0.308
|
|
57 |
26 0.564
|
|
58 |
28 1.994
|
|
59 |
30 7.648
|
|
60 |
31 15.881
|
|
61 |
32 32.190
|
|
62 |
\end{filecontents}
|
|
63 |
|
218
|
64 |
\begin{filecontents}{re-java9.data}
|
|
65 |
1000 0.01410
|
|
66 |
2000 0.04882
|
|
67 |
3000 0.10609
|
|
68 |
4000 0.17456
|
|
69 |
5000 0.27530
|
|
70 |
6000 0.41116
|
|
71 |
7000 0.53741
|
|
72 |
8000 0.70261
|
|
73 |
9000 0.93981
|
|
74 |
10000 0.97419
|
|
75 |
11000 1.28697
|
|
76 |
12000 1.51387
|
|
77 |
14000 2.07079
|
|
78 |
16000 2.69846
|
|
79 |
20000 4.41823
|
|
80 |
24000 6.46077
|
|
81 |
26000 7.64373
|
|
82 |
30000 9.99446
|
|
83 |
34000 12.966885
|
|
84 |
38000 16.281621
|
|
85 |
42000 19.180228
|
|
86 |
46000 21.984721
|
|
87 |
50000 26.950203
|
|
88 |
60000 43.0327746
|
|
89 |
\end{filecontents}
|
|
90 |
|
351
|
91 |
\begin{filecontents}{re-swift.data}
|
|
92 |
5 0.001
|
|
93 |
10 0.001
|
|
94 |
15 0.009
|
|
95 |
20 0.178
|
|
96 |
23 1.399
|
|
97 |
24 2.893
|
|
98 |
25 5.671
|
|
99 |
26 11.357
|
|
100 |
27 22.430
|
|
101 |
\end{filecontents}
|
|
102 |
|
|
103 |
\begin{filecontents}{re-dart.data}
|
|
104 |
20 0.042
|
|
105 |
21 0.084
|
|
106 |
22 0.190
|
|
107 |
23 0.340
|
|
108 |
24 0.678
|
|
109 |
25 1.369
|
|
110 |
26 2.700
|
|
111 |
27 5.462
|
|
112 |
28 10.908
|
|
113 |
29 21.725
|
|
114 |
30 43.492
|
|
115 |
\end{filecontents}
|
6
|
116 |
|
|
117 |
\begin{document}
|
|
118 |
|
218
|
119 |
% BF IDE
|
|
120 |
% https://www.microsoft.com/en-us/p/brainf-ck/9nblgggzhvq5
|
|
121 |
|
428
|
122 |
\section*{Main Part 3 (Scala, 7 Marks)}
|
6
|
123 |
|
425
|
124 |
\mbox{}\hfill\textit{``Java is the most distressing thing to happen to computing since MS-DOS.''}\smallskip\\
|
|
125 |
\mbox{}\hfill\textit{ --- Alan Kay, the inventor of object-oriented programming}\bigskip\medskip
|
275
|
126 |
|
|
127 |
\noindent
|
351
|
128 |
This part is about a regular expression matcher described by
|
415
|
129 |
Brzozowski in 1964. The
|
351
|
130 |
background is that ``out-of-the-box'' regular expression matching in
|
|
131 |
mainstream languages like Java, JavaScript and Python can sometimes be
|
|
132 |
excruciatingly slow. You are supposed to implement a regular
|
|
133 |
expression matcher that is much, much faster. \bigskip
|
218
|
134 |
|
351
|
135 |
\IMPORTANTNONE{}
|
62
|
136 |
|
|
137 |
\noindent
|
218
|
138 |
Also note that the running time of each part will be restricted to a
|
257
|
139 |
maximum of 30 seconds on my laptop.
|
218
|
140 |
|
|
141 |
\DISCLAIMER{}
|
86
|
142 |
|
221
|
143 |
\subsection*{Reference Implementation}
|
|
144 |
|
351
|
145 |
This Scala assignment comes with a reference implementation in form of
|
|
146 |
a \texttt{jar}-file. This allows you to run any test cases on your own
|
475
|
147 |
computer. For example you can call \texttt{scala-cli} on the command
|
|
148 |
line with the option \texttt{--extra-jars re.jar} and then query any function
|
|
149 |
from the \texttt{re.scala} template file. As usual you have to prefix
|
|
150 |
the calls with \texttt{M3} or import this object. Since some tasks
|
351
|
151 |
are time sensitive, you can check the reference implementation as
|
|
152 |
follows: if you want to know, for example, how long it takes to match
|
|
153 |
strings of $a$'s using the regular expression $(a^*)^*\cdot b$ you can
|
|
154 |
query as follows:
|
221
|
155 |
|
|
156 |
|
245
|
157 |
\begin{lstlisting}[xleftmargin=1mm,numbers=none,basicstyle=\ttfamily\small]
|
475
|
158 |
$ scala-cli --extra-jars re.jar
|
396
|
159 |
scala> import M3._
|
221
|
160 |
scala> for (i <- 0 to 5000000 by 500000) {
|
481
|
161 |
println(s"$i: ${time_needed(2, matcher(EVIL, "a" * i))}")
|
|
162 |
}
|
292
|
163 |
0: 0.00002 secs.
|
|
164 |
500000: 0.10608 secs.
|
|
165 |
1000000: 0.22286 secs.
|
|
166 |
1500000: 0.35982 secs.
|
|
167 |
2000000: 0.45828 secs.
|
|
168 |
2500000: 0.59558 secs.
|
|
169 |
3000000: 0.73191 secs.
|
|
170 |
3500000: 0.83499 secs.
|
|
171 |
4000000: 0.99149 secs.
|
|
172 |
4500000: 1.15395 secs.
|
|
173 |
5000000: 1.29659 secs.
|
221
|
174 |
\end{lstlisting}%$
|
|
175 |
|
481
|
176 |
\noindent
|
|
177 |
For this you need to copy the \texttt{time\_needed} function and the \texttt{EVIL} regular
|
|
178 |
expression from the comments given in \texttt{re.scala}.
|
424
|
179 |
|
351
|
180 |
\subsection*{Preliminaries}
|
218
|
181 |
|
|
182 |
The task is to implement a regular expression matcher that is based on
|
|
183 |
derivatives of regular expressions. Most of the functions are defined by
|
|
184 |
recursion over regular expressions and can be elegantly implemented
|
|
185 |
using Scala's pattern-matching. The implementation should deal with the
|
|
186 |
following regular expressions, which have been predefined in the file
|
|
187 |
\texttt{re.scala}:
|
6
|
188 |
|
218
|
189 |
\begin{center}
|
|
190 |
\begin{tabular}{lcll}
|
|
191 |
$r$ & $::=$ & $\ZERO$ & cannot match anything\\
|
|
192 |
& $|$ & $\ONE$ & can only match the empty string\\
|
|
193 |
& $|$ & $c$ & can match a single character (in this case $c$)\\
|
|
194 |
& $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
|
|
195 |
& $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
|
|
196 |
& & & then the second part with $r_2$\\
|
221
|
197 |
& $|$ & $r^*$ & can match a string with zero or more copies of $r$\\
|
218
|
198 |
\end{tabular}
|
|
199 |
\end{center}
|
6
|
200 |
|
218
|
201 |
\noindent
|
221
|
202 |
Why? Regular expressions are
|
|
203 |
one of the simplest ways to match patterns in text, and
|
218
|
204 |
are endlessly useful for searching, editing and analysing data in all
|
|
205 |
sorts of places (for example analysing network traffic in order to
|
|
206 |
detect security breaches). However, you need to be fast, otherwise you
|
|
207 |
will stumble over problems such as recently reported at
|
|
208 |
|
|
209 |
{\small
|
|
210 |
\begin{itemize}
|
289
|
211 |
\item[$\bullet$] \url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019}
|
428
|
212 |
\item[$\bullet$] \texttt{\href{https://web.archive.org/web/20160801163029/https://www.stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
|
218
|
213 |
\item[$\bullet$] \url{https://vimeo.com/112065252}
|
289
|
214 |
\item[$\bullet$] \url{https://davidvgalbraith.com/how-i-fixed-atom}
|
218
|
215 |
\end{itemize}}
|
|
216 |
|
221
|
217 |
% Knowing how to match regular expressions and strings will let you
|
|
218 |
% solve a lot of problems that vex other humans.
|
|
219 |
|
|
220 |
|
218
|
221 |
\subsubsection*{Tasks (file re.scala)}
|
6
|
222 |
|
218
|
223 |
The file \texttt{re.scala} has already a definition for regular
|
428
|
224 |
expressions and also defines some handy shorthand notation for regular
|
|
225 |
expressions. The notation in this coursework description matches up
|
|
226 |
with the code as follows:
|
218
|
227 |
|
|
228 |
\begin{center}
|
|
229 |
\begin{tabular}{rcl@{\hspace{10mm}}l}
|
|
230 |
& & code: & shorthand:\smallskip \\
|
|
231 |
$\ZERO$ & $\mapsto$ & \texttt{ZERO}\\
|
|
232 |
$\ONE$ & $\mapsto$ & \texttt{ONE}\\
|
|
233 |
$c$ & $\mapsto$ & \texttt{CHAR(c)}\\
|
396
|
234 |
$\sum rs$ & $\mapsto$ & \texttt{ALTs(rs)}\\
|
218
|
235 |
$r_1 + r_2$ & $\mapsto$ & \texttt{ALT(r1, r2)} & \texttt{r1 | r2}\\
|
428
|
236 |
$\prod rs$ & $\mapsto$ & \texttt{SEQs(rs)}\\
|
218
|
237 |
$r_1 \cdot r_2$ & $\mapsto$ & \texttt{SEQ(r1, r2)} & \texttt{r1 $\sim$ r2}\\
|
|
238 |
$r^*$ & $\mapsto$ & \texttt{STAR(r)} & \texttt{r.\%}
|
|
239 |
\end{tabular}
|
|
240 |
\end{center}
|
|
241 |
|
396
|
242 |
\noindent
|
444
|
243 |
The alternative regular expression comes in two versions: one is
|
396
|
244 |
binary (+ / \texttt{ALT}) and the other is n-ary ($\sum$ /
|
|
245 |
\texttt{ALTs}). The latter takes a list of regular expressions as
|
424
|
246 |
argument. In what follows we shall use $rs$ to stand for lists of
|
|
247 |
regular expressions. When the list is empty, we shall write $\sum\,[]$;
|
|
248 |
if it is non-empty, we sometimes write $\sum\,[r_1,..., r_n]$.
|
|
249 |
The binary alternative can be seen as an abbreviation,
|
396
|
250 |
that is $r_1 + r_2 \dn \sum\,[r_1, r_2]$. As a result we can ignore the
|
428
|
251 |
binary version and only implement the n-ary alternative. Similarly
|
|
252 |
the sequence regular expression is only implemented with lists and the
|
|
253 |
binary version can be obtained by defining $r_1 \cdot r_2 \dn \prod\,[r_1, r_2]$.
|
105
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
254 |
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
255 |
\begin{itemize}
|
351
|
256 |
\item[(1)] Implement a function, called \textit{nullable}, by
|
218
|
257 |
recursion over regular expressions. This function tests whether a
|
|
258 |
regular expression can match the empty string. This means given a
|
428
|
259 |
regular expression, it either returns true or false. The function
|
218
|
260 |
\textit{nullable}
|
|
261 |
is defined as follows:
|
|
262 |
|
|
263 |
\begin{center}
|
|
264 |
\begin{tabular}{lcl}
|
|
265 |
$\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
|
|
266 |
$\textit{nullable}(\ONE)$ & $\dn$ & $\textit{true}$\\
|
|
267 |
$\textit{nullable}(c)$ & $\dn$ & $\textit{false}$\\
|
396
|
268 |
$\textit{nullable}(\sum rs)$ & $\dn$ & $\exists r \in rs.\;\textit{nullable}(r)$\\
|
428
|
269 |
$\textit{nullable}(\prod rs)$ & $\dn$ & $\forall r\in rs.\;\textit{nullable}(r)$\\
|
218
|
270 |
$\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\\
|
|
271 |
\end{tabular}
|
396
|
272 |
\end{center}~\hfill[0.5 Marks]
|
105
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
273 |
|
351
|
274 |
\item[(2)] Implement a function, called \textit{der}, by recursion over
|
218
|
275 |
regular expressions. It takes a character and a regular expression
|
424
|
276 |
as arguments and calculates the \emph{derivative} of a regular expression according
|
218
|
277 |
to the rules:
|
105
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
278 |
|
218
|
279 |
\begin{center}
|
|
280 |
\begin{tabular}{lcl}
|
|
281 |
$\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
|
|
282 |
$\textit{der}\;c\;(\ONE)$ & $\dn$ & $\ZERO$\\
|
|
283 |
$\textit{der}\;c\;(d)$ & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
|
396
|
284 |
$\textit{der}\;c\;(\sum\;[r_1,..,r_n])$ & $\dn$ & $\sum\;[\textit{der}\;c\;r_1,..,\textit{der}\;c\;r_n]$\\
|
428
|
285 |
$\textit{der}\;c\;(\prod\;[])$ & $\dn$ & $\ZERO$\\
|
|
286 |
$\textit{der}\;c\;(\prod\;r\!::\!rs)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r)$\\
|
|
287 |
& & $\textit{then}\;(\prod\;(\textit{der}\;c\;r)\!::\!rs) + (\textit{der}\;c\;(\prod rs))$\\
|
|
288 |
& & $\textit{else}\;(\prod\;(\textit{der}\;c\;r)\!::\! rs)$\\
|
218
|
289 |
$\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\
|
|
290 |
\end{tabular}
|
|
291 |
\end{center}
|
|
292 |
\mbox{}\hfill\mbox{[1 Mark]}
|
|
293 |
|
396
|
294 |
\item[(3)] We next want to simplify regular expressions: essentially
|
|
295 |
we want to remove $\ZERO$ in regular expressions like $r + \ZERO$
|
|
296 |
and $\ZERO + r$. However, our n-ary alternative takes a list of
|
428
|
297 |
regular expressions as argument, and we therefore need a more general
|
|
298 |
``denesting'' function, which deletes $\ZERO$s and ``spills out'' nested
|
|
299 |
$\sum$s. This function, called \texttt{denest}, should analyse a
|
|
300 |
list of regular expressions, say $rs$, as follows:
|
396
|
301 |
|
|
302 |
\begin{center}
|
|
303 |
\begin{tabular}{lllll}
|
|
304 |
1) &$rs = []$ & $\dn$ & $[]$ & (empty list)\\
|
428
|
305 |
2) &$rs = \ZERO :: rest$ & $\dn$ & $\texttt{denest}\;rest$ & (throw away $\ZERO$)\\
|
|
306 |
3) &$rs = (\sum rs) :: rest$ & $\dn$ & $rs ::: \texttt{denest}\;rest$ & (spill out $\sum$)\\
|
|
307 |
4) &$rs = r :: rest$ & $\dn$ & $r :: \texttt{denest}\;rest$ & (otherwise)\\
|
396
|
308 |
\end{tabular}
|
|
309 |
\end{center}
|
|
310 |
|
424
|
311 |
The first clause states that empty lists cannot be further
|
428
|
312 |
denested. The second removes the first $\ZERO$ from the list and recurses.
|
396
|
313 |
The third is when the first regular expression is an \texttt{ALTs}, then
|
|
314 |
the content of this alternative should be spilled out and appended
|
428
|
315 |
with the denested rest of the list. The last case is for all other
|
396
|
316 |
cases where the head of the list is not $\ZERO$ and not an \texttt{ALTs},
|
428
|
317 |
then we just keep the head of the list and denest the rest.\\
|
396
|
318 |
\mbox{}\hfill\mbox{[1 Mark]}
|
|
319 |
|
428
|
320 |
\item[(4)] Implement the function \texttt{flts} which flattens our
|
|
321 |
n-ary sequence regular expression $\prod$. Like \texttt{denest}, this
|
|
322 |
function is intended to delete $\ONE$s and spill out nested $\prod$s.
|
|
323 |
Unfortunately, there is a special case to do with $\ZERO$: If this function encounters a $\ZERO$, then
|
|
324 |
the whole ``product'' should be $\ZERO$. The problem is that the $\ZERO$ can be anywhere
|
|
325 |
inside the list. The easiest way to implement this function is therefore by using an
|
|
326 |
accumulator, which when called is set to \texttt{Nil}. This means \textit{flts} takes
|
|
327 |
two arguments (which are both lists of regular expressions)
|
|
328 |
|
|
329 |
\[
|
|
330 |
\texttt{flts}\;rs\;acc
|
|
331 |
\]
|
|
332 |
|
|
333 |
This function analyses the list $rs$ as follows
|
|
334 |
|
|
335 |
\begin{center}
|
|
336 |
\begin{tabular}{@{}lllll@{}}
|
|
337 |
1) &$rs = []$ & $\dn$ & $acc$ & (empty list)\\
|
|
338 |
2) &$rs = \ZERO :: rest$ & $\dn$ & $[\ZERO]$ & (special case for $\ZERO$)\\
|
|
339 |
3) &$rs = \ONE :: rest$ & $\dn$ & $\texttt{flts}\,rest\,acc$ & (throw away $\ONE$)\\
|
|
340 |
4) &$rs = (\prod rs) :: rest$ & $\dn$ & $\texttt{flts}\;rest\,(acc ::: rs)$ & (spill out $\prod$)\\
|
|
341 |
5) &$rs = r :: rest$ & $\dn$ & $\texttt{flts}\;rest\,(acc ::: [r])$& (otherwise)\\
|
|
342 |
\end{tabular}
|
|
343 |
\end{center}
|
|
344 |
|
|
345 |
In the first case we just return whatever has accumulated in
|
|
346 |
$acc$. In the fourth case we spill out the $rs$ by appending the
|
|
347 |
$rs$ to the end of the accumulator. Similarly in the last case we
|
|
348 |
append the single regular expression $r$ to the end of the
|
475
|
349 |
accumulator. I let you think why you have to add it to the end. \mbox{}\hfill\mbox{[1 Mark]}
|
428
|
350 |
|
|
351 |
\item[(5)] Before we can simplify regular expressions, we need what is often called
|
|
352 |
\emph{smart constructors} for $\sum$ and $\prod$. While the ``normal'' constructors
|
|
353 |
\texttt{ALTs} and \texttt{SEQs} give us alternatives and sequences, respectively, \emph{smart}
|
|
354 |
constructors might return something different depending on what list of regular expressions
|
|
355 |
they are given as argument.
|
|
356 |
|
|
357 |
\begin{center}
|
|
358 |
\begin{tabular}{@{}c@{\hspace{9mm}}c@{}}
|
|
359 |
\begin{tabular}{l@{\hspace{2mm}}l@{\hspace{1mm}}ll}
|
|
360 |
& $\sum^{smart}$\smallskip\\
|
|
361 |
1) & $rs = []$ & $\dn$ & $\ZERO$\\
|
|
362 |
2) & $rs = [r]$ & $\dn$ & $r$\\ \\
|
|
363 |
3) & otherwise & $\dn$ & $\sum\,rs$
|
|
364 |
\end{tabular} &
|
|
365 |
\begin{tabular}{l@{\hspace{2mm}}l@{\hspace{1mm}}ll}
|
|
366 |
& $\prod^{smart}$\smallskip\\
|
|
367 |
1) & $rs = []$ & $\dn$ & $\ONE$\\
|
|
368 |
2a) & $rs = [\ZERO]$ & $\dn$ & $\ZERO$\\
|
|
369 |
2b) & $rs = [r]$ & $\dn$ & $r$\\
|
|
370 |
3) & otherwise & $\dn$ & $\prod\,rs$
|
|
371 |
\end{tabular}
|
|
372 |
\end{tabular}
|
|
373 |
\end{center}
|
|
374 |
\mbox{}\hfill\mbox{[0.5 Marks]}
|
|
375 |
|
|
376 |
\item[(6)] Implement the function \textit{simp}, which recursively
|
224
|
377 |
traverses a regular expression, and on the way up simplifies every
|
|
378 |
regular expression on the left (see below) to the regular expression
|
|
379 |
on the right, except it does not simplify inside ${}^*$-regular
|
428
|
380 |
expressions and also does not simplify $\ZERO$, $\ONE$ and characters.
|
105
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
381 |
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
382 |
\begin{center}
|
428
|
383 |
\begin{tabular}{@{}l@{\hspace{3mm}}c@{\hspace{3mm}}ll@{}}
|
|
384 |
LHS: & & RHS:\smallskip\\
|
|
385 |
$\sum\;[r_1,..,r_n]$ & $\mapsto$ & $\sum^{smart}\;(\texttt{(denest + distinct)}[simp(r_1),..,simp(r_n)])$\\
|
|
386 |
$\prod\;[r_1,..,r_n]$ & $\mapsto$ & $\prod^{smart}\;(\texttt{(flts)}[simp(r_1),..,simp(r_n)])$\\
|
|
387 |
$r$ & $\mapsto$ & $r$ \quad (all other cases)
|
218
|
388 |
\end{tabular}
|
396
|
389 |
\end{center}
|
|
390 |
|
428
|
391 |
The first case is as follows: first apply $simp$ to all regular
|
|
392 |
expressions $r_1,.. ,r_n$; then denest the resulting list using
|
|
393 |
\texttt{denest}; after that remove all duplicates in this list (this can be
|
418
|
394 |
done in Scala using the function
|
428
|
395 |
\texttt{\_.distinct}). Finally, you end up with a list of (simplified)
|
|
396 |
regular expressions; apply the smart constructor $\sum^{smart}$ to this list.
|
|
397 |
Similarly in the $\prod$ case: simplify first all regular
|
|
398 |
expressions $r_1,.. ,r_n$; then flatten the resulting list using \texttt{flts} and apply the
|
|
399 |
smart constructor $\prod^{smart}$ to the result. In all other cases, just return the
|
|
400 |
input $r$ as is.
|
418
|
401 |
|
105
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
402 |
|
218
|
403 |
For example the regular expression
|
|
404 |
\[(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO)\]
|
|
405 |
|
428
|
406 |
simplifies to just $r_1$. \mbox{}\hfill\mbox{[1 Mark]}
|
218
|
407 |
|
428
|
408 |
\item[(7)] Implement two functions: The first, called \textit{ders},
|
218
|
409 |
takes a list of characters and a regular expression as arguments, and
|
|
410 |
builds the derivative w.r.t.~the list as follows:
|
|
411 |
|
|
412 |
\begin{center}
|
|
413 |
\begin{tabular}{lcl}
|
|
414 |
$\textit{ders}\;(Nil)\;r$ & $\dn$ & $r$\\
|
|
415 |
$\textit{ders}\;(c::cs)\;r$ & $\dn$ &
|
428
|
416 |
$\textit{ders}\;cs\;(\textit{simp}\,(\textit{der}\;c\;r))$\\
|
218
|
417 |
\end{tabular}
|
|
418 |
\end{center}
|
|
419 |
|
|
420 |
Note that this function is different from \textit{der}, which only
|
|
421 |
takes a single character.
|
|
422 |
|
|
423 |
The second function, called \textit{matcher}, takes a string and a
|
|
424 |
regular expression as arguments. It builds first the derivatives
|
|
425 |
according to \textit{ders} and after that tests whether the resulting
|
|
426 |
derivative regular expression can match the empty string (using
|
|
427 |
\textit{nullable}). For example the \textit{matcher} will produce
|
|
428 |
true for the regular expression $(a\cdot b)\cdot c$ and the string
|
428
|
429 |
$abc$, but false if you give it the string $ab$. \hfill[0.5 Mark]
|
218
|
430 |
|
428
|
431 |
\item[(8)] Implement a function, called \textit{size}, by recursion
|
218
|
432 |
over regular expressions. If a regular expression is seen as a tree,
|
|
433 |
then \textit{size} should return the number of nodes in such a
|
|
434 |
tree. Therefore this function is defined as follows:
|
|
435 |
|
|
436 |
\begin{center}
|
|
437 |
\begin{tabular}{lcl}
|
|
438 |
$\textit{size}(\ZERO)$ & $\dn$ & $1$\\
|
|
439 |
$\textit{size}(\ONE)$ & $\dn$ & $1$\\
|
|
440 |
$\textit{size}(c)$ & $\dn$ & $1$\\
|
396
|
441 |
$\textit{size}(\sum\,[r_1,..,r_n]$ & $\dn$ & $1 + \textit{size}(r_1) + ... + \textit{size}(r_n)$\\
|
428
|
442 |
$\textit{size}(\prod\,[r_1,..,r_n]$ & $\dn$ & $1 + \textit{size}(r_1) + ... + \textit{size}(r_n)$\\
|
218
|
443 |
$\textit{size}(r^*)$ & $\dn$ & $1 + \textit{size}(r)$\\
|
|
444 |
\end{tabular}
|
|
445 |
\end{center}
|
|
446 |
|
224
|
447 |
You can use \textit{size} in order to test how much the ``evil'' regular
|
218
|
448 |
expression $(a^*)^* \cdot b$ grows when taking successive derivatives
|
|
449 |
according the letter $a$ without simplification and then compare it to
|
|
450 |
taking the derivative, but simplify the result. The sizes
|
396
|
451 |
are given in \texttt{re.scala}. \hfill[0.5 Marks]
|
221
|
452 |
|
428
|
453 |
\item[(9)] You do not have to implement anything specific under this
|
221
|
454 |
task. The purpose here is that you will be marked for some ``power''
|
|
455 |
test cases. For example can your matcher decide within 30 seconds
|
|
456 |
whether the regular expression $(a^*)^*\cdot b$ matches strings of the
|
|
457 |
form $aaa\ldots{}aaaa$, for say 1 Million $a$'s. And does simplification
|
|
458 |
simplify the regular expression
|
|
459 |
|
|
460 |
\[
|
|
461 |
\texttt{SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)}
|
|
462 |
\]
|
|
463 |
|
|
464 |
\noindent correctly to just \texttt{ONE}, where \texttt{SEQ} is nested
|
245
|
465 |
50 or more times?\\
|
396
|
466 |
\mbox{}\hfill[1 Mark]
|
105
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
467 |
\end{itemize}
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
468 |
|
218
|
469 |
\subsection*{Background}
|
|
470 |
|
396
|
471 |
Although easily implementable in Scala (ok maybe the \texttt{simp} functions and
|
428
|
472 |
the constructors \texttt{ALTs}/\texttt{SEQs} needs a bit more thinking), the idea behind the
|
396
|
473 |
derivative function might not so easy to be seen. To understand its
|
|
474 |
purpose better, assume a regular expression $r$ can match strings of
|
|
475 |
the form $c\!::\!cs$ (that means strings which start with a character
|
|
476 |
$c$ and have some rest, or tail, $cs$). If you take the derivative of
|
|
477 |
$r$ with respect to the character $c$, then you obtain a regular
|
|
478 |
expression that can match all the strings $cs$. In other words, the
|
|
479 |
regular expression $\textit{der}\;c\;r$ can match the same strings
|
|
480 |
$c\!::\!cs$ that can be matched by $r$, except that the $c$ is chopped
|
|
481 |
off.
|
218
|
482 |
|
|
483 |
Assume now $r$ can match the string $abc$. If you take the derivative
|
|
484 |
according to $a$ then you obtain a regular expression that can match
|
|
485 |
$bc$ (it is $abc$ where the $a$ has been chopped off). If you now
|
|
486 |
build the derivative $\textit{der}\;b\;(\textit{der}\;a\;r)$ you
|
|
487 |
obtain a regular expression that can match the string $c$ (it is $bc$
|
|
488 |
where $b$ is chopped off). If you finally build the derivative of this
|
|
489 |
according $c$, that is
|
|
490 |
$\textit{der}\;c\;(\textit{der}\;b\;(\textit{der}\;a\;r))$, you obtain
|
|
491 |
a regular expression that can match the empty string. You can test
|
|
492 |
whether this is indeed the case using the function nullable, which is
|
428
|
493 |
what the matcher you have implemented is doing.
|
218
|
494 |
|
|
495 |
The purpose of the $\textit{simp}$ function is to keep the regular
|
|
496 |
expressions small. Normally the derivative function makes the regular
|
428
|
497 |
expression bigger (see the \texttt{SEQs} case and the example in Task (2)) and the
|
218
|
498 |
algorithm would be slower and slower over time. The $\textit{simp}$
|
|
499 |
function counters this increase in size and the result is that the
|
|
500 |
algorithm is fast throughout. By the way, this algorithm is by Janusz
|
|
501 |
Brzozowski who came up with the idea of derivatives in 1964 in his PhD
|
|
502 |
thesis.
|
|
503 |
|
|
504 |
\begin{center}\small
|
|
505 |
\url{https://en.wikipedia.org/wiki/Janusz_Brzozowski_(computer_scientist)}
|
|
506 |
\end{center}
|
|
507 |
|
105
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
508 |
|
218
|
509 |
If you want to see how badly the regular expression matchers do in
|
428
|
510 |
Java\footnote{Version 8 and below; Version 9 and above does not seem
|
|
511 |
to be as catastrophic, but still much worse than the regular
|
|
512 |
expression matcher based on derivatives. BTW, Scala uses the regular
|
|
513 |
expression matcher provided by the Java libraries. So is just as bad.}, JavaScript,
|
|
514 |
Python Swift and Dart with the ``evil'' regular expression
|
|
515 |
$(a^*)^*\cdot b$, then have a look at the graphs below (you can try it
|
|
516 |
out for yourself: have a look at the files
|
351
|
517 |
\texttt{catastrophic9.java}, \texttt{catastrophic.js},
|
428
|
518 |
\texttt{catastrophic.py} etc on KEATS). Compare this with the matcher
|
|
519 |
you have implemented. How long can a string of $a$'s be in your
|
|
520 |
matcher and still stay within the 30 seconds time limit? It should be
|
|
521 |
mu(uu)$^*$ch better than your off-the-shelf matcher in your
|
475
|
522 |
bog-standard programming language.
|
78
|
523 |
|
218
|
524 |
\begin{center}
|
|
525 |
\begin{tabular}{@{}cc@{}}
|
|
526 |
\multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings
|
424
|
527 |
$\underbrace{a\ldots a}_{n}$}\medskip\\
|
218
|
528 |
|
|
529 |
\begin{tikzpicture}
|
|
530 |
\begin{axis}[
|
|
531 |
xlabel={$n$},
|
|
532 |
x label style={at={(1.05,0.0)}},
|
|
533 |
ylabel={time in secs},
|
|
534 |
y label style={at={(0.06,0.5)}},
|
|
535 |
enlargelimits=false,
|
|
536 |
xtick={0,5,...,30},
|
|
537 |
xmax=33,
|
|
538 |
ymax=45,
|
|
539 |
ytick={0,5,...,40},
|
|
540 |
scaled ticks=false,
|
|
541 |
axis lines=left,
|
|
542 |
width=6cm,
|
424
|
543 |
height=5.5cm,
|
351
|
544 |
legend entries={Python, Java 8, JavaScript, Swift, Dart},
|
222
|
545 |
legend pos=north west,
|
|
546 |
legend cell align=left]
|
218
|
547 |
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
|
|
548 |
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
|
221
|
549 |
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
|
351
|
550 |
\addplot[magenta,mark=*, mark options={fill=white}] table {re-swift.data};
|
|
551 |
\addplot[brown,mark=*, mark options={fill=white}] table {re-dart.data};
|
218
|
552 |
\end{axis}
|
|
553 |
\end{tikzpicture}
|
|
554 |
&
|
|
555 |
\begin{tikzpicture}
|
|
556 |
\begin{axis}[
|
|
557 |
xlabel={$n$},
|
|
558 |
x label style={at={(1.05,0.0)}},
|
|
559 |
ylabel={time in secs},
|
|
560 |
y label style={at={(0.06,0.5)}},
|
|
561 |
%enlargelimits=false,
|
|
562 |
%xtick={0,5000,...,30000},
|
|
563 |
xmax=65000,
|
|
564 |
ymax=45,
|
|
565 |
ytick={0,5,...,40},
|
|
566 |
scaled ticks=false,
|
|
567 |
axis lines=left,
|
|
568 |
width=6cm,
|
424
|
569 |
height=5.5cm,
|
218
|
570 |
legend entries={Java 9},
|
|
571 |
legend pos=north west]
|
|
572 |
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
|
|
573 |
\end{axis}
|
|
574 |
\end{tikzpicture}
|
|
575 |
\end{tabular}
|
|
576 |
\end{center}
|
|
577 |
\newpage
|
|
578 |
|
|
579 |
|
|
580 |
|
|
581 |
|
6
|
582 |
|
|
583 |
\end{document}
|
|
584 |
|
68
|
585 |
|
428
|
586 |
|
|
587 |
For example given the regular expression $r = (a \cdot b) \cdot c$, the derivatives
|
|
588 |
w.r.t.~the characters $a$, $b$ and $c$ are
|
|
589 |
|
|
590 |
\begin{center}
|
|
591 |
\begin{tabular}{lcll}
|
|
592 |
$\textit{der}\;a\;r$ & $=$ & $(\ONE \cdot b)\cdot c$ & \quad($= r'$)\\
|
|
593 |
$\textit{der}\;b\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$\\
|
|
594 |
$\textit{der}\;c\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$
|
|
595 |
\end{tabular}
|
|
596 |
\end{center}
|
|
597 |
|
|
598 |
Let $r'$ stand for the first derivative, then taking the derivatives of $r'$
|
|
599 |
w.r.t.~the characters $a$, $b$ and $c$ gives
|
|
600 |
|
|
601 |
\begin{center}
|
|
602 |
\begin{tabular}{lcll}
|
|
603 |
$\textit{der}\;a\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$ \\
|
|
604 |
$\textit{der}\;b\;r'$ & $=$ & $((\ZERO \cdot b) + \ONE)\cdot c$ & \quad($= r''$)\\
|
|
605 |
$\textit{der}\;c\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$
|
|
606 |
\end{tabular}
|
|
607 |
\end{center}
|
|
608 |
|
|
609 |
One more example: Let $r''$ stand for the second derivative above,
|
|
610 |
then taking the derivatives of $r''$ w.r.t.~the characters $a$, $b$
|
|
611 |
and $c$ gives
|
|
612 |
|
|
613 |
\begin{center}
|
|
614 |
\begin{tabular}{lcll}
|
|
615 |
$\textit{der}\;a\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$ \\
|
|
616 |
$\textit{der}\;b\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$\\
|
|
617 |
$\textit{der}\;c\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ONE$ &
|
|
618 |
(is $\textit{nullable}$)
|
|
619 |
\end{tabular}
|
|
620 |
\end{center}
|
|
621 |
|
|
622 |
Note, the last derivative can match the empty string, that is it is \textit{nullable}.
|
|
623 |
|
|
624 |
|
|
625 |
|
6
|
626 |
%%% Local Variables:
|
|
627 |
%%% mode: latex
|
|
628 |
%%% TeX-master: t
|
|
629 |
%%% End:
|