cws/main_cw03.tex
author Christian Urban <christian.urban@kcl.ac.uk>
Sat, 28 Nov 2020 15:37:58 +0000
changeset 377 e104e2da83eb
parent 356 d1046d9d3213
child 390 175a950470a9
permissions -rw-r--r--
updated

% !TEX program = xelatex
\documentclass{article}
\usepackage{../style}
\usepackage{../langs}
\usepackage{disclaimer}
\usepackage{tikz}
\usepackage{pgf}
\usepackage{pgfplots}
\usepackage{stackengine}
%% \usepackage{accents}
\newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}

\begin{filecontents}{re-python2.data}
1 0.033
5 0.036
10 0.034
15 0.036
18 0.059
19 0.084 
20 0.141
21 0.248
22 0.485
23 0.878
24 1.71
25 3.40
26 7.08
27 14.12
28 26.69
\end{filecontents}

\begin{filecontents}{re-java.data}
5  0.00298
10  0.00418
15  0.00996
16  0.01710
17  0.03492
18  0.03303
19  0.05084
20  0.10177
21  0.19960
22  0.41159
23  0.82234
24  1.70251
25  3.36112
26  6.63998
27  13.35120
28  29.81185
\end{filecontents}

\begin{filecontents}{re-js.data}
5   0.061
10  0.061
15  0.061
20  0.070
23  0.131
25  0.308
26  0.564
28  1.994
30  7.648
31  15.881 
32  32.190
\end{filecontents}

\begin{filecontents}{re-java9.data}
1000  0.01410
2000  0.04882
3000  0.10609
4000  0.17456
5000  0.27530
6000  0.41116
7000  0.53741
8000  0.70261
9000  0.93981
10000 0.97419
11000 1.28697
12000 1.51387
14000 2.07079
16000 2.69846
20000 4.41823
24000 6.46077
26000 7.64373
30000 9.99446
34000 12.966885
38000 16.281621
42000 19.180228
46000 21.984721
50000 26.950203
60000 43.0327746
\end{filecontents}

\begin{filecontents}{re-swift.data}
5   0.001
10  0.001
15  0.009
20  0.178
23  1.399
24  2.893
25  5.671
26  11.357
27  22.430
\end{filecontents}

\begin{filecontents}{re-dart.data}
20 0.042
21 0.084
22 0.190
23 0.340
24 0.678
25 1.369
26 2.700
27 5.462
28 10.908
29 21.725
30 43.492
\end{filecontents}

\begin{document}

% BF IDE
% https://www.microsoft.com/en-us/p/brainf-ck/9nblgggzhvq5
  
\section*{Main Part 3 (Scala, 7 Marks)}

%\mbox{}\hfill\textit{``[Google’s MapReduce] abstraction is inspired by the}\\
%\mbox{}\hfill\textit{map and reduce primitives present in Lisp and many}\\
%\mbox{}\hfill\textit{other functional language.''}\smallskip\\
%\mbox{}\hfill\textit{ --- Dean and Ghemawat, who designed this concept at Google}
%\bigskip\medskip

\noindent
This part is about a regular expression matcher described by
Brzozowski in 1964. This part is due on \cwEIGHTa{} at 5pm.  The
background is that ``out-of-the-box'' regular expression matching in
mainstream languages like Java, JavaScript and Python can sometimes be
excruciatingly slow.  You are supposed to implement a regular
expression matcher that is much, much faster. \bigskip

\IMPORTANTNONE{}

\noindent
Also note that the running time of each part will be restricted to a
maximum of 30 seconds on my laptop.  

\DISCLAIMER{}

\subsection*{Reference Implementation}

This Scala assignment comes with a reference implementation in form of
a \texttt{jar}-file. This allows you to run any test cases on your own
computer. For example you can call Scala on the command line with the
option \texttt{-cp re.jar} and then query any function from the
\texttt{re.scala} template file. As usual you have to prefix the calls
with \texttt{CW8c} or import this object.  Since some tasks
are time sensitive, you can check the reference implementation as
follows: if you want to know, for example, how long it takes to match
strings of $a$'s using the regular expression $(a^*)^*\cdot b$ you can
query as follows:


\begin{lstlisting}[xleftmargin=1mm,numbers=none,basicstyle=\ttfamily\small]
$ scala -cp re.jar
scala> import CW8c._  
scala> for (i <- 0 to 5000000 by 500000) {
  | println(f"$i: ${time_needed(2, matcher(EVIL, "a" * i))}%.5f secs.")
  | }
0: 0.00002 secs.
500000: 0.10608 secs.
1000000: 0.22286 secs.
1500000: 0.35982 secs.
2000000: 0.45828 secs.
2500000: 0.59558 secs.
3000000: 0.73191 secs.
3500000: 0.83499 secs.
4000000: 0.99149 secs.
4500000: 1.15395 secs.
5000000: 1.29659 secs.
\end{lstlisting}%$

\subsection*{Preliminaries}

The task is to implement a regular expression matcher that is based on
derivatives of regular expressions. Most of the functions are defined by
recursion over regular expressions and can be elegantly implemented
using Scala's pattern-matching. The implementation should deal with the
following regular expressions, which have been predefined in the file
\texttt{re.scala}:

\begin{center}
\begin{tabular}{lcll}
  $r$ & $::=$ & $\ZERO$     & cannot match anything\\
      &   $|$ & $\ONE$      & can only match the empty string\\
      &   $|$ & $c$         & can match a single character (in this case $c$)\\
      &   $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
  &   $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
          &  & & then the second part with $r_2$\\
      &   $|$ & $r^*$       & can match a string with zero or more copies of $r$\\
\end{tabular}
\end{center}

\noindent 
Why? Regular expressions are
one of the simplest ways to match patterns in text, and
are endlessly useful for searching, editing and analysing data in all
sorts of places (for example analysing network traffic in order to
detect security breaches). However, you need to be fast, otherwise you
will stumble over problems such as recently reported at

{\small
\begin{itemize}
\item[$\bullet$] \url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019}  
\item[$\bullet$] \url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
\item[$\bullet$] \url{https://vimeo.com/112065252}
\item[$\bullet$] \url{https://davidvgalbraith.com/how-i-fixed-atom}  
\end{itemize}}

% Knowing how to match regular expressions and strings will let you
% solve a lot of problems that vex other humans.


\subsubsection*{Tasks (file re.scala)}

The file \texttt{re.scala} has already a definition for regular
expressions and also defines some handy shorthand notation for
regular expressions. The notation in this document matches up
with the code in the file as follows:

\begin{center}
  \begin{tabular}{rcl@{\hspace{10mm}}l}
    & & code: & shorthand:\smallskip \\ 
  $\ZERO$ & $\mapsto$ & \texttt{ZERO}\\
  $\ONE$  & $\mapsto$ & \texttt{ONE}\\
  $c$     & $\mapsto$ & \texttt{CHAR(c)}\\
  $r_1 + r_2$ & $\mapsto$ & \texttt{ALT(r1, r2)} & \texttt{r1 | r2}\\
  $r_1 \cdot r_2$ & $\mapsto$ & \texttt{SEQ(r1, r2)} & \texttt{r1 $\sim$ r2}\\
  $r^*$ & $\mapsto$ &  \texttt{STAR(r)} & \texttt{r.\%}
\end{tabular}    
\end{center}  


\begin{itemize}
\item[(1)] Implement a function, called \textit{nullable}, by
  recursion over regular expressions. This function tests whether a
  regular expression can match the empty string. This means given a
  regular expression it either returns true or false. The function
  \textit{nullable}
  is defined as follows:

\begin{center}
\begin{tabular}{lcl}
$\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
$\textit{nullable}(\ONE)$  & $\dn$ & $\textit{true}$\\
$\textit{nullable}(c)$     & $\dn$ & $\textit{false}$\\
$\textit{nullable}(r_1 + r_2)$ & $\dn$ & $\textit{nullable}(r_1) \vee \textit{nullable}(r_2)$\\
$\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\
$\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\\
\end{tabular}
\end{center}~\hfill[1 Mark]

\item[(2)] Implement a function, called \textit{der}, by recursion over
  regular expressions. It takes a character and a regular expression
  as arguments and calculates the derivative of a regular expression according
  to the rules:

\begin{center}
\begin{tabular}{lcl}
$\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
$\textit{der}\;c\;(\ONE)$  & $\dn$ & $\ZERO$\\
$\textit{der}\;c\;(d)$     & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
$\textit{der}\;c\;(r_1 + r_2)$ & $\dn$ & $(\textit{der}\;c\;r_1) + (\textit{der}\;c\;r_2)$\\
$\textit{der}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
      & & $\textit{then}\;((\textit{der}\;c\;r_1)\cdot r_2) + (\textit{der}\;c\;r_2)$\\
      & & $\textit{else}\;(\textit{der}\;c\;r_1)\cdot r_2$\\
$\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\
\end{tabular}
\end{center}

For example given the regular expression $r = (a \cdot b) \cdot c$, the derivatives
w.r.t.~the characters $a$, $b$ and $c$ are

\begin{center}
  \begin{tabular}{lcll}
    $\textit{der}\;a\;r$ & $=$ & $(\ONE \cdot b)\cdot c$ & \quad($= r'$)\\
    $\textit{der}\;b\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$\\
    $\textit{der}\;c\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$
  \end{tabular}
\end{center}

Let $r'$ stand for the first derivative, then taking the derivatives of $r'$
w.r.t.~the characters $a$, $b$ and $c$ gives

\begin{center}
  \begin{tabular}{lcll}
    $\textit{der}\;a\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$ \\
    $\textit{der}\;b\;r'$ & $=$ & $((\ZERO \cdot b) + \ONE)\cdot c$ & \quad($= r''$)\\
    $\textit{der}\;c\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$
  \end{tabular}
\end{center}

One more example: Let $r''$ stand for the second derivative above,
then taking the derivatives of $r''$ w.r.t.~the characters $a$, $b$
and $c$ gives

\begin{center}
  \begin{tabular}{lcll}
    $\textit{der}\;a\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$ \\
    $\textit{der}\;b\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$\\
    $\textit{der}\;c\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ONE$ &
    (is $\textit{nullable}$)                      
  \end{tabular}
\end{center}

Note, the last derivative can match the empty string, that is it is \textit{nullable}.\\
\mbox{}\hfill\mbox{[1 Mark]}

\item[(3)] Implement the function \textit{simp}, which recursively
  traverses a regular expression, and on the way up simplifies every
  regular expression on the left (see below) to the regular expression
  on the right, except it does not simplify inside ${}^*$-regular
  expressions.

  \begin{center}
\begin{tabular}{l@{\hspace{4mm}}c@{\hspace{4mm}}ll}
$r \cdot \ZERO$ & $\mapsto$ & $\ZERO$\\ 
$\ZERO \cdot r$ & $\mapsto$ & $\ZERO$\\ 
$r \cdot \ONE$ & $\mapsto$ & $r$\\ 
$\ONE \cdot r$ & $\mapsto$ & $r$\\ 
$r + \ZERO$ & $\mapsto$ & $r$\\ 
$\ZERO + r$ & $\mapsto$ & $r$\\ 
$r + r$ & $\mapsto$ & $r$\\ 
\end{tabular}
  \end{center}

  For example the regular expression
  \[(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO)\]

  simplifies to just $r_1$. \textbf{Hint:} Regular expressions can be
  seen as trees and there are several methods for traversing
  trees. One of them corresponds to the inside-out traversal, which is also
  sometimes called post-order tra\-versal: you traverse inside the
  tree and on the way up you apply simplification rules.
  \textbf{Another Hint:}
  Remember numerical expressions from school times---there you had expressions
  like $u + \ldots + (1 \cdot x) - \ldots (z + (y \cdot 0)) \ldots$
  and simplification rules that looked very similar to rules
  above. You would simplify such numerical expressions by replacing
  for example the $y \cdot 0$ by $0$, or $1\cdot x$ by $x$, and then
  look whether more rules are applicable. If you organise the
  simplification in an inside-out fashion, it is always clear which
  simplification should be applied next.\hfill[1 Mark]

\item[(4)] Implement two functions: The first, called \textit{ders},
  takes a list of characters and a regular expression as arguments, and
  builds the derivative w.r.t.~the list as follows:

\begin{center}
\begin{tabular}{lcl}
$\textit{ders}\;(Nil)\;r$ & $\dn$ & $r$\\
  $\textit{ders}\;(c::cs)\;r$  & $\dn$ &
    $\textit{ders}\;cs\;(\textit{simp}(\textit{der}\;c\;r))$\\
\end{tabular}
\end{center}

Note that this function is different from \textit{der}, which only
takes a single character.

The second function, called \textit{matcher}, takes a string and a
regular expression as arguments. It builds first the derivatives
according to \textit{ders} and after that tests whether the resulting
derivative regular expression can match the empty string (using
\textit{nullable}).  For example the \textit{matcher} will produce
true for the regular expression $(a\cdot b)\cdot c$ and the string
$abc$, but false if you give it the string $ab$. \hfill[1 Mark]

\item[(5)] Implement a function, called \textit{size}, by recursion
  over regular expressions. If a regular expression is seen as a tree,
  then \textit{size} should return the number of nodes in such a
  tree. Therefore this function is defined as follows:

\begin{center}
\begin{tabular}{lcl}
$\textit{size}(\ZERO)$ & $\dn$ & $1$\\
$\textit{size}(\ONE)$  & $\dn$ & $1$\\
$\textit{size}(c)$     & $\dn$ & $1$\\
$\textit{size}(r_1 + r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
$\textit{size}(r_1 \cdot r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
$\textit{size}(r^*)$ & $\dn$ & $1 + \textit{size}(r)$\\
\end{tabular}
\end{center}

You can use \textit{size} in order to test how much the ``evil'' regular
expression $(a^*)^* \cdot b$ grows when taking successive derivatives
according the letter $a$ without simplification and then compare it to
taking the derivative, but simplify the result.  The sizes
are given in \texttt{re.scala}. \hfill[1 Mark]

\item[(6)] You do not have to implement anything specific under this
  task.  The purpose here is that you will be marked for some ``power''
  test cases. For example can your matcher decide within 30 seconds
  whether the regular expression $(a^*)^*\cdot b$ matches strings of the
  form $aaa\ldots{}aaaa$, for say 1 Million $a$'s. And does simplification
  simplify the regular expression

  \[
  \texttt{SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)}
  \]  

  \noindent correctly to just \texttt{ONE}, where \texttt{SEQ} is nested
  50 or more times?\\
  \mbox{}\hfill[2 Mark]
\end{itemize}

\subsection*{Background}

Although easily implementable in Scala, the idea behind the derivative
function might not so easy to be seen. To understand its purpose
better, assume a regular expression $r$ can match strings of the form
$c\!::\!cs$ (that means strings which start with a character $c$ and have
some rest, or tail, $cs$). If you take the derivative of $r$ with
respect to the character $c$, then you obtain a regular expression
that can match all the strings $cs$.  In other words, the regular
expression $\textit{der}\;c\;r$ can match the same strings $c\!::\!cs$
that can be matched by $r$, except that the $c$ is chopped off.

Assume now $r$ can match the string $abc$. If you take the derivative
according to $a$ then you obtain a regular expression that can match
$bc$ (it is $abc$ where the $a$ has been chopped off). If you now
build the derivative $\textit{der}\;b\;(\textit{der}\;a\;r)$ you
obtain a regular expression that can match the string $c$ (it is $bc$
where $b$ is chopped off). If you finally build the derivative of this
according $c$, that is
$\textit{der}\;c\;(\textit{der}\;b\;(\textit{der}\;a\;r))$, you obtain
a regular expression that can match the empty string. You can test
whether this is indeed the case using the function nullable, which is
what your matcher is doing.

The purpose of the $\textit{simp}$ function is to keep the regular
expressions small. Normally the derivative function makes the regular
expression bigger (see the SEQ case and the example in (2)) and the
algorithm would be slower and slower over time. The $\textit{simp}$
function counters this increase in size and the result is that the
algorithm is fast throughout.  By the way, this algorithm is by Janusz
Brzozowski who came up with the idea of derivatives in 1964 in his PhD
thesis.

\begin{center}\small
\url{https://en.wikipedia.org/wiki/Janusz_Brzozowski_(computer_scientist)}
\end{center}


If you want to see how badly the regular expression matchers do in
Java\footnote{Version 8 and below; Version 9 and above does not seem to be as
  catastrophic, but still much worse than the regular expression
  matcher based on derivatives.}, JavaScript and Python with the
`evil' regular expression $(a^*)^*\cdot b$, then have a look at the
graphs below (you can try it out for yourself: have a look at the files
\texttt{catastrophic9.java}, \texttt{catastrophic.js},
\texttt{catastrophic.py} etc on KEATS). Compare this with the matcher you
have implemented. How long can the string of $a$'s be in your matcher
and still stay within the 30 seconds time limit?

\begin{center}
\begin{tabular}{@{}cc@{}}
\multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings 
           $\underbrace{a\ldots a}_{n}$}\bigskip\\
  
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,0.0)}},
    ylabel={time in secs},
    y label style={at={(0.06,0.5)}},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=45,
    ytick={0,5,...,40},
    scaled ticks=false,
    axis lines=left,
    width=6cm,
    height=5.5cm, 
    legend entries={Python, Java 8, JavaScript, Swift, Dart},  
    legend pos=north west,
    legend cell align=left]
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
\addplot[magenta,mark=*, mark options={fill=white}] table {re-swift.data};
\addplot[brown,mark=*, mark options={fill=white}] table {re-dart.data};
\end{axis}
\end{tikzpicture}
  & 
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,0.0)}},
    ylabel={time in secs},
    y label style={at={(0.06,0.5)}},
    %enlargelimits=false,
    %xtick={0,5000,...,30000},
    xmax=65000,
    ymax=45,
    ytick={0,5,...,40},
    scaled ticks=false,
    axis lines=left,
    width=6cm,
    height=5.5cm, 
    legend entries={Java 9},  
    legend pos=north west]
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
\end{axis}
\end{tikzpicture}
\end{tabular}  
\end{center}
\newpage





\end{document}


%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: