cws/main_cw03.tex
author Christian Urban <christian.urban@kcl.ac.uk>
Tue, 07 Dec 2021 23:17:51 +0000
changeset 418 fa7f7144f2bb
parent 415 fced9a61c881
child 424 daf561a83ba6
permissions -rw-r--r--
updated

% !TEX program = xelatex
\documentclass{article}
\usepackage{../style}
\usepackage{../langs}
\usepackage{disclaimer}
\usepackage{tikz}
\usepackage{pgf}
\usepackage{pgfplots}
\usepackage{stackengine}
%% \usepackage{accents}
\newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}

\begin{filecontents}{re-python2.data}
1 0.033
5 0.036
10 0.034
15 0.036
18 0.059
19 0.084 
20 0.141
21 0.248
22 0.485
23 0.878
24 1.71
25 3.40
26 7.08
27 14.12
28 26.69
\end{filecontents}

\begin{filecontents}{re-java.data}
5  0.00298
10  0.00418
15  0.00996
16  0.01710
17  0.03492
18  0.03303
19  0.05084
20  0.10177
21  0.19960
22  0.41159
23  0.82234
24  1.70251
25  3.36112
26  6.63998
27  13.35120
28  29.81185
\end{filecontents}

\begin{filecontents}{re-js.data}
5   0.061
10  0.061
15  0.061
20  0.070
23  0.131
25  0.308
26  0.564
28  1.994
30  7.648
31  15.881 
32  32.190
\end{filecontents}

\begin{filecontents}{re-java9.data}
1000  0.01410
2000  0.04882
3000  0.10609
4000  0.17456
5000  0.27530
6000  0.41116
7000  0.53741
8000  0.70261
9000  0.93981
10000 0.97419
11000 1.28697
12000 1.51387
14000 2.07079
16000 2.69846
20000 4.41823
24000 6.46077
26000 7.64373
30000 9.99446
34000 12.966885
38000 16.281621
42000 19.180228
46000 21.984721
50000 26.950203
60000 43.0327746
\end{filecontents}

\begin{filecontents}{re-swift.data}
5   0.001
10  0.001
15  0.009
20  0.178
23  1.399
24  2.893
25  5.671
26  11.357
27  22.430
\end{filecontents}

\begin{filecontents}{re-dart.data}
20 0.042
21 0.084
22 0.190
23 0.340
24 0.678
25 1.369
26 2.700
27 5.462
28 10.908
29 21.725
30 43.492
\end{filecontents}

\begin{document}

% BF IDE
% https://www.microsoft.com/en-us/p/brainf-ck/9nblgggzhvq5
  
\section*{Main Part 3 (Scala, 6 Marks)}

%\mbox{}\hfill\textit{``[Google’s MapReduce] abstraction is inspired by the}\\
%\mbox{}\hfill\textit{map and reduce primitives present in Lisp and many}\\
%\mbox{}\hfill\textit{other functional language.''}\smallskip\\
%\mbox{}\hfill\textit{ --- Dean and Ghemawat, who designed this concept at Google}
%\bigskip\medskip

\noindent
This part is about a regular expression matcher described by
Brzozowski in 1964.  The
background is that ``out-of-the-box'' regular expression matching in
mainstream languages like Java, JavaScript and Python can sometimes be
excruciatingly slow.  You are supposed to implement a regular
expression matcher that is much, much faster. \bigskip

\IMPORTANTNONE{}

\noindent
Also note that the running time of each part will be restricted to a
maximum of 30 seconds on my laptop.  

\DISCLAIMER{}

\subsection*{Reference Implementation}

This Scala assignment comes with a reference implementation in form of
a \texttt{jar}-file. This allows you to run any test cases on your own
computer. For example you can call Scala on the command line with the
option \texttt{-cp re.jar} and then query any function from the
\texttt{re.scala} template file. As usual you have to prefix the calls
with \texttt{M3} or import this object.  Since some tasks
are time sensitive, you can check the reference implementation as
follows: if you want to know, for example, how long it takes to match
strings of $a$'s using the regular expression $(a^*)^*\cdot b$ you can
query as follows:


\begin{lstlisting}[xleftmargin=1mm,numbers=none,basicstyle=\ttfamily\small]
$ scala -cp re.jar
scala> import M3._  
scala> for (i <- 0 to 5000000 by 500000) {
  | println(f"$i: ${time_needed(2, matcher(EVIL, "a" * i))}%.5f secs.")
  | }
0: 0.00002 secs.
500000: 0.10608 secs.
1000000: 0.22286 secs.
1500000: 0.35982 secs.
2000000: 0.45828 secs.
2500000: 0.59558 secs.
3000000: 0.73191 secs.
3500000: 0.83499 secs.
4000000: 0.99149 secs.
4500000: 1.15395 secs.
5000000: 1.29659 secs.
\end{lstlisting}%$

\subsection*{Preliminaries}

The task is to implement a regular expression matcher that is based on
derivatives of regular expressions. Most of the functions are defined by
recursion over regular expressions and can be elegantly implemented
using Scala's pattern-matching. The implementation should deal with the
following regular expressions, which have been predefined in the file
\texttt{re.scala}:

\begin{center}
\begin{tabular}{lcll}
  $r$ & $::=$ & $\ZERO$     & cannot match anything\\
      &   $|$ & $\ONE$      & can only match the empty string\\
      &   $|$ & $c$         & can match a single character (in this case $c$)\\
      &   $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
  &   $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
          &  & & then the second part with $r_2$\\
      &   $|$ & $r^*$       & can match a string with zero or more copies of $r$\\
\end{tabular}
\end{center}

\noindent 
Why? Regular expressions are
one of the simplest ways to match patterns in text, and
are endlessly useful for searching, editing and analysing data in all
sorts of places (for example analysing network traffic in order to
detect security breaches). However, you need to be fast, otherwise you
will stumble over problems such as recently reported at

{\small
\begin{itemize}
\item[$\bullet$] \url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019}  
\item[$\bullet$] \url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
\item[$\bullet$] \url{https://vimeo.com/112065252}
\item[$\bullet$] \url{https://davidvgalbraith.com/how-i-fixed-atom}  
\end{itemize}}

% Knowing how to match regular expressions and strings will let you
% solve a lot of problems that vex other humans.


\subsubsection*{Tasks (file re.scala)}

The file \texttt{re.scala} has already a definition for regular
expressions and also defines some handy shorthand notation for
regular expressions. The notation in this document matches up
with the code in the file as follows:

\begin{center}
  \begin{tabular}{rcl@{\hspace{10mm}}l}
    & & code: & shorthand:\smallskip \\ 
  $\ZERO$ & $\mapsto$ & \texttt{ZERO}\\
  $\ONE$  & $\mapsto$ & \texttt{ONE}\\
  $c$     & $\mapsto$ & \texttt{CHAR(c)}\\
  $\sum rs$ & $\mapsto$ & \texttt{ALTs(rs)}\\  
  $r_1 + r_2$ & $\mapsto$ & \texttt{ALT(r1, r2)} & \texttt{r1 | r2}\\
  $r_1 \cdot r_2$ & $\mapsto$ & \texttt{SEQ(r1, r2)} & \texttt{r1 $\sim$ r2}\\
  $r^*$ & $\mapsto$ &  \texttt{STAR(r)} & \texttt{r.\%}
\end{tabular}    
\end{center}  

\noindent
The alternative regular expressions comes in two versions: one is
binary (+ / \texttt{ALT}) and the other is n-ary ($\sum$ /
\texttt{ALTs}). The latter takes a list of regular expressions as
arguments.  In what follows we shall use $rs$ to stand for lists of
regular expressions.  The binary alternative can be seen as an abbreviation,
that is $r_1 + r_2 \dn \sum\,[r_1, r_2]$. As a result we can ignore the
binary version and only implement the n-ary alternative.

\begin{itemize}
\item[(1)] Implement a function, called \textit{nullable}, by
  recursion over regular expressions. This function tests whether a
  regular expression can match the empty string. This means given a
  regular expression it either returns true or false. The function
  \textit{nullable}
  is defined as follows:

\begin{center}
\begin{tabular}{lcl}
$\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
$\textit{nullable}(\ONE)$  & $\dn$ & $\textit{true}$\\
$\textit{nullable}(c)$     & $\dn$ & $\textit{false}$\\
$\textit{nullable}(\sum rs)$ & $\dn$ & $\exists r \in rs.\;\textit{nullable}(r)$\\
$\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\
$\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\\
\end{tabular}
\end{center}~\hfill[0.5 Marks]

\item[(2)] Implement a function, called \textit{der}, by recursion over
  regular expressions. It takes a character and a regular expression
  as arguments and calculates the derivative of a regular expression according
  to the rules:

\begin{center}
\begin{tabular}{lcl}
$\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
$\textit{der}\;c\;(\ONE)$  & $\dn$ & $\ZERO$\\
$\textit{der}\;c\;(d)$     & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
$\textit{der}\;c\;(\sum\;[r_1,..,r_n])$ & $\dn$ & $\sum\;[\textit{der}\;c\;r_1,..,\textit{der}\;c\;r_n]$\\
$\textit{der}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
      & & $\textit{then}\;((\textit{der}\;c\;r_1)\cdot r_2) + (\textit{der}\;c\;r_2)$\\
      & & $\textit{else}\;(\textit{der}\;c\;r_1)\cdot r_2$\\
$\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\
\end{tabular}
\end{center}

For example given the regular expression $r = (a \cdot b) \cdot c$, the derivatives
w.r.t.~the characters $a$, $b$ and $c$ are

\begin{center}
  \begin{tabular}{lcll}
    $\textit{der}\;a\;r$ & $=$ & $(\ONE \cdot b)\cdot c$ & \quad($= r'$)\\
    $\textit{der}\;b\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$\\
    $\textit{der}\;c\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$
  \end{tabular}
\end{center}

Let $r'$ stand for the first derivative, then taking the derivatives of $r'$
w.r.t.~the characters $a$, $b$ and $c$ gives

\begin{center}
  \begin{tabular}{lcll}
    $\textit{der}\;a\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$ \\
    $\textit{der}\;b\;r'$ & $=$ & $((\ZERO \cdot b) + \ONE)\cdot c$ & \quad($= r''$)\\
    $\textit{der}\;c\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$
  \end{tabular}
\end{center}

One more example: Let $r''$ stand for the second derivative above,
then taking the derivatives of $r''$ w.r.t.~the characters $a$, $b$
and $c$ gives

\begin{center}
  \begin{tabular}{lcll}
    $\textit{der}\;a\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$ \\
    $\textit{der}\;b\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$\\
    $\textit{der}\;c\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ONE$ &
    (is $\textit{nullable}$)                      
  \end{tabular}
\end{center}

Note, the last derivative can match the empty string, that is it is \textit{nullable}.\\
\mbox{}\hfill\mbox{[1 Mark]}

\item[(3)] We next want to simplify regular expressions: essentially
  we want to remove $\ZERO$ in regular expressions like $r + \ZERO$
  and $\ZERO + r$.  However, our n-ary alternative takes a list of
  regular expressions as argument, we therefore need a more general
  ``flatten'' function, called \texttt{flts}. This function should
  analyse a list of regular expressions, say $rs$, as follows:

  \begin{center}
    \begin{tabular}{lllll}
      1) &$rs = []$                & $\dn$ & $[]$ & (empty list)\\
      2) &$rs = \ZERO :: rest$     & $\dn$ & $\textrm{flatten}\;rest$\\
      3) &$rs = (\sum rs_1) :: rest$ & $\dn$ & $rs_1 ::: \textrm{flatten}\;rest$\\
      4) &$rs = r :: rest$         & $\dn$ & $r :: \textrm{flatten}\;rest$ & (otherwise)\\
    \end{tabular}  
  \end{center}  

  The first clause just states that empty lists cannot be further
  flattened. The second removes all $\ZERO$s from the list.
  The third is when the first regular expression is an \texttt{ALTs}, then
  the content of this alternative should be spilled out and appended
  with the flattened rest of the list. The last case is for all other
  cases where the head of the list is not $\ZERO$ and not an \texttt{ALTs},
  then we just keep the head of the list and flatten the rest.
  \mbox{}\hfill\mbox{[1 Mark]}

\item[(4)] Implement the function \textit{simp}, which recursively
  traverses a regular expression, and on the way up simplifies every
  regular expression on the left (see below) to the regular expression
  on the right, except it does not simplify inside ${}^*$-regular
  expressions.

  \begin{center}
\begin{tabular}{l@{\hspace{4mm}}c@{\hspace{4mm}}ll}
$r \cdot \ZERO$ & $\mapsto$ & $\ZERO$\\ 
$\ZERO \cdot r$ & $\mapsto$ & $\ZERO$\\ 
$r \cdot \ONE$ & $\mapsto$ & $r$\\ 
$\ONE \cdot r$ & $\mapsto$ & $r$\\ 
$\sum\;[r_1,..,r_n]$ & $\mapsto$ & $\sum\;(\texttt{(flts + distinct)}[simp(r_1),..,simp(r_n)])$\\  
\end{tabular}
\end{center}

The last case is as follows: first apply $simp$ to all regular
expressions $r_1,.. ,r_n$; then flatten the resulting list using
\texttt{flts}; finally remove all duplicates in this list (this can be
done in Scala using the function
\texttt{\_.distinct}). \textcolor{red}{When you perform these
  operations, you end up with three cases, namely where the list is
  empty, contains a single element and ``otherwise''. These cases
  should be processed as follows}
\begin{center}
\textcolor{red}{\begin{tabular}{l@{\hspace{4mm}}c@{\hspace{4mm}}ll}
$\sum\;[]$ & $\mapsto$ & $\ZERO$\\ 
$\sum\;[r]$ & $\mapsto$ & $r$\\ 
$\sum\;rs$ & $\mapsto$ & $\sum\;rs$ & ``otherwise''\\ 
\end{tabular}}
\end{center}

  

  For example the regular expression
  \[(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO)\]

  simplifies to just $r_1$. \textbf{Hint:} Regular expressions can be
  seen as trees and there are several methods for traversing
  trees. One of them corresponds to the inside-out traversal, which is
  also sometimes called post-order tra\-versal: you traverse inside
  the tree and on the way up you apply simplification rules.
  \textbf{Another Hint:} Remember numerical expressions from school
  times---there you had expressions like
  $u + \ldots + (1 \cdot x) - \ldots (z + (y \cdot 0)) \ldots$ and
  simplification rules that looked very similar to rules above. You
  would simplify such numerical expressions by replacing for example
  the $y \cdot 0$ by $0$, or $1\cdot x$ by $x$, and then look whether
  more rules are applicable. If you regard regular expressions as
  trees and if you organise the simplification in an inside-out
  fashion, it is always clear which simplification should be applied
  next.\mbox{}\hfill\mbox{[1 Mark]}

\item[(5)] Implement two functions: The first, called \textit{ders},
  takes a list of characters and a regular expression as arguments, and
  builds the derivative w.r.t.~the list as follows:

\begin{center}
\begin{tabular}{lcl}
$\textit{ders}\;(Nil)\;r$ & $\dn$ & $r$\\
  $\textit{ders}\;(c::cs)\;r$  & $\dn$ &
    $\textit{ders}\;cs\;(\textit{simp}(\textit{der}\;c\;r))$\\
\end{tabular}
\end{center}

Note that this function is different from \textit{der}, which only
takes a single character.

The second function, called \textit{matcher}, takes a string and a
regular expression as arguments. It builds first the derivatives
according to \textit{ders} and after that tests whether the resulting
derivative regular expression can match the empty string (using
\textit{nullable}).  For example the \textit{matcher} will produce
true for the regular expression $(a\cdot b)\cdot c$ and the string
$abc$, but false if you give it the string $ab$. \hfill[1 Mark]

\item[(6)] Implement a function, called \textit{size}, by recursion
  over regular expressions. If a regular expression is seen as a tree,
  then \textit{size} should return the number of nodes in such a
  tree. Therefore this function is defined as follows:

\begin{center}
\begin{tabular}{lcl}
$\textit{size}(\ZERO)$ & $\dn$ & $1$\\
$\textit{size}(\ONE)$  & $\dn$ & $1$\\
$\textit{size}(c)$     & $\dn$ & $1$\\
$\textit{size}(\sum\,[r_1,..,r_n]$ & $\dn$ & $1 + \textit{size}(r_1) + ... + \textit{size}(r_n)$\\
$\textit{size}(r_1 \cdot r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
$\textit{size}(r^*)$ & $\dn$ & $1 + \textit{size}(r)$\\
\end{tabular}
\end{center}

You can use \textit{size} in order to test how much the ``evil'' regular
expression $(a^*)^* \cdot b$ grows when taking successive derivatives
according the letter $a$ without simplification and then compare it to
taking the derivative, but simplify the result.  The sizes
are given in \texttt{re.scala}. \hfill[0.5 Marks]

\item[(7)] You do not have to implement anything specific under this
  task.  The purpose here is that you will be marked for some ``power''
  test cases. For example can your matcher decide within 30 seconds
  whether the regular expression $(a^*)^*\cdot b$ matches strings of the
  form $aaa\ldots{}aaaa$, for say 1 Million $a$'s. And does simplification
  simplify the regular expression

  \[
  \texttt{SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)}
  \]  

  \noindent correctly to just \texttt{ONE}, where \texttt{SEQ} is nested
  50 or more times?\\
  \mbox{}\hfill[1 Mark]
\end{itemize}

\subsection*{Background}

Although easily implementable in Scala (ok maybe the \texttt{simp} functions and
\texttt{ALTs} needs a bit more thinking), the idea behind the
derivative function might not so easy to be seen. To understand its
purpose better, assume a regular expression $r$ can match strings of
the form $c\!::\!cs$ (that means strings which start with a character
$c$ and have some rest, or tail, $cs$). If you take the derivative of
$r$ with respect to the character $c$, then you obtain a regular
expression that can match all the strings $cs$.  In other words, the
regular expression $\textit{der}\;c\;r$ can match the same strings
$c\!::\!cs$ that can be matched by $r$, except that the $c$ is chopped
off.

Assume now $r$ can match the string $abc$. If you take the derivative
according to $a$ then you obtain a regular expression that can match
$bc$ (it is $abc$ where the $a$ has been chopped off). If you now
build the derivative $\textit{der}\;b\;(\textit{der}\;a\;r)$ you
obtain a regular expression that can match the string $c$ (it is $bc$
where $b$ is chopped off). If you finally build the derivative of this
according $c$, that is
$\textit{der}\;c\;(\textit{der}\;b\;(\textit{der}\;a\;r))$, you obtain
a regular expression that can match the empty string. You can test
whether this is indeed the case using the function nullable, which is
what your matcher is doing.

The purpose of the $\textit{simp}$ function is to keep the regular
expressions small. Normally the derivative function makes the regular
expression bigger (see the SEQ case and the example in (2)) and the
algorithm would be slower and slower over time. The $\textit{simp}$
function counters this increase in size and the result is that the
algorithm is fast throughout.  By the way, this algorithm is by Janusz
Brzozowski who came up with the idea of derivatives in 1964 in his PhD
thesis.

\begin{center}\small
\url{https://en.wikipedia.org/wiki/Janusz_Brzozowski_(computer_scientist)}
\end{center}


If you want to see how badly the regular expression matchers do in
Java\footnote{Version 8 and below; Version 9 and above does not seem to be as
  catastrophic, but still much worse than the regular expression
  matcher based on derivatives.}, JavaScript and Python with the
``evil'' regular expression $(a^*)^*\cdot b$, then have a look at the
graphs below (you can try it out for yourself: have a look at the files
\texttt{catastrophic9.java}, \texttt{catastrophic.js},
\texttt{catastrophic.py} etc on KEATS). Compare this with the matcher you
have implemented. How long can a string of $a$'s be in your matcher
and still stay within the 30 seconds time limit? It should be muuuch better
than your off-the-shelf matcher in your bog-standard language.

\begin{center}
\begin{tabular}{@{}cc@{}}
\multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings 
           $\underbrace{a\ldots a}_{n}$}\bigskip\\
  
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,0.0)}},
    ylabel={time in secs},
    y label style={at={(0.06,0.5)}},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=45,
    ytick={0,5,...,40},
    scaled ticks=false,
    axis lines=left,
    width=6cm,
    height=5.0cm, 
    legend entries={Python, Java 8, JavaScript, Swift, Dart},  
    legend pos=north west,
    legend cell align=left]
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
\addplot[magenta,mark=*, mark options={fill=white}] table {re-swift.data};
\addplot[brown,mark=*, mark options={fill=white}] table {re-dart.data};
\end{axis}
\end{tikzpicture}
  & 
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,0.0)}},
    ylabel={time in secs},
    y label style={at={(0.06,0.5)}},
    %enlargelimits=false,
    %xtick={0,5000,...,30000},
    xmax=65000,
    ymax=45,
    ytick={0,5,...,40},
    scaled ticks=false,
    axis lines=left,
    width=6cm,
    height=5.0cm, 
    legend entries={Java 9},  
    legend pos=north west]
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
\end{axis}
\end{tikzpicture}
\end{tabular}  
\end{center}
\newpage





\end{document}


%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: