pep-material: changeset 351:591b9005157e

--- a/cws/main_cw03.tex	Wed Nov 04 14:55:49 2020 +0000
+++ b/cws/main_cw03.tex	Wed Nov 04 15:35:31 2020 +0000
@@ -1,444 +1,525 @@
 % !TEX program = xelatex
 \documentclass{article}
-\usepackage{chessboard}
-\usepackage[LSBC4,T1]{fontenc}
-\let\clipbox\relax
 \usepackage{../style}
 \usepackage{../langs}
 \usepackage{disclaimer}
+\usepackage{tikz}
+\usepackage{pgf}
+\usepackage{pgfplots}
+\usepackage{stackengine}
+%% \usepackage{accents}
+\newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}
+
+\begin{filecontents}{re-python2.data}
+1 0.033
+5 0.036
+10 0.034
+15 0.036
+18 0.059
+19 0.084 
+20 0.141
+21 0.248
+22 0.485
+23 0.878
+24 1.71
+25 3.40
+26 7.08
+27 14.12
+28 26.69
+\end{filecontents}
+
+\begin{filecontents}{re-java.data}
+5  0.00298
+10  0.00418
+15  0.00996
+16  0.01710
+17  0.03492
+18  0.03303
+19  0.05084
+20  0.10177
+21  0.19960
+22  0.41159
+23  0.82234
+24  1.70251
+25  3.36112
+26  6.63998
+27  13.35120
+28  29.81185
+\end{filecontents}
+
+\begin{filecontents}{re-js.data}
+5   0.061
+10  0.061
+15  0.061
+20  0.070
+23  0.131
+25  0.308
+26  0.564
+28  1.994
+30  7.648
+31  15.881 
+32  32.190
+\end{filecontents}
+
+\begin{filecontents}{re-java9.data}
+1000  0.01410
+2000  0.04882
+3000  0.10609
+4000  0.17456
+5000  0.27530
+6000  0.41116
+7000  0.53741
+8000  0.70261
+9000  0.93981
+10000 0.97419
+11000 1.28697
+12000 1.51387
+14000 2.07079
+16000 2.69846
+20000 4.41823
+24000 6.46077
+26000 7.64373
+30000 9.99446
+34000 12.966885
+38000 16.281621
+42000 19.180228
+46000 21.984721
+50000 26.950203
+60000 43.0327746
+\end{filecontents}
+
+\begin{filecontents}{re-swift.data}
+5   0.001
+10  0.001
+15  0.009
+20  0.178
+23  1.399
+24  2.893
+25  5.671
+26  11.357
+27  22.430
+\end{filecontents}
+
+\begin{filecontents}{re-dart.data}
+20 0.042
+21 0.084
+22 0.190
+23 0.340
+24 0.678
+25 1.369
+26 2.700
+27 5.462
+28 10.908
+29 21.725
+30 43.492
+\end{filecontents}
 
 \begin{document}
 
-\setchessboard{smallboard,
-               zero,
-               showmover=false,
-               boardfontencoding=LSBC4,
-               hlabelformat=\arabic{ranklabel},
-               vlabelformat=\arabic{filelabel}}
+% BF IDE
+% https://www.microsoft.com/en-us/p/brainf-ck/9nblgggzhvq5
+  
+\section*{Part 8 (Scala, 7 Marks)}
 
-\mbox{}\\[-18mm]\mbox{}
-
-\section*{Part 8 (Scala)}
-
-\mbox{}\hfill\textit{``The problem with object-oriented languages is they’ve got all this implicit,}\\
-\mbox{}\hfill\textit{environment that they carry around with them. You wanted a banana but}\\
-\mbox{}\hfill\textit{what you got was a gorilla holding the banana and the entire jungle.''}\smallskip\\
-\mbox{}\hfill\textit{ --- Joe Armstrong (creator of the Erlang programming language)}\medskip\bigskip
+%\mbox{}\hfill\textit{``[Google’s MapReduce] abstraction is inspired by the}\\
+%\mbox{}\hfill\textit{map and reduce primitives present in Lisp and many}\\
+%\mbox{}\hfill\textit{other functional language.''}\smallskip\\
+%\mbox{}\hfill\textit{ --- Dean and Ghemawat, who designed this concept at Google}
+%\bigskip\medskip
 
 \noindent
-This part is about searching and backtracking. You are asked to
-implement Scala programs that solve various versions of the
-\textit{Knight's Tour Problem} on a chessboard. The preliminary part (4\%) is
-due on  \cwEIGHT{} at 4pm; the core part is due on \cwEIGHTa{} at 4pm.
-Note the core, more advanced, part might include material you have not
-yet seen in the first three lectures. \bigskip
+This part is about a regular expression matcher described by
+Brzozowski in 1964. This part is due on \cwEIGHTa{} at 5pm.  The
+background is that ``out-of-the-box'' regular expression matching in
+mainstream languages like Java, JavaScript and Python can sometimes be
+excruciatingly slow.  You are supposed to implement a regular
+expression matcher that is much, much faster. \bigskip
 
-\IMPORTANT{}
+\IMPORTANTNONE{}
+
+\noindent
 Also note that the running time of each part will be restricted to a
-maximum of 30 seconds on my laptop: If you calculate a result once,
-try to avoid to calculate the result again. Feel free to copy any code
-you need from files \texttt{knight1.scala}, \texttt{knight2.scala} and
-\texttt{knight3.scala}.
+maximum of 30 seconds on my laptop.  
 
 \DISCLAIMER{}
 
-\subsection*{Background}
-
-The \textit{Knight's Tour Problem} is about finding a tour such that
-the knight visits every field on an $n\times n$ chessboard once. For
-example on a $5\times 5$ chessboard, a knight's tour is:
-
-\chessboard[maxfield=d4, 
-            pgfstyle= {[base,at={\pgfpoint{0pt}{-0.5ex}}]text},
-            text = \small 24, markfield=Z4,
-            text = \small 11, markfield=a4,
-            text = \small  6, markfield=b4,
-            text = \small 17, markfield=c4,
-            text = \small  0, markfield=d4,
-            text = \small 19, markfield=Z3,
-            text = \small 16, markfield=a3,
-            text = \small 23, markfield=b3,
-            text = \small 12, markfield=c3,
-            text = \small  7, markfield=d3,
-            text = \small 10, markfield=Z2,
-            text = \small  5, markfield=a2,
-            text = \small 18, markfield=b2,
-            text = \small  1, markfield=c2,
-            text = \small 22, markfield=d2,
-            text = \small 15, markfield=Z1,
-            text = \small 20, markfield=a1,
-            text = \small  3, markfield=b1,
-            text = \small  8, markfield=c1,
-            text = \small 13, markfield=d1,
-            text = \small  4, markfield=Z0,
-            text = \small  9, markfield=a0,
-            text = \small 14, markfield=b0,
-            text = \small 21, markfield=c0,
-            text = \small  2, markfield=d0
-           ]
-           
-\noindent
-This tour starts in the right-upper corner, then moves to field
-$(3,2)$, then $(4,0)$ and so on. There are no knight's tours on
-$2\times 2$, $3\times 3$ and $4\times 4$ chessboards, but for every
-bigger board there is. 
-
-A knight's tour is called \emph{closed}, if the last step in the tour
-is within a knight's move to the beginning of the tour. So the above
-knight's tour is \underline{not} closed because the last
-step on field $(0, 4)$ is not within the reach of the first step on
-$(4, 4)$. It turns out there is no closed knight's tour on a $5\times
-5$ board. But there are on a $6\times 6$ board and on bigger ones, for
-example
-
-\chessboard[maxfield=e5, 
-            pgfstyle={[base,at={\pgfpoint{0pt}{-0.5ex}}]text},
-            text = \small 10, markfield=Z5,
-            text = \small  5, markfield=a5,
-            text = \small 18, markfield=b5,
-            text = \small 25, markfield=c5,
-            text = \small 16, markfield=d5,
-            text = \small  7, markfield=e5,
-            text = \small 31, markfield=Z4,
-            text = \small 26, markfield=a4,
-            text = \small  9, markfield=b4,
-            text = \small  6, markfield=c4,
-            text = \small 19, markfield=d4,
-            text = \small 24, markfield=e4,
-            % 4  11  30  17   8  15 
-            text = \small  4, markfield=Z3,
-            text = \small 11, markfield=a3,
-            text = \small 30, markfield=b3,
-            text = \small 17, markfield=c3,
-            text = \small  8, markfield=d3,
-            text = \small 15, markfield=e3,
-            %29  32  27   0  23  20 
-            text = \small 29, markfield=Z2,
-            text = \small 32, markfield=a2,
-            text = \small 27, markfield=b2,
-            text = \small  0, markfield=c2,
-            text = \small 23, markfield=d2,
-            text = \small 20, markfield=e2,
-            %12   3  34  21  14   1 
-            text = \small 12, markfield=Z1,
-            text = \small  3, markfield=a1,
-            text = \small 34, markfield=b1,
-            text = \small 21, markfield=c1,
-            text = \small 14, markfield=d1,
-            text = \small  1, markfield=e1,
-            %33  28  13   2  35  22 
-            text = \small 33, markfield=Z0,
-            text = \small 28, markfield=a0,
-            text = \small 13, markfield=b0,
-            text = \small  2, markfield=c0,
-            text = \small 35, markfield=d0,
-            text = \small 22, markfield=e0,
-            vlabel=false,
-            hlabel=false
-           ]
-
-
-\noindent
-where the 35th move can join up again with the 0th move.
-
-If you cannot remember how a knight moves in chess, or never played
-chess, below are all potential moves indicated for two knights, one on
-field $(2, 2)$ (blue moves) and another on $(7, 7)$ (red moves):
-
-{\chessboard[maxfield=g7,
-            color=blue!50,
-            linewidth=0.2em,
-            shortenstart=0.5ex,
-            shortenend=0.5ex,
-            markstyle=cross,
-            markfields={a4, c4, Z3, d3, Z1, d1, a0, c0},
-            color=red!50,
-            markfields={f5, e6},
-            setpieces={Ng7, Nb2},
-            boardfontsize=12pt,labelfontsize=9pt]}
-
 \subsection*{Reference Implementation}
 
-This Scala part comes with three reference implementations in form of
-\texttt{jar}-files. This allows you to run any test cases on your own
+This Scala assignment comes with a reference implementation in form of
+a \texttt{jar}-file. This allows you to run any test cases on your own
 computer. For example you can call Scala on the command line with the
-option \texttt{-cp knight1.jar} and then query any function from the
-\texttt{knight1.scala} template file. As usual you have to
-prefix the calls with \texttt{CW8a}, \texttt{CW8b} and \texttt{CW8c}.
-Since some of the calls are time sensitive, I included some timing
-information. For example
-
-\begin{lstlisting}[language={},numbers=none,basicstyle=\ttfamily\small]
-$ scala -cp knight1.jar
-scala> CW8a.enum_tours(5, List((0, 0))).length
-Time needed: 1.722 secs.
-res0: Int = 304
-
-scala> CW8a.print_board(8, CW8a.first_tour(8, List((0, 0))).get)
-Time needed: 15.411 secs.
-
- 51  46  55  44  53   4  21  12 
- 56  43  52   3  22  13  24   5 
- 47  50  45  54  25  20  11  14 
- 42  57   2  49  40  23   6  19 
- 35  48  41  26  61  10  15  28 
- 58   1  36  39  32  27  18   7 
- 37  34  31  60   9  62  29  16 
-  0  59  38  33  30  17   8  63 
-\end{lstlisting}%$
-
-
-\subsection*{Hints}
-
-\noindent
-\textbf{Preliminary Part} useful list functions: \texttt{.contains(..)} checks
-whether an element is in a list, \texttt{.flatten} turns a list of
-lists into just a list, \texttt{\_::\_} puts an element on the head of
-the list, \texttt{.head} gives you the first element of a list (make
-sure the list is not \texttt{Nil}); a useful option function:
-\texttt{.isDefined} returns true, if an option is \texttt{Some(..)};
-anonymous functions can be constructed using \texttt{(x:Int) => ...},
-this function takes an \texttt{Int} as an argument.\medskip
-
-
-\noindent
-\textbf{Core Part} a useful list function: \texttt{.sortBy} sorts a list
-according to a component given by the function; a function can be
-tested to be tail-recursive by annotation \texttt{@tailrec}, which is
-made available by importing \texttt{scala.annotation.tailrec}.\medskip
-
-
+option \texttt{-cp re.jar} and then query any function from the
+\texttt{re.scala} template file. As usual you have to prefix the calls
+with \texttt{CW8c} or import this object.  Since some tasks
+are time sensitive, you can check the reference implementation as
+follows: if you want to know, for example, how long it takes to match
+strings of $a$'s using the regular expression $(a^*)^*\cdot b$ you can
+query as follows:
 
 
-\subsection*{Preliminary Part (4 Marks)}
+\begin{lstlisting}[xleftmargin=1mm,numbers=none,basicstyle=\ttfamily\small]
+$ scala -cp re.jar
+scala> import CW8c._  
+scala> for (i <- 0 to 5000000 by 500000) {
+  | println(f"$i: ${time_needed(2, matcher(EVIL, "a" * i))}%.5f secs.")
+  | }
+0: 0.00002 secs.
+500000: 0.10608 secs.
+1000000: 0.22286 secs.
+1500000: 0.35982 secs.
+2000000: 0.45828 secs.
+2500000: 0.59558 secs.
+3000000: 0.73191 secs.
+3500000: 0.83499 secs.
+4000000: 0.99149 secs.
+4500000: 1.15395 secs.
+5000000: 1.29659 secs.
+\end{lstlisting}%$
 
-You are asked to implement the knight's tour problem such that the
-dimension of the board can be changed.  Therefore most functions will
-take the dimension of the board as an argument.  The fun with this
-problem is that even for small chessboard dimensions it has already an
-incredibly large search space---finding a tour is like finding a
-needle in a haystack. In the first task we want to see how far we get
-with exhaustively exploring the complete search space for small
-chessboards.\medskip
+\subsection*{Preliminaries}
+
+The task is to implement a regular expression matcher that is based on
+derivatives of regular expressions. Most of the functions are defined by
+recursion over regular expressions and can be elegantly implemented
+using Scala's pattern-matching. The implementation should deal with the
+following regular expressions, which have been predefined in the file
+\texttt{re.scala}:
 
-\noindent
-Let us first fix the basic datastructures for the implementation.  The
-board dimension is an integer.
-A \emph{position} (or field) on the chessboard is
-a pair of integers, like $(0, 0)$. A \emph{path} is a list of
-positions. The first (or 0th move) in a path is the last element in
-this list; and the last move in the path is the first element. For
-example the path for the $5\times 5$ chessboard above is represented
-by
+\begin{center}
+\begin{tabular}{lcll}
+  $r$ & $::=$ & $\ZERO$     & cannot match anything\\
+      &   $|$ & $\ONE$      & can only match the empty string\\
+      &   $|$ & $c$         & can match a single character (in this case $c$)\\
+      &   $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
+  &   $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
+          &  & & then the second part with $r_2$\\
+      &   $|$ & $r^*$       & can match a string with zero or more copies of $r$\\
+\end{tabular}
+\end{center}
 
-\[
-\texttt{List($\underbrace{\texttt{(0, 4)}}_{24}$,
-  $\underbrace{\texttt{(2, 3)}}_{23}$, ...,
-  $\underbrace{\texttt{(3, 2)}}_1$, $\underbrace{\texttt{(4, 4)}}_0$)}
-\]
+\noindent 
+Why? Regular expressions are
+one of the simplest ways to match patterns in text, and
+are endlessly useful for searching, editing and analysing data in all
+sorts of places (for example analysing network traffic in order to
+detect security breaches). However, you need to be fast, otherwise you
+will stumble over problems such as recently reported at
 
-\noindent
-Suppose the dimension of a chessboard is $n$, then a path is a
-\emph{tour} if the length of the path is $n \times n$, each element
-occurs only once in the path, and each move follows the rules of how a
-knight moves (see above for the rules).
+{\small
+\begin{itemize}
+\item[$\bullet$] \url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019}  
+\item[$\bullet$] \url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
+\item[$\bullet$] \url{https://vimeo.com/112065252}
+\item[$\bullet$] \url{https://davidvgalbraith.com/how-i-fixed-atom}  
+\end{itemize}}
+
+% Knowing how to match regular expressions and strings will let you
+% solve a lot of problems that vex other humans.
 
 
-\subsubsection*{Tasks (file knight1.scala)}
-
-\begin{itemize}
-\item[(1)] Implement an \texttt{is\_legal} function that takes a
-  dimension, a path and a position as arguments and tests whether the
-  position is inside the board and not yet element in the
-  path. \hfill[1 Mark]
+\subsubsection*{Tasks (file re.scala)}
 
-\item[(2)] Implement a \texttt{legal\_moves} function that calculates for a
-  position all legal onward moves. If the onward moves are
-  placed on a circle, you should produce them starting from
-  ``12-o'clock'' following in clockwise order.  For example on an
-  $8\times 8$ board for a knight at position $(2, 2)$ and otherwise
-  empty board, the legal-moves function should produce the onward
-  positions in this order:
-
-  \begin{center}
-  \texttt{List((3,4), (4,3), (4,1), (3,0), (1,0), (0,1), (0,3), (1,4))}
-  \end{center}
-
-  If the board is not empty, then maybe some of the moves need to be
-  filtered out from this list.  For a knight on field $(7, 7)$ and an
-  empty board, the legal moves are
+The file \texttt{re.scala} has already a definition for regular
+expressions and also defines some handy shorthand notation for
+regular expressions. The notation in this document matches up
+with the code in the file as follows:
 
-  \begin{center}
-  \texttt{List((6,5), (5,6))}
-  \end{center}
-  \mbox{}\hfill[1 Mark]
-
-\item[(3)] Implement two recursive functions (\texttt{count\_tours} and
-  \texttt{enum\_tours}). They each take a dimension and a path as
-  arguments. They exhaustively search for tours starting
-  from the given path. The first function counts all possible 
-  tours (there can be none for certain board sizes) and the second
-  collects all tours in a list of paths. These functions will be
-  called with a path containing a single position---the starting field.
-  They are expected to extend this path so as to find all tours starting
-  from the given position.\\
-  \mbox{}\hfill[2 Marks]
-\end{itemize}
-
-\noindent \textbf{Test data:} For the marking, the functions in (3)
-will be called with board sizes up to $5 \times 5$. If you search
-for tours on a $5 \times 5$ board starting only from field $(0, 0)$,
-there are 304 of tours. If you try out every field of a $5 \times
-5$-board as a starting field and add up all tours, you obtain
-1728. A $6\times 6$ board is already too large to be searched
-exhaustively.\footnote{For your interest, the number of tours on
-  $6\times 6$, $7\times 7$ and $8\times 8$ are 6637920, 165575218320,
-  19591828170979904, respectively.}\smallskip
+\begin{center}
+  \begin{tabular}{rcl@{\hspace{10mm}}l}
+    & & code: & shorthand:\smallskip \\ 
+  $\ZERO$ & $\mapsto$ & \texttt{ZERO}\\
+  $\ONE$  & $\mapsto$ & \texttt{ONE}\\
+  $c$     & $\mapsto$ & \texttt{CHAR(c)}\\
+  $r_1 + r_2$ & $\mapsto$ & \texttt{ALT(r1, r2)} & \texttt{r1 | r2}\\
+  $r_1 \cdot r_2$ & $\mapsto$ & \texttt{SEQ(r1, r2)} & \texttt{r1 $\sim$ r2}\\
+  $r^*$ & $\mapsto$ &  \texttt{STAR(r)} & \texttt{r.\%}
+\end{tabular}    
+\end{center}  
 
 
-\subsection*{Core Part (6 Marks)}
+\begin{itemize}
+\item[(1)] Implement a function, called \textit{nullable}, by
+  recursion over regular expressions. This function tests whether a
+  regular expression can match the empty string. This means given a
+  regular expression it either returns true or false. The function
+  \textit{nullable}
+  is defined as follows:
+
+\begin{center}
+\begin{tabular}{lcl}
+$\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
+$\textit{nullable}(\ONE)$  & $\dn$ & $\textit{true}$\\
+$\textit{nullable}(c)$     & $\dn$ & $\textit{false}$\\
+$\textit{nullable}(r_1 + r_2)$ & $\dn$ & $\textit{nullable}(r_1) \vee \textit{nullable}(r_2)$\\
+$\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\
+$\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\\
+\end{tabular}
+\end{center}~\hfill[1 Mark]
+
+\item[(2)] Implement a function, called \textit{der}, by recursion over
+  regular expressions. It takes a character and a regular expression
+  as arguments and calculates the derivative of a regular expression according
+  to the rules:
+
+\begin{center}
+\begin{tabular}{lcl}
+$\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
+$\textit{der}\;c\;(\ONE)$  & $\dn$ & $\ZERO$\\
+$\textit{der}\;c\;(d)$     & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
+$\textit{der}\;c\;(r_1 + r_2)$ & $\dn$ & $(\textit{der}\;c\;r_1) + (\textit{der}\;c\;r_2)$\\
+$\textit{der}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
+      & & $\textit{then}\;((\textit{der}\;c\;r_1)\cdot r_2) + (\textit{der}\;c\;r_2)$\\
+      & & $\textit{else}\;(\textit{der}\;c\;r_1)\cdot r_2$\\
+$\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\
+\end{tabular}
+\end{center}
+
+For example given the regular expression $r = (a \cdot b) \cdot c$, the derivatives
+w.r.t.~the characters $a$, $b$ and $c$ are
 
+\begin{center}
+  \begin{tabular}{lcll}
+    $\textit{der}\;a\;r$ & $=$ & $(\ONE \cdot b)\cdot c$ & \quad($= r'$)\\
+    $\textit{der}\;b\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$\\
+    $\textit{der}\;c\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$
+  \end{tabular}
+\end{center}
 
-\subsubsection*{Tasks (file knight1.scala cont.)}
+Let $r'$ stand for the first derivative, then taking the derivatives of $r'$
+w.r.t.~the characters $a$, $b$ and $c$ gives
+
+\begin{center}
+  \begin{tabular}{lcll}
+    $\textit{der}\;a\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$ \\
+    $\textit{der}\;b\;r'$ & $=$ & $((\ZERO \cdot b) + \ONE)\cdot c$ & \quad($= r''$)\\
+    $\textit{der}\;c\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$
+  \end{tabular}
+\end{center}
+
+One more example: Let $r''$ stand for the second derivative above,
+then taking the derivatives of $r''$ w.r.t.~the characters $a$, $b$
+and $c$ gives
+
+\begin{center}
+  \begin{tabular}{lcll}
+    $\textit{der}\;a\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$ \\
+    $\textit{der}\;b\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$\\
+    $\textit{der}\;c\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ONE$ &
+    (is $\textit{nullable}$)                      
+  \end{tabular}
+\end{center}
+
+Note, the last derivative can match the empty string, that is it is \textit{nullable}.\\
+\mbox{}\hfill\mbox{[1 Mark]}
+
+\item[(3)] Implement the function \textit{simp}, which recursively
+  traverses a regular expression, and on the way up simplifies every
+  regular expression on the left (see below) to the regular expression
+  on the right, except it does not simplify inside ${}^*$-regular
+  expressions.
 
-\begin{itemize}
-\item[(4)] Implement a \texttt{first}-function. This function takes a list of
-  positions and a function $f$ as arguments; $f$ is the name we give to
-  this argument). The function $f$ takes a position as argument and
-  produces an optional path. So $f$'s type is \texttt{Pos =>
-    Option[Path]}. The idea behind the \texttt{first}-function is as follows:
+  \begin{center}
+\begin{tabular}{l@{\hspace{4mm}}c@{\hspace{4mm}}ll}
+$r \cdot \ZERO$ & $\mapsto$ & $\ZERO$\\ 
+$\ZERO \cdot r$ & $\mapsto$ & $\ZERO$\\ 
+$r \cdot \ONE$ & $\mapsto$ & $r$\\ 
+$\ONE \cdot r$ & $\mapsto$ & $r$\\ 
+$r + \ZERO$ & $\mapsto$ & $r$\\ 
+$\ZERO + r$ & $\mapsto$ & $r$\\ 
+$r + r$ & $\mapsto$ & $r$\\ 
+\end{tabular}
+  \end{center}
+
+  For example the regular expression
+  \[(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO)\]
+
+  simplifies to just $r_1$. \textbf{Hint:} Regular expressions can be
+  seen as trees and there are several methods for traversing
+  trees. One of them corresponds to the inside-out traversal, which is also
+  sometimes called post-order tra\-versal: you traverse inside the
+  tree and on the way up you apply simplification rules.
+  \textbf{Another Hint:}
+  Remember numerical expressions from school times---there you had expressions
+  like $u + \ldots + (1 \cdot x) - \ldots (z + (y \cdot 0)) \ldots$
+  and simplification rules that looked very similar to rules
+  above. You would simplify such numerical expressions by replacing
+  for example the $y \cdot 0$ by $0$, or $1\cdot x$ by $x$, and then
+  look whether more rules are applicable. If you organise the
+  simplification in an inside-out fashion, it is always clear which
+  simplification should be applied next.\hfill[1 Mark]
+
+\item[(4)] Implement two functions: The first, called \textit{ders},
+  takes a list of characters and a regular expression as arguments, and
+  builds the derivative w.r.t.~the list as follows:
+
+\begin{center}
+\begin{tabular}{lcl}
+$\textit{ders}\;(Nil)\;r$ & $\dn$ & $r$\\
+  $\textit{ders}\;(c::cs)\;r$  & $\dn$ &
+    $\textit{ders}\;cs\;(\textit{simp}(\textit{der}\;c\;r))$\\
+\end{tabular}
+\end{center}
+
+Note that this function is different from \textit{der}, which only
+takes a single character.
+
+The second function, called \textit{matcher}, takes a string and a
+regular expression as arguments. It builds first the derivatives
+according to \textit{ders} and after that tests whether the resulting
+derivative regular expression can match the empty string (using
+\textit{nullable}).  For example the \textit{matcher} will produce
+true for the regular expression $(a\cdot b)\cdot c$ and the string
+$abc$, but false if you give it the string $ab$. \hfill[1 Mark]
+
+\item[(5)] Implement a function, called \textit{size}, by recursion
+  over regular expressions. If a regular expression is seen as a tree,
+  then \textit{size} should return the number of nodes in such a
+  tree. Therefore this function is defined as follows:
+
+\begin{center}
+\begin{tabular}{lcl}
+$\textit{size}(\ZERO)$ & $\dn$ & $1$\\
+$\textit{size}(\ONE)$  & $\dn$ & $1$\\
+$\textit{size}(c)$     & $\dn$ & $1$\\
+$\textit{size}(r_1 + r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
+$\textit{size}(r_1 \cdot r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
+$\textit{size}(r^*)$ & $\dn$ & $1 + \textit{size}(r)$\\
+\end{tabular}
+\end{center}
+
+You can use \textit{size} in order to test how much the ``evil'' regular
+expression $(a^*)^* \cdot b$ grows when taking successive derivatives
+according the letter $a$ without simplification and then compare it to
+taking the derivative, but simplify the result.  The sizes
+are given in \texttt{re.scala}. \hfill[1 Mark]
+
+\item[(6)] You do not have to implement anything specific under this
+  task.  The purpose here is that you will be marked for some ``power''
+  test cases. For example can your matcher decide within 30 seconds
+  whether the regular expression $(a^*)^*\cdot b$ matches strings of the
+  form $aaa\ldots{}aaaa$, for say 1 Million $a$'s. And does simplification
+  simplify the regular expression
 
   \[
-  \begin{array}{lcl}
-  \textit{first}(\texttt{Nil}, f) & \dn & \texttt{None}\\  
-  \textit{first}(x\!::\!xs, f) & \dn & \begin{cases}
-    f(x) & \textit{if}\;f(x) \not=\texttt{None}\\
-    \textit{first}(xs, f) & \textit{otherwise}\\
-                              \end{cases}
-  \end{array}
-  \]
+  \texttt{SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)}
+  \]  
 
-  \noindent That is, we want to find the first position where the
-  result of $f$ is not \texttt{None}, if there is one. Note that
-  `inside' \texttt{first}, you do not (need to) know anything about
-  the argument $f$ except its type, namely \texttt{Pos =>
-    Option[Path]}. If you want to find out what the result of $f$ is
-  on a particular argument, say $x$, you can just write $f(x)$. 
-  There is one additional point however you should
-  take into account when implementing \texttt{first}: you will need to
-  calculate what the result of $f(x)$ is; your code should do this
-  only \textbf{once} and for as \textbf{few} elements in the list as
-  possible! Do not calculate $f(x)$ for all elements and then see which 
-  is the first \texttt{Some}.\\\mbox{}\hfill[1 Mark]
-  
-\item[(5)] Implement a \texttt{first\_tour} function that uses the
-  \texttt{first}-function from (4), and searches recursively for single tour.
-  As there might not be such a tour at all, the \texttt{first\_tour} function
-  needs to return a value of type
-  \texttt{Option[Path]}.\\\mbox{}\hfill[1 Mark]
+  \noindent correctly to just \texttt{ONE}, where \texttt{SEQ} is nested
+  50 or more times?\\
+  \mbox{}\hfill[2 Mark]
 \end{itemize}
 
-\noindent
-\textbf{Testing:} The \texttt{first\_tour} function will be called with board
-sizes of up to $8 \times 8$.
-\bigskip
+\subsection*{Background}
 
-%%\newpage
+Although easily implementable in Scala, the idea behind the derivative
+function might not so easy to be seen. To understand its purpose
+better, assume a regular expression $r$ can match strings of the form
+$c\!::\!cs$ (that means strings which start with a character $c$ and have
+some rest, or tail, $cs$). If you take the derivative of $r$ with
+respect to the character $c$, then you obtain a regular expression
+that can match all the strings $cs$.  In other words, the regular
+expression $\textit{der}\;c\;r$ can match the same strings $c\!::\!cs$
+that can be matched by $r$, except that the $c$ is chopped off.
 
-\noindent
-As you should have seen in the earlier parts, a naive search for tours beyond
-$8 \times 8$ boards and also searching for closed tours even on small
-boards takes too much time. There is a heuristics, called \emph{Warnsdorf's
-Rule} that can speed up finding a tour. This heuristics states that a
-knight is moved so that it always proceeds to the field from which the
-knight will have the \underline{fewest} onward moves.  For example for
-a knight on field $(1, 3)$, the field $(0, 1)$ has the fewest possible
-onward moves, namely 2.
+Assume now $r$ can match the string $abc$. If you take the derivative
+according to $a$ then you obtain a regular expression that can match
+$bc$ (it is $abc$ where the $a$ has been chopped off). If you now
+build the derivative $\textit{der}\;b\;(\textit{der}\;a\;r)$ you
+obtain a regular expression that can match the string $c$ (it is $bc$
+where $b$ is chopped off). If you finally build the derivative of this
+according $c$, that is
+$\textit{der}\;c\;(\textit{der}\;b\;(\textit{der}\;a\;r))$, you obtain
+a regular expression that can match the empty string. You can test
+whether this is indeed the case using the function nullable, which is
+what your matcher is doing.
 
-\chessboard[maxfield=g7,
-            pgfstyle= {[base,at={\pgfpoint{0pt}{-0.5ex}}]text},
-            text = \small 3, markfield=Z5,
-            text = \small 7, markfield=b5,
-            text = \small 7, markfield=c4,
-            text = \small 7, markfield=c2,
-            text = \small 5, markfield=b1,
-            text = \small 2, markfield=Z1,
-            setpieces={Na3}]
+The purpose of the $\textit{simp}$ function is to keep the regular
+expressions small. Normally the derivative function makes the regular
+expression bigger (see the SEQ case and the example in (2)) and the
+algorithm would be slower and slower over time. The $\textit{simp}$
+function counters this increase in size and the result is that the
+algorithm is fast throughout.  By the way, this algorithm is by Janusz
+Brzozowski who came up with the idea of derivatives in 1964 in his PhD
+thesis.
 
-\noindent
-Warnsdorf's Rule states that the moves on the board above should be
-tried in the order
+\begin{center}\small
+\url{https://en.wikipedia.org/wiki/Janusz_Brzozowski_(computer_scientist)}
+\end{center}
+
 
-\[
-(0, 1), (0, 5), (2, 1), (2, 5), (3, 4), (3, 2)
-\]
+If you want to see how badly the regular expression matchers do in
+Java\footnote{Version 8 and below; Version 9 and above does not seem to be as
+  catastrophic, but still much worse than the regular expression
+  matcher based on derivatives.}, JavaScript and Python with the
+`evil' regular expression $(a^*)^*\cdot b$, then have a look at the
+graphs below (you can try it out for yourself: have a look at the files
+\texttt{catastrophic9.java}, \texttt{catastrophic.js},
+\texttt{catastrophic.py} etc on KEATS). Compare this with the matcher you
+have implemented. How long can the string of $a$'s be in your matcher
+and still stay within the 30 seconds time limit?
 
-\noindent
-Whenever there are ties, the corresponding onward moves can be in any
-order.  When calculating the number of onward moves for each field, we
-do not count moves that revisit any field already visited.
-
-\subsubsection*{Tasks (file knight2.scala)}
-
-\begin{itemize}
-\item[(6)] Write a function \texttt{ordered\_moves} that calculates a list of
-  onward moves like in (2) but orders them according to 
-  Warnsdorf’s Rule. That means moves with the fewest legal onward moves
-  should come first (in order to be tried out first). \hfill[1 Mark]
+\begin{center}
+\begin{tabular}{@{}cc@{}}
+\multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings 
+           $\underbrace{a\ldots a}_{n}$}\bigskip\\
   
-\item[(7)] Implement a \texttt{first\_closed\_tour\_heuristics}
-  function that searches for a single
-  \textbf{closed} tour on a $6\times 6$ board. It should try out
-  onward moves according to
-  the \texttt{ordered\_moves} function from (6). It is more likely to find
-  a solution when started in the middle of the board (that is
-  position $(dimension / 2, dimension / 2)$). \hfill[1 Mark]
-
-\item[(8)] Implement a \texttt{first\_tour\_heuristics} function
-  for boards up to
-  $30\times 30$.  It is the same function as in (7) but searches for
-  tours (not just closed tours). It might be called with any field on the
-  board as starting field.\\
-  %You have to be careful to write a
-  %tail-recursive function of the \texttt{first\_tour\_heuristics} function
-  %otherwise you will get problems with stack-overflows.\\
-  \mbox{}\hfill[1 Mark]
-\end{itemize}    
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n$},
+    x label style={at={(1.05,0.0)}},
+    ylabel={time in secs},
+    y label style={at={(0.06,0.5)}},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=33,
+    ymax=45,
+    ytick={0,5,...,40},
+    scaled ticks=false,
+    axis lines=left,
+    width=6cm,
+    height=5.5cm, 
+    legend entries={Python, Java 8, JavaScript, Swift, Dart},  
+    legend pos=north west,
+    legend cell align=left]
+\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
+\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
+\addplot[magenta,mark=*, mark options={fill=white}] table {re-swift.data};
+\addplot[brown,mark=*, mark options={fill=white}] table {re-dart.data};
+\end{axis}
+\end{tikzpicture}
+  & 
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n$},
+    x label style={at={(1.05,0.0)}},
+    ylabel={time in secs},
+    y label style={at={(0.06,0.5)}},
+    %enlargelimits=false,
+    %xtick={0,5000,...,30000},
+    xmax=65000,
+    ymax=45,
+    ytick={0,5,...,40},
+    scaled ticks=false,
+    axis lines=left,
+    width=6cm,
+    height=5.5cm, 
+    legend entries={Java 9},  
+    legend pos=north west]
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
+\end{axis}
+\end{tikzpicture}
+\end{tabular}  
+\end{center}
+\newpage
 
-\subsubsection*{Task (file knight3.scala)}
-\begin{itemize}
-\item[(9)] Implement a function \texttt{tour\_on\_mega\_board} which is
-  the same function as in (8), \textbf{but} should be able to
-  deal with boards up to
-  $70\times 70$ \textbf{within 30 seconds} (on my laptop). This will be tested
-  by starting from field $(0, 0)$. You have to be careful to
-  write a tail-recursive function otherwise you will get problems
-  with stack-overflows. Please observe the requirements about
-  the submissions: no tricks involving \textbf{.par}.\medskip
-
-  The timelimit of 30 seconds is with respect to the laptop on which the
-  marking will happen. You can roughly estimate how well your
-  implementation performs by running \texttt{knight3.jar} on your
-  computer. For example the reference implementation shows
-  on my laptop:
-  
-  \begin{lstlisting}[language={},numbers=none,basicstyle=\ttfamily\small]
-$ scala -cp knight3.jar
-  
-scala> CW8c.tour_on_mega_board(70, List((0, 0)))
-Time needed: 9.484 secs.
-...<<long_list>>...
-\end{lstlisting}%$
-
-  \mbox{}\hfill[1 Mark]
-\end{itemize}  
-\bigskip
 
 
 
 
 \end{document}
 
+
 %%% Local Variables: 
 %%% mode: latex
 %%% TeX-master: t
author	Christian Urban <christian.urban@kcl.ac.uk>
	Wed, 04 Nov 2020 15:35:31 +0000
changeset 351	591b9005157e
parent 350	c5ad0e3f2a6d
child 352	97bcf8efe4e0