cws/cw04.tex
changeset 486 9c03b5e89a2a
parent 485 19b75e899d37
child 487 efad9725dfd8
equal deleted inserted replaced
485:19b75e899d37 486:9c03b5e89a2a
     1 % !TEX program = xelatex
       
     2 \documentclass{article}
       
     3 \usepackage{../style}
       
     4 \usepackage{../langs}
       
     5 \usepackage{disclaimer}
       
     6 \usepackage{tikz}
       
     7 \usepackage{pgf}
       
     8 \usepackage{pgfplots}
       
     9 \usepackage{stackengine}
       
    10 %% \usepackage{accents}
       
    11 \newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}
       
    12 
       
    13 \begin{filecontents}{re-python2.data}
       
    14 1 0.033
       
    15 5 0.036
       
    16 10 0.034
       
    17 15 0.036
       
    18 18 0.059
       
    19 19 0.084 
       
    20 20 0.141
       
    21 21 0.248
       
    22 22 0.485
       
    23 23 0.878
       
    24 24 1.71
       
    25 25 3.40
       
    26 26 7.08
       
    27 27 14.12
       
    28 28 26.69
       
    29 \end{filecontents}
       
    30 
       
    31 \begin{filecontents}{re-java.data}
       
    32 5  0.00298
       
    33 10  0.00418
       
    34 15  0.00996
       
    35 16  0.01710
       
    36 17  0.03492
       
    37 18  0.03303
       
    38 19  0.05084
       
    39 20  0.10177
       
    40 21  0.19960
       
    41 22  0.41159
       
    42 23  0.82234
       
    43 24  1.70251
       
    44 25  3.36112
       
    45 26  6.63998
       
    46 27  13.35120
       
    47 28  29.81185
       
    48 \end{filecontents}
       
    49 
       
    50 \begin{filecontents}{re-js.data}
       
    51 5   0.061
       
    52 10  0.061
       
    53 15  0.061
       
    54 20  0.070
       
    55 23  0.131
       
    56 25  0.308
       
    57 26  0.564
       
    58 28  1.994
       
    59 30  7.648
       
    60 31  15.881 
       
    61 32  32.190
       
    62 \end{filecontents}
       
    63 
       
    64 \begin{filecontents}{re-java9.data}
       
    65 1000  0.01410
       
    66 2000  0.04882
       
    67 3000  0.10609
       
    68 4000  0.17456
       
    69 5000  0.27530
       
    70 6000  0.41116
       
    71 7000  0.53741
       
    72 8000  0.70261
       
    73 9000  0.93981
       
    74 10000 0.97419
       
    75 11000 1.28697
       
    76 12000 1.51387
       
    77 14000 2.07079
       
    78 16000 2.69846
       
    79 20000 4.41823
       
    80 24000 6.46077
       
    81 26000 7.64373
       
    82 30000 9.99446
       
    83 34000 12.966885
       
    84 38000 16.281621
       
    85 42000 19.180228
       
    86 46000 21.984721
       
    87 50000 26.950203
       
    88 60000 43.0327746
       
    89 \end{filecontents}
       
    90 
       
    91 
       
    92 \begin{document}
       
    93 
       
    94 % BF IDE
       
    95 % https://www.microsoft.com/en-us/p/brainf-ck/9nblgggzhvq5
       
    96   
       
    97 \section*{Part 9 (Scala)}
       
    98 
       
    99 \mbox{}\hfill\textit{``[Google’s MapReduce] abstraction is inspired by the}\\
       
   100 \mbox{}\hfill\textit{map and reduce primitives present in Lisp and many}\\
       
   101 \mbox{}\hfill\textit{other functional language.''}\smallskip\\
       
   102 \mbox{}\hfill\textit{ --- Dean and Ghemawat, who designed this concept at Google}
       
   103 \bigskip\medskip
       
   104 
       
   105 \noindent
       
   106 This part is about the shunting yard algorithm by Dijkstra and a
       
   107 regular expression matcher by Brzozowski. The preliminary part (4\%) is due on
       
   108 \cwNINE{} at 4pm; the core, more advanced part, is due on \cwNINEa{}
       
   109 at 4pm. The preliminary part is about the Shunting Yard Algorithm that
       
   110 transforms the usual infix notation of arithmetic expressions into the
       
   111 postfix notation, which is for example used in compilers. In the core
       
   112 part, you are asked to implement a regular expression matcher based on
       
   113 derivatives of regular expressions. The background is that
       
   114 ``out-of-the-box'' regular expression matching in mainstream languages
       
   115 like Java, JavaScript and Python can sometimes be excruciatingly slow.
       
   116 You are supposed to implement an regular expression matcher that is
       
   117 much, much faster. \bigskip
       
   118 
       
   119 \IMPORTANT{}
       
   120 
       
   121 \noindent
       
   122 Also note that the running time of each part will be restricted to a
       
   123 maximum of 30 seconds on my laptop.  
       
   124 
       
   125 \DISCLAIMER{}
       
   126 
       
   127 \subsection*{Reference Implementation}
       
   128 
       
   129 This Scala assignment comes with three reference implementations in form of
       
   130 \texttt{jar}-files you can download from KEATS. This allows you to run any
       
   131 test cases on your own
       
   132 computer. For example you can call Scala on the command line with the
       
   133 option \texttt{-cp re.jar} and then query any function from the
       
   134 \texttt{re.scala} template file. As usual you have to
       
   135 prefix the calls with \texttt{CW9a}, \texttt{CW9b} and \texttt{CW9c}.
       
   136 Since some tasks are time sensitive, you can check the reference
       
   137 implementation as follows: if you want to know, for example, how long it takes
       
   138 to match strings of $a$'s using the regular expression $(a^*)^*\cdot b$
       
   139 you can query as follows:
       
   140 
       
   141 
       
   142 
       
   143 \begin{lstlisting}[xleftmargin=1mm,numbers=none,basicstyle=\ttfamily\small]
       
   144 $ scala -cp re.jar
       
   145 scala> import CW9c._  
       
   146 scala> for (i <- 0 to 5000000 by 500000) {
       
   147   | println(f"$i: ${time_needed(2, matcher(EVIL, "a" * i))}%.5f secs.")
       
   148   | }
       
   149 0: 0.00002 secs.
       
   150 500000: 0.10608 secs.
       
   151 1000000: 0.22286 secs.
       
   152 1500000: 0.35982 secs.
       
   153 2000000: 0.45828 secs.
       
   154 2500000: 0.59558 secs.
       
   155 3000000: 0.73191 secs.
       
   156 3500000: 0.83499 secs.
       
   157 4000000: 0.99149 secs.
       
   158 4500000: 1.15395 secs.
       
   159 5000000: 1.29659 secs.
       
   160 \end{lstlisting}%$
       
   161 
       
   162 
       
   163 \subsection*{Preliminary Part (4 Marks)}
       
   164 
       
   165 The \emph{Shunting Yard Algorithm} has been developed by Edsger Dijkstra,
       
   166 an influential computer scientist who developed many well-known
       
   167 algorithms. This algorithm transforms the usual infix notation of
       
   168 arithmetic expressions into the postfix notation, sometimes also
       
   169 called reverse Polish notation.
       
   170 
       
   171 Why on Earth do people use the postfix notation? It is much more
       
   172 convenient to work with the usual infix notation for arithmetic
       
   173 expressions. Most modern calculators (as opposed to the ones used 20
       
   174 years ago) understand infix notation. So why on Earth? \ldots{}Well,
       
   175 many computers under the hood, even nowadays, use postfix notation
       
   176 extensively. For example if you give to the Java compiler the
       
   177 expression $1 + ((2 * 3) + (4 - 3))$, it will generate the Java Byte
       
   178 code
       
   179 
       
   180 \begin{lstlisting}[language=JVMIS,numbers=none]
       
   181 ldc 1 
       
   182 ldc 2 
       
   183 ldc 3 
       
   184 imul 
       
   185 ldc 4 
       
   186 ldc 3 
       
   187 isub 
       
   188 iadd 
       
   189 iadd
       
   190 \end{lstlisting}
       
   191 
       
   192 \noindent
       
   193 where the command \texttt{ldc} loads a constant onto the stack, and \texttt{imul},
       
   194 \texttt{isub} and \texttt{iadd} are commands acting on the stack. Clearly this
       
   195 is the arithmetic expression in postfix notation.\bigskip
       
   196 
       
   197 \noindent
       
   198 The shunting yard algorithm processes an input token list using an
       
   199 operator stack and an output list. The input consists of numbers,
       
   200 operators ($+$, $-$, $*$, $/$) and parentheses, and for the purpose of
       
   201 the assignment we assume the input is always a well-formed expression
       
   202 in infix notation.  The calculation in the shunting yard algorithm uses
       
   203 information about the
       
   204 precedences of the operators (given in the template file). The
       
   205 algorithm processes the input token list as follows:
       
   206 
       
   207 \begin{itemize}
       
   208 \item If there is a number as input token, then this token is
       
   209   transferred directly to the output list. Then the rest of the input is
       
   210   processed.
       
   211 \item If there is an operator as input token, then you need to check
       
   212   what is on top of the operator stack. If there are operators with
       
   213   a higher or equal precedence, these operators are first popped off
       
   214   from the stack and moved to the output list. Then the operator from the input
       
   215   is pushed onto the stack and the rest of the input is processed.
       
   216 \item If the input is a left-parenthesis, you push it on to the stack
       
   217   and continue processing the input.
       
   218 \item If the input is a right-parenthesis, then you pop off all operators
       
   219   from the stack to the output list until you reach the left-parenthesis.
       
   220   Then you discharge the $($ and $)$ from the input and stack, and continue
       
   221   processing the input list.
       
   222 \item If the input is empty, then you move all remaining operators
       
   223   from the stack to the output list.  
       
   224 \end{itemize}  
       
   225 
       
   226 \subsubsection*{Tasks (file postfix.scala)}
       
   227 
       
   228 \begin{itemize}
       
   229 \item[(1)] Implement the shunting yard algorithm described above. The
       
   230   function, called \texttt{syard}, takes a list of tokens as first
       
   231   argument. The second and third arguments are the stack and output
       
   232   list represented as Scala lists. The most convenient way to
       
   233   implement this algorithm is to analyse what the input list, stack
       
   234   and output list look like in each step using pattern-matching. The
       
   235   algorithm transforms for example the input
       
   236 
       
   237   \[
       
   238   \texttt{List(3, +, 4, *, (, 2, -, 1, ))}
       
   239   \]
       
   240 
       
   241   into the postfix output
       
   242 
       
   243   \[
       
   244   \texttt{List(3, 4, 2, 1, -, *, +)}
       
   245   \]  
       
   246   
       
   247   You can assume the input list is always a  list representing
       
   248   a well-formed infix arithmetic expression.\hfill[1 Mark]
       
   249 
       
   250 \item[(2)] Implement a compute function that takes a postfix expression
       
   251   as argument and evaluates it generating an integer as result. It uses a
       
   252   stack to evaluate the postfix expression. The operators $+$, $-$, $*$
       
   253   are as usual; $/$ is division on integers, for example $7 / 3 = 2$.
       
   254   \hfill[1 Mark]
       
   255 \end{itemize}
       
   256 
       
   257 \subsubsection*{Task (file postfix2.scala)}
       
   258 
       
   259 \begin{itemize}
       
   260 \item[(3/4)] Extend the code in (7) and (8) to include the power
       
   261   operator.  This requires proper account of associativity of
       
   262   the operators. The power operator is right-associative, whereas the
       
   263   other operators are left-associative.  Left-associative operators
       
   264   are popped off if the precedence is bigger or equal, while
       
   265   right-associative operators are only popped off if the precedence is
       
   266   bigger. The compute function in this task should use
       
   267   \texttt{Long}s, rather than \texttt{Int}s.\hfill[2 Marks]
       
   268 \end{itemize}
       
   269 
       
   270 
       
   271 
       
   272 \subsection*{Core Part (6 Marks)}
       
   273 
       
   274 The task is to implement a regular expression matcher that is based on
       
   275 derivatives of regular expressions. Most of the functions are defined by
       
   276 recursion over regular expressions and can be elegantly implemented
       
   277 using Scala's pattern-matching. The implementation should deal with the
       
   278 following regular expressions, which have been predefined in the file
       
   279 \texttt{re.scala}:
       
   280 
       
   281 \begin{center}
       
   282 \begin{tabular}{lcll}
       
   283   $r$ & $::=$ & $\ZERO$     & cannot match anything\\
       
   284       &   $|$ & $\ONE$      & can only match the empty string\\
       
   285       &   $|$ & $c$         & can match a single character (in this case $c$)\\
       
   286       &   $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
       
   287   &   $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
       
   288           &  & & then the second part with $r_2$\\
       
   289       &   $|$ & $r^*$       & can match a string with zero or more copies of $r$\\
       
   290 \end{tabular}
       
   291 \end{center}
       
   292 
       
   293 \noindent 
       
   294 Why? Regular expressions are
       
   295 one of the simplest ways to match patterns in text, and
       
   296 are endlessly useful for searching, editing and analysing data in all
       
   297 sorts of places (for example analysing network traffic in order to
       
   298 detect security breaches). However, you need to be fast, otherwise you
       
   299 will stumble over problems such as recently reported at
       
   300 
       
   301 {\small
       
   302 \begin{itemize}
       
   303 \item[$\bullet$] \url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019}  
       
   304 \item[$\bullet$] \url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
       
   305 \item[$\bullet$] \url{https://vimeo.com/112065252}
       
   306 \item[$\bullet$] \url{https://davidvgalbraith.com/how-i-fixed-atom}  
       
   307 \end{itemize}}
       
   308 
       
   309 % Knowing how to match regular expressions and strings will let you
       
   310 % solve a lot of problems that vex other humans.
       
   311 
       
   312 
       
   313 \subsubsection*{Tasks (file re.scala)}
       
   314 
       
   315 The file \texttt{re.scala} has already a definition for regular
       
   316 expressions and also defines some handy shorthand notation for
       
   317 regular expressions. The notation in this document matches up
       
   318 with the code in the file as follows:
       
   319 
       
   320 \begin{center}
       
   321   \begin{tabular}{rcl@{\hspace{10mm}}l}
       
   322     & & code: & shorthand:\smallskip \\ 
       
   323   $\ZERO$ & $\mapsto$ & \texttt{ZERO}\\
       
   324   $\ONE$  & $\mapsto$ & \texttt{ONE}\\
       
   325   $c$     & $\mapsto$ & \texttt{CHAR(c)}\\
       
   326   $r_1 + r_2$ & $\mapsto$ & \texttt{ALT(r1, r2)} & \texttt{r1 | r2}\\
       
   327   $r_1 \cdot r_2$ & $\mapsto$ & \texttt{SEQ(r1, r2)} & \texttt{r1 $\sim$ r2}\\
       
   328   $r^*$ & $\mapsto$ &  \texttt{STAR(r)} & \texttt{r.\%}
       
   329 \end{tabular}    
       
   330 \end{center}  
       
   331 
       
   332 
       
   333 \begin{itemize}
       
   334 \item[(5)] Implement a function, called \textit{nullable}, by
       
   335   recursion over regular expressions. This function tests whether a
       
   336   regular expression can match the empty string. This means given a
       
   337   regular expression it either returns true or false. The function
       
   338   \textit{nullable}
       
   339   is defined as follows:
       
   340 
       
   341 \begin{center}
       
   342 \begin{tabular}{lcl}
       
   343 $\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
       
   344 $\textit{nullable}(\ONE)$  & $\dn$ & $\textit{true}$\\
       
   345 $\textit{nullable}(c)$     & $\dn$ & $\textit{false}$\\
       
   346 $\textit{nullable}(r_1 + r_2)$ & $\dn$ & $\textit{nullable}(r_1) \vee \textit{nullable}(r_2)$\\
       
   347 $\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\
       
   348 $\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\\
       
   349 \end{tabular}
       
   350 \end{center}~\hfill[1 Mark]
       
   351 
       
   352 \item[(6)] Implement a function, called \textit{der}, by recursion over
       
   353   regular expressions. It takes a character and a regular expression
       
   354   as arguments and calculates the derivative of a regular expression according
       
   355   to the rules:
       
   356 
       
   357 \begin{center}
       
   358 \begin{tabular}{lcl}
       
   359 $\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
       
   360 $\textit{der}\;c\;(\ONE)$  & $\dn$ & $\ZERO$\\
       
   361 $\textit{der}\;c\;(d)$     & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
       
   362 $\textit{der}\;c\;(r_1 + r_2)$ & $\dn$ & $(\textit{der}\;c\;r_1) + (\textit{der}\;c\;r_2)$\\
       
   363 $\textit{der}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
       
   364       & & $\textit{then}\;((\textit{der}\;c\;r_1)\cdot r_2) + (\textit{der}\;c\;r_2)$\\
       
   365       & & $\textit{else}\;(\textit{der}\;c\;r_1)\cdot r_2$\\
       
   366 $\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\
       
   367 \end{tabular}
       
   368 \end{center}
       
   369 
       
   370 For example given the regular expression $r = (a \cdot b) \cdot c$, the derivatives
       
   371 w.r.t.~the characters $a$, $b$ and $c$ are
       
   372 
       
   373 \begin{center}
       
   374   \begin{tabular}{lcll}
       
   375     $\textit{der}\;a\;r$ & $=$ & $(\ONE \cdot b)\cdot c$ & \quad($= r'$)\\
       
   376     $\textit{der}\;b\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$\\
       
   377     $\textit{der}\;c\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$
       
   378   \end{tabular}
       
   379 \end{center}
       
   380 
       
   381 Let $r'$ stand for the first derivative, then taking the derivatives of $r'$
       
   382 w.r.t.~the characters $a$, $b$ and $c$ gives
       
   383 
       
   384 \begin{center}
       
   385   \begin{tabular}{lcll}
       
   386     $\textit{der}\;a\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$ \\
       
   387     $\textit{der}\;b\;r'$ & $=$ & $((\ZERO \cdot b) + \ONE)\cdot c$ & \quad($= r''$)\\
       
   388     $\textit{der}\;c\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$
       
   389   \end{tabular}
       
   390 \end{center}
       
   391 
       
   392 One more example: Let $r''$ stand for the second derivative above,
       
   393 then taking the derivatives of $r''$ w.r.t.~the characters $a$, $b$
       
   394 and $c$ gives
       
   395 
       
   396 \begin{center}
       
   397   \begin{tabular}{lcll}
       
   398     $\textit{der}\;a\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$ \\
       
   399     $\textit{der}\;b\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$\\
       
   400     $\textit{der}\;c\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ONE$ &
       
   401     (is $\textit{nullable}$)                      
       
   402   \end{tabular}
       
   403 \end{center}
       
   404 
       
   405 Note, the last derivative can match the empty string, that is it is \textit{nullable}.\\
       
   406 \mbox{}\hfill\mbox{[1 Mark]}
       
   407 
       
   408 \item[(7)] Implement the function \textit{simp}, which recursively
       
   409   traverses a regular expression, and on the way up simplifies every
       
   410   regular expression on the left (see below) to the regular expression
       
   411   on the right, except it does not simplify inside ${}^*$-regular
       
   412   expressions.
       
   413 
       
   414   \begin{center}
       
   415 \begin{tabular}{l@{\hspace{4mm}}c@{\hspace{4mm}}ll}
       
   416 $r \cdot \ZERO$ & $\mapsto$ & $\ZERO$\\ 
       
   417 $\ZERO \cdot r$ & $\mapsto$ & $\ZERO$\\ 
       
   418 $r \cdot \ONE$ & $\mapsto$ & $r$\\ 
       
   419 $\ONE \cdot r$ & $\mapsto$ & $r$\\ 
       
   420 $r + \ZERO$ & $\mapsto$ & $r$\\ 
       
   421 $\ZERO + r$ & $\mapsto$ & $r$\\ 
       
   422 $r + r$ & $\mapsto$ & $r$\\ 
       
   423 \end{tabular}
       
   424   \end{center}
       
   425 
       
   426   For example the regular expression
       
   427   \[(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO)\]
       
   428 
       
   429   simplifies to just $r_1$. \textbf{Hint:} Regular expressions can be
       
   430   seen as trees and there are several methods for traversing
       
   431   trees. One of them corresponds to the inside-out traversal, which is also
       
   432   sometimes called post-order tra\-versal: you traverse inside the
       
   433   tree and on the way up you apply simplification rules.
       
   434   \textbf{Another Hint:}
       
   435   Remember numerical expressions from school times---there you had expressions
       
   436   like $u + \ldots + (1 \cdot x) - \ldots (z + (y \cdot 0)) \ldots$
       
   437   and simplification rules that looked very similar to rules
       
   438   above. You would simplify such numerical expressions by replacing
       
   439   for example the $y \cdot 0$ by $0$, or $1\cdot x$ by $x$, and then
       
   440   look whether more rules are applicable. If you organise the
       
   441   simplification in an inside-out fashion, it is always clear which
       
   442   simplification should be applied next.\hfill[1 Mark]
       
   443 
       
   444 \item[(8)] Implement two functions: The first, called \textit{ders},
       
   445   takes a list of characters and a regular expression as arguments, and
       
   446   builds the derivative w.r.t.~the list as follows:
       
   447 
       
   448 \begin{center}
       
   449 \begin{tabular}{lcl}
       
   450 $\textit{ders}\;(Nil)\;r$ & $\dn$ & $r$\\
       
   451   $\textit{ders}\;(c::cs)\;r$  & $\dn$ &
       
   452     $\textit{ders}\;cs\;(\textit{simp}(\textit{der}\;c\;r))$\\
       
   453 \end{tabular}
       
   454 \end{center}
       
   455 
       
   456 Note that this function is different from \textit{der}, which only
       
   457 takes a single character.
       
   458 
       
   459 The second function, called \textit{matcher}, takes a string and a
       
   460 regular expression as arguments. It builds first the derivatives
       
   461 according to \textit{ders} and after that tests whether the resulting
       
   462 derivative regular expression can match the empty string (using
       
   463 \textit{nullable}).  For example the \textit{matcher} will produce
       
   464 true for the regular expression $(a\cdot b)\cdot c$ and the string
       
   465 $abc$, but false if you give it the string $ab$. \hfill[1 Mark]
       
   466 
       
   467 \item[(9)] Implement a function, called \textit{size}, by recursion
       
   468   over regular expressions. If a regular expression is seen as a tree,
       
   469   then \textit{size} should return the number of nodes in such a
       
   470   tree. Therefore this function is defined as follows:
       
   471 
       
   472 \begin{center}
       
   473 \begin{tabular}{lcl}
       
   474 $\textit{size}(\ZERO)$ & $\dn$ & $1$\\
       
   475 $\textit{size}(\ONE)$  & $\dn$ & $1$\\
       
   476 $\textit{size}(c)$     & $\dn$ & $1$\\
       
   477 $\textit{size}(r_1 + r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
       
   478 $\textit{size}(r_1 \cdot r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
       
   479 $\textit{size}(r^*)$ & $\dn$ & $1 + \textit{size}(r)$\\
       
   480 \end{tabular}
       
   481 \end{center}
       
   482 
       
   483 You can use \textit{size} in order to test how much the ``evil'' regular
       
   484 expression $(a^*)^* \cdot b$ grows when taking successive derivatives
       
   485 according the letter $a$ without simplification and then compare it to
       
   486 taking the derivative, but simplify the result.  The sizes
       
   487 are given in \texttt{re.scala}. \hfill[1 Mark]
       
   488 
       
   489 \item[(10)] You do not have to implement anything specific under this
       
   490   task.  The purpose here is that you will be marked for some ``power''
       
   491   test cases. For example can your matcher decide within 30 seconds
       
   492   whether the regular expression $(a^*)^*\cdot b$ matches strings of the
       
   493   form $aaa\ldots{}aaaa$, for say 1 Million $a$'s. And does simplification
       
   494   simplify the regular expression
       
   495 
       
   496   \[
       
   497   \texttt{SEQ(SEQ(SEQ(..., ONE | ONE) , ONE | ONE), ONE | ONE)}
       
   498   \]  
       
   499 
       
   500   \noindent correctly to just \texttt{ONE}, where \texttt{SEQ} is nested
       
   501   50 or more times?\\
       
   502   \mbox{}\hfill[1 Mark]
       
   503 \end{itemize}
       
   504 
       
   505 \subsection*{Background}
       
   506 
       
   507 Although easily implementable in Scala, the idea behind the derivative
       
   508 function might not so easy to be seen. To understand its purpose
       
   509 better, assume a regular expression $r$ can match strings of the form
       
   510 $c\!::\!cs$ (that means strings which start with a character $c$ and have
       
   511 some rest, or tail, $cs$). If you take the derivative of $r$ with
       
   512 respect to the character $c$, then you obtain a regular expression
       
   513 that can match all the strings $cs$.  In other words, the regular
       
   514 expression $\textit{der}\;c\;r$ can match the same strings $c\!::\!cs$
       
   515 that can be matched by $r$, except that the $c$ is chopped off.
       
   516 
       
   517 Assume now $r$ can match the string $abc$. If you take the derivative
       
   518 according to $a$ then you obtain a regular expression that can match
       
   519 $bc$ (it is $abc$ where the $a$ has been chopped off). If you now
       
   520 build the derivative $\textit{der}\;b\;(\textit{der}\;a\;r)$ you
       
   521 obtain a regular expression that can match the string $c$ (it is $bc$
       
   522 where $b$ is chopped off). If you finally build the derivative of this
       
   523 according $c$, that is
       
   524 $\textit{der}\;c\;(\textit{der}\;b\;(\textit{der}\;a\;r))$, you obtain
       
   525 a regular expression that can match the empty string. You can test
       
   526 whether this is indeed the case using the function nullable, which is
       
   527 what your matcher is doing.
       
   528 
       
   529 The purpose of the $\textit{simp}$ function is to keep the regular
       
   530 expressions small. Normally the derivative function makes the regular
       
   531 expression bigger (see the SEQ case and the example in (2)) and the
       
   532 algorithm would be slower and slower over time. The $\textit{simp}$
       
   533 function counters this increase in size and the result is that the
       
   534 algorithm is fast throughout.  By the way, this algorithm is by Janusz
       
   535 Brzozowski who came up with the idea of derivatives in 1964 in his PhD
       
   536 thesis.
       
   537 
       
   538 \begin{center}\small
       
   539 \url{https://en.wikipedia.org/wiki/Janusz_Brzozowski_(computer_scientist)}
       
   540 \end{center}
       
   541 
       
   542 
       
   543 If you want to see how badly the regular expression matchers do in
       
   544 Java\footnote{Version 8 and below; Version 9 and above does not seem to be as
       
   545   catastrophic, but still much worse than the regular expression
       
   546   matcher based on derivatives.}, JavaScript and Python with the
       
   547 `evil' regular expression $(a^*)^*\cdot b$, then have a look at the
       
   548 graphs below (you can try it out for yourself: have a look at the file
       
   549 \texttt{catastrophic9.java}, \texttt{catastrophic.js} and
       
   550 \texttt{catastrophic.py} on KEATS). Compare this with the matcher you
       
   551 have implemented. How long can the string of $a$'s be in your matcher
       
   552 and still stay within the 30 seconds time limit?
       
   553 
       
   554 \begin{center}
       
   555 \begin{tabular}{@{}cc@{}}
       
   556 \multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings 
       
   557            $\underbrace{a\ldots a}_{n}$}\bigskip\\
       
   558   
       
   559 \begin{tikzpicture}
       
   560 \begin{axis}[
       
   561     xlabel={$n$},
       
   562     x label style={at={(1.05,0.0)}},
       
   563     ylabel={time in secs},
       
   564     y label style={at={(0.06,0.5)}},
       
   565     enlargelimits=false,
       
   566     xtick={0,5,...,30},
       
   567     xmax=33,
       
   568     ymax=45,
       
   569     ytick={0,5,...,40},
       
   570     scaled ticks=false,
       
   571     axis lines=left,
       
   572     width=6cm,
       
   573     height=5.5cm, 
       
   574     legend entries={Python, Java 8, JavaScript},  
       
   575     legend pos=north west,
       
   576     legend cell align=left]
       
   577 \addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
       
   578 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
       
   579 \addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
       
   580 \end{axis}
       
   581 \end{tikzpicture}
       
   582   & 
       
   583 \begin{tikzpicture}
       
   584 \begin{axis}[
       
   585     xlabel={$n$},
       
   586     x label style={at={(1.05,0.0)}},
       
   587     ylabel={time in secs},
       
   588     y label style={at={(0.06,0.5)}},
       
   589     %enlargelimits=false,
       
   590     %xtick={0,5000,...,30000},
       
   591     xmax=65000,
       
   592     ymax=45,
       
   593     ytick={0,5,...,40},
       
   594     scaled ticks=false,
       
   595     axis lines=left,
       
   596     width=6cm,
       
   597     height=5.5cm, 
       
   598     legend entries={Java 9},  
       
   599     legend pos=north west]
       
   600 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
       
   601 \end{axis}
       
   602 \end{tikzpicture}
       
   603 \end{tabular}  
       
   604 \end{center}
       
   605 \newpage
       
   606 
       
   607 
       
   608 
       
   609 
       
   610 
       
   611 \end{document}
       
   612 
       
   613 
       
   614 %%% Local Variables: 
       
   615 %%% mode: latex
       
   616 %%% TeX-master: t
       
   617 %%% End: