cws/cw04.tex
changeset 218 22705d22c105
parent 111 cd6b9fe4bce5
child 221 9e7897f25e13
equal deleted inserted replaced
217:e689375abcc1 218:22705d22c105
     1 \documentclass{article}
     1 \documentclass{article}
     2 \usepackage{../style}
     2 \usepackage{../style}
     3 \usepackage{../langs}
     3 \usepackage{../langs}
       
     4 \usepackage{disclaimer}
       
     5 \usepackage{tikz}
       
     6 \usepackage{pgf}
       
     7 \usepackage{pgfplots}
       
     8 \usepackage{stackengine}
       
     9 %% \usepackage{accents}
       
    10 \newcommand\barbelow[1]{\stackunder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}
       
    11 
       
    12 \begin{filecontents}{re-python2.data}
       
    13 1 0.033
       
    14 5 0.036
       
    15 10 0.034
       
    16 15 0.036
       
    17 18 0.059
       
    18 19 0.084 
       
    19 20 0.141
       
    20 21 0.248
       
    21 22 0.485
       
    22 23 0.878
       
    23 24 1.71
       
    24 25 3.40
       
    25 26 7.08
       
    26 27 14.12
       
    27 28 26.69
       
    28 \end{filecontents}
       
    29 
       
    30 \begin{filecontents}{re-java.data}
       
    31 5  0.00298
       
    32 10  0.00418
       
    33 15  0.00996
       
    34 16  0.01710
       
    35 17  0.03492
       
    36 18  0.03303
       
    37 19  0.05084
       
    38 20  0.10177
       
    39 21  0.19960
       
    40 22  0.41159
       
    41 23  0.82234
       
    42 24  1.70251
       
    43 25  3.36112
       
    44 26  6.63998
       
    45 27  13.35120
       
    46 28  29.81185
       
    47 \end{filecontents}
       
    48 
       
    49 \begin{filecontents}{re-java9.data}
       
    50 1000  0.01410
       
    51 2000  0.04882
       
    52 3000  0.10609
       
    53 4000  0.17456
       
    54 5000  0.27530
       
    55 6000  0.41116
       
    56 7000  0.53741
       
    57 8000  0.70261
       
    58 9000  0.93981
       
    59 10000 0.97419
       
    60 11000 1.28697
       
    61 12000 1.51387
       
    62 14000 2.07079
       
    63 16000 2.69846
       
    64 20000 4.41823
       
    65 24000 6.46077
       
    66 26000 7.64373
       
    67 30000 9.99446
       
    68 34000 12.966885
       
    69 38000 16.281621
       
    70 42000 19.180228
       
    71 46000 21.984721
       
    72 50000 26.950203
       
    73 60000 43.0327746
       
    74 \end{filecontents}
       
    75 
     4 
    76 
     5 \begin{document}
    77 \begin{document}
     6 
    78 
     7 \section*{Replacement Coursework 1 (Roman Numerals)}
    79 % BF IDE
     8 
    80 % https://www.microsoft.com/en-us/p/brainf-ck/9nblgggzhvq5
     9 This coursework is worth 10\%. It is about translating roman numerals
    81   
    10 into integers and also about validating roman numerals.  The coursework
    82 \section*{Coursework 8 (Regular Expressions and Brainf***)}
    11 is due on 2 February at 5pm.  Make sure the files you submit can be
    83 
    12 processed by just calling \texttt{scala <<filename.scala>>}.\bigskip
    84 This coursework is worth 10\%. It is about regular expressions,
       
    85 pattern matching and an interpreter. The first part is due on 30
       
    86 November at 11pm; the second, more advanced part, is due on 21
       
    87 December at 11pm. In the first part, you are asked to implement a
       
    88 regular expression matcher based on derivatives of regular
       
    89 expressions. The reason is that regular expression matching in Java
       
    90 and Python can sometimes be extremely slow. The advanced part is about
       
    91 an interpreter for a very simple programming language.\bigskip
       
    92 
       
    93 \IMPORTANT{}
    13 
    94 
    14 \noindent
    95 \noindent
    15 \textbf{Important:} Do not use any mutable data structures in your
    96 Also note that the running time of each part will be restricted to a
    16 submission! They are not needed. This menas you cannot use 
    97 maximum of 30 seconds on my laptop.
    17 \texttt{ListBuffer}s, for example. Do not use \texttt{return} in your
    98 
    18 code! It has a different meaning in Scala, than in Java.  Do not use
    99 \DISCLAIMER{}
    19 \texttt{var}! This declares a mutable variable.  Make sure the
   100 
    20 functions you submit are defined on the ``top-level'' of Scala, not
   101 
    21 inside a class or object. Also note that the running time will be
   102 \subsection*{Part 1 (6 Marks)}
    22 restricted to a maximum of 360 seconds on my laptop.
   103 
    23 
   104 The task is to implement a regular expression matcher that is based on
    24 
   105 derivatives of regular expressions. Most of the functions are defined by
    25 \subsection*{Disclaimer}
   106 recursion over regular expressions and can be elegantly implemented
    26 
   107 using Scala's pattern-matching. The implementation should deal with the
    27 It should be understood that the work you submit represents your own
   108 following regular expressions, which have been predefined in the file
    28 effort! You have not copied from anyone else. An exception is the
   109 \texttt{re.scala}:
    29 Scala code I showed during the lectures or uploaded to KEATS, which
   110 
    30 you can freely use.\bigskip
   111 \begin{center}
    31 
   112 \begin{tabular}{lcll}
    32 
   113   $r$ & $::=$ & $\ZERO$     & cannot match anything\\
    33 \subsection*{Part 1 (Translation)}
   114       &   $|$ & $\ONE$      & can only match the empty string\\
       
   115       &   $|$ & $c$         & can match a single character (in this case $c$)\\
       
   116       &   $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
       
   117   &   $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
       
   118           &  & & then the second part with $r_2$\\
       
   119       &   $|$ & $r^*$       & can match zero or more times $r$\\
       
   120 \end{tabular}
       
   121 \end{center}
       
   122 
       
   123 \noindent 
       
   124 Why? Knowing how to match regular expressions and strings will let you
       
   125 solve a lot of problems that vex other humans. Regular expressions are
       
   126 one of the fastest and simplest ways to match patterns in text, and
       
   127 are endlessly useful for searching, editing and analysing data in all
       
   128 sorts of places (for example analysing network traffic in order to
       
   129 detect security breaches). However, you need to be fast, otherwise you
       
   130 will stumble over problems such as recently reported at
       
   131 
       
   132 {\small
       
   133 \begin{itemize}
       
   134 \item[$\bullet$] \url{http://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
       
   135 \item[$\bullet$] \url{https://vimeo.com/112065252}
       
   136 \item[$\bullet$] \url{http://davidvgalbraith.com/how-i-fixed-atom/}  
       
   137 \end{itemize}}
       
   138 
       
   139 \subsubsection*{Tasks (file re.scala)}
       
   140 
       
   141 The file \texttt{re.scala} has already a definition for regular
       
   142 expressions and also defines some handy shorthand notation for
       
   143 regular expressions. The notation in this document matches up
       
   144 with the code in the file as follows:
       
   145 
       
   146 \begin{center}
       
   147   \begin{tabular}{rcl@{\hspace{10mm}}l}
       
   148     & & code: & shorthand:\smallskip \\ 
       
   149   $\ZERO$ & $\mapsto$ & \texttt{ZERO}\\
       
   150   $\ONE$  & $\mapsto$ & \texttt{ONE}\\
       
   151   $c$     & $\mapsto$ & \texttt{CHAR(c)}\\
       
   152   $r_1 + r_2$ & $\mapsto$ & \texttt{ALT(r1, r2)} & \texttt{r1 | r2}\\
       
   153   $r_1 \cdot r_2$ & $\mapsto$ & \texttt{SEQ(r1, r2)} & \texttt{r1 $\sim$ r2}\\
       
   154   $r^*$ & $\mapsto$ &  \texttt{STAR(r)} & \texttt{r.\%}
       
   155 \end{tabular}    
       
   156 \end{center}  
       
   157 
       
   158 
       
   159 \begin{itemize}
       
   160 \item[(1a)] Implement a function, called \textit{nullable}, by
       
   161   recursion over regular expressions. This function tests whether a
       
   162   regular expression can match the empty string. This means given a
       
   163   regular expression it either returns true or false. The function
       
   164   \textit{nullable}
       
   165   is defined as follows:
       
   166 
       
   167 \begin{center}
       
   168 \begin{tabular}{lcl}
       
   169 $\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
       
   170 $\textit{nullable}(\ONE)$  & $\dn$ & $\textit{true}$\\
       
   171 $\textit{nullable}(c)$     & $\dn$ & $\textit{false}$\\
       
   172 $\textit{nullable}(r_1 + r_2)$ & $\dn$ & $\textit{nullable}(r_1) \vee \textit{nullable}(r_2)$\\
       
   173 $\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\
       
   174 $\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\\
       
   175 \end{tabular}
       
   176 \end{center}~\hfill[1 Mark]
       
   177 
       
   178 \item[(1b)] Implement a function, called \textit{der}, by recursion over
       
   179   regular expressions. It takes a character and a regular expression
       
   180   as arguments and calculates the derivative regular expression according
       
   181   to the rules:
       
   182 
       
   183 \begin{center}
       
   184 \begin{tabular}{lcl}
       
   185 $\textit{der}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
       
   186 $\textit{der}\;c\;(\ONE)$  & $\dn$ & $\ZERO$\\
       
   187 $\textit{der}\;c\;(d)$     & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
       
   188 $\textit{der}\;c\;(r_1 + r_2)$ & $\dn$ & $(\textit{der}\;c\;r_1) + (\textit{der}\;c\;r_2)$\\
       
   189 $\textit{der}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
       
   190       & & $\textit{then}\;((\textit{der}\;c\;r_1)\cdot r_2) + (\textit{der}\;c\;r_2)$\\
       
   191       & & $\textit{else}\;(\textit{der}\;c\;r_1)\cdot r_2$\\
       
   192 $\textit{der}\;c\;(r^*)$ & $\dn$ & $(\textit{der}\;c\;r)\cdot (r^*)$\\
       
   193 \end{tabular}
       
   194 \end{center}
       
   195 
       
   196 For example given the regular expression $r = (a \cdot b) \cdot c$, the derivatives
       
   197 w.r.t.~the characters $a$, $b$ and $c$ are
       
   198 
       
   199 \begin{center}
       
   200   \begin{tabular}{lcll}
       
   201     $\textit{der}\;a\;r$ & $=$ & $(\ONE \cdot b)\cdot c$ & ($= r'$)\\
       
   202     $\textit{der}\;b\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$\\
       
   203     $\textit{der}\;c\;r$ & $=$ & $(\ZERO \cdot b)\cdot c$
       
   204   \end{tabular}
       
   205 \end{center}
       
   206 
       
   207 Let $r'$ stand for the first derivative, then taking the derivatives of $r'$
       
   208 w.r.t.~the characters $a$, $b$ and $c$ gives
       
   209 
       
   210 \begin{center}
       
   211   \begin{tabular}{lcll}
       
   212     $\textit{der}\;a\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$ \\
       
   213     $\textit{der}\;b\;r'$ & $=$ & $((\ZERO \cdot b) + \ONE)\cdot c$ & ($= r''$)\\
       
   214     $\textit{der}\;c\;r'$ & $=$ & $((\ZERO \cdot b) + \ZERO)\cdot c$
       
   215   \end{tabular}
       
   216 \end{center}
       
   217 
       
   218 One more example: Let $r''$ stand for the second derivative above,
       
   219 then taking the derivatives of $r''$ w.r.t.~the characters $a$, $b$
       
   220 and $c$ gives
       
   221 
       
   222 \begin{center}
       
   223   \begin{tabular}{lcll}
       
   224     $\textit{der}\;a\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$ \\
       
   225     $\textit{der}\;b\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ZERO$\\
       
   226     $\textit{der}\;c\;r''$ & $=$ & $((\ZERO \cdot b) + \ZERO) \cdot c + \ONE$ &
       
   227     (is $\textit{nullable}$)                      
       
   228   \end{tabular}
       
   229 \end{center}
       
   230 
       
   231 Note, the last derivative can match the empty string, that is it is \textit{nullable}.\\
       
   232 \mbox{}\hfill\mbox{[1 Mark]}
       
   233 
       
   234 \item[(1c)] Implement the function \textit{simp}, which recursively
       
   235   traverses a regular expression from the inside to the outside, and
       
   236   on the way simplifies every regular expression on the left (see
       
   237   below) to the regular expression on the right, except it does not
       
   238   simplify inside ${}^*$-regular expressions.
       
   239 
       
   240   \begin{center}
       
   241 \begin{tabular}{l@{\hspace{4mm}}c@{\hspace{4mm}}ll}
       
   242 $r \cdot \ZERO$ & $\mapsto$ & $\ZERO$\\ 
       
   243 $\ZERO \cdot r$ & $\mapsto$ & $\ZERO$\\ 
       
   244 $r \cdot \ONE$ & $\mapsto$ & $r$\\ 
       
   245 $\ONE \cdot r$ & $\mapsto$ & $r$\\ 
       
   246 $r + \ZERO$ & $\mapsto$ & $r$\\ 
       
   247 $\ZERO + r$ & $\mapsto$ & $r$\\ 
       
   248 $r + r$ & $\mapsto$ & $r$\\ 
       
   249 \end{tabular}
       
   250   \end{center}
       
   251 
       
   252   For example the regular expression
       
   253   \[(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO)\]
       
   254 
       
   255   simplifies to just $r_1$. \textbf{Hint:} Regular expressions can be
       
   256   seen as trees and there are several methods for traversing
       
   257   trees. One of them corresponds to the inside-out traversal, which is
       
   258   sometimes also called post-order traversal. Furthermore,
       
   259   remember numerical expressions from school times: there you had expressions
       
   260   like $u + \ldots + (1 \cdot x) - \ldots (z + (y \cdot 0)) \ldots$
       
   261   and simplification rules that looked very similar to rules
       
   262   above. You would simplify such numerical expressions by replacing
       
   263   for example the $y \cdot 0$ by $0$, or $1\cdot x$ by $x$, and then
       
   264   look whether more rules are applicable. If you organise the
       
   265   simplification in an inside-out fashion, it is always clear which
       
   266   rule should be applied next.\hfill[2 Marks]
       
   267 
       
   268 \item[(1d)] Implement two functions: The first, called \textit{ders},
       
   269   takes a list of characters and a regular expression as arguments, and
       
   270   builds the derivative w.r.t.~the list as follows:
       
   271 
       
   272 \begin{center}
       
   273 \begin{tabular}{lcl}
       
   274 $\textit{ders}\;(Nil)\;r$ & $\dn$ & $r$\\
       
   275   $\textit{ders}\;(c::cs)\;r$  & $\dn$ &
       
   276     $\textit{ders}\;cs\;(\textit{simp}(\textit{der}\;c\;r))$\\
       
   277 \end{tabular}
       
   278 \end{center}
       
   279 
       
   280 Note that this function is different from \textit{der}, which only
       
   281 takes a single character.
       
   282 
       
   283 The second function, called \textit{matcher}, takes a string and a
       
   284 regular expression as arguments. It builds first the derivatives
       
   285 according to \textit{ders} and after that tests whether the resulting
       
   286 derivative regular expression can match the empty string (using
       
   287 \textit{nullable}).  For example the \textit{matcher} will produce
       
   288 true for the regular expression $(a\cdot b)\cdot c$ and the string
       
   289 $abc$, but false if you give it the string $ab$. \hfill[1 Mark]
       
   290 
       
   291 \item[(1e)] Implement a function, called \textit{size}, by recursion
       
   292   over regular expressions. If a regular expression is seen as a tree,
       
   293   then \textit{size} should return the number of nodes in such a
       
   294   tree. Therefore this function is defined as follows:
       
   295 
       
   296 \begin{center}
       
   297 \begin{tabular}{lcl}
       
   298 $\textit{size}(\ZERO)$ & $\dn$ & $1$\\
       
   299 $\textit{size}(\ONE)$  & $\dn$ & $1$\\
       
   300 $\textit{size}(c)$     & $\dn$ & $1$\\
       
   301 $\textit{size}(r_1 + r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
       
   302 $\textit{size}(r_1 \cdot r_2)$ & $\dn$ & $1 + \textit{size}(r_1) + \textit{size}(r_2)$\\
       
   303 $\textit{size}(r^*)$ & $\dn$ & $1 + \textit{size}(r)$\\
       
   304 \end{tabular}
       
   305 \end{center}
       
   306 
       
   307 You can use \textit{size} in order to test how much the `evil' regular
       
   308 expression $(a^*)^* \cdot b$ grows when taking successive derivatives
       
   309 according the letter $a$ without simplification and then compare it to
       
   310 taking the derivative, but simplify the result.  The sizes
       
   311 are given in \texttt{re.scala}. \hfill[1 Mark]
       
   312 \end{itemize}
       
   313 
       
   314 \subsection*{Background}
       
   315 
       
   316 Although easily implementable in Scala, the idea behind the derivative
       
   317 function might not so easy to be seen. To understand its purpose
       
   318 better, assume a regular expression $r$ can match strings of the form
       
   319 $c\!::\!cs$ (that means strings which start with a character $c$ and have
       
   320 some rest, or tail, $cs$). If you take the derivative of $r$ with
       
   321 respect to the character $c$, then you obtain a regular expression
       
   322 that can match all the strings $cs$.  In other words, the regular
       
   323 expression $\textit{der}\;c\;r$ can match the same strings $c\!::\!cs$
       
   324 that can be matched by $r$, except that the $c$ is chopped off.
       
   325 
       
   326 Assume now $r$ can match the string $abc$. If you take the derivative
       
   327 according to $a$ then you obtain a regular expression that can match
       
   328 $bc$ (it is $abc$ where the $a$ has been chopped off). If you now
       
   329 build the derivative $\textit{der}\;b\;(\textit{der}\;a\;r)$ you
       
   330 obtain a regular expression that can match the string $c$ (it is $bc$
       
   331 where $b$ is chopped off). If you finally build the derivative of this
       
   332 according $c$, that is
       
   333 $\textit{der}\;c\;(\textit{der}\;b\;(\textit{der}\;a\;r))$, you obtain
       
   334 a regular expression that can match the empty string. You can test
       
   335 whether this is indeed the case using the function nullable, which is
       
   336 what your matcher is doing.
       
   337 
       
   338 The purpose of the $\textit{simp}$ function is to keep the regular
       
   339 expressions small. Normally the derivative function makes the regular
       
   340 expression bigger (see the SEQ case and the example in (1b)) and the
       
   341 algorithm would be slower and slower over time. The $\textit{simp}$
       
   342 function counters this increase in size and the result is that the
       
   343 algorithm is fast throughout.  By the way, this algorithm is by Janusz
       
   344 Brzozowski who came up with the idea of derivatives in 1964 in his PhD
       
   345 thesis.
       
   346 
       
   347 \begin{center}\small
       
   348 \url{https://en.wikipedia.org/wiki/Janusz_Brzozowski_(computer_scientist)}
       
   349 \end{center}
       
   350 
       
   351 
       
   352 If you want to see how badly the regular expression matchers do in
       
   353 Java\footnote{Version 8 and below; Version 9 does not seem to be as
       
   354   catastrophic, but still worse than the regular expression matcher
       
   355 based on derivatives.} and in Python with the `evil' regular
       
   356 expression $(a^*)^*\cdot b$, then have a look at the graphs below (you
       
   357 can try it out for yourself: have a look at the file
       
   358 \texttt{catastrophic.java} and \texttt{catastrophic.py} on
       
   359 KEATS). Compare this with the matcher you have implemented. How long
       
   360 can the string of $a$'s be in your matcher and still stay within the
       
   361 30 seconds time limit?
       
   362 
       
   363 \begin{center}
       
   364 \begin{tabular}{@{}cc@{}}
       
   365 \multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings 
       
   366            $\underbrace{a\ldots a}_{n}$}\bigskip\\
       
   367   
       
   368 \begin{tikzpicture}
       
   369 \begin{axis}[
       
   370     xlabel={$n$},
       
   371     x label style={at={(1.05,0.0)}},
       
   372     ylabel={time in secs},
       
   373     y label style={at={(0.06,0.5)}},
       
   374     enlargelimits=false,
       
   375     xtick={0,5,...,30},
       
   376     xmax=33,
       
   377     ymax=45,
       
   378     ytick={0,5,...,40},
       
   379     scaled ticks=false,
       
   380     axis lines=left,
       
   381     width=6cm,
       
   382     height=5.5cm, 
       
   383     legend entries={Python, Java 8},  
       
   384     legend pos=north west]
       
   385 \addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
       
   386 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
       
   387 \end{axis}
       
   388 \end{tikzpicture}
       
   389   & 
       
   390 \begin{tikzpicture}
       
   391 \begin{axis}[
       
   392     xlabel={$n$},
       
   393     x label style={at={(1.05,0.0)}},
       
   394     ylabel={time in secs},
       
   395     y label style={at={(0.06,0.5)}},
       
   396     %enlargelimits=false,
       
   397     %xtick={0,5000,...,30000},
       
   398     xmax=65000,
       
   399     ymax=45,
       
   400     ytick={0,5,...,40},
       
   401     scaled ticks=false,
       
   402     axis lines=left,
       
   403     width=6cm,
       
   404     height=5.5cm, 
       
   405     legend entries={Java 9},  
       
   406     legend pos=north west]
       
   407 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
       
   408 \end{axis}
       
   409 \end{tikzpicture}
       
   410 \end{tabular}  
       
   411 \end{center}
       
   412 \newpage
       
   413 
       
   414 \subsection*{Part 2 (4 Marks)}
       
   415 
       
   416 Coming from Java or C++, you might think Scala is a quite esoteric
       
   417 programming language.  But remember, some serious companies have built
       
   418 their business on
       
   419 Scala.\footnote{\url{https://en.wikipedia.org/wiki/Scala_(programming_language)\#Companies}}
       
   420 And there are far, far more esoteric languages out there. One is
       
   421 called \emph{brainf***}. You are asked in this part to implement an
       
   422 interpreter for this language.
       
   423 
       
   424 Urban M\"uller developed brainf*** in 1993.  A close relative of this
       
   425 language was already introduced in 1964 by Corado B\"ohm, an Italian
       
   426 computer pioneer, who unfortunately died a few months ago. The main
       
   427 feature of brainf*** is its minimalistic set of instructions---just 8
       
   428 instructions in total and all of which are single characters. Despite
       
   429 the minimalism, this language has been shown to be Turing
       
   430 complete\ldots{}if this doesn't ring any bell with you: it roughly
       
   431 means that every algorithm we know can, in principle, be implemented in
       
   432 brainf***. It just takes a lot of determination and quite a lot of
       
   433 memory resources. Some relatively sophisticated sample programs in
       
   434 brainf*** are given in the file \texttt{bf.scala}.\bigskip
    34 
   435 
    35 \noindent
   436 \noindent
    36 Roman numerals are strings consisting of the letters $I$, $V$, $X$,
   437 As mentioned above, brainf*** has 8 single-character commands, namely
    37 $L$, $C$, $D$, and $M$. Such strings should be transformed into an
   438 \texttt{'>'}, \texttt{'<'}, \texttt{'+'}, \texttt{'-'}, \texttt{'.'},
    38 internal representation using the datatypes \texttt{RomanDigit} and
   439 \texttt{','}, \texttt{'['} and \texttt{']'}. Every other character is
    39 \texttt{RomanNumeral} (defined in \texttt{roman.scala}), and then from
   440 considered a comment.  Brainf*** operates on memory cells containing
    40 this internal representation converted into Integers.
   441 integers. For this it uses a single memory pointer that points at each
       
   442 stage to one memory cell. This pointer can be moved forward by one
       
   443 memory cell by using the command \texttt{'>'}, and backward by using
       
   444 \texttt{'<'}. The commands \texttt{'+'} and \texttt{'-'} increase,
       
   445 respectively decrease, by 1 the content of the memory cell to which
       
   446 the memory pointer currently points to. The commands for input/output
       
   447 are \texttt{','} and \texttt{'.'}. Output works by reading the content
       
   448 of the memory cell to which the memory pointer points to and printing
       
   449 it out as an ASCII character. Input works the other way, taking some
       
   450 user input and storing it in the cell to which the memory pointer
       
   451 points to. The commands \texttt{'['} and \texttt{']'} are looping
       
   452 constructs. Everything in between \texttt{'['} and \texttt{']'} is
       
   453 repeated until a counter (memory cell) reaches zero.  A typical
       
   454 program in brainf*** looks as follows:
       
   455 
       
   456 \begin{center}
       
   457 \begin{verbatim}
       
   458  ++++++++[>++++[>++>+++>+++>+<<<<-]>+>+>->>+[<]<-]>>.>---.+++++++
       
   459  ..+++.>>.<-.<.+++.------.--------.>>+.>++.
       
   460 \end{verbatim}
       
   461 \end{center}  
       
   462 
       
   463 \noindent
       
   464 This one prints out Hello World\ldots{}obviously. 
       
   465 
       
   466 \subsubsection*{Tasks (file bf.scala)}
    41 
   467 
    42 \begin{itemize}
   468 \begin{itemize}
    43 \item[(1)] First write a polymorphic function that recursively
   469 \item[(2a)] Brainf*** memory is represented by a \texttt{Map} from
    44   transforms a list of options into an option of a list. For example,
   470   integers to integers. The empty memory is represented by
    45   if you have the lists on the left-hand side, they should be transformed into
   471   \texttt{Map()}, that is nothing is stored in the
    46   the options on the right-hand side:
   472   memory. \texttt{Map(0 -> 1, 2 -> 3)} clearly stores \texttt{1} at
    47 
   473   memory location \texttt{0}; at \texttt{2} it stores \texttt{3}. The
    48   \begin{center}
   474   convention is that if we query the memory at a location that is
    49   \begin{tabular}{lcl}  
   475   \emph{not} defined in the \texttt{Map}, we return \texttt{0}. Write
    50     \texttt{List(Some(1), Some(2), Some(3))} & $\Rightarrow$ &
   476   a function, \texttt{sread}, that takes a memory (a \texttt{Map}) and
    51     \texttt{Some(List(1, 2, 3))} \\
   477   a memory pointer (an \texttt{Int}) as argument, and safely reads the
    52     \texttt{List(Some(1), None, Some(3))} & $\Rightarrow$ &
   478   corresponding memory location. If the \texttt{Map} is not defined at
    53     \texttt{None} \\
   479   the memory pointer, \texttt{sread} returns \texttt{0}.
    54     \texttt{List()} & $\Rightarrow$ & \texttt{Some(List())}
   480 
    55   \end{tabular}  
   481   Write another function \texttt{write}, which takes a memory, a
    56   \end{center}
   482   memory pointer and an integer value as argument and updates the
    57 
   483   \texttt{Map} with the value at the given memory location. As usual
    58   This means the function should produce \texttt{None} as soon
   484   the \texttt{Map} is not updated `in-place' but a new map is created
    59   as a \texttt{None} is inside the list. Otherwise it produces
   485   with the same data, except the value is stored at the given memory
    60   a list of all \texttt{Some}s. In case the list is empty, it
   486   pointer.\hfill[1 Mark]
    61   produces \texttt{Some} of the empty list. \hfill[1 Mark]
   487 
    62 
   488 \item[(2b)] Write two functions, \texttt{jumpRight} and
    63  
   489   \texttt{jumpLeft} that are needed to implement the loop constructs
    64 \item[(2)] Write first a function that converts the characters $I$, $V$,
   490   of brainf***. They take a program (a \texttt{String}) and a program
    65   $X$, $L$, $C$, $D$, and $M$ into an option of a \texttt{RomanDigit}.
   491   counter (an \texttt{Int}) as argument and move right (respectively
    66   If it is one of the roman digits, it should produce \texttt{Some};
   492   left) in the string in order to find the \textbf{matching}
    67   otherwise \texttt{None}.
   493   opening/closing bracket. For example, given the following program
       
   494   with the program counter indicated by an arrow:
       
   495 
       
   496   \begin{center}
       
   497   \texttt{--[\barbelow{.}.+>--],>,++}
       
   498   \end{center}
       
   499 
       
   500   then the matching closing bracket is in 9th position (counting from 0) and
       
   501   \texttt{jumpRight} is supposed to return the position just after this
    68   
   502   
    69   Next write a function that converts a string into a
   503   \begin{center}
    70   \texttt{RomanNumeral}.  Again, this function should return an
   504   \texttt{--[..+>--]\barbelow{,}>,++}
    71   \texttt{Option}: If the string consists of $I$, $V$, $X$, $L$, $C$,
   505   \end{center}
    72   $D$, and $M$ only, then it produces \texttt{Some}; otherwise if
   506 
    73   there is any other character in the string, it should produce
   507   meaning it jumps to after the loop. Similarly, if you are in 8th position
    74   \texttt{None}. The empty string is just the empty
   508   then \texttt{jumpLeft} is supposed to jump to just after the opening
    75   \texttt{RomanNumeral}, that is the empty list of
   509   bracket (that is jumping to the beginning of the loop):
    76   \texttt{RomanDigit}'s.  You should use the function under Task (1)
   510 
    77   to produce the result.  \hfill[2 Marks]
   511   \begin{center}
    78 
   512     \texttt{--[..+>-\barbelow{-}],>,++}
    79 \item[(3)] Write a recursive function \texttt{RomanNumral2Int} that
   513     \qquad$\stackrel{\texttt{jumpLeft}}{\longrightarrow}$\qquad
    80   converts a \texttt{RomanNumeral} into an integer. You can assume the
   514     \texttt{--[\barbelow{.}.+>--],>,++}
    81   generated integer will be between 0 and 3999.  The argument of the
   515   \end{center}
    82   function is a list of roman digits. It should look how this list
   516 
    83   starts and then calculate what the corresponding integer is for this
   517   Unfortunately we have to take into account that there might be
    84   ``start'' and add it with the integer for the rest of the list. That
   518   other opening and closing brackets on the `way' to find the
    85   means if the argument is of the form shown on the left-hand side, it
   519   matching bracket. For example in the brainf*** program
    86   should do the calculation on the right-hand side.
   520 
    87 
   521   \begin{center}
    88   \begin{center}
   522   \texttt{--[\barbelow{.}.[+>]--],>,++}
    89   \begin{tabular}{lcl}
   523   \end{center}
    90     $M::r$    & $\Rightarrow$ & $1000 + \text{roman numeral of rest}\; r$\\
   524 
    91     $C::M::r$ & $\Rightarrow$ & $900 + \text{roman numeral of rest}\; r$\\
   525   we do not want to return the index for the \texttt{'-'} in the 9th
    92     $D::r$    & $\Rightarrow$ & $500 + \text{roman numeral of rest}\; r$\\
   526   position, but the program counter for \texttt{','} in 12th
    93     $C::D::r$ & $\Rightarrow$ & $400 + \text{roman numeral of rest}\; r$\\
   527   position. The easiest to find out whether a bracket is matched is by
    94     $C::r$    & $\Rightarrow$ & $100 + \text{roman numeral of rest}\; r$\\
   528   using levels (which are the third argument in \texttt{jumpLeft} and
    95     $X::C::r$ & $\Rightarrow$ & $90 + \text{roman numeral of rest}\; r$\\
   529   \texttt{jumpLeft}). In case of \texttt{jumpRight} you increase the
    96     $L::r$    & $\Rightarrow$ & $50 + \text{roman numeral of rest}\; r$\\
   530   level by one whenever you find an opening bracket and decrease by
    97     $X::L::r$ & $\Rightarrow$ & $40 + \text{roman numeral of rest}\; r$\\
   531   one for a closing bracket. Then in \texttt{jumpRight} you are looking
    98     $X::r$    & $\Rightarrow$ & $10 + \text{roman numeral of rest}\; r$\\
   532   for the closing bracket on level \texttt{0}. For \texttt{jumpLeft} you
    99     $I::X::r$ & $\Rightarrow$ & $9 + \text{roman numeral of rest}\; r$\\
   533   do the opposite. In this way you can find \textbf{matching} brackets
   100     $V::r$    & $\Rightarrow$ & $5 + \text{roman numeral of rest}\; r$\\
   534   in strings such as
   101     $I::V::r$ & $\Rightarrow$ & $4 + \text{roman numeral of rest}\; r$\\
   535 
   102     $I::r$    & $\Rightarrow$ & $1 + \text{roman numeral of rest}\; r$
   536   \begin{center}
   103   \end{tabular}  
   537   \texttt{--[\barbelow{.}.[[-]+>[.]]--],>,++}
   104   \end{center}    
   538   \end{center}
   105 
   539 
   106   The empty list will be converted to integer $0$.\hfill[1 Mark]
   540   for which \texttt{jumpRight} should produce the position:
       
   541 
       
   542   \begin{center}
       
   543   \texttt{--[..[[-]+>[.]]--]\barbelow{,}>,++}
       
   544   \end{center}
       
   545 
       
   546   It is also possible that the position returned by \texttt{jumpRight} or
       
   547   \texttt{jumpLeft} is outside the string in cases where there are
       
   548   no matching brackets. For example
       
   549 
       
   550   \begin{center}
       
   551   \texttt{--[\barbelow{.}.[[-]+>[.]]--,>,++}
       
   552   \qquad$\stackrel{\texttt{jumpRight}}{\longrightarrow}$\qquad
       
   553   \texttt{--[..[[-]+>[.]]-->,++\barbelow{\;\phantom{+}}}
       
   554   \end{center}
       
   555   \hfill[1 Mark]
       
   556 
       
   557 
       
   558 \item[(2c)] Write a recursive function \texttt{run} that executes a
       
   559   brainf*** program. It takes a program, a program counter, a memory
       
   560   pointer and a memory as arguments. If the program counter is outside
       
   561   the program string, the execution stops and \texttt{run} returns the
       
   562   memory. If the program counter is inside the string, it reads the
       
   563   corresponding character and updates the program counter \texttt{pc},
       
   564   memory pointer \texttt{mp} and memory \texttt{mem} according to the
       
   565   rules shown in Figure~\ref{comms}. It then calls recursively
       
   566   \texttt{run} with the updated data.
       
   567 
       
   568   Write another function \texttt{start} that calls \texttt{run} with a
       
   569   given brainfu** program and memory, and the program counter and memory pointer
       
   570   set to~$0$. Like \texttt{run} it returns the memory after the execution
       
   571   of the program finishes. You can test your brainf**k interpreter with the
       
   572   Sierpinski triangle or the Hello world programs or have a look at
       
   573 
       
   574   \begin{center}
       
   575   \url{https://esolangs.org/wiki/Brainfuck}
       
   576   \end{center}\hfill[2 Marks]
   107   
   577   
   108 \item[(4)] Write a function that takes a string and if possible
   578   \begin{figure}[p]
   109   converts it into the internal representation. If successful, it then
   579   \begin{center}
   110   calculates the integer (an option of an integer) according to the
   580     \begin{tabular}{|@{}p{0.8cm}|l|}
   111   function in (3).  If this is not possible, then return
   581       \hline
   112   \texttt{None}.\hfill[1 Mark]
   582       \hfill\texttt{'>'} & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   113 
   583                        $\bullet$ & $\texttt{pc} + 1$\\
   114 
   584                        $\bullet$ & $\texttt{mp} + 1$\\
   115 \item[(5)] The file \texttt{roman.txt} contains a list of roman numerals.
   585                        $\bullet$ & \texttt{mem} unchanged
   116   Read in these numerals, convert them into integers and then add them all
   586                      \end{tabular}\\\hline   
   117   up. The Scala function for reading a file is
   587       \hfill\texttt{'<'} & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   118 
   588                        $\bullet$ & $\texttt{pc} + 1$\\
   119   \begin{center}
   589                        $\bullet$ & $\texttt{mp} - 1$\\
   120   \texttt{Source.fromFile("filename")("ISO-8859-9")}
   590                        $\bullet$ & \texttt{mem} unchanged
   121   \end{center}
   591                      \end{tabular}\\\hline   
   122 
   592       \hfill\texttt{'+'} & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   123   Make sure you process the strings correctly by ignoring whitespaces
   593                        $\bullet$ & $\texttt{pc} + 1$\\
   124   where needed.\\ \mbox{}\hfill[1 Mark]
   594                        $\bullet$ & $\texttt{mp}$ unchanged\\
   125 \end{itemize}
   595                        $\bullet$ & \texttt{mem} updated with \texttt{mp -> mem(mp) + 1}\\
   126 
   596                      \end{tabular}\\\hline   
   127 
   597       \hfill\texttt{'-'} & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   128 \subsection*{Part 2 (Validation)}
   598                        $\bullet$ & $\texttt{pc} + 1$\\
   129 
   599                        $\bullet$ & $\texttt{mp}$ unchanged\\
   130 As you can see the function under Task (3) can produce some unexpected
   600                        $\bullet$ & \texttt{mem} updated with \texttt{mp -> mem(mp) - 1}\\
   131 results. For example for $XXCIII$ it produces 103. The reason for this
   601                      \end{tabular}\\\hline   
   132 unexpected result is that $XXCIII$ is actually not a valid roman
   602       \hfill\texttt{'.'} & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   133 number, neither is $IIII$ for 4 nor $MIM$ for 1999. Although actual
   603                        $\bullet$ & $\texttt{pc} + 1$\\
   134 Romans were not so fussy about this,\footnote{They happily used
   604                        $\bullet$ & $\texttt{mp}$ and \texttt{mem} unchanged\\
   135   numbers like $XIIX$ or $IIXX$ for 18.} but modern times declared
   605                        $\bullet$ & print out \,\texttt{mem(mp)} as a character\\
   136 that there are precise rules for what a valid roman number is, namely:
   606                      \end{tabular}\\\hline   
   137 
   607       \hfill\texttt{','} & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   138 \begin{itemize}
   608                        $\bullet$ & $\texttt{pc} + 1$\\
   139 \item Repeatable roman digits are $I$, $X$, $C$ and $M$. The other ones
   609                        $\bullet$ & $\texttt{mp}$ unchanged\\
   140   are non-repeatable. Repeatable digits can be repeated upto 3 times in a
   610                        $\bullet$ & \texttt{mem} updated with \texttt{mp -> \textrm{input}}\\
   141   number (for example $MMM$ is OK); non-repeatable digits cannot be
   611                        \multicolumn{2}{@{}l}{the input is given by \texttt{Console.in.read().toByte}}
   142   repeated at all (for example $VV$ is excluded).
   612                      \end{tabular}\\\hline   
   143   
   613       \hfill\texttt{'['} & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   144 \item If a smaller digits precedes a bigger digit, then $I$ can precede $V$ and $X$; $X$ can preced
   614                        \multicolumn{2}{@{}l}{if \texttt{mem(mp) == 0} then}\\
   145   $L$ and $C$; and $C$ can preced $D$ and $M$. No other combination is permitted in this case.
   615                        $\bullet$ & $\texttt{pc = jumpRight(prog, pc + 1, 0)}$\\
   146 
   616                        $\bullet$ & $\texttt{mp}$ and \texttt{mem} unchanged\medskip\\
   147 \item If a smaller digit precedes a bigger digit (for example $IV$), then the smaller number   
   617                        \multicolumn{2}{@{}l}{otherwise if \texttt{mem(mp) != 0} then}\\
   148   must be either the first digit in the number, or follow a digit which is at least 10 times its value.
   618                        $\bullet$ & $\texttt{pc} + 1$\\
   149   So $VIV$ is excluded, because $I$ follows $V$ and $I * 10$ is bigger than $V$; but $XIV$ is
   619                        $\bullet$ & $\texttt{mp}$ and \texttt{mem} unchanged\\
   150   allowed, because $I$ follows $X$ and $I * 10$ is equal to $X$.
   620                      \end{tabular}
   151 
   621                      \\\hline   
   152 \item Let us say two digits are called a \emph{compound} roman digit
   622       \hfill\texttt{']'} & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   153   when a smaller digit precedes a bigger digit (so $IV$, $XL$, $CM$
   623                        \multicolumn{2}{@{}l}{if \texttt{mem(mp) != 0} then}\\
   154   for example). If a compound digit is followed by another digit, then
   624                        $\bullet$ & $\texttt{pc = jumpLeft(prog, pc - 1, 0)}$\\
   155   this digit must be smaller than the first digit in the compound
   625                        $\bullet$ & $\texttt{mp}$ and \texttt{mem} unchanged\medskip\\
   156   digit. For example $IXI$ is excluded, but $XLI$ is not.
   626                        \multicolumn{2}{@{}l}{otherwise if \texttt{mem(mp) == 0} then}\\
   157 
   627                        $\bullet$ & $\texttt{pc} + 1$\\
   158 \item The empty roman numeral is valid.  
   628                        $\bullet$ & $\texttt{mp}$ and \texttt{mem} unchanged\\
   159 \end{itemize}
   629                      \end{tabular}\\\hline   
   160 
   630       any other char & \begin{tabular}[t]{@{}l@{\hspace{2mm}}l@{}}
   161 \noindent
   631                          $\bullet$ & $\texttt{pc} + 1$\\
   162 The tasks in this part are as follows:
   632                          $\bullet$ & \texttt{mp} and \texttt{mem} unchanged
   163 
   633                        \end{tabular}\\
   164 \begin{itemize}
   634       \hline                 
   165 \item[(6)] Implement a recursive function \texttt{isValidNumeral} that
   635     \end{tabular}
   166   takes a \texttt{RomanNumeral} as argument and produces true if \textbf{all}
   636   \end{center}
   167   the rules above are satisfied, and otherwise false.
   637   \caption{The rules for how commands in the brainf*** language update the program counter \texttt{pc},
   168 
   638     memory pointer \texttt{mp} and memory \texttt{mem}.\label{comms}}
   169   Hint: It might be more convenient to test when the rules fail and then return false;
   639   \end{figure}
   170   return true in all other cases.
   640 \end{itemize}\bigskip  
   171   \mbox{}\hfill[2 Marks]
   641 
   172 
   642 
   173 \item[(7)] Write a recursive function that converts an Integer into a \texttt{RomanNumeral}.
   643 
   174   You can assume the function will only be called for integers between 0 and 3999.\mbox{}\hfill[1 Mark]
       
   175   
       
   176 \item[(8)] Write a function that reads a text file (for example \texttt{roman2.txt})
       
   177   containing valid and invalid roman numerals. Convert all valid roman numerals into
       
   178   integers, add them up and produce the result as a \texttt{RomanNumeral} (using the function
       
   179   from (7)). \hfill[1 Mark]
       
   180 \end{itemize}
       
   181   
       
   182 
   644 
   183 \end{document}
   645 \end{document}
   184 
   646 
   185 
   647 
   186 %%% Local Variables: 
   648 %%% Local Variables: