handouts/ho02.tex
changeset 414 065ca01b62ae
parent 412 1cef3924f7a2
child 415 4ae59fd3b174
equal deleted inserted replaced
413:b7221df9662a 414:065ca01b62ae
    22 match the strings).  To see the substantial differences in the left
    22 match the strings).  To see the substantial differences in the left
    23 and right plots below, note the different scales of the $x$-axes. 
    23 and right plots below, note the different scales of the $x$-axes. 
    24 
    24 
    25 
    25 
    26 \begin{center}
    26 \begin{center}
       
    27 Plots: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$\\
    27 \begin{tabular}{@{}cc@{}}
    28 \begin{tabular}{@{}cc@{}}
    28 \begin{tikzpicture}
    29 \begin{tikzpicture}
    29 \begin{axis}[
    30 \begin{axis}[
    30     xlabel={\small $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
    31     xlabel={$n$},
       
    32     x label style={at={(1.05,0.0)}},
    31     ylabel={\small time in secs},
    33     ylabel={\small time in secs},
    32     enlargelimits=false,
    34     enlargelimits=false,
    33     xtick={0,5,...,30},
    35     xtick={0,5,...,30},
    34     xmax=33,
    36     xmax=33,
    35     ymax=35,
    37     ymax=35,
    48 \end{axis}
    50 \end{axis}
    49 \end{tikzpicture}
    51 \end{tikzpicture}
    50 &
    52 &
    51 \begin{tikzpicture}
    53 \begin{tikzpicture}
    52   \begin{axis}[
    54   \begin{axis}[
    53     xlabel={\small $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
    55     xlabel={$n$},
       
    56     x label style={at={(1.1,0.05)}},
    54     ylabel={\small time in secs},
    57     ylabel={\small time in secs},
    55     enlargelimits=false,
    58     enlargelimits=false,
    56     xtick={0,3000,...,12000},
    59     xtick={0,3000,...,12000},
    57     xmax=12500,
    60     xmax=12500,
    58     ymax=35,
    61     ymax=35,
    67 \end{tikzpicture}
    70 \end{tikzpicture}
    68 \end{tabular}
    71 \end{tabular}
    69 \end{center}
    72 \end{center}
    70 
    73 
    71 \begin{center}
    74 \begin{center}
       
    75 Plots: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$
    72 \begin{tabular}{@{}cc@{}}
    76 \begin{tabular}{@{}cc@{}}
    73 \begin{tikzpicture}
    77 \begin{tikzpicture}
    74 \begin{axis}[
    78 \begin{axis}[
    75     xlabel={$(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
    79     xlabel={$n$},
       
    80     x label style={at={(1.05,0.0)}},
    76     ylabel={time in secs},
    81     ylabel={time in secs},
    77     enlargelimits=false,
    82     enlargelimits=false,
    78     xtick={0,5,...,30},
    83     xtick={0,5,...,30},
    79     xmax=33,
    84     xmax=33,
    80     ymax=35,
    85     ymax=35,
    90 \end{axis}
    95 \end{axis}
    91 \end{tikzpicture}
    96 \end{tikzpicture}
    92 &
    97 &
    93 \begin{tikzpicture}
    98 \begin{tikzpicture}
    94   \begin{axis}[
    99   \begin{axis}[
    95     xlabel={$(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$}, 
   100     xlabel={$n$},
       
   101     x label style={at={(1.1,0.05)}},
    96     ylabel={time in secs},
   102     ylabel={time in secs},
    97     enlargelimits=false,
   103     enlargelimits=false,
    98     xtick={0,3000,...,12000},
   104     xtick={0,3000,...,12000},
    99     xmax=12500,
   105     xmax=12500,
   100     ymax=35,
   106     ymax=35,
   127 we cannot use the function $L$ directly for this, because in general
   133 we cannot use the function $L$ directly for this, because in general
   128 the set of strings $L$ returns is infinite (recall what $L(a^*)$ is).
   134 the set of strings $L$ returns is infinite (recall what $L(a^*)$ is).
   129 In such cases there is no way we can implement an exhaustive test for
   135 In such cases there is no way we can implement an exhaustive test for
   130 whether a string is member of this set or not. In contrast our
   136 whether a string is member of this set or not. In contrast our
   131 matching algorithm will operate on the regular expression $r$ and
   137 matching algorithm will operate on the regular expression $r$ and
   132 string $s$, only, which are both finite objects. Before we explain to
   138 string $s$, only, which are both finite objects. Before we explain 
   133 the matching algorithm, however, let us have a closer look at what it
   139 the matching algorithm, however, let us have a closer look at what it
   134 means when two regular expressions are equivalent.
   140 means when two regular expressions are equivalent.
   135 
   141 
   136 \subsection*{Regular Expression Equivalences}
   142 \subsection*{Regular Expression Equivalences}
   137 
   143 
   291 expression look like that can match just $s$? The definition
   297 expression look like that can match just $s$? The definition
   292 of this function is as follows:
   298 of this function is as follows:
   293 
   299 
   294 \begin{center}
   300 \begin{center}
   295 \begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
   301 \begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
   296   $der\, c\, (\ZERO)$      & $\dn$ & $\ZERO$\\
   302   $\textit{der}\, c\, (\ZERO)$      & $\dn$ & $\ZERO$\\
   297   $der\, c\, (\ONE)$         & $\dn$ & $\ZERO$ \\
   303   $\textit{der}\, c\, (\ONE)$         & $\dn$ & $\ZERO$ \\
   298   $der\, c\, (d)$                & $\dn$ & if $c = d$ then $\ONE$ else $\ZERO$\\
   304   $\textit{der}\, c\, (d)$                & $\dn$ & if $c = d$ then $\ONE$ else $\ZERO$\\
   299   $der\, c\, (r_1 + r_2)$        & $\dn$ & $der\, c\, r_1 + der\, c\, r_2$\\
   305   $\textit{der}\, c\, (r_1 + r_2)$        & $\dn$ & $\textit{der}\, c\, r_1 + \textit{der}\, c\, r_2$\\
   300   $der\, c\, (r_1 \cdot r_2)$  & $\dn$  & if $\textit{nullable} (r_1)$\\
   306   $\textit{der}\, c\, (r_1 \cdot r_2)$  & $\dn$  & if $\textit{nullable} (r_1)$\\
   301   & & then $(der\,c\,r_1) \cdot r_2 + der\, c\, r_2$\\ 
   307   & & then $(\textit{der}\,c\,r_1) \cdot r_2 + \textit{der}\, c\, r_2$\\ 
   302   & & else $(der\, c\, r_1) \cdot r_2$\\
   308   & & else $(\textit{der}\, c\, r_1) \cdot r_2$\\
   303   $der\, c\, (r^*)$          & $\dn$ & $(der\,c\,r) \cdot (r^*)$
   309   $\textit{der}\, c\, (r^*)$          & $\dn$ & $(\textit{der}\,c\,r) \cdot (r^*)$
   304   \end{tabular}
   310   \end{tabular}
   305 \end{center}
   311 \end{center}
   306 
   312 
   307 \noindent The first two clauses can be rationalised as
   313 \noindent The first two clauses can be rationalised as
   308 follows: recall that $der$ should calculate a regular
   314 follows: recall that $\textit{der}$ should calculate a regular
   309 expression so that given the ``input'' regular expression can
   315 expression so that given the ``input'' regular expression can
   310 match a string of the form $c\!::\!s$, we want a regular
   316 match a string of the form $c\!::\!s$, we want a regular
   311 expression for $s$. Since neither $\ZERO$ nor $\ONE$
   317 expression for $s$. Since neither $\ZERO$ nor $\ONE$
   312 can match a string of the form $c\!::\!s$, we return
   318 can match a string of the form $c\!::\!s$, we return
   313 $\ZERO$. In the third case we have to make a
   319 $\ZERO$. In the third case we have to make a
   318 return $\ZERO$ since no string of the $c\!::\!s$ can be
   324 return $\ZERO$ since no string of the $c\!::\!s$ can be
   319 matched. Next come the recursive cases, which are a bit more
   325 matched. Next come the recursive cases, which are a bit more
   320 involved. Fortunately, the $+$-case is still relatively
   326 involved. Fortunately, the $+$-case is still relatively
   321 straightforward: all strings of the form $c\!::\!s$ are either
   327 straightforward: all strings of the form $c\!::\!s$ are either
   322 matched by the regular expression $r_1$ or $r_2$. So we just
   328 matched by the regular expression $r_1$ or $r_2$. So we just
   323 have to recursively call $der$ with these two regular
   329 have to recursively call $\textit{der}$ with these two regular
   324 expressions and compose the results again with $+$. Makes
   330 expressions and compose the results again with $+$. Makes
   325 sense? 
   331 sense? 
   326 
   332 
   327 The $\cdot$-case is more complicated: if $r_1\cdot r_2$
   333 The $\cdot$-case is more complicated: if $r_1\cdot r_2$
   328 matches a string of the form $c\!::\!s$, then the first part
   334 matches a string of the form $c\!::\!s$, then the first part
   329 must be matched by $r_1$. Consequently, it makes sense to
   335 must be matched by $r_1$. Consequently, it makes sense to
   330 construct the regular expression for $s$ by calling $der$ with
   336 construct the regular expression for $s$ by calling $\textit{der}$ with
   331 $r_1$ and ``appending'' $r_2$. There is however one exception
   337 $r_1$ and ``appending'' $r_2$. There is however one exception
   332 to this simple rule: if $r_1$ can match the empty string, then
   338 to this simple rule: if $r_1$ can match the empty string, then
   333 all of $c\!::\!s$ is matched by $r_2$. So in case $r_1$ is
   339 all of $c\!::\!s$ is matched by $r_2$. So in case $r_1$ is
   334 nullable (that is can match the empty string) we have to allow
   340 nullable (that is can match the empty string) we have to allow
   335 the choice $der\,c\,r_2$ for calculating the regular
   341 the choice $\textit{der}\,c\,r_2$ for calculating the regular
   336 expression that can match $s$. Therefore we have to add the
   342 expression that can match $s$. Therefore we have to add the
   337 regular expression $der\,c\,r_2$ in the result. The $*$-case
   343 regular expression $\textit{der}\,c\,r_2$ in the result. The $*$-case
   338 is again simple: if $r^*$ matches a string of the form
   344 is again simple: if $r^*$ matches a string of the form
   339 $c\!::\!s$, then the first part must be ``matched'' by a
   345 $c\!::\!s$, then the first part must be ``matched'' by a
   340 single copy of $r$. Therefore we call recursively $der\,c\,r$
   346 single copy of $r$. Therefore we call recursively $\textit{der}\,c\,r$
   341 and ``append'' $r^*$ in order to match the rest of $s$.
   347 and ``append'' $r^*$ in order to match the rest of $s$. Still
   342 
   348 makes sense?
   343 If this did not make sense yet, here is another way to rationalise
   349 
   344 the definition of $der$ by considering the following operation
   350 If all this did not make sense yet, here is another way to rationalise
       
   351 the definition of $\textit{der}$ by considering the following operation
   345 on sets:
   352 on sets:
   346 
   353 
   347 \begin{equation}\label{Der}
   354 \begin{equation}\label{Der}
   348 Der\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\}
   355 \textit{Der}\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\}
   349 \end{equation}
   356 \end{equation}
   350 
   357 
   351 \noindent This operation essentially transforms a set of
   358 \noindent This operation essentially transforms a set of
   352 strings $A$ by filtering out all strings that do not start
   359 strings $A$ by filtering out all strings that do not start
   353 with $c$ and then strips off the $c$ from all the remaining
   360 with $c$ and then strips off the $c$ from all the remaining
   354 strings. For example suppose $A = \{f\!oo, bar, f\!rak\}$ then
   361 strings. For example suppose $A = \{f\!oo, bar, f\!rak\}$ then
   355 
   362 
   356 \[ Der\,f\,A = \{oo, rak\}\quad,\quad 
   363 \[ \textit{Der}\,f\,A = \{oo, rak\}\quad,\quad 
   357    Der\,b\,A = \{ar\} \quad \text{and} \quad 
   364    \textit{Der}\,b\,A = \{ar\} \quad \text{and} \quad 
   358    Der\,a\,A = \{\} 
   365    \textit{Der}\,a\,A = \{\} 
   359 \]
   366 \]
   360  
   367  
   361 \noindent
   368 \noindent
   362 Note that in the last case $Der$ is empty, because no string in $A$
   369 Note that in the last case $\textit{Der}$ is empty, because no string in $A$
   363 starts with $a$. With this operation we can state the following
   370 starts with $a$. With this operation we can state the following
   364 property about $der$:
   371 property about $\textit{der}$:
   365 
   372 
   366 \[
   373 \[
   367 L(der\,c\,r) = Der\,c\,(L(r))
   374 L(\textit{der}\,c\,r) = \textit{Der}\,c\,(L(r))
   368 \]
   375 \]
   369 
   376 
   370 \noindent
   377 \noindent
   371 This property clarifies what regular expression $der$ calculates,
   378 This property clarifies what regular expression $\textit{der}$ calculates,
   372 namely take the set of strings that $r$ can match (that is $L(r)$),
   379 namely take the set of strings that $r$ can match (that is $L(r)$),
   373 filter out all strings not starting with $c$ and strip off the $c$
   380 filter out all strings not starting with $c$ and strip off the $c$
   374 from the remaining strings---this is exactly the language that
   381 from the remaining strings---this is exactly the language that
   375 $der\,c\,r$ can match.
   382 $\textit{der}\,c\,r$ can match.
   376 
   383 
   377 If we want to find out whether the string $abc$ is matched by
   384 If we want to find out whether the string $abc$ is matched by
   378 the regular expression $r_1$ then we can iteratively apply $der$
   385 the regular expression $r_1$ then we can iteratively apply $\textit{der}$
   379 as follows
   386 as follows
   380 
   387 
   381 \begin{center}
   388 \begin{center}
   382 \begin{tabular}{rll}
   389 \begin{tabular}{rll}
   383 Input: $r_1$, $abc$\medskip\\
   390 Input: $r_1$, $abc$\medskip\\
   384 Step 1: & build derivative of $a$ and $r_1$ & $(r_2 = der\,a\,r_1)$\smallskip\\
   391 Step 1: & build derivative of $a$ and $r_1$ & $(r_2 = \textit{der}\,a\,r_1)$\smallskip\\
   385 Step 2: & build derivative of $b$ and $r_2$ & $(r_3 = der\,b\,r_2)$\smallskip\\
   392 Step 2: & build derivative of $b$ and $r_2$ & $(r_3 = \textit{der}\,b\,r_2)$\smallskip\\
   386 Step 3: & build derivative of $c$ and $r_3$ & $(r_4 = der\,b\,r_3)$\smallskip\\
   393 Step 3: & build derivative of $c$ and $r_3$ & $(r_4 = \textit{der}\,b\,r_3)$\smallskip\\
   387 Step 4: & the string is exhausted; test & ($\textit{nullable}(r_4)$)\\
   394 Step 4: & the string is exhausted; test & ($\textit{nullable}(r_4)$)\\
   388         & whether $r_4$ can recognise the\\
   395         & whether $r_4$ can recognise the\\
   389         & empty string\smallskip\\
   396         & empty string\smallskip\\
   390 Output: & result of this test $\Rightarrow \textit{true} \,\text{or}\, \textit{false}$\\        
   397 Output: & result of this test $\Rightarrow \textit{true} \,\text{or}\, \textit{false}$\\        
   391 \end{tabular}
   398 \end{tabular}
   392 \end{center}
   399 \end{center}
   393 
   400 
   394 \noindent Again the operation $Der$ might help to rationalise
   401 \noindent Again the operation $\textit{Der}$ might help to rationalise
   395 this algorithm. We want to know whether $abc \in L(r_1)$. We
   402 this algorithm. We want to know whether $abc \in L(r_1)$. We
   396 do not know yet---but let us assume it is. Then $Der\,a\,L(r_1)$
   403 do not know yet---but let us assume it is. Then $\textit{Der}\,a\,L(r_1)$
   397 builds the set where all the strings not starting with $a$ are
   404 builds the set where all the strings not starting with $a$ are
   398 filtered out. Of the remaining strings, the $a$ is stripped
   405 filtered out. Of the remaining strings, the $a$ is stripped
   399 off. So we should still have $bc$ in the set.
   406 off. So we should still have $bc$ in the set.
   400 Then we continue with filtering out all strings not
   407 Then we continue with filtering out all strings not
   401 starting with $b$ and stripping off the $b$ from the remaining
   408 starting with $b$ and stripping off the $b$ from the remaining
   402 strings, that means we build $Der\,b\,(Der\,a\,(L(r_1)))$.
   409 strings, that means we build $\textit{Der}\,b\,(\textit{Der}\,a\,(L(r_1)))$.
   403 Finally we filter out all strings not starting with $c$ and
   410 Finally we filter out all strings not starting with $c$ and
   404 strip off $c$ from the remaining string. This is
   411 strip off $c$ from the remaining string. This is
   405 $Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$. Now if $abc$ was in the 
   412 $\textit{Der}\,c\,(\textit{Der}\,b\,(\textit{Der}\,a\,(L(r_1))))$. Now if $abc$ was in the 
   406 original set ($L(r_1)$), then in $Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$ 
   413 original set ($L(r_1)$), then $\textit{Der}\,c\,(\textit{Der}\,b\,(\textit{Der}\,a\,(L(r_1))))$ 
   407 must contain the empty string. If not, then $abc$ was not in the 
   414 must contain the empty string. If not, then $abc$ was not in the 
   408 language we started with.
   415 language we started with.
   409 
   416 
   410 Our matching algorithm using $der$ and $\textit{nullable}$ works
   417 Our matching algorithm using $\textit{der}$ and $\textit{nullable}$ works
   411 similarly, just using regular expression instead of sets. For
   418 similarly, just using regular expression instead of sets. In order to
   412 this we need to extend the notion of derivatives from single
   419 define our algorithm we need to extend the notion of derivatives from single
   413 characters to strings. This can be done using the following
   420 characters to strings. This can be done using the following
   414 function, taking a string and regular expression as input and
   421 function, taking a string and a regular expression as input and
   415 a regular expression as output.
   422 a regular expression as output.
   416 
   423 
   417 \begin{center}
   424 \begin{center}
   418 \begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {\hspace{-10mm}}l@ {}}
   425 \begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {\hspace{-10mm}}l@ {}}
   419   $\textit{ders}\, []\, r$     & $\dn$ & $r$ & \\
   426   $\textit{ders}\, []\, r$     & $\dn$ & $r$ & \\
   420   $\textit{ders}\, (c\!::\!s)\, r$ & $\dn$ & $\textit{ders}\,s\,(der\,c\,r)$ & \\
   427   $\textit{ders}\, (c\!::\!s)\, r$ & $\dn$ & $\textit{ders}\,s\,(\textit{der}\,c\,r)$ & \\
   421   \end{tabular}
   428   \end{tabular}
   422 \end{center}
   429 \end{center}
   423 
   430 
   424 \noindent This function iterates $der$ taking one character at
   431 \noindent This function iterates $\textit{der}$ taking one character at
   425 the time from the original string until it is exhausted.
   432 the time from the original string until it is exhausted.
   426 Having $ders$ in place, we can finally define our matching
   433 Having $\textit{der}s$ in place, we can finally define our matching
   427 algorithm:
   434 algorithm:
   428 
   435 
   429 \[
   436 \[
   430 matches\,s\,r \dn \textit{nullable}(ders\,s\,r)
   437 \textit{matches}\,s\,r \dn \textit{nullable}(\textit{ders}\,s\,r)
   431 \]
   438 \]
   432 
   439 
   433 \noindent
   440 \noindent
   434 and we can claim that
   441 and we can claim that
   435 
   442 
   436 \[
   443 \[
   437 matches\,s\,r\quad\text{if and only if}\quad s\in L(r)
   444 \textit{matches}\,s\,r\quad\text{if and only if}\quad s\in L(r)
   438 \]
   445 \]
   439 
   446 
   440 \noindent holds, which means our algorithm satisfies the
   447 \noindent holds, which means our algorithm satisfies the
   441 specification. Of course we can claim many things\ldots
   448 specification. Of course we can claim many things\ldots
   442 whether the claim holds any water is a different question,
   449 whether the claim holds any water is a different question,
   443 which for example is the point of the Strand-2 Coursework.
   450 which for example is the point of the Strand-2 Coursework.
   444 
   451 
   445 This algorithm was introduced by Janus Brzozowski in 1964. Its
   452 This algorithm was introduced by Janus Brzozowski in 1964, but 
       
   453 is more widely known only in the last 10 or so years. Its
   446 main attractions are simplicity and being fast, as well as
   454 main attractions are simplicity and being fast, as well as
   447 being easily extendable for other regular expressions such as
   455 being easily extendable for other regular expressions such as
   448 $r^{\{n\}}$, $r^?$, $\sim{}r$ and so on (this is subject of
   456 $r^{\{n\}}$, $r^?$, $\sim{}r$ and so on (this is subject of
   449 Strand-1 Coursework 1).
   457 Strand-1 Coursework 1). 
   450 
   458 
   451 \subsection*{The Matching Algorithm in Scala}
   459 \subsection*{The Matching Algorithm in Scala}
   452 
   460 
   453 Another attraction of the algorithm is that it can be easily
   461 Another attraction of the algorithm is that it can be easily
   454 implemented in a functional programming language, like Scala.
   462 implemented in a functional programming language, like Scala.
   463   implement in functional languages, because their built-in pattern 
   471   implement in functional languages, because their built-in pattern 
   464   matching and recursion allow us to mimic the mathematical
   472   matching and recursion allow us to mimic the mathematical
   465   definitions very closely.\label{scala1}}
   473   definitions very closely.\label{scala1}}
   466 \end{figure}
   474 \end{figure}
   467 
   475 
   468 For running the algorithm with our favourite example, the evil
   476 
       
   477 Remember our second example involving the regular expression
       
   478 $(a^*)^* \cdot b$ which could not match strings of $n$ \texttt{a}s. 
       
   479 Java needed around 30 seconds to find this out a string with $n=28$.
       
   480 It seems our algorithm is doing rather well in comparison:
       
   481 
       
   482 \begin{center}
       
   483 \begin{tikzpicture}
       
   484 \begin{axis}[
       
   485     title={Plot: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
       
   486     xlabel={$n$},
       
   487     x label style={at={(1.05,0.0)}},
       
   488     ylabel={time in secs},
       
   489     enlargelimits=false,
       
   490     xtick={0,500,...,4000},
       
   491     xmax=4100,
       
   492     ytick={0,5,...,30},
       
   493     scaled ticks=false,
       
   494     axis lines=left,
       
   495     width=10cm,
       
   496     height=4.5cm, 
       
   497     legend entries={Java,Scala V1},  
       
   498     legend pos=north east,
       
   499     legend cell align=left]
       
   500 \addplot[blue,mark=*, mark options={fill=white}] 
       
   501   table {re-java.data};
       
   502 \addplot[red,mark=triangle*,mark options={fill=white}]
       
   503   table {re2c.data};
       
   504 \end{axis}
       
   505 \end{tikzpicture}
       
   506 \end{center}
       
   507 
       
   508 \noindent
       
   509 This is no error: it hardly takes more than half a second for
       
   510 strings up to the length of 4000. After that we receive a
       
   511 StackOverflow exception, but still\ldots
       
   512 
       
   513 For running the algorithm with our first example, the evil
   469 regular expression $a^?{}^{\{n\}}a^{\{n\}}$, we need to implement
   514 regular expression $a^?{}^{\{n\}}a^{\{n\}}$, we need to implement
   470 the optional regular expression and the exactly $n$-times
   515 the optional regular expression and the exactly $n$-times
   471 regular expression. This can be done with the translations
   516 regular expression. This can be done with the translations
   472 
   517 
   473 \lstinputlisting[numbers=none]{../progs/app51.scala}
   518 \lstinputlisting[numbers=none]{../progs/app51.scala}
   474 
   519 
   475 \noindent Running the matcher with the example, we find it is
   520 \noindent Running the matcher with this example, we find it is
   476 slightly worse then the matcher in Ruby and Python.
   521 slightly worse then the matcher in Ruby and Python.
   477 Ooops\ldots
   522 Ooops\ldots
   478 
   523 
   479 \begin{center}
   524 \begin{center}
   480 \begin{tikzpicture}
   525 \begin{tikzpicture}
   481 \begin{axis}[
   526 \begin{axis}[    
   482     xlabel={\pcode{a}s},
   527     title={Plot: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
       
   528     xlabel={$n$},
       
   529     x label style={at={(1.05,0.0)}},
   483     ylabel={time in secs},
   530     ylabel={time in secs},
   484     enlargelimits=false,
   531     enlargelimits=false,
   485     xtick={0,5,...,30},
   532     xtick={0,5,...,30},
   486     xmax=30,
   533     xmax=30,
   487     ytick={0,5,...,30},
   534     ytick={0,5,...,30},
   530 \end{center}
   577 \end{center}
   531 
   578 
   532 \noindent With this fix we have a constant ``size'' regular
   579 \noindent With this fix we have a constant ``size'' regular
   533 expression for our running example no matter how large $n$ is.
   580 expression for our running example no matter how large $n$ is.
   534 This means we have to also add cases for \pcode{NTIMES} in the
   581 This means we have to also add cases for \pcode{NTIMES} in the
   535 functions $\textit{nullable}$ and $der$. Does the change have any
   582 functions $\textit{nullable}$ and $\textit{der}$. Does the change have any
   536 effect?
   583 effect?
   537 
   584 
   538 \begin{center}
   585 \begin{center}
   539 \begin{tikzpicture}
   586 \begin{tikzpicture}
   540 \begin{axis}[
   587 \begin{axis}[
   541     xlabel={\pcode{a}s},
   588     title={Plot: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
       
   589     xlabel={$n$},
       
   590     x label style={at={(1.01,0.0)}},
   542     ylabel={time in secs},
   591     ylabel={time in secs},
   543     enlargelimits=false,
   592     enlargelimits=false,
   544     xtick={0,100,...,1000},
   593     xtick={0,100,...,1000},
   545     xmax=1000,
   594     xmax=1100,
   546     ytick={0,5,...,30},
   595     ytick={0,5,...,30},
   547     scaled ticks=false,
   596     scaled ticks=false,
   548     axis lines=left,
   597     axis lines=left,
   549     width=9.5cm,
   598     width=10cm,
   550     height=5cm, 
   599     height=5cm, 
   551     legend entries={Python,Ruby,Scala V1,Scala V2},  
   600     legend entries={Python,Ruby,Scala V1,Scala V2},  
   552     legend pos=outer north east,
   601     legend pos=outer north east,
   553     legend cell align=left  
   602     legend cell align=left]
   554 ]
       
   555 \addplot[blue,mark=*, mark options={fill=white}] 
   603 \addplot[blue,mark=*, mark options={fill=white}] 
   556   table {re-python.data};
   604   table {re-python.data};
   557 \addplot[brown,mark=pentagon*, mark options={fill=white}] 
   605 \addplot[brown,mark=pentagon*, mark options={fill=white}] 
   558   table {re-ruby.data};  
   606   table {re-ruby.data};  
   559 \addplot[red,mark=triangle*,mark options={fill=white}] 
   607 \addplot[red,mark=triangle*,mark options={fill=white}] 
   564 \end{tikzpicture}
   612 \end{tikzpicture}
   565 \end{center}
   613 \end{center}
   566 
   614 
   567 \noindent Now we are talking business! The modified matcher 
   615 \noindent Now we are talking business! The modified matcher 
   568 can within 30 seconds handle regular expressions up to
   616 can within 30 seconds handle regular expressions up to
   569 $n = 950$ before a StackOverflow is raised. Python and Ruby 
   617 $n = 950$ before a StackOverflow is raised. Recall that Python and Ruby 
   570 (and our first version) could only handle $n = 27$ or so in 30 
   618 (and our first version, Scala V1) could only handle $n = 27$ or so in 30 
   571 second.
   619 seconds. There is no change for our second example 
   572 
   620 $(a^*)^* \cdot b$---so this is still good.
   573 SECOND EXAMPLE
   621 
   574 
   622 
   575 The moral is that our algorithm is rather sensitive to the
   623 The moral is that our algorithm is rather sensitive to the
   576 size of regular expressions it needs to handle. This is of
   624 size of regular expressions it needs to handle. This is of
   577 course obvious because both $\textit{nullable}$ and $der$ frequently
   625 course obvious because both $\textit{nullable}$ and $\textit{der}$ frequently
   578 need to traverse the whole regular expression. There seems,
   626 need to traverse the whole regular expression. There seems,
   579 however, one more issue for making the algorithm run faster.
   627 however, one more issue for making the algorithm run faster.
   580 The derivative function often produces ``useless''
   628 The derivative function often produces ``useless''
   581 $\ZERO$s and $\ONE$s. To see this, consider $r = ((a
   629 $\ZERO$s and $\ONE$s. To see this, consider $r = ((a
   582 \cdot b) + b)^*$ and the following two derivatives
   630 \cdot b) + b)^*$ and the following two derivatives
   583 
   631 
   584 \begin{center}
   632 \begin{center}
   585 \begin{tabular}{l}
   633 \begin{tabular}{l}
   586 $der\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\
   634 $\textit{der}\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\
   587 $der\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\
   635 $\textit{der}\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\
   588 $der\,c\,r = ((\ZERO \cdot b) + \ZERO)\cdot r$
   636 $\textit{der}\,c\,r = ((\ZERO \cdot b) + \ZERO)\cdot r$
   589 \end{tabular}
   637 \end{tabular}
   590 \end{center}
   638 \end{center}
   591 
   639 
   592 \noindent 
   640 \noindent 
   593 If we simplify them according to the simple rules from the
   641 If we simplify them according to the simple rules from the
   594 beginning, we can replace the right-hand sides by the 
   642 beginning, we can replace the right-hand sides by the 
   595 smaller equivalent regular expressions
   643 smaller equivalent regular expressions
   596 
   644 
   597 \begin{center}
   645 \begin{center}
   598 \begin{tabular}{l}
   646 \begin{tabular}{l}
   599 $der\,a\,r \equiv b \cdot r$\\
   647 $\textit{der}\,a\,r \equiv b \cdot r$\\
   600 $der\,b\,r \equiv r$\\
   648 $\textit{der}\,b\,r \equiv r$\\
   601 $der\,c\,r \equiv \ZERO$
   649 $\textit{der}\,c\,r \equiv \ZERO$
   602 \end{tabular}
   650 \end{tabular}
   603 \end{center}
   651 \end{center}
   604 
   652 
   605 \noindent I leave it to you to contemplate whether such a
   653 \noindent I leave it to you to contemplate whether such a
   606 simplification can have any impact on the correctness of our
   654 simplification can have any impact on the correctness of our
   626 Line~\ref{simpline}.\label{scala2}}
   674 Line~\ref{simpline}.\label{scala2}}
   627 \end{figure}
   675 \end{figure}
   628 
   676 
   629 \begin{center}
   677 \begin{center}
   630 \begin{tikzpicture}
   678 \begin{tikzpicture}
   631 \begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs},
   679 \begin{axis}[
       
   680     title={Plot: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
       
   681     xlabel={$n$},
       
   682     x label style={at={(1.04,0.0)}},
       
   683     ylabel={time in secs},
   632     enlargelimits=false,
   684     enlargelimits=false,
   633     xtick={0,2000,...,12000},
   685     xtick={0,2000,...,12000},
   634     xmax=12000,
   686     xmax=13000,
   635     ytick={0,5,...,30},
   687     ytick={0,5,...,30},
   636     scaled ticks=false,
   688     scaled ticks=false,
   637     axis lines=left,
   689     axis lines=left,
   638     width=9cm,
   690     width=9cm,
   639     height=5cm, 
   691     height=5cm, 
   642 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
   694 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
   643 \end{axis}
   695 \end{axis}
   644 \end{tikzpicture}
   696 \end{tikzpicture}
   645 \end{center}
   697 \end{center}
   646 
   698 
   647 SECOND EXAMPLE
   699 \begin{center}
       
   700 \begin{tikzpicture}
       
   701 \begin{axis}[
       
   702     title={Plot: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
       
   703     xlabel={$n$},
       
   704     x label style={at={(1.09,0.0)}},
       
   705     ylabel={time in secs},
       
   706     enlargelimits=false,
       
   707     %xtick={0,1e+6,...,6e+6},
       
   708     xmax=6200000,
       
   709     ytick={0,5,...,30},
       
   710     ymax=33,
       
   711     scaled ticks=false,
       
   712     axis lines=left,
       
   713     width=9cm,
       
   714     height=5cm, 
       
   715     legend entries={Scala V3}]
       
   716 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
       
   717 \end{axis}
       
   718 \end{tikzpicture}
       
   719 \end{center}
       
   720 
       
   721 \section{Epilogue}
       
   722 
       
   723 TBD
       
   724 
   648 
   725 
   649 \section*{Proofs}
   726 \section*{Proofs}
   650 
   727 
   651 You might not like doing proofs. But they serve a very
   728 You might not like doing proofs. But they serve a very
   652 important purpose in Computer Science: How can we be sure that
   729 important purpose in Computer Science: How can we be sure that
   784 
   861 
   785 \noindent
   862 \noindent
   786 Given this recipe, I let you show
   863 Given this recipe, I let you show
   787 
   864 
   788 \begin{equation}
   865 \begin{equation}
   789 Ders\,s\,(L(r)) = L(ders\,s\,r)
   866 \textit{Ders}\,s\,(L(r)) = L(\textit{ders}\,s\,r)
   790 \label{dersprop}
   867 \label{dersprop}
   791 \end{equation}
   868 \end{equation}
   792 
   869 
   793 \noindent by induction on $s$. Recall $Der$ is defined for 
   870 \noindent by induction on $s$. Recall $\textit{Der}$ is defined for 
   794 character---see \eqref{Der}; $Ders$ is similar, but for strings:
   871 character---see \eqref{Der}; $\textit{Ders}$ is similar, but for strings:
   795 
   872 
   796 \[
   873 \[
   797 Ders\,s\,A\;\dn\;\{s'\,|\,s @ s' \in A\}
   874 \textit{Ders}\,s\,A\;\dn\;\{s'\,|\,s @ s' \in A\}
   798 \]
   875 \]
   799 
   876 
   800 \noindent In this proof you can assume the following property
   877 \noindent In this proof you can assume the following property
   801 for $der$ and $Der$ has already been proved, that is you can
   878 for $der$ and $\textit{Der}$ has already been proved, that is you can
   802 assume
   879 assume
   803 
   880 
   804 \[
   881 \[
   805 L(der\,c\,r) = Der\,c\,(L(r))
   882 L(\textit{der}\,c\,r) = \textit{Der}\,c\,(L(r))
   806 \]
   883 \]
   807 
   884 
   808 \noindent holds (this would be of course a property that
   885 \noindent holds (this would be of course a property that
   809 needs to be proved in a side-lemma by induction on $r$).
   886 needs to be proved in a side-lemma by induction on $r$).
   810 
   887 
   819 \noindent That is the problem we want to solve. Thinking a 
   896 \noindent That is the problem we want to solve. Thinking a 
   820 little, you will see that this problem is equivalent to the 
   897 little, you will see that this problem is equivalent to the 
   821 following problem
   898 following problem
   822 
   899 
   823 \begin{equation}
   900 \begin{equation}
   824 [] \in Ders\,s\,(L(r))
   901 [] \in \textit{Ders}\,s\,(L(r))
   825 \label{dersstep}
   902 \label{dersstep}
   826 \end{equation}
   903 \end{equation}
   827 
   904 
   828 \noindent But we have shown above in \eqref{dersprop}, that
   905 \noindent But we have shown above in \eqref{dersprop}, that
   829 the $Ders$ can be replaced by $L(ders\ldots)$. That means 
   906 the $\textit{Ders}$ can be replaced by $L(\textit{ders}\ldots)$. That means 
   830 \eqref{dersstep} is equivalent to 
   907 \eqref{dersstep} is equivalent to 
   831 
   908 
   832 \begin{equation}
   909 \begin{equation}
   833 [] \in L(ders\,s\,r)
   910 [] \in L(\textit{ders}\,s\,r)
   834 \label{prefinalstep}
   911 \label{prefinalstep}
   835 \end{equation}
   912 \end{equation}
   836 
   913 
   837 \noindent We have also shown that testing whether the empty
   914 \noindent We have also shown that testing whether the empty
   838 string is in a language is equivalent to the $\textit{nullable}$
   915 string is in a language is equivalent to the $\textit{nullable}$
   839 function; see \eqref{nullableprop}. That means
   916 function; see \eqref{nullableprop}. That means
   840 \eqref{prefinalstep} is equivalent with
   917 \eqref{prefinalstep} is equivalent with
   841  
   918  
   842 \[
   919 \[
   843 \textit{nullable}(ders\,s\,r)
   920 \textit{nullable}(\textit{ders}\,s\,r)
   844 \] 
   921 \] 
   845 
   922 
   846 \noindent But this is just the definition of $matches$
   923 \noindent But this is just the definition of $matches$
   847 
   924 
   848 \[
   925 \[
   849 matches\,s\,r \dn nullable(ders\,s\,r)
   926 matches\,s\,r \dn nullable(\textit{ders}\,s\,r)
   850 \]
   927 \]
   851 
   928 
   852 \noindent In effect we have shown
   929 \noindent In effect we have shown
   853 
   930 
   854 \[
   931 \[