handouts/ho02.tex
changeset 504 5dc452d7c08e
parent 492 39b7ff2cf1bc
child 510 25580bf89ac0
equal deleted inserted replaced
503:f2d7b885b3e3 504:5dc452d7c08e
    10 
    10 
    11 
    11 
    12 \section*{Handout 2 (Regular Expression Matching)}
    12 \section*{Handout 2 (Regular Expression Matching)}
    13 
    13 
    14 This lecture is about implementing a more efficient regular expression
    14 This lecture is about implementing a more efficient regular expression
    15 matcher (the plots on the right)---more efficient than the matchers
    15 matcher (the plots on the right below)---more efficient than the
    16 from regular expression libraries in Ruby, Python and Java (the plots
    16 matchers from regular expression libraries in Ruby, Python and Java
    17 on the left). The first pair of plots show the running time for the
    17 (the plots on the left). The first pair of plots shows the running time
    18 regular expressions $a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings composed
    18 for the regular expression $(a^*)^*\cdot b$ and strings composed of
    19 of $n$ \pcode{a}s. The second pair of plots show the running time
    19 $n$ \pcode{a}s (meaning this regular expression actually does not
    20 for the regular expression $(a^*)^*\cdot b$ and also strings composed
    20 match the strings). The second pair of plots shows the running time for
    21 of $n$ \pcode{a}s (meaning this regular expression actually does not
    21 the regular expressions $a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings
       
    22 also composed of $n$ \pcode{a}s (this time the regular expressions
    22 match the strings).  To see the substantial differences in the left
    23 match the strings).  To see the substantial differences in the left
    23 and right plots below, note the different scales of the $x$-axes. 
    24 and right plots below, note the different scales of the $x$-axes.
    24 
    25 
       
    26 
       
    27 \begin{center}
       
    28 Graphs: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$
       
    29 \begin{tabular}{@{}cc@{}}
       
    30 \begin{tikzpicture}
       
    31 \begin{axis}[
       
    32     xlabel={$n$},
       
    33     x label style={at={(1.05,0.0)}},
       
    34     ylabel={time in secs},
       
    35     enlargelimits=false,
       
    36     xtick={0,5,...,30},
       
    37     xmax=33,
       
    38     ymax=35,
       
    39     ytick={0,5,...,30},
       
    40     scaled ticks=false,
       
    41     axis lines=left,
       
    42     width=5cm,
       
    43     height=5cm, 
       
    44     legend entries={Java, Python},  
       
    45     legend pos=north west,
       
    46     legend cell align=left]
       
    47 \addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
       
    48 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
       
    49 \end{axis}
       
    50 \end{tikzpicture}
       
    51 &
       
    52 \begin{tikzpicture}
       
    53   \begin{axis}[
       
    54     xlabel={$n$},
       
    55     x label style={at={(1.1,0.0)}},
       
    56     %%xtick={0,1000000,...,5000000}, 
       
    57     ylabel={time in secs},
       
    58     enlargelimits=false,
       
    59     ymax=35,
       
    60     ytick={0,5,...,30},
       
    61     axis lines=left,
       
    62     %scaled ticks=false,
       
    63     width=6.5cm,
       
    64     height=5cm,
       
    65     legend entries={Our matcher},  
       
    66     legend pos=north east,
       
    67     legend cell align=left]
       
    68 %\addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};    
       
    69 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
       
    70 \end{axis}
       
    71 \end{tikzpicture}
       
    72 \end{tabular}
       
    73 \end{center}\bigskip
    25 
    74 
    26 \begin{center}
    75 \begin{center}
    27 Graphs: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$\\
    76 Graphs: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$\\
    28 \begin{tabular}{@{}cc@{}}
    77 \begin{tabular}{@{}cc@{}}
    29 \begin{tikzpicture}
    78 \begin{tikzpicture}
    52   \begin{axis}[
   101   \begin{axis}[
    53     xlabel={$n$},
   102     xlabel={$n$},
    54     x label style={at={(1.1,0.05)}},
   103     x label style={at={(1.1,0.05)}},
    55     ylabel={\small time in secs},
   104     ylabel={\small time in secs},
    56     enlargelimits=false,
   105     enlargelimits=false,
    57     xtick={0,3000,...,9000},
   106     xtick={0,2500,...,11000},
    58     xmax=10000,
   107     xmax=12000,
    59     ymax=35,
   108     ymax=35,
    60     ytick={0,5,...,30},
   109     ytick={0,5,...,30},
    61     scaled ticks=false,
   110     scaled ticks=false,
    62     axis lines=left,
   111     axis lines=left,
    63     width=6.5cm,
   112     width=6.5cm,
    64     height=5cm]
   113     height=5cm,
    65 \addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
   114     legend entries={Our matcher},  
       
   115     legend pos=north east,
       
   116     legend cell align=left]
       
   117 %\addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
    66 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
   118 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
    67 \end{axis}
   119 \end{axis}
    68 \end{tikzpicture}
   120 \end{tikzpicture}
    69 \end{tabular}
   121 \end{tabular}
    70 \end{center}
   122 \end{center}
    71 
   123 \bigskip
    72 \begin{center}
   124 
    73 Graphs: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$
   125 \noindent
    74 \begin{tabular}{@{}cc@{}}
   126 In what follows we will use these regular expressions and strings as
    75 \begin{tikzpicture}
   127 running examples. There will be several versions (V1, V2, V3,\ldots)
    76 \begin{axis}[
   128 of our matcher.\footnote{The corresponding files are
    77     xlabel={$n$},
   129   \texttt{re1.scala}, \texttt{re2.scala} and so on. As usual, you can
    78     x label style={at={(1.05,0.0)}},
   130   find the code on KEATS.}\bigskip
    79     ylabel={time in secs},
   131 
    80     enlargelimits=false,
   132 \noindent
    81     xtick={0,5,...,30},
       
    82     xmax=33,
       
    83     ymax=35,
       
    84     ytick={0,5,...,30},
       
    85     scaled ticks=false,
       
    86     axis lines=left,
       
    87     width=5cm,
       
    88     height=5cm, 
       
    89     legend entries={Java},  
       
    90     legend pos=north west,
       
    91     legend cell align=left]
       
    92 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
       
    93 \end{axis}
       
    94 \end{tikzpicture}
       
    95 &
       
    96 \begin{tikzpicture}
       
    97   \begin{axis}[
       
    98     xlabel={$n$},
       
    99     x label style={at={(1.05,0.0)}},
       
   100     ylabel={time in secs},
       
   101     enlargelimits=false,
       
   102     ymax=35,
       
   103     ytick={0,5,...,30},
       
   104     axis lines=left,
       
   105     scaled ticks=false,
       
   106     width=6.5cm,
       
   107     height=5cm]
       
   108 \addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};    
       
   109 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
       
   110 \end{axis}
       
   111 \end{tikzpicture}
       
   112 \end{tabular}
       
   113 \end{center}\medskip
       
   114 
       
   115 \noindent
       
   116 We will use these regular expressions and strings
       
   117 as running examples.
       
   118 
       
   119 Having specified in the previous lecture what
   133 Having specified in the previous lecture what
   120 problem our regular expression matcher is supposed to solve,
   134 problem our regular expression matcher is supposed to solve,
   121 namely for any given regular expression $r$ and string $s$
   135 namely for any given regular expression $r$ and string $s$
   122 answer \textit{true} if and only if
   136 answer \textit{true} if and only if
   123 
   137 
   124 \[
   138 \[
   125 s \in L(r)
   139 s \in L(r)
   126 \]
   140 \]
   127 
   141 
   128 \noindent we can look at an algorithm to solve this problem. Clearly
   142 \noindent we can look for an algorithm to solve this problem. Clearly
   129 we cannot use the function $L$ directly for this, because in general
   143 we cannot use the function $L$ directly for this, because in general
   130 the set of strings $L$ returns is infinite (recall what $L(a^*)$ is).
   144 the set of strings $L$ returns is infinite (recall what $L(a^*)$ is).
   131 In such cases there is no way we can implement an exhaustive test for
   145 In such cases there is no way we can implement an exhaustive test for
   132 whether a string is member of this set or not. In contrast our
   146 whether a string is member of this set or not. In contrast our
   133 matching algorithm will operate on the regular expression $r$ and
   147 matching algorithm will operate on the regular expression $r$ and
   222 (r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO)
   236 (r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO)
   223 \label{big}
   237 \label{big}
   224 \end{equation}
   238 \end{equation}
   225 
   239 
   226 \noindent If we can find an equivalent regular expression that is
   240 \noindent If we can find an equivalent regular expression that is
   227 simpler (smaller for example), then this might potentially make our
   241 simpler (that usually means smaller), then this might potentially make
   228 matching algorithm run faster. We can look for such a simpler regular
   242 our matching algorithm run faster. We can look for such a simpler
   229 expression $r'$ because whether a string $s$ is in $L(r)$ or in
   243 regular expression $r'$ because whether a string $s$ is in $L(r)$ or
   230 $L(r')$ with $r\equiv r'$ will always give the same answer. In the
   244 in $L(r')$ with $r\equiv r'$ will always give the same answer. Yes?
   231 example above you will see that the regular expression is equivalent
   245 
   232 to just $r_1$. You can verify this by iteratively applying the
   246 In the example above you will see that the regular expression is
   233 simplification rules from above:
   247 equivalent to just $r_1$. You can verify this by iteratively applying
       
   248 the simplification rules from above:
   234 
   249 
   235 \begin{center}
   250 \begin{center}
   236 \begin{tabular}{ll}
   251 \begin{tabular}{ll}
   237  & $(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot 
   252  & $(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot 
   238 (\underline{r_4 \cdot \ZERO})$\smallskip\\
   253 (\underline{r_4 \cdot \ZERO})$\smallskip\\
   249 rule is applied. Our matching algorithm in the next section
   264 rule is applied. Our matching algorithm in the next section
   250 will often generate such ``useless'' $\ONE$s and
   265 will often generate such ``useless'' $\ONE$s and
   251 $\ZERO$s, therefore simplifying them away will make the
   266 $\ZERO$s, therefore simplifying them away will make the
   252 algorithm quite a bit faster.
   267 algorithm quite a bit faster.
   253 
   268 
       
   269 Finally here are three equivalences between regular expressions which are
       
   270 not so obvious:
       
   271 
       
   272 \begin{center}
       
   273 \begin{tabular}{rcl}
       
   274 $r^*$  & $\equiv$ & $1 + r\cdot r^*$\\
       
   275 $(r_1 + r_2)^*$  & $\equiv$ & $r_1^* \cdot (r_2\cdot r_1^*)^*$\\
       
   276 $(r_1 \cdot r_2)^*$ & $\equiv$ & $1 + r_1\cdot (r_2 \cdot r_1)^* \cdot r_2$\\
       
   277 \end{tabular}
       
   278 \end{center}
       
   279 
       
   280 \noindent
       
   281 We will not use them in our algorithm, but feel free to convince you
       
   282 that they hold. As an aside, there has been a lot of research about
       
   283 questions like: Can one always decide when two regular expressions are
       
   284 equivalent or not? What does an algorithm look like to decide this
       
   285 efficiently?
       
   286 
   254 \subsection*{The Matching Algorithm}
   287 \subsection*{The Matching Algorithm}
   255 
   288 
   256 The algorithm we will define below consists of two parts. One
   289 The algorithm we will define below consists of two parts. One
   257 is the function $\textit{nullable}$ which takes a regular expression as
   290 is the function $\textit{nullable}$ which takes a regular expression as
   258 argument and decides whether it can match the empty string
   291 argument and decides whether it can match the empty string
   284 
   317 
   285 The other function of our matching algorithm calculates a
   318 The other function of our matching algorithm calculates a
   286 \emph{derivative} of a regular expression. This is a function
   319 \emph{derivative} of a regular expression. This is a function
   287 which will take a regular expression, say $r$, and a
   320 which will take a regular expression, say $r$, and a
   288 character, say $c$, as arguments and returns a new regular
   321 character, say $c$, as arguments and returns a new regular
   289 expression. Be careful that the intuition behind this function
   322 expression. Be mindful that the intuition behind this function
   290 is not so easy to grasp on first reading. Essentially this
   323 is not so easy to grasp on first reading. Essentially this
   291 function solves the following problem: if $r$ can match a
   324 function solves the following problem: if $r$ can match a
   292 string of the form $c\!::\!s$, what does the regular
   325 string of the form $c\!::\!s$, what does a regular
   293 expression look like that can match just $s$? The definition
   326 expression look like that can match just $s$? The definition
   294 of this function is as follows:
   327 of this function is as follows:
   295 
   328 
   296 \begin{center}
   329 \begin{center}
   297 \begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
   330 \begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
   341 $c\!::\!s$, then the first part must be ``matched'' by a
   374 $c\!::\!s$, then the first part must be ``matched'' by a
   342 single copy of $r$. Therefore we call recursively $\textit{der}\,c\,r$
   375 single copy of $r$. Therefore we call recursively $\textit{der}\,c\,r$
   343 and ``append'' $r^*$ in order to match the rest of $s$. Still
   376 and ``append'' $r^*$ in order to match the rest of $s$. Still
   344 makes sense?
   377 makes sense?
   345 
   378 
   346 If all this did not make sense yet, here is another way to rationalise
   379 If all this did not make sense yet, here is another way to explain the
   347 the definition of $\textit{der}$ by considering the following operation
   380 definition of $\textit{der}$ by considering the following operation on
   348 on sets:
   381 sets:
   349 
   382 
   350 \begin{equation}\label{Der}
   383 \begin{equation}\label{Der}
   351 \textit{Der}\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\}
   384 \textit{Der}\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\}
   352 \end{equation}
   385 \end{equation}
   353 
   386 
   423   $\textit{ders}\, (c\!::\!s)\, r$ & $\dn$ & $\textit{ders}\,s\,(\textit{der}\,c\,r)$ & \\
   456   $\textit{ders}\, (c\!::\!s)\, r$ & $\dn$ & $\textit{ders}\,s\,(\textit{der}\,c\,r)$ & \\
   424   \end{tabular}
   457   \end{tabular}
   425 \end{center}
   458 \end{center}
   426 
   459 
   427 \noindent This function iterates $\textit{der}$ taking one character at
   460 \noindent This function iterates $\textit{der}$ taking one character at
   428 the time from the original string until it is exhausted.
   461 the time from the original string until the string is exhausted.
   429 Having $\textit{der}s$ in place, we can finally define our matching
   462 Having $\textit{der}s$ in place, we can finally define our matching
   430 algorithm:
   463 algorithm:
   431 
   464 
   432 \[
   465 \[
   433 \textit{matches}\,s\,r \dn \textit{nullable}(\textit{ders}\,s\,r)
   466 \textit{matches}\,s\,r \dn \textit{nullable}(\textit{ders}\,s\,r)
   459 Given the implementation of regular expressions in Scala shown
   492 Given the implementation of regular expressions in Scala shown
   460 in the first lecture and handout, the functions and subfunctions
   493 in the first lecture and handout, the functions and subfunctions
   461 for \pcode{matches} are shown in Figure~\ref{scala1}.
   494 for \pcode{matches} are shown in Figure~\ref{scala1}.
   462 
   495 
   463 \begin{figure}[p]
   496 \begin{figure}[p]
   464 \lstinputlisting{../progs/app5.scala}
   497   \lstinputlisting[numbers=left,linebackgroundcolor=
   465 \caption{Scala implementation of the \textit{nullable} and 
   498                   {\ifodd\value{lstnumber}\color{capri!3}\fi}]
       
   499                   {../progs/app5.scala}
       
   500 \caption{A Scala implementation of the \textit{nullable} and 
   466   derivative functions. These functions are easy to
   501   derivative functions. These functions are easy to
   467   implement in functional languages, because their built-in pattern 
   502   implement in functional languages. This is because pattern 
   468   matching and recursion allow us to mimic the mathematical
   503   matching and recursion allow us to mimic the mathematical
   469   definitions very closely.\label{scala1}}
   504   definitions very closely. Nearly all functional
       
   505   programming languages support pattern matching and
       
   506   recursion out of the box.\label{scala1}}
   470 \end{figure}
   507 \end{figure}
   471 
   508 
   472 
   509 
   473 %Remember our second example involving the regular expression
   510 %Remember our second example involving the regular expression
   474 %$(a^*)^* \cdot b$ which could not match strings of $n$ \texttt{a}s. 
   511 %$(a^*)^* \cdot b$ which could not match strings of $n$ \texttt{a}s. 
   505 %strings up to the length of 6500. After that we receive a
   542 %strings up to the length of 6500. After that we receive a
   506 %StackOverflow exception, but still\ldots
   543 %StackOverflow exception, but still\ldots
   507 
   544 
   508 For running the algorithm with our first example, the evil
   545 For running the algorithm with our first example, the evil
   509 regular expression $a^?{}^{\{n\}}a^{\{n\}}$, we need to implement
   546 regular expression $a^?{}^{\{n\}}a^{\{n\}}$, we need to implement
   510 the optional regular expression and the exactly $n$-times
   547 the optional regular expression and the `exactly $n$-times
   511 regular expression. This can be done with the translations
   548 regular expression'. This can be done with the translations
   512 
   549 
   513 \lstinputlisting[numbers=none]{../progs/app51.scala}
   550 \lstinputlisting[numbers=none]{../progs/app51.scala}
   514 
   551 
   515 \noindent Running the matcher with this example, we find it is
   552 \noindent Running the matcher with this example, we find it is
   516 slightly worse then the matcher in Ruby and Python.
   553 slightly worse then the matcher in Ruby and Python.
   539 \addplot[red,mark=triangle*,mark options={fill=white}] table {re1.data};  
   576 \addplot[red,mark=triangle*,mark options={fill=white}] table {re1.data};  
   540 \end{axis}
   577 \end{axis}
   541 \end{tikzpicture}
   578 \end{tikzpicture}
   542 \end{center}
   579 \end{center}
   543 
   580 
   544 \noindent Analysing this failure we notice that for
   581 \noindent Analysing this failure we notice that for $a^{\{n\}}$, for
   545 $a^{\{n\}}$ we generate quite big regular expressions:
   582 example, we generate quite big regular expressions:
   546 
   583 
   547 \begin{center}
   584 \begin{center}
   548 \begin{tabular}{rl}
   585 \begin{tabular}{rl}
   549 1: & $a$\\
   586 1: & $a$\\
   550 2: & $a\cdot a$\\
   587 2: & $a\cdot a$\\
   558 \noindent Our algorithm traverses such regular expressions at
   595 \noindent Our algorithm traverses such regular expressions at
   559 least once every time a derivative is calculated. So having
   596 least once every time a derivative is calculated. So having
   560 large regular expressions will cause problems. This problem
   597 large regular expressions will cause problems. This problem
   561 is aggravated by $a^?$ being represented as $a + \ONE$.
   598 is aggravated by $a^?$ being represented as $a + \ONE$.
   562 
   599 
   563 We can however fix this by having an explicit constructor for
   600 We can however fix this easily by having an explicit constructor for
   564 $r^{\{n\}}$. In Scala we would introduce a constructor like
   601 $r^{\{n\}}$. In Scala we would introduce a constructor like
   565 
   602 
   566 \begin{center}
   603 \begin{center}
   567 \code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
   604 \code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
   568 \end{center}
   605 \end{center}
   569 
   606 
   570 \noindent With this fix we have a constant ``size'' regular
   607 \noindent With this fix we have a constant ``size'' regular expression
   571 expression for our running example no matter how large $n$ is.
   608 for our running example no matter how large $n$ is (see the
   572 This means we have to also add cases for \pcode{NTIMES} in the
   609 \texttt{size} section in the implementations).  This means we have to
   573 functions $\textit{nullable}$ and $\textit{der}$. Does the change have any
   610 also add cases for \pcode{NTIMES} in the functions $\textit{nullable}$
   574 effect?
   611 and $\textit{der}$. Does the change have any effect?
   575 
   612 
   576 \begin{center}
   613 \begin{center}
   577 \begin{tikzpicture}
   614 \begin{tikzpicture}
   578 \begin{axis}[
   615 \begin{axis}[
   579     title={Graph: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
   616     title={Graph: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
   580     xlabel={$n$},
   617     xlabel={$n$},
   581     x label style={at={(1.01,0.0)}},
   618     x label style={at={(1.01,0.0)}},
   582     ylabel={time in secs},
   619     ylabel={time in secs},
   583     enlargelimits=false,
   620     enlargelimits=false,
   584     xtick={0,100,...,1000},
   621     xtick={0,200,...,1100},
   585     xmax=1100,
   622     xmax=1200,
   586     ytick={0,5,...,30},
   623     ytick={0,5,...,30},
   587     scaled ticks=false,
   624     scaled ticks=false,
   588     axis lines=left,
   625     axis lines=left,
   589     width=10cm,
   626     width=10cm,
   590     height=5cm, 
   627     height=5cm, 
   597 \addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
   634 \addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
   598 \end{axis}
   635 \end{axis}
   599 \end{tikzpicture}
   636 \end{tikzpicture}
   600 \end{center}
   637 \end{center}
   601 
   638 
   602 \noindent Now we are talking business! The modified matcher 
   639 \noindent Now we are talking business! The modified matcher can within
   603 can within 30 seconds handle regular expressions up to
   640 25 seconds handle regular expressions up to $n = 1,100$ before a
   604 $n = 950$ before a StackOverflow is raised. Recall that Python and Ruby 
   641 StackOverflow is raised. Recall that Python and Ruby (and our first
   605 (and our first version, Scala V1) could only handle $n = 27$ or so in 30 
   642 version, Scala V1) could only handle $n = 27$ or so in 30
   606 seconds. There is no change for our second example 
   643 seconds. We have not tried our algorithm on the second example $(a^*)^* \cdot
   607 $(a^*)^* \cdot b$---so this is still good.
   644 b$---but it is doing OK with it.
   608 
   645 
   609 
   646 
   610 The moral is that our algorithm is rather sensitive to the
   647 The moral is that our algorithm is rather sensitive to the
   611 size of regular expressions it needs to handle. This is of
   648 size of regular expressions it needs to handle. This is of
   612 course obvious because both $\textit{nullable}$ and $\textit{der}$ frequently
   649 course obvious because both $\textit{nullable}$ and $\textit{der}$ frequently
   613 need to traverse the whole regular expression. There seems,
   650 need to traverse the whole regular expression. There seems,
   614 however, one more issue for making the algorithm run faster.
   651 however, one more issue for making the algorithm run faster.
   615 The derivative function often produces ``useless''
   652 The derivative function often produces ``useless''
   616 $\ZERO$s and $\ONE$s. To see this, consider $r = ((a
   653 $\ZERO$s and $\ONE$s. To see this, consider $r = ((a
   617 \cdot b) + b)^*$ and the following two derivatives
   654 \cdot b) + b)^*$ and the following three derivatives
   618 
   655 
   619 \begin{center}
   656 \begin{center}
   620 \begin{tabular}{l}
   657 \begin{tabular}{l}
   621 $\textit{der}\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\
   658 $\textit{der}\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\
   622 $\textit{der}\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\
   659 $\textit{der}\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\
   623 $\textit{der}\,c\,r = ((\ZERO \cdot b) + \ZERO)\cdot r$
   660 $\textit{der}\,c\,r = ((\ZERO \cdot b) + \ZERO)\cdot r$
   624 \end{tabular}
   661 \end{tabular}
   625 \end{center}
   662 \end{center}
   626 
   663 
   627 \noindent 
   664 \noindent 
   628 If we simplify them according to the simple rules from the
   665 If we simplify them according to the simplification rules from the
   629 beginning, we can replace the right-hand sides by the 
   666 beginning, we can replace the right-hand sides by the smaller
   630 smaller equivalent regular expressions
   667 equivalent regular expressions
   631 
   668 
   632 \begin{center}
   669 \begin{center}
   633 \begin{tabular}{l}
   670 \begin{tabular}{l}
   634 $\textit{der}\,a\,r \equiv b \cdot r$\\
   671 $\textit{der}\,a\,r \equiv b \cdot r$\\
   635 $\textit{der}\,b\,r \equiv r$\\
   672 $\textit{der}\,b\,r \equiv r$\\
   636 $\textit{der}\,c\,r \equiv \ZERO$
   673 $\textit{der}\,c\,r \equiv \ZERO$
   637 \end{tabular}
   674 \end{tabular}
   638 \end{center}
   675 \end{center}
   639 
   676 
   640 \noindent I leave it to you to contemplate whether such a
   677 \noindent I leave it to you to contemplate whether such a
   641 simplification can have any impact on the correctness of our
   678 simplification can have any impact on the correctness of our algorithm
   642 algorithm (will it change any answers?). Figure~\ref{scala2}
   679 (will it change any answers?). Figure~\ref{scala2} gives a
   643 gives a simplification function that recursively traverses a
   680 simplification function that recursively traverses a regular
   644 regular expression and simplifies it according to the rules
   681 expression and simplifies it according to the rules given at the
   645 given at the beginning. There are only rules for $+$, $\cdot$
   682 beginning. There are only rules for $+$, $\cdot$ and $n$-times (the
   646 and $n$-times (the latter because we added it in the second
   683 latter because we added it in the second version of our
   647 version of our matcher). There is no rule for a star, because
   684 matcher). There is no simplification rule for a star, because
   648 empirical data and also a little thought showed that
   685 empirical data and also a little thought showed that simplifying under
   649 simplifying under a star is a waste of computation time. The
   686 a star is a waste of computation time. The simplification function
   650 simplification function will be called after every derivation.
   687 will be called after every derivation.  This additional step removes
   651 This additional step removes all the ``junk'' the derivative
   688 all the ``junk'' the derivative function introduced. Does this improve
   652 function introduced. Does this improve the speed? You bet!!
   689 the speed? You bet!!
   653 
   690 
   654 \begin{figure}[p]
   691 \begin{figure}[p]
   655 \lstinputlisting{../progs/app6.scala}
   692 \lstinputlisting[numbers=left,linebackgroundcolor=
       
   693   {\ifodd\value{lstnumber}\color{capri!3}\fi}]
       
   694                 {../progs/app6.scala}
   656 \caption{The simplification function and modified 
   695 \caption{The simplification function and modified 
   657 \texttt{ders}-function; this function now
   696 \texttt{ders}-function; this function now
   658 calls \texttt{der} first, but then simplifies
   697 calls \texttt{der} first, but then simplifies
   659 the resulting derivative regular expressions before
   698 the resulting derivative regular expressions before
   660 building the next derivative, see
   699 building the next derivative, see
   667     title={Graph: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
   706     title={Graph: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
   668     xlabel={$n$},
   707     xlabel={$n$},
   669     x label style={at={(1.04,0.0)}},
   708     x label style={at={(1.04,0.0)}},
   670     ylabel={time in secs},
   709     ylabel={time in secs},
   671     enlargelimits=false,
   710     enlargelimits=false,
   672     xtick={0,3000,...,9000},
   711     xtick={0,2500,...,10000},
   673     xmax=10000,
   712     xmax=12000,
   674     ytick={0,5,...,30},
   713     ytick={0,5,...,30},
   675     ymax=32,
   714     ymax=32,
   676     scaled ticks=false,
   715     scaled ticks=false,
   677     axis lines=left,
   716     axis lines=left,
   678     width=9cm,
   717     width=9cm,
   685 \end{axis}
   724 \end{axis}
   686 \end{tikzpicture}
   725 \end{tikzpicture}
   687 \end{center}
   726 \end{center}
   688 
   727 
   689 \noindent
   728 \noindent
   690 To reacap, Python and Ruby needed approximately 30 seconds to match
   729 To reacap, Python and Ruby needed approximately 30 seconds to match a
   691 a string of 28 \texttt{a}s and the regular expression $a^{?\{n\}} \cdot a^{\{n\}}$. 
   730 string of 28 \texttt{a}s and the regular expression $a^{?\{n\}} \cdot
   692 We need a third of this time to do the same with strings up to 12,000 \texttt{a}s. 
   731 a^{\{n\}}$.  We need a third of this time to do the same with strings
   693 Similarly, Java needed 30 seconds to find out the regular expression 
   732 up to 11,000 \texttt{a}s.  Similarly, Java and Python needed 30
   694 $(a^*)^* \cdot b$ does not match the string of 28 \texttt{a}s. We can do
   733 seconds to find out the regular expression $(a^*)^* \cdot b$ does not
   695 the same in approximately 5 seconds for strings of 6000000 \texttt{a}s:
   734 match the string of 28 \texttt{a}s. We can do the same in
       
   735 for strings composed of nearly 6,000,000 \texttt{a}s:
   696 
   736 
   697 
   737 
   698 \begin{center}
   738 \begin{center}
   699 \begin{tikzpicture}
   739 \begin{tikzpicture}
   700 \begin{axis}[
   740 \begin{axis}[
   701     title={Graph: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
   741     title={Graph: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
   702     xlabel={$n$},
   742     xlabel={$n$},
   703     x label style={at={(1.09,0.0)}},
       
   704     ylabel={time in secs},
   743     ylabel={time in secs},
   705     enlargelimits=false,
   744     enlargelimits=false,
   706     xmax=7700000,
   745     ymax=35,
   707     ytick={0,5,...,30},
   746     ytick={0,5,...,30},
   708     ymax=32,
   747     axis lines=left,
   709     %scaled ticks=false,
   748     %scaled ticks=false,
   710     axis lines=left,
   749     x label style={at={(1.09,0.0)}},
       
   750     %xmax=7700000,
   711     width=9cm,
   751     width=9cm,
   712     height=5cm, 
   752     height=5cm, 
   713     legend entries={Scala V2, Scala V3},
   753     legend entries={Scala V3},
   714     legend pos=outer north east,
   754     legend pos=outer north east,
   715     legend cell align=left]
   755     legend cell align=left]
   716 \addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};
   756 %\addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};
   717 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
   757 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
   718 \end{axis}
   758 \end{axis}
   719 \end{tikzpicture}
   759 \end{tikzpicture}
   720 \end{center}
   760 \end{center}
   721 
   761 
   722 \subsection*{Epilogue}
   762 \subsection*{Epilogue}
   723 
   763 
   724 (23/Aug/2016) I recently found another place where this algorithm can be 
   764 (23/Aug/2016) I recently found another place where this algorithm can
   725 sped (this idea is not integrated with what is coming next,
   765 be sped up (this idea is not integrated with what is coming next, but
   726 but I present it nonetheless). The idea is to define \texttt{ders}
   766 I present it nonetheless). The idea is to not define \texttt{ders}
   727 not such that it iterates the derivative character-by-character, but
   767 that it iterates the derivative character-by-character, but in bigger
   728 in bigger chunks. The resulting code for \texttt{ders2} looks as
   768 chunks. The resulting code for \texttt{ders2} looks as follows:
   729 follows:
       
   730 
   769 
   731 \lstinputlisting[numbers=none]{../progs/app52.scala} 
   770 \lstinputlisting[numbers=none]{../progs/app52.scala} 
   732 
   771 
   733 \noindent
   772 \noindent
   734 I have not fully understood why this version is much faster,
   773 I have not fully understood why this version is much faster,
   754     xmax=7100000,
   793     xmax=7100000,
   755     ytick={0,5,...,30},
   794     ytick={0,5,...,30},
   756     ymax=33,
   795     ymax=33,
   757     %scaled ticks=false,
   796     %scaled ticks=false,
   758     axis lines=left,
   797     axis lines=left,
   759     width=5.5cm,
   798     width=5.3cm,
   760     height=5cm, 
   799     height=5cm, 
   761     legend entries={Scala V3, Scala V4},
   800     legend entries={Scala V3, Scala V4},
   762     legend style={at={(0.1,-0.2)},anchor=north}]
   801     legend style={at={(0.1,-0.2)},anchor=north}]
   763 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
   802 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
   764 \addplot[purple,mark=square*,mark options={fill=white}] table {re4.data};
   803 \addplot[purple,mark=square*,mark options={fill=white}] table {re4.data};
   770     title={Graph: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
   809     title={Graph: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
   771     xlabel={$n$},
   810     xlabel={$n$},
   772     x label style={at={(1.09,0.0)}},
   811     x label style={at={(1.09,0.0)}},
   773     ylabel={time in secs},
   812     ylabel={time in secs},
   774     enlargelimits=false,
   813     enlargelimits=false,
   775     xmax=8100000,
   814     xmax=8200000,
   776     ytick={0,5,...,30},
   815     ytick={0,5,...,30},
   777     ymax=33,
   816     ymax=33,
   778     %scaled ticks=false,
   817     %scaled ticks=false,
   779     axis lines=left,
   818     axis lines=left,
   780     width=5.5cm,
   819     width=5.3cm,
   781     height=5cm, 
   820     height=5cm, 
   782     legend entries={Scala V3, Scala V4},
   821     legend entries={Scala V3, Scala V4},
   783     legend style={at={(0.1,-0.2)},anchor=north}]
   822     legend style={at={(0.1,-0.2)},anchor=north}]
   784 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
   823 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
   785 \addplot[purple,mark=square*,mark options={fill=white}] table {re4a.data};
   824 \addplot[purple,mark=square*,mark options={fill=white}] table {re4a.data};
   791 
   830 
   792 \section*{Proofs}
   831 \section*{Proofs}
   793 
   832 
   794 You might not like doing proofs. But they serve a very
   833 You might not like doing proofs. But they serve a very
   795 important purpose in Computer Science: How can we be sure that
   834 important purpose in Computer Science: How can we be sure that
   796 our algorithm matches its specification. We can try to test
   835 our algorithm matches its specification? We can try to test
   797 the algorithm, but that often overlooks corner cases and an
   836 the algorithm, but that often overlooks corner cases and an
   798 exhaustive testing is impossible (since there are infinitely
   837 exhaustive testing is impossible (since there are infinitely
   799 many inputs). Proofs allow us to ensure that an algorithm
   838 many inputs). Proofs allow us to ensure that an algorithm
   800 really meets its specification. 
   839 really meets its specification. 
   801 
   840 
   812         & $\mid$ & $r_1 \cdot r_2$      & sequence\\
   851         & $\mid$ & $r_1 \cdot r_2$      & sequence\\
   813         & $\mid$ & $r^*$                & star (zero or more)\\
   852         & $\mid$ & $r^*$                & star (zero or more)\\
   814   \end{tabular}
   853   \end{tabular}
   815 \end{center}
   854 \end{center}
   816 
   855 
   817 \noindent If you want to show a property $P(r)$ for all 
   856 \noindent If you want to show a property $P(r)$ for \emph{all} 
   818 regular expressions $r$, then you have to follow essentially 
   857 regular expressions $r$, then you have to follow essentially 
   819 the recipe:
   858 the recipe:
   820 
   859 
   821 \begin{itemize}
   860 \begin{itemize}
   822 \item $P$ has to hold for $\ZERO$, $\ONE$ and $c$
   861 \item $P$ has to hold for $\ZERO$, $\ONE$ and $c$
   864 \textit{nullable}(r_1 + r_2) \;\;\text{if and only if}\;\; 
   903 \textit{nullable}(r_1 + r_2) \;\;\text{if and only if}\;\; 
   865 []\in L(r_1 + r_2)
   904 []\in L(r_1 + r_2)
   866 \label{propalt}
   905 \label{propalt}
   867 \end{equation}
   906 \end{equation}
   868 
   907 
   869 \noindent The difference to the base cases is that in this
   908 \noindent The difference to the base cases is that in the inductive
   870 case we can already assume we proved
   909 cases we can already assume we proved $P$ for the components, that is
       
   910 we can assume.
   871 
   911 
   872 \begin{center}
   912 \begin{center}
   873 \begin{tabular}{l}
   913 \begin{tabular}{l}
   874 $\textit{nullable}(r_1) \;\;\text{if and only if}\;\; []\in L(r_1)$ and\\
   914 $\textit{nullable}(r_1) \;\;\text{if and only if}\;\; []\in L(r_1)$ and\\
   875 $\textit{nullable}(r_2) \;\;\text{if and only if}\;\; []\in L(r_2)$\\
   915 $\textit{nullable}(r_2) \;\;\text{if and only if}\;\; []\in L(r_2)$\\
   876 \end{tabular}
   916 \end{tabular}
   877 \end{center}
   917 \end{center}
   878 
   918 
   879 \noindent These are the induction hypotheses. To check this 
   919 \noindent These are called the induction hypotheses. To check this 
   880 case, we can start from $\textit{nullable}(r_1 + r_2)$, which by 
   920 case, we can start from $\textit{nullable}(r_1 + r_2)$, which by 
   881 definition is
   921 definition of $\textit{nullable}$ is
   882 
   922 
   883 \[
   923 \[
   884 \textit{nullable}(r_1) \vee \textit{nullable}(r_2)
   924 \textit{nullable}(r_1) \vee \textit{nullable}(r_2)
   885 \]
   925 \]
   886 
   926 
   899 
   939 
   900 \[
   940 \[
   901 [] \in L(r_1)\cup L(r_2)
   941 [] \in L(r_1)\cup L(r_2)
   902 \]
   942 \]
   903 
   943 
   904 \noindent but this is by definition of $L$ exactly $[] \in
   944 \noindent but this is by definition of $L$ exactly $[] \in L(r_1 +
   905 L(r_1 + r_2)$, which we needed to establish according to
   945 r_2)$, which we needed to establish according to statement in
   906 \eqref{propalt}. What we have shown is that starting from
   946 \eqref{propalt}. What we have shown is that starting from
   907 $\textit{nullable}(r_1 + r_2)$ we have done equivalent transformations
   947 $\textit{nullable}(r_1 + r_2)$ we have done equivalent transformations
   908 to end up with $[] \in L(r_1 + r_2)$. Consequently we have
   948 to end up with $[] \in L(r_1 + r_2)$. Consequently we have established
   909 established that $P(r_1 + r_2)$ holds.
   949 that $P(r_1 + r_2)$ holds.
   910 
   950 
   911 In order to complete the proof we would now need to look 
   951 In order to complete the proof we would now need to look 
   912 at the cases \mbox{$P(r_1\cdot r_2)$} and $P(r^*)$. Again I let you
   952 at the cases \mbox{$P(r_1\cdot r_2)$} and $P(r^*)$. Again I let you
   913 check the details.
   953 check the details.
   914 
   954 
   915 You might have to do induction proofs over strings. 
   955 You might also have to do induction proofs over strings. 
   916 That means you want to establish a property $P(s)$ for all
   956 That means you want to establish a property $P(s)$ for all
   917 strings $s$. For this remember strings are lists of 
   957 strings $s$. For this remember strings are lists of 
   918 characters. These lists can be either the empty list or a
   958 characters. These lists can be either the empty list or a
   919 list of the form $c::s$. If you want to perform an induction
   959 list of the form $c::s$. If you want to perform an induction
   920 proof for strings you need to consider the cases
   960 proof for strings you need to consider the cases
   946 
   986 
   947 \[
   987 \[
   948 L(\textit{der}\,c\,r) = \textit{Der}\,c\,(L(r))
   988 L(\textit{der}\,c\,r) = \textit{Der}\,c\,(L(r))
   949 \]
   989 \]
   950 
   990 
   951 \noindent holds (this would be of course a property that
   991 \noindent holds (this would be of course another property that needs
   952 needs to be proved in a side-lemma by induction on $r$).
   992 to be proved in a side-lemma by induction on $r$). This is a bit
       
   993 more challenging, but not impossible.
   953 
   994 
   954 To sum up, using reasoning like the one shown above allows us 
   995 To sum up, using reasoning like the one shown above allows us 
   955 to show the correctness of our algorithm. To see this,
   996 to show the correctness of our algorithm. To see this,
   956 start from the specification
   997 start from the specification
   957 
   998 
   966 \begin{equation}
  1007 \begin{equation}
   967 [] \in \textit{Ders}\,s\,(L(r))
  1008 [] \in \textit{Ders}\,s\,(L(r))
   968 \label{dersstep}
  1009 \label{dersstep}
   969 \end{equation}
  1010 \end{equation}
   970 
  1011 
   971 \noindent But we have shown above in \eqref{dersprop}, that
  1012 \noindent You agree?  But we have shown above in \eqref{dersprop},
   972 the $\textit{Ders}$ can be replaced by $L(\textit{ders}\ldots)$. That means 
  1013 that the $\textit{Ders}$ can be replaced by
   973 \eqref{dersstep} is equivalent to 
  1014 $L(\textit{ders}\ldots)$. That means \eqref{dersstep} is equivalent to
   974 
  1015 
   975 \begin{equation}
  1016 \begin{equation}
   976 [] \in L(\textit{ders}\,s\,r)
  1017 [] \in L(\textit{ders}\,s\,r)
   977 \label{prefinalstep}
  1018 \label{prefinalstep}
   978 \end{equation}
  1019 \end{equation}
   997 \[
  1038 \[
   998 matches\,s\,r\;\;\text{if and only if}\;\;
  1039 matches\,s\,r\;\;\text{if and only if}\;\;
   999 s\in L(r)
  1040 s\in L(r)
  1000 \]
  1041 \]
  1001 
  1042 
  1002 \noindent which is the property we set out to prove:
  1043 \noindent which is the property we set out to prove: our algorithm
  1003 our algorithm meets its specification. To have done
  1044 meets its specification. To have done so, requires a few induction
  1004 so, requires a few induction proofs about strings and
  1045 proofs about strings and regular expressions. Following the \emph{induction
  1005 regular expressions. Following the recipes is already a big 
  1046 recipes} is already a big step in actually performing these proofs.
  1006 step in performing these proofs.
  1047 If you do not believe it, proofs have helped me to make sure my code
       
  1048 is correct and in several instances prevented me of letting slip
       
  1049 embarassing mistakes into the `wild'.
  1007 
  1050 
  1008 \end{document}
  1051 \end{document}
  1009 
  1052 
  1010 
  1053 
  1011 
  1054