handouts/ho02.tex
changeset 478 48b842c997c7
parent 477 b78664a24f5d
child 479 52aa298211f6
child 480 9e42ccbbd1e6
equal deleted inserted replaced
477:b78664a24f5d 478:48b842c997c7
    11 
    11 
    12 
    12 
    13 \section*{Handout 2 (Regular Expression Matching)}
    13 \section*{Handout 2 (Regular Expression Matching)}
    14 
    14 
    15 This lecture is about implementing a more efficient regular expression
    15 This lecture is about implementing a more efficient regular expression
    16 matcher (the plots on the right)---more efficient than the matchers
    16 matcher (the plots on the right below)---more efficient than the
    17 from regular expression libraries in Ruby, Python and Java (the plots
    17 matchers from regular expression libraries in Ruby, Python and Java
    18 on the left). The first pair of plots show the running time for the
    18 (the plots on the left). The first pair of plots show the running time
    19 regular expressions $a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings composed
    19 for the regular expression $(a^*)^*\cdot b$ and strings composed of
    20 of $n$ \pcode{a}s. The second pair of plots show the running time
    20 $n$ \pcode{a}s (meaning this regular expression actually does not
    21 for the regular expression $(a^*)^*\cdot b$ and also strings composed
    21 match the strings). The second pair of plots show the running time for
    22 of $n$ \pcode{a}s (meaning this regular expression actually does not
    22 the regular expressions $a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings
       
    23 also composed of $n$ \pcode{a}s (this time the regular expressions
    23 match the strings).  To see the substantial differences in the left
    24 match the strings).  To see the substantial differences in the left
    24 and right plots below, note the different scales of the $x$-axes. 
    25 and right plots below, note the different scales of the $x$-axes.
    25 
    26 
       
    27 
       
    28 \begin{center}
       
    29 Graphs: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$
       
    30 \begin{tabular}{@{}cc@{}}
       
    31 \begin{tikzpicture}
       
    32 \begin{axis}[
       
    33     xlabel={$n$},
       
    34     x label style={at={(1.05,0.0)}},
       
    35     ylabel={time in secs},
       
    36     enlargelimits=false,
       
    37     xtick={0,5,...,30},
       
    38     xmax=33,
       
    39     ymax=35,
       
    40     ytick={0,5,...,30},
       
    41     scaled ticks=false,
       
    42     axis lines=left,
       
    43     width=5cm,
       
    44     height=5cm, 
       
    45     legend entries={Java, Python},  
       
    46     legend pos=north west,
       
    47     legend cell align=left]
       
    48 \addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
       
    49 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
       
    50 \end{axis}
       
    51 \end{tikzpicture}
       
    52 &
       
    53 \begin{tikzpicture}
       
    54   \begin{axis}[
       
    55     xlabel={$n$},
       
    56     x label style={at={(1.05,0.0)}},
       
    57     ylabel={time in secs},
       
    58     enlargelimits=false,
       
    59     ymax=35,
       
    60     ytick={0,5,...,30},
       
    61     axis lines=left,
       
    62     scaled ticks=false,
       
    63     width=6.5cm,
       
    64     height=5cm,
       
    65     legend entries={Scala V3},  
       
    66     legend pos=north east,
       
    67     legend cell align=left]
       
    68 %\addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};    
       
    69 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
       
    70 \end{axis}
       
    71 \end{tikzpicture}
       
    72 \end{tabular}
       
    73 \end{center}
    26 
    74 
    27 \begin{center}
    75 \begin{center}
    28 Graphs: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$\\
    76 Graphs: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$\\
    29 \begin{tabular}{@{}cc@{}}
    77 \begin{tabular}{@{}cc@{}}
    30 \begin{tikzpicture}
    78 \begin{tikzpicture}
    60     ymax=35,
   108     ymax=35,
    61     ytick={0,5,...,30},
   109     ytick={0,5,...,30},
    62     scaled ticks=false,
   110     scaled ticks=false,
    63     axis lines=left,
   111     axis lines=left,
    64     width=6.5cm,
   112     width=6.5cm,
    65     height=5cm]
   113     height=5cm,
    66 \addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
   114     legend entries={Scala V3},  
       
   115     legend pos=north east,
       
   116     legend cell align=left]
       
   117 %\addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
    67 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
   118 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
    68 \end{axis}
   119 \end{axis}
    69 \end{tikzpicture}
   120 \end{tikzpicture}
    70 \end{tabular}
   121 \end{tabular}
    71 \end{center}
   122 \end{center}
    72 
   123 \medskip
    73 \begin{center}
       
    74 Graphs: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$
       
    75 \begin{tabular}{@{}cc@{}}
       
    76 \begin{tikzpicture}
       
    77 \begin{axis}[
       
    78     xlabel={$n$},
       
    79     x label style={at={(1.05,0.0)}},
       
    80     ylabel={time in secs},
       
    81     enlargelimits=false,
       
    82     xtick={0,5,...,30},
       
    83     xmax=33,
       
    84     ymax=35,
       
    85     ytick={0,5,...,30},
       
    86     scaled ticks=false,
       
    87     axis lines=left,
       
    88     width=5cm,
       
    89     height=5cm, 
       
    90     legend entries={Java, Python},  
       
    91     legend pos=north west,
       
    92     legend cell align=left]
       
    93 \addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
       
    94 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
       
    95 \end{axis}
       
    96 \end{tikzpicture}
       
    97 &
       
    98 \begin{tikzpicture}
       
    99   \begin{axis}[
       
   100     xlabel={$n$},
       
   101     x label style={at={(1.05,0.0)}},
       
   102     ylabel={time in secs},
       
   103     enlargelimits=false,
       
   104     ymax=35,
       
   105     ytick={0,5,...,30},
       
   106     axis lines=left,
       
   107     scaled ticks=false,
       
   108     width=6.5cm,
       
   109     height=5cm]
       
   110 \addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};    
       
   111 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
       
   112 \end{axis}
       
   113 \end{tikzpicture}
       
   114 \end{tabular}
       
   115 \end{center}\medskip
       
   116 
   124 
   117 \noindent
   125 \noindent
   118 We will use these regular expressions and strings
   126 We will use these regular expressions and strings as running
   119 as running examples.
   127 examples. There will be several versions (V1, V2, V3,\ldots) of the
   120 
   128 algorithm.\footnote{The corresponding files are \texttt{re1.scala},
       
   129   \texttt{re2.scala} and so on. As usual, you can find the code on
       
   130   KEATS.}\bigskip
       
   131 
       
   132 \noindent
   121 Having specified in the previous lecture what
   133 Having specified in the previous lecture what
   122 problem our regular expression matcher is supposed to solve,
   134 problem our regular expression matcher is supposed to solve,
   123 namely for any given regular expression $r$ and string $s$
   135 namely for any given regular expression $r$ and string $s$
   124 answer \textit{true} if and only if
   136 answer \textit{true} if and only if
   125 
   137 
   569 
   581 
   570 \begin{center}
   582 \begin{center}
   571 \code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
   583 \code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
   572 \end{center}
   584 \end{center}
   573 
   585 
   574 \noindent With this fix we have a constant ``size'' regular
   586 \noindent With this fix we have a constant ``size'' regular expression
   575 expression for our running example no matter how large $n$ is.
   587 for our running example no matter how large $n$ is (see the
   576 This means we have to also add cases for \pcode{NTIMES} in the
   588 \texttt{size} section in the implementations).  This means we have to
   577 functions $\textit{nullable}$ and $\textit{der}$. Does the change have any
   589 also add cases for \pcode{NTIMES} in the functions $\textit{nullable}$
   578 effect?
   590 and $\textit{der}$. Does the change have any effect?
   579 
   591 
   580 \begin{center}
   592 \begin{center}
   581 \begin{tikzpicture}
   593 \begin{tikzpicture}
   582 \begin{axis}[
   594 \begin{axis}[
   583     title={Graph: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
   595     title={Graph: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
   601 \addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
   613 \addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
   602 \end{axis}
   614 \end{axis}
   603 \end{tikzpicture}
   615 \end{tikzpicture}
   604 \end{center}
   616 \end{center}
   605 
   617 
   606 \noindent Now we are talking business! The modified matcher 
   618 \noindent Now we are talking business! The modified matcher can within
   607 can within 25 seconds handle regular expressions up to
   619 25 seconds handle regular expressions up to $n = 1,100$ before a
   608 $n = 1,100$ before a StackOverflow is raised. Recall that Python and Ruby 
   620 StackOverflow is raised. Recall that Python and Ruby (and our first
   609 (and our first version, Scala V1) could only handle $n = 27$ or so in 30 
   621 version, Scala V1) could only handle $n = 27$ or so in 30
   610 seconds. There is no change for our second example 
   622 seconds. There is no change for our second example $(a^*)^* \cdot
   611 $(a^*)^* \cdot b$---so this is still good.
   623 b$---so this is still good.
   612 
   624 
   613 
   625 
   614 The moral is that our algorithm is rather sensitive to the
   626 The moral is that our algorithm is rather sensitive to the
   615 size of regular expressions it needs to handle. This is of
   627 size of regular expressions it needs to handle. This is of
   616 course obvious because both $\textit{nullable}$ and $\textit{der}$ frequently
   628 course obvious because both $\textit{nullable}$ and $\textit{der}$ frequently
   617 need to traverse the whole regular expression. There seems,
   629 need to traverse the whole regular expression. There seems,
   618 however, one more issue for making the algorithm run faster.
   630 however, one more issue for making the algorithm run faster.
   619 The derivative function often produces ``useless''
   631 The derivative function often produces ``useless''
   620 $\ZERO$s and $\ONE$s. To see this, consider $r = ((a
   632 $\ZERO$s and $\ONE$s. To see this, consider $r = ((a
   621 \cdot b) + b)^*$ and the following two derivatives
   633 \cdot b) + b)^*$ and the following three derivatives
   622 
   634 
   623 \begin{center}
   635 \begin{center}
   624 \begin{tabular}{l}
   636 \begin{tabular}{l}
   625 $\textit{der}\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\
   637 $\textit{der}\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\
   626 $\textit{der}\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\
   638 $\textit{der}\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\
   640 $\textit{der}\,c\,r \equiv \ZERO$
   652 $\textit{der}\,c\,r \equiv \ZERO$
   641 \end{tabular}
   653 \end{tabular}
   642 \end{center}
   654 \end{center}
   643 
   655 
   644 \noindent I leave it to you to contemplate whether such a
   656 \noindent I leave it to you to contemplate whether such a
   645 simplification can have any impact on the correctness of our
   657 simplification can have any impact on the correctness of our algorithm
   646 algorithm (will it change any answers?). Figure~\ref{scala2}
   658 (will it change any answers?). Figure~\ref{scala2} gives a
   647 gives a simplification function that recursively traverses a
   659 simplification function that recursively traverses a regular
   648 regular expression and simplifies it according to the rules
   660 expression and simplifies it according to the rules given at the
   649 given at the beginning. There are only rules for $+$, $\cdot$
   661 beginning. There are only rules for $+$, $\cdot$ and $n$-times (the
   650 and $n$-times (the latter because we added it in the second
   662 latter because we added it in the second version of our
   651 version of our matcher). There is no rule for a star, because
   663 matcher). There is no simplification rule for a star, because
   652 empirical data and also a little thought showed that
   664 empirical data and also a little thought showed that simplifying under
   653 simplifying under a star is a waste of computation time. The
   665 a star is a waste of computation time. The simplification function
   654 simplification function will be called after every derivation.
   666 will be called after every derivation.  This additional step removes
   655 This additional step removes all the ``junk'' the derivative
   667 all the ``junk'' the derivative function introduced. Does this improve
   656 function introduced. Does this improve the speed? You bet!!
   668 the speed? You bet!!
   657 
   669 
   658 \begin{figure}[p]
   670 \begin{figure}[p]
   659 \lstinputlisting[numbers=left,linebackgroundcolor=
   671 \lstinputlisting[numbers=left,linebackgroundcolor=
   660   {\ifodd\value{lstnumber}\color{capri!3}\fi}]
   672   {\ifodd\value{lstnumber}\color{capri!3}\fi}]
   661                 {../progs/app6.scala}
   673                 {../progs/app6.scala}
   673     title={Graph: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
   685     title={Graph: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$},
   674     xlabel={$n$},
   686     xlabel={$n$},
   675     x label style={at={(1.04,0.0)}},
   687     x label style={at={(1.04,0.0)}},
   676     ylabel={time in secs},
   688     ylabel={time in secs},
   677     enlargelimits=false,
   689     enlargelimits=false,
   678     xtick={0,3000,...,9000},
   690     xtick={0,2500,...,10000},
   679     xmax=10000,
   691     xmax=12000,
   680     ytick={0,5,...,30},
   692     ytick={0,5,...,30},
   681     ymax=32,
   693     ymax=32,
   682     scaled ticks=false,
   694     scaled ticks=false,
   683     axis lines=left,
   695     axis lines=left,
   684     width=9cm,
   696     width=9cm,
   691 \end{axis}
   703 \end{axis}
   692 \end{tikzpicture}
   704 \end{tikzpicture}
   693 \end{center}
   705 \end{center}
   694 
   706 
   695 \noindent
   707 \noindent
   696 To reacap, Python and Ruby needed approximately 30 seconds to match
   708 To reacap, Python and Ruby needed approximately 30 seconds to match a
   697 a string of 28 \texttt{a}s and the regular expression $a^{?\{n\}} \cdot a^{\{n\}}$. 
   709 string of 28 \texttt{a}s and the regular expression $a^{?\{n\}} \cdot
   698 We need a third of this time to do the same with strings up to 12,000 \texttt{a}s. 
   710 a^{\{n\}}$.  We need a third of this time to do the same with strings
   699 Similarly, Java needed 30 seconds to find out the regular expression 
   711 up to 11,000 \texttt{a}s.  Similarly, Java and Python needed 30
   700 $(a^*)^* \cdot b$ does not match the string of 28 \texttt{a}s. We can do
   712 seconds to find out the regular expression $(a^*)^* \cdot b$ does not
   701 the same in approximately 5 seconds for strings of 6000000 \texttt{a}s:
   713 match the string of 28 \texttt{a}s. We can do the same in
       
   714 for strings of 6,000,000 \texttt{a}s:
   702 
   715 
   703 
   716 
   704 \begin{center}
   717 \begin{center}
   705 \begin{tikzpicture}
   718 \begin{tikzpicture}
   706 \begin{axis}[
   719 \begin{axis}[
   707     title={Graph: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
   720     title={Graph: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$},
   708     xlabel={$n$},
   721     xlabel={$n$},
   709     x label style={at={(1.09,0.0)}},
       
   710     ylabel={time in secs},
   722     ylabel={time in secs},
   711     enlargelimits=false,
   723     enlargelimits=false,
   712     xmax=7700000,
   724     ymax=35,
   713     ytick={0,5,...,30},
   725     ytick={0,5,...,30},
   714     ymax=32,
       
   715     %scaled ticks=false,
       
   716     axis lines=left,
   726     axis lines=left,
       
   727     scaled ticks=false,
       
   728     x label style={at={(1.09,0.0)}},
       
   729     %xmax=7700000,
   717     width=9cm,
   730     width=9cm,
   718     height=5cm, 
   731     height=5cm, 
   719     legend entries={Scala V2, Scala V3},
   732     legend entries={Scala V3},
   720     legend pos=outer north east,
   733     legend pos=outer north east,
   721     legend cell align=left]
   734     legend cell align=left]
   722 \addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};
   735 %\addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};
   723 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
   736 \addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
   724 \end{axis}
   737 \end{axis}
   725 \end{tikzpicture}
   738 \end{tikzpicture}
   726 \end{center}
   739 \end{center}
   727 
   740 
   728 \subsection*{Epilogue}
   741 \subsection*{Epilogue}
   729 
   742 
   730 (23/Aug/2016) I recently found another place where this algorithm can be 
   743 (23/Aug/2016) I recently found another place where this algorithm can be 
   731 sped (this idea is not integrated with what is coming next,
   744 sped up (this idea is not integrated with what is coming next,
   732 but I present it nonetheless). The idea is to define \texttt{ders}
   745 but I present it nonetheless). The idea is to define \texttt{ders}
   733 not such that it iterates the derivative character-by-character, but
   746 not such that it iterates the derivative character-by-character, but
   734 in bigger chunks. The resulting code for \texttt{ders2} looks as
   747 in bigger chunks. The resulting code for \texttt{ders2} looks as
   735 follows:
   748 follows:
   736 
   749