handouts/ho02.tex
changeset 262 ee4304bc6350
parent 261 24531cfaa36a
child 263 92e6985018ae
equal deleted inserted replaced
261:24531cfaa36a 262:ee4304bc6350
     2 \usepackage{../style}
     2 \usepackage{../style}
     3 \usepackage{../langs}
     3 \usepackage{../langs}
     4 \usepackage{../graphics}
     4 \usepackage{../graphics}
     5 \usepackage{../data}
     5 \usepackage{../data}
     6 
     6 
       
     7 
     7 \begin{document}
     8 \begin{document}
     8 
     9 
     9 \section*{Handout 2}
    10 \section*{Handout 2}
    10 
    11 
    11 This lecture is about implementing a more efficient regular
    12 This lecture is about implementing a more efficient regular
    12 expression matcher (the plots on the right)---more efficient
    13 expression matcher (the plots on the right)---more efficient
    13 than the matchers from regular expression libraries in Ruby and
    14 than the matchers from regular expression libraries in Ruby and
    14 Python (the plots on the left). These plots show the running 
    15 Python (the plots on the left). These plots show the running 
    15 time for the evil regular expression $a?^{\{n\}}a^{\{n\}}$.
    16 time for the evil regular expression $a?^{\{n\}}a^{\{n\}}$.
    16 
    17 Note the different scales in each plot.
       
    18 
       
    19 \pgfplotsset{compat=1.11}
    17 \begin{center}
    20 \begin{center}
    18 \begin{tabular}{@{}cc@{}}
    21 \begin{tabular}{@{}cc@{}}
    19 \begin{tikzpicture}[y=.072cm, x=.12cm]
    22 \begin{tikzpicture}
    20  	%axis
    23 \begin{axis}[
    21 	\draw (0,0) -- coordinate (x axis mid) (30,0);
    24     xlabel={\pcode{a}s},
    22    \draw (0,0) -- coordinate (y axis mid) (0,30);
    25     ylabel={time in secs},
    23    %ticks
    26     enlargelimits=false,
    24    \foreach \x in {0,5,...,30}
    27     xtick={0,5,...,30},
    25      \draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x};
    28     xmax=30,
    26    \foreach \y in {0,5,...,30}
    29     ymax=35,
    27      \draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y}; 
    30     ytick={0,5,...,30},
    28 	%labels      
    31     scaled ticks=false,
    29 	\node[below=0.6cm] at (x axis mid) {number of \texttt{a}s};
    32     axis lines=left,
    30 	\node[rotate=90,left=0.9cm] at (y axis mid) {time in secs};
    33     width=5cm,
    31 	%plots
    34     height=5cm, 
    32 	\draw[color=blue] plot[mark=*] 
    35     legend entries={Python,Ruby},  
    33 		file {re-python.data};
    36     legend pos=north west,
    34 	\draw[color=brown] plot[mark=triangle*] 
    37     legend cell align=left  
    35 		file {re-ruby.data};	 
    38 ]
    36    %legend
    39 \addplot[blue,mark=*, mark options={fill=white}] 
    37 	\begin{scope}[shift={(4,20)}] 
    40   table {re-python.data};
    38 	\draw[color=blue] (0,0) -- 
    41 \addplot[brown,mark=pentagon*, mark options={fill=white}] 
    39 		plot[mark=*] (0.25,0) -- (0.5,0) 
    42   table {re-ruby.data};  
    40 		node[right]{\small Python};
    43 \end{axis}
    41 	\draw[yshift=-4mm, color=brown] (0,0) -- 
    44 \end{tikzpicture}
    42 		plot[mark=triangle*] (0.25,0) -- (0.5,0)
       
    43 		node[right]{\small Ruby};
       
    44 	\end{scope}	
       
    45 \end{tikzpicture} 
       
    46 &
    45 &
    47 \begin{tikzpicture}[y=.072cm, x=.0004cm]
    46 \begin{tikzpicture}
    48  	%axis
    47 \begin{axis}[
    49 	\draw (0,0) -- coordinate (x axis mid) (12000,0);
    48     xlabel={\pcode{a}s},
    50    \draw (0,0) -- coordinate (y axis mid) (0,30);
    49     ylabel={time in secs},
    51    %ticks
    50     enlargelimits=false,
    52    \foreach \x in {0,3000,...,12000}
    51     xtick={0,3000,...,12000},
    53      	\draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x};
    52     xmax=12000,
    54    \foreach \y in {0,5,...,30}
    53     ymax=35,
    55      	\draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y}; 
    54     ytick={0,5,...,30},
    56 	%labels      
    55     scaled ticks=false,
    57 	\node[below=0.6cm] at (x axis mid) {number of \texttt{a}s};
    56     axis lines=left,
    58 	\node[rotate=90,left=0.9cm] at (y axis mid) {time in secs};
    57     width=6.5cm,
    59 
    58     height=5cm
    60 	%plots
    59 ]
    61    \draw[color=green] plot[mark=square*, mark options={fill=white} ] 
    60 \addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
    62 		file {re2b.data};
    61 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
    63 	\draw[color=black] plot[mark=square*, mark options={fill=white} ] 
    62 \end{axis}
    64 		file {re3.data};	 
       
    65 \end{tikzpicture}
    63 \end{tikzpicture}
    66 \end{tabular}
    64 \end{tabular}
    67 \end{center}\medskip
    65 \end{center}\medskip
    68 
    66 
    69 
    67 
   421 
   419 
   422 \lstinputlisting[numbers=none]{../progs/app51.scala}
   420 \lstinputlisting[numbers=none]{../progs/app51.scala}
   423 
   421 
   424 \noindent Running the matcher with the example, we find it is
   422 \noindent Running the matcher with the example, we find it is
   425 slightly worse then the matcher in Ruby and Python.
   423 slightly worse then the matcher in Ruby and Python.
   426 Ooops\ldots\medskip
   424 Ooops\ldots
       
   425 
       
   426 \pgfplotsset{compat=1.11}
       
   427 \begin{center}
       
   428 \begin{tikzpicture}
       
   429 \begin{axis}[
       
   430     xlabel={\pcode{a}s},
       
   431     ylabel={time in secs},
       
   432     enlargelimits=false,
       
   433     xtick={0,5,...,30},
       
   434     xmax=30,
       
   435     ytick={0,5,...,30},
       
   436     scaled ticks=false,
       
   437     axis lines=left,
       
   438     width=6cm,
       
   439     height=5cm, 
       
   440     legend entries={Python,Ruby,Scala V1},  
       
   441     legend pos=outer north east,
       
   442     legend cell align=left  
       
   443 ]
       
   444 \addplot[blue,mark=*, mark options={fill=white}] 
       
   445   table {re-python.data};
       
   446 \addplot[brown,mark=pentagon*, mark options={fill=white}] 
       
   447   table {re-ruby.data};  
       
   448 \addplot[red,mark=triangle*,mark options={fill=white}] 
       
   449   table {re1.data};  
       
   450 \end{axis}
       
   451 \end{tikzpicture}
       
   452 \end{center}
   427 
   453 
   428 \noindent Analysing this failure a bit we notice that 
   454 \noindent Analysing this failure a bit we notice that 
   429 for $a^{\{n\}}$ we generate quite big regular expressions:
   455 for $a^{\{n\}}$ we generate quite big regular expressions:
   430 
   456 
   431 \begin{center}
   457 \begin{center}
   433 1: & $a$\\
   459 1: & $a$\\
   434 2: & $a\cdot a$\\
   460 2: & $a\cdot a$\\
   435 3: & $a\cdot a\cdot a$\\
   461 3: & $a\cdot a\cdot a$\\
   436 & \ldots\\
   462 & \ldots\\
   437 13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\
   463 13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\
   438 & \ldots\\
   464 & \ldots
   439 20:
       
   440 \end{tabular}
   465 \end{tabular}
   441 \end{center}
   466 \end{center}
   442 
   467 
   443 \noindent Our algorithm traverses such regular expressions at
   468 \noindent Our algorithm traverses such regular expressions at
   444 least once every time a derivative is calculated. So having
   469 least once every time a derivative is calculated. So having
   445 large regular expressions, will cause problems. This problem
   470 large regular expressions will cause problems. This problem
   446 is aggravated with $a?$ being represented as $a + \epsilon$.
   471 is aggravated by $a?$ being represented as $a + \epsilon$.
   447 
   472 
   448 
   473 We can fix this by having an explicit constructor for 
       
   474 $r^{\{n\}}$. In Scala we would introduce a constructor like
       
   475 
       
   476 \begin{center}
       
   477 \code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
       
   478 \end{center}
       
   479 
       
   480 \noindent With this we have a constant ``size'' regular
       
   481 expression for our running example no matter how large $n$ 
       
   482 is. This means we have to also add cases for $nullable$ and
       
   483 $der$. Does the change have any effect?
       
   484 
       
   485 \pgfplotsset{compat=1.11}
       
   486 \begin{center}
       
   487 \begin{tikzpicture}
       
   488 \begin{axis}[
       
   489     xlabel={\pcode{a}s},
       
   490     ylabel={time in secs},
       
   491     enlargelimits=false,
       
   492     xtick={0,100,...,1000},
       
   493     xmax=1000,
       
   494     ytick={0,5,...,30},
       
   495     scaled ticks=false,
       
   496     axis lines=left,
       
   497     width=10cm,
       
   498     height=5cm, 
       
   499     legend entries={Python,Ruby,Scala V1,Scala V2},  
       
   500     legend pos=outer north east,
       
   501     legend cell align=left  
       
   502 ]
       
   503 \addplot[blue,mark=*, mark options={fill=white}] 
       
   504   table {re-python.data};
       
   505 \addplot[brown,mark=pentagon*, mark options={fill=white}] 
       
   506   table {re-ruby.data};  
       
   507 \addplot[red,mark=triangle*,mark options={fill=white}] 
       
   508   table {re1.data};  
       
   509 \addplot[green,mark=square*,mark options={fill=white}] 
       
   510   table {re2b.data};
       
   511 \end{axis}
       
   512 \end{tikzpicture}
       
   513 \end{center}
       
   514 
       
   515 \noindent Now we are talking business! The modified matcher 
       
   516 can within 30 seconds handle regular expressions up to
       
   517 $n = 950$ before a StackOverflow is raised.
       
   518 
       
   519 The moral is that our algorithm is rather sensitive to the
       
   520 size of regular expressions it needs to handle. This is of
       
   521 course obvious because both $nullable$ and $der$ need to
       
   522 traverse the whole regular expression. There seems to be one
       
   523 more source of making the algorithm run faster. The derivative
       
   524 function often produces ``useless'' $\varnothing$s and
       
   525 $\epsilon$s. To see this, consider $r = ((a \cdot b) + b)^*$
       
   526 and the following two derivatives
       
   527 
       
   528 \begin{center}
       
   529 \begin{tabular}{l}
       
   530 $der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\
       
   531 $der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\
       
   532 $der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$
       
   533 \end{tabular}
       
   534 \end{center}
       
   535 
       
   536 \noindent 
       
   537 If we simplify them according to the simple rules from the
       
   538 beginning, we can replace the right-hand sides by the 
       
   539 smaller equivalent regular expressions
       
   540 
       
   541 \begin{center}
       
   542 \begin{tabular}{l}
       
   543 $der\,a\,r \equiv b \cdot r$\\
       
   544 $der\,b\,r \equiv r$\\
       
   545 $der\,c\,r \equiv \varnothing$
       
   546 \end{tabular}
       
   547 \end{center}
       
   548 
       
   549 \noindent I leave it to you to contemplate whether such a
       
   550 simplification can have any impact on the correctness of our
       
   551 algorithm (will it change any answers?). Figure~\ref{scala2}
       
   552 give a simplification function that recursively traverses a
       
   553 regular expression and simplifies it according to the rules
       
   554 given at the beginning. There are only rules for $+$, $\cdot$
       
   555 and $n$-times (the latter because we added it in the second
       
   556 version of our matcher). There is no rule for a star, because
       
   557 empirical data and also a little thought showed that
       
   558 simplifying under a star is waste of computation time. The
       
   559 simplification function will be called after every derivation.
       
   560 This additional step removes all the ``junk'' the derivative
       
   561 function introduced. Does this improve the speed? You bet!!
       
   562 
       
   563 \begin{figure}[p]
       
   564 \lstinputlisting{../progs/app6.scala}
       
   565 \caption{The simplification function and modified 
       
   566 \pcode{ders}-function.\label{scala2}}
       
   567 \end{figure}
       
   568 
       
   569 \pgfplotsset{compat=1.11}
       
   570 \begin{center}
       
   571 \begin{tikzpicture}
       
   572 \begin{axis}[
       
   573     xlabel={\pcode{a}s},
       
   574     ylabel={time in secs},
       
   575     enlargelimits=false,
       
   576     xtick={0,2000,...,12000},
       
   577     xmax=12000,
       
   578     ytick={0,5,...,30},
       
   579     scaled ticks=false,
       
   580     axis lines=left,
       
   581     width=9cm,
       
   582     height=4cm, 
       
   583     legend entries={Scala V2,Scala V3},    
       
   584 ]
       
   585 \addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
       
   586 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
       
   587 \end{axis}
       
   588 \end{tikzpicture}
       
   589 \end{center}
   449 
   590 
   450 \end{document}
   591 \end{document}
       
   592 
       
   593 
       
   594 
   451 
   595 
   452 %%% Local Variables: 
   596 %%% Local Variables: 
   453 %%% mode: latex
   597 %%% mode: latex
   454 %%% TeX-master: t
   598 %%% TeX-master: t
   455 %%% End: 
   599 %%% End: