# HG changeset patch # User Christian Urban # Date 1411939825 -3600 # Node ID ee4304bc6350096668dd8399e572a59ddb4ba2f2 # Parent 24531cfaa36aecb093d50a4b9625c52e3fd3716e updated handouts diff -r 24531cfaa36a -r ee4304bc6350 handouts/ho02.pdf Binary file handouts/ho02.pdf has changed diff -r 24531cfaa36a -r ee4304bc6350 handouts/ho02.tex --- a/handouts/ho02.tex Sun Sep 28 18:07:58 2014 +0100 +++ b/handouts/ho02.tex Sun Sep 28 22:30:25 2014 +0100 @@ -4,6 +4,7 @@ \usepackage{../graphics} \usepackage{../data} + \begin{document} \section*{Handout 2} @@ -13,56 +14,39 @@ than the matchers from regular expression libraries in Ruby and Python (the plots on the left). These plots show the running time for the evil regular expression $a?^{\{n\}}a^{\{n\}}$. +Note the different scales in each plot. +\pgfplotsset{compat=1.11} \begin{center} \begin{tabular}{@{}cc@{}} -\begin{tikzpicture}[y=.072cm, x=.12cm] - %axis - \draw (0,0) -- coordinate (x axis mid) (30,0); - \draw (0,0) -- coordinate (y axis mid) (0,30); - %ticks - \foreach \x in {0,5,...,30} - \draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x}; - \foreach \y in {0,5,...,30} - \draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y}; - %labels - \node[below=0.6cm] at (x axis mid) {number of \texttt{a}s}; - \node[rotate=90,left=0.9cm] at (y axis mid) {time in secs}; - %plots - \draw[color=blue] plot[mark=*] - file {re-python.data}; - \draw[color=brown] plot[mark=triangle*] - file {re-ruby.data}; - %legend - \begin{scope}[shift={(4,20)}] - \draw[color=blue] (0,0) -- - plot[mark=*] (0.25,0) -- (0.5,0) - node[right]{\small Python}; - \draw[yshift=-4mm, color=brown] (0,0) -- - plot[mark=triangle*] (0.25,0) -- (0.5,0) - node[right]{\small Ruby}; - \end{scope} -\end{tikzpicture} +\begin{tikzpicture} \begin{axis}[ xlabel={\pcode{a}s}, ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=30, + ymax=35, ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=5cm, + legend entries={Python,Ruby}, + legend pos=north west, + legend cell align=left ] +\addplot[blue,mark=*, mark options={fill=white}] + table {re-python.data}; +\addplot[brown,mark=pentagon*, mark options={fill=white}] + table {re-ruby.data}; +\end{axis} \end{tikzpicture} & -\begin{tikzpicture}[y=.072cm, x=.0004cm] - %axis - \draw (0,0) -- coordinate (x axis mid) (12000,0); - \draw (0,0) -- coordinate (y axis mid) (0,30); - %ticks - \foreach \x in {0,3000,...,12000} - \draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x}; - \foreach \y in {0,5,...,30} - \draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y}; - %labels - \node[below=0.6cm] at (x axis mid) {number of \texttt{a}s}; - \node[rotate=90,left=0.9cm] at (y axis mid) {time in secs}; - - %plots - \draw[color=green] plot[mark=square*, mark options={fill=white} ] - file {re2b.data}; - \draw[color=black] plot[mark=square*, mark options={fill=white} ] - file {re3.data}; -\end{tikzpicture} +\begin{tikzpicture} \begin{axis}[ xlabel={\pcode{a}s}, ylabel={time in secs}, + enlargelimits=false, + xtick={0,3000,...,12000}, + xmax=12000, + ymax=35, ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=6.5cm, + height=5cm ] \addplot[green,mark=square*,mark options={fill=white}] table {re2b.data}; +\addplot[black,mark=square*,mark options={fill=white}] table {re3.data}; \end{axis} \end{tikzpicture} \end{tabular} \end{center}\medskip @@ -423,7 +407,28 @@ \noindent Running the matcher with the example, we find it is slightly worse then the matcher in Ruby and Python. -Ooops\ldots\medskip +Ooops\ldots + +\pgfplotsset{compat=1.11} +\begin{center} +\begin{tikzpicture} \begin{axis}[ xlabel={\pcode{a}s}, ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=30, ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=6cm, + height=5cm, + legend entries={Python,Ruby,Scala V1}, + legend pos=outer north east, + legend cell align=left ] +\addplot[blue,mark=*, mark options={fill=white}] + table {re-python.data}; +\addplot[brown,mark=pentagon*, mark options={fill=white}] + table {re-ruby.data}; +\addplot[red,mark=triangle*,mark options={fill=white}] + table {re1.data}; \end{axis} \end{tikzpicture} +\end{center} \noindent Analysing this failure a bit we notice that for $a^{\{n\}}$ we generate quite big regular expressions: @@ -435,19 +440,121 @@ 3: & $a\cdot a\cdot a$\\ & \ldots\\ 13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\ -& \ldots\\ -20: +& \ldots \end{tabular} \end{center} \noindent Our algorithm traverses such regular expressions at least once every time a derivative is calculated. So having -large regular expressions, will cause problems. This problem -is aggravated with $a?$ being represented as $a + \epsilon$. +large regular expressions will cause problems. This problem +is aggravated by $a?$ being represented as $a + \epsilon$. + +We can fix this by having an explicit constructor for +$r^{\{n\}}$. In Scala we would introduce a constructor like + +\begin{center} +\code{case class NTIMES(r: Rexp, n: Int) extends Rexp} +\end{center} + +\noindent With this we have a constant ``size'' regular +expression for our running example no matter how large $n$ +is. This means we have to also add cases for $nullable$ and +$der$. Does the change have any effect? + +\pgfplotsset{compat=1.11} +\begin{center} +\begin{tikzpicture} \begin{axis}[ xlabel={\pcode{a}s}, ylabel={time in secs}, + enlargelimits=false, + xtick={0,100,...,1000}, + xmax=1000, ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=10cm, + height=5cm, + legend entries={Python,Ruby,Scala V1,Scala V2}, + legend pos=outer north east, + legend cell align=left ] +\addplot[blue,mark=*, mark options={fill=white}] + table {re-python.data}; +\addplot[brown,mark=pentagon*, mark options={fill=white}] + table {re-ruby.data}; +\addplot[red,mark=triangle*,mark options={fill=white}] + table {re1.data}; \addplot[green,mark=square*,mark options={fill=white}] + table {re2b.data}; \end{axis} \end{tikzpicture} +\end{center} + +\noindent Now we are talking business! The modified matcher +can within 30 seconds handle regular expressions up to +$n = 950$ before a StackOverflow is raised. + +The moral is that our algorithm is rather sensitive to the +size of regular expressions it needs to handle. This is of +course obvious because both $nullable$ and $der$ need to +traverse the whole regular expression. There seems to be one +more source of making the algorithm run faster. The derivative +function often produces ``useless'' $\varnothing$s and +$\epsilon$s. To see this, consider $r = ((a \cdot b) + b)^*$ +and the following two derivatives + +\begin{center} +\begin{tabular}{l} +$der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\ +$der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\ +$der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$ +\end{tabular} +\end{center} + +\noindent +If we simplify them according to the simple rules from the +beginning, we can replace the right-hand sides by the +smaller equivalent regular expressions + +\begin{center} +\begin{tabular}{l} +$der\,a\,r \equiv b \cdot r$\\ +$der\,b\,r \equiv r$\\ +$der\,c\,r \equiv \varnothing$ +\end{tabular} +\end{center} + +\noindent I leave it to you to contemplate whether such a +simplification can have any impact on the correctness of our +algorithm (will it change any answers?). Figure~\ref{scala2} +give a simplification function that recursively traverses a +regular expression and simplifies it according to the rules +given at the beginning. There are only rules for $+$, $\cdot$ +and $n$-times (the latter because we added it in the second +version of our matcher). There is no rule for a star, because +empirical data and also a little thought showed that +simplifying under a star is waste of computation time. The +simplification function will be called after every derivation. +This additional step removes all the ``junk'' the derivative +function introduced. Does this improve the speed? You bet!! + +\begin{figure}[p] +\lstinputlisting{../progs/app6.scala} +\caption{The simplification function and modified +\pcode{ders}-function.\label{scala2}} +\end{figure} + +\pgfplotsset{compat=1.11} +\begin{center} +\begin{tikzpicture} \begin{axis}[ xlabel={\pcode{a}s}, ylabel={time in secs}, + enlargelimits=false, + xtick={0,2000,...,12000}, + xmax=12000, ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=9cm, + height=4cm, + legend entries={Scala V2,Scala V3}, ] \addplot[green,mark=square*,mark options={fill=white}] table {re2b.data}; +\addplot[black,mark=square*,mark options={fill=white}] table {re3.data}; \end{axis} \end{tikzpicture} +\end{center} + +\end{document} -\end{document} %%% Local Variables: %%% mode: latex diff -r 24531cfaa36a -r ee4304bc6350 progs/app6.scala --- a/progs/app6.scala Sun Sep 28 18:07:58 2014 +0100 +++ b/progs/app6.scala Sun Sep 28 22:30:25 2014 +0100 @@ -1,16 +1,30 @@ -def der (r: Rexp, c: Char) : Rexp = r match { - case NULL => NULL - case EMPTY => NULL - case CHAR(d) => if (c == d) EMPTY else NULL - case ALT(r1, r2) => ALT(der(r1, c), der(r2, c)) - case SEQ(r1, r2) => - if (nullable(r1)) ALT(SEQ(der(r1, c), r2), der(r2, c)) - else SEQ(der(r1, c), r2) - case STAR(r) => SEQ(der(r, c), STAR(r)) +def simp(r: Rexp): Rexp = r match { + case ALT(r1, r2) => { + val r1s = simp(r1) + val r2s = simp(r2) + (r1s, r2s) match { + case (NULL, _) => r2s + case (_, NULL) => r1s + case _ => if (r1s == r2s) r1s else ALT(r1s, r2s) + } + } + case SEQ(r1, r2) => { + val r1s = simp(r1) + val r2s = simp(r2) + (r1s, r2s) match { + case (NULL, _) => NULL + case (_, NULL) => NULL + case (EMPTY, _) => r2s + case (_, EMPTY) => r1s + case _ => SEQ(r1s, r2s) + } + } + case NTIMES(r, n) => NTIMES(simp(r), n) + case r => r } def ders (s: List[Char], r: Rexp) : Rexp = s match { case Nil => r - case c::s => ders(s, der(c, r)) + case c::s => ders(s, simp(der(c, r))) }