--- a/handouts/ho02.tex Sun Sep 28 18:07:58 2014 +0100
+++ b/handouts/ho02.tex Sun Sep 28 22:30:25 2014 +0100
@@ -4,6 +4,7 @@
\usepackage{../graphics}
\usepackage{../data}
+
\begin{document}
\section*{Handout 2}
@@ -13,56 +14,39 @@
than the matchers from regular expression libraries in Ruby and
Python (the plots on the left). These plots show the running
time for the evil regular expression $a?^{\{n\}}a^{\{n\}}$.
+Note the different scales in each plot.
+\pgfplotsset{compat=1.11}
\begin{center}
\begin{tabular}{@{}cc@{}}
-\begin{tikzpicture}[y=.072cm, x=.12cm]
- %axis
- \draw (0,0) -- coordinate (x axis mid) (30,0);
- \draw (0,0) -- coordinate (y axis mid) (0,30);
- %ticks
- \foreach \x in {0,5,...,30}
- \draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x};
- \foreach \y in {0,5,...,30}
- \draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y};
- %labels
- \node[below=0.6cm] at (x axis mid) {number of \texttt{a}s};
- \node[rotate=90,left=0.9cm] at (y axis mid) {time in secs};
- %plots
- \draw[color=blue] plot[mark=*]
- file {re-python.data};
- \draw[color=brown] plot[mark=triangle*]
- file {re-ruby.data};
- %legend
- \begin{scope}[shift={(4,20)}]
- \draw[color=blue] (0,0) --
- plot[mark=*] (0.25,0) -- (0.5,0)
- node[right]{\small Python};
- \draw[yshift=-4mm, color=brown] (0,0) --
- plot[mark=triangle*] (0.25,0) -- (0.5,0)
- node[right]{\small Ruby};
- \end{scope}
-\end{tikzpicture}
+\begin{tikzpicture}
\begin{axis}[
xlabel={\pcode{a}s},
ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=30,
+ ymax=35,
ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=5cm,
+ legend entries={Python,Ruby},
+ legend pos=north west,
+ legend cell align=left
]
+\addplot[blue,mark=*, mark options={fill=white}]
+ table {re-python.data};
+\addplot[brown,mark=pentagon*, mark options={fill=white}]
+ table {re-ruby.data};
+\end{axis}
\end{tikzpicture}
&
-\begin{tikzpicture}[y=.072cm, x=.0004cm]
- %axis
- \draw (0,0) -- coordinate (x axis mid) (12000,0);
- \draw (0,0) -- coordinate (y axis mid) (0,30);
- %ticks
- \foreach \x in {0,3000,...,12000}
- \draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x};
- \foreach \y in {0,5,...,30}
- \draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y};
- %labels
- \node[below=0.6cm] at (x axis mid) {number of \texttt{a}s};
- \node[rotate=90,left=0.9cm] at (y axis mid) {time in secs};
-
- %plots
- \draw[color=green] plot[mark=square*, mark options={fill=white} ]
- file {re2b.data};
- \draw[color=black] plot[mark=square*, mark options={fill=white} ]
- file {re3.data};
-\end{tikzpicture}
+\begin{tikzpicture}
\begin{axis}[
xlabel={\pcode{a}s},
ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,3000,...,12000},
+ xmax=12000,
+ ymax=35,
ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=6.5cm,
+ height=5cm
]
\addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
+\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
\end{axis}
\end{tikzpicture}
\end{tabular}
\end{center}\medskip
@@ -423,7 +407,28 @@
\noindent Running the matcher with the example, we find it is
slightly worse then the matcher in Ruby and Python.
-Ooops\ldots\medskip
+Ooops\ldots
+
+\pgfplotsset{compat=1.11}
+\begin{center}
+\begin{tikzpicture}
\begin{axis}[
xlabel={\pcode{a}s},
ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=30,
ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=6cm,
+ height=5cm,
+ legend entries={Python,Ruby,Scala V1},
+ legend pos=outer north east,
+ legend cell align=left
]
+\addplot[blue,mark=*, mark options={fill=white}]
+ table {re-python.data};
+\addplot[brown,mark=pentagon*, mark options={fill=white}]
+ table {re-ruby.data};
+\addplot[red,mark=triangle*,mark options={fill=white}]
+ table {re1.data};
\end{axis}
\end{tikzpicture}
+\end{center}
\noindent Analysing this failure a bit we notice that
for $a^{\{n\}}$ we generate quite big regular expressions:
@@ -435,19 +440,121 @@
3: & $a\cdot a\cdot a$\\
& \ldots\\
13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\
-& \ldots\\
-20:
+& \ldots
\end{tabular}
\end{center}
\noindent Our algorithm traverses such regular expressions at
least once every time a derivative is calculated. So having
-large regular expressions, will cause problems. This problem
-is aggravated with $a?$ being represented as $a + \epsilon$.
+large regular expressions will cause problems. This problem
+is aggravated by $a?$ being represented as $a + \epsilon$.
+
+We can fix this by having an explicit constructor for
+$r^{\{n\}}$. In Scala we would introduce a constructor like
+
+\begin{center}
+\code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
+\end{center}
+
+\noindent With this we have a constant ``size'' regular
+expression for our running example no matter how large $n$
+is. This means we have to also add cases for $nullable$ and
+$der$. Does the change have any effect?
+
+\pgfplotsset{compat=1.11}
+\begin{center}
+\begin{tikzpicture}
\begin{axis}[
xlabel={\pcode{a}s},
ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,100,...,1000},
+ xmax=1000,
ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=10cm,
+ height=5cm,
+ legend entries={Python,Ruby,Scala V1,Scala V2},
+ legend pos=outer north east,
+ legend cell align=left
]
+\addplot[blue,mark=*, mark options={fill=white}]
+ table {re-python.data};
+\addplot[brown,mark=pentagon*, mark options={fill=white}]
+ table {re-ruby.data};
+\addplot[red,mark=triangle*,mark options={fill=white}]
+ table {re1.data};
\addplot[green,mark=square*,mark options={fill=white}]
+ table {re2b.data};
\end{axis}
\end{tikzpicture}
+\end{center}
+
+\noindent Now we are talking business! The modified matcher
+can within 30 seconds handle regular expressions up to
+$n = 950$ before a StackOverflow is raised.
+
+The moral is that our algorithm is rather sensitive to the
+size of regular expressions it needs to handle. This is of
+course obvious because both $nullable$ and $der$ need to
+traverse the whole regular expression. There seems to be one
+more source of making the algorithm run faster. The derivative
+function often produces ``useless'' $\varnothing$s and
+$\epsilon$s. To see this, consider $r = ((a \cdot b) + b)^*$
+and the following two derivatives
+
+\begin{center}
+\begin{tabular}{l}
+$der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\
+$der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\
+$der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$
+\end{tabular}
+\end{center}
+
+\noindent
+If we simplify them according to the simple rules from the
+beginning, we can replace the right-hand sides by the
+smaller equivalent regular expressions
+
+\begin{center}
+\begin{tabular}{l}
+$der\,a\,r \equiv b \cdot r$\\
+$der\,b\,r \equiv r$\\
+$der\,c\,r \equiv \varnothing$
+\end{tabular}
+\end{center}
+
+\noindent I leave it to you to contemplate whether such a
+simplification can have any impact on the correctness of our
+algorithm (will it change any answers?). Figure~\ref{scala2}
+give a simplification function that recursively traverses a
+regular expression and simplifies it according to the rules
+given at the beginning. There are only rules for $+$, $\cdot$
+and $n$-times (the latter because we added it in the second
+version of our matcher). There is no rule for a star, because
+empirical data and also a little thought showed that
+simplifying under a star is waste of computation time. The
+simplification function will be called after every derivation.
+This additional step removes all the ``junk'' the derivative
+function introduced. Does this improve the speed? You bet!!
+
+\begin{figure}[p]
+\lstinputlisting{../progs/app6.scala}
+\caption{The simplification function and modified
+\pcode{ders}-function.\label{scala2}}
+\end{figure}
+
+\pgfplotsset{compat=1.11}
+\begin{center}
+\begin{tikzpicture}
\begin{axis}[
xlabel={\pcode{a}s},
ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,2000,...,12000},
+ xmax=12000,
ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=9cm,
+ height=4cm,
+ legend entries={Scala V2,Scala V3},
]
\addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
+\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
\end{axis}
\end{tikzpicture}
+\end{center}
+
+\end{document}
-\end{document}
%%% Local Variables:
%%% mode: latex
--- a/progs/app6.scala Sun Sep 28 18:07:58 2014 +0100
+++ b/progs/app6.scala Sun Sep 28 22:30:25 2014 +0100
@@ -1,16 +1,30 @@
-def der (r: Rexp, c: Char) : Rexp = r match {
- case NULL => NULL
- case EMPTY => NULL
- case CHAR(d) => if (c == d) EMPTY else NULL
- case ALT(r1, r2) => ALT(der(r1, c), der(r2, c))
- case SEQ(r1, r2) =>
- if (nullable(r1)) ALT(SEQ(der(r1, c), r2), der(r2, c))
- else SEQ(der(r1, c), r2)
- case STAR(r) => SEQ(der(r, c), STAR(r))
+def simp(r: Rexp): Rexp = r match {
+ case ALT(r1, r2) => {
+ val r1s = simp(r1)
+ val r2s = simp(r2)
+ (r1s, r2s) match {
+ case (NULL, _) => r2s
+ case (_, NULL) => r1s
+ case _ => if (r1s == r2s) r1s else ALT(r1s, r2s)
+ }
+ }
+ case SEQ(r1, r2) => {
+ val r1s = simp(r1)
+ val r2s = simp(r2)
+ (r1s, r2s) match {
+ case (NULL, _) => NULL
+ case (_, NULL) => NULL
+ case (EMPTY, _) => r2s
+ case (_, EMPTY) => r1s
+ case _ => SEQ(r1s, r2s)
+ }
+ }
+ case NTIMES(r, n) => NTIMES(simp(r), n)
+ case r => r
}
def ders (s: List[Char], r: Rexp) : Rexp = s match {
case Nil => r
- case c::s => ders(s, der(c, r))
+ case c::s => ders(s, simp(der(c, r)))
}