updated handouts
authorChristian Urban <christian dot urban at kcl dot ac dot uk>
Sun, 28 Sep 2014 22:30:25 +0100
changeset 262 ee4304bc6350
parent 261 24531cfaa36a
child 263 92e6985018ae
updated handouts
handouts/ho02.pdf
handouts/ho02.tex
progs/app6.scala
Binary file handouts/ho02.pdf has changed
--- a/handouts/ho02.tex	Sun Sep 28 18:07:58 2014 +0100
+++ b/handouts/ho02.tex	Sun Sep 28 22:30:25 2014 +0100
@@ -4,6 +4,7 @@
 \usepackage{../graphics}
 \usepackage{../data}
 
+
 \begin{document}
 
 \section*{Handout 2}
@@ -13,56 +14,39 @@
 than the matchers from regular expression libraries in Ruby and
 Python (the plots on the left). These plots show the running 
 time for the evil regular expression $a?^{\{n\}}a^{\{n\}}$.
+Note the different scales in each plot.
 
+\pgfplotsset{compat=1.11}
 \begin{center}
 \begin{tabular}{@{}cc@{}}
-\begin{tikzpicture}[y=.072cm, x=.12cm]
- 	%axis
-	\draw (0,0) -- coordinate (x axis mid) (30,0);
-   \draw (0,0) -- coordinate (y axis mid) (0,30);
-   %ticks
-   \foreach \x in {0,5,...,30}
-     \draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x};
-   \foreach \y in {0,5,...,30}
-     \draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y}; 
-	%labels      
-	\node[below=0.6cm] at (x axis mid) {number of \texttt{a}s};
-	\node[rotate=90,left=0.9cm] at (y axis mid) {time in secs};
-	%plots
-	\draw[color=blue] plot[mark=*] 
-		file {re-python.data};
-	\draw[color=brown] plot[mark=triangle*] 
-		file {re-ruby.data};	 
-   %legend
-	\begin{scope}[shift={(4,20)}] 
-	\draw[color=blue] (0,0) -- 
-		plot[mark=*] (0.25,0) -- (0.5,0) 
-		node[right]{\small Python};
-	\draw[yshift=-4mm, color=brown] (0,0) -- 
-		plot[mark=triangle*] (0.25,0) -- (0.5,0)
-		node[right]{\small Ruby};
-	\end{scope}	
-\end{tikzpicture} 
+\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=30,
+    ymax=35,
    ytick={0,5,...,30},
+    scaled ticks=false,
+    axis lines=left,
+    width=5cm,
+    height=5cm, 
+    legend entries={Python,Ruby},  
+    legend pos=north west,
+    legend cell align=left  
]
+\addplot[blue,mark=*, mark options={fill=white}] 
+  table {re-python.data};
+\addplot[brown,mark=pentagon*, mark options={fill=white}] 
+  table {re-ruby.data};  
+\end{axis}
\end{tikzpicture}
 &
-\begin{tikzpicture}[y=.072cm, x=.0004cm]
- 	%axis
-	\draw (0,0) -- coordinate (x axis mid) (12000,0);
-   \draw (0,0) -- coordinate (y axis mid) (0,30);
-   %ticks
-   \foreach \x in {0,3000,...,12000}
-     	\draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x};
-   \foreach \y in {0,5,...,30}
-     	\draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y}; 
-	%labels      
-	\node[below=0.6cm] at (x axis mid) {number of \texttt{a}s};
-	\node[rotate=90,left=0.9cm] at (y axis mid) {time in secs};
-
-	%plots
-   \draw[color=green] plot[mark=square*, mark options={fill=white} ] 
-		file {re2b.data};
-	\draw[color=black] plot[mark=square*, mark options={fill=white} ] 
-		file {re3.data};	 
-\end{tikzpicture}
+\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,3000,...,12000},
+    xmax=12000,
+    ymax=35,
    ytick={0,5,...,30},
+    scaled ticks=false,
+    axis lines=left,
+    width=6.5cm,
+    height=5cm
]
\addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
+\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
\end{axis}
\end{tikzpicture}
 \end{tabular}
 \end{center}\medskip
 
@@ -423,7 +407,28 @@
 
 \noindent Running the matcher with the example, we find it is
 slightly worse then the matcher in Ruby and Python.
-Ooops\ldots\medskip
+Ooops\ldots
+
+\pgfplotsset{compat=1.11}
+\begin{center}
+\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=30,
    ytick={0,5,...,30},
+    scaled ticks=false,
+    axis lines=left,
+    width=6cm,
+    height=5cm, 
+    legend entries={Python,Ruby,Scala V1},  
+    legend pos=outer north east,
+    legend cell align=left  
]
+\addplot[blue,mark=*, mark options={fill=white}] 
+  table {re-python.data};
+\addplot[brown,mark=pentagon*, mark options={fill=white}] 
+  table {re-ruby.data};  
+\addplot[red,mark=triangle*,mark options={fill=white}] 
+  table {re1.data};  
\end{axis}
\end{tikzpicture}
+\end{center}
 
 \noindent Analysing this failure a bit we notice that 
 for $a^{\{n\}}$ we generate quite big regular expressions:
@@ -435,19 +440,121 @@
 3: & $a\cdot a\cdot a$\\
 & \ldots\\
 13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\
-& \ldots\\
-20:
+& \ldots
 \end{tabular}
 \end{center}
 
 \noindent Our algorithm traverses such regular expressions at
 least once every time a derivative is calculated. So having
-large regular expressions, will cause problems. This problem
-is aggravated with $a?$ being represented as $a + \epsilon$.
+large regular expressions will cause problems. This problem
+is aggravated by $a?$ being represented as $a + \epsilon$.
+
+We can fix this by having an explicit constructor for 
+$r^{\{n\}}$. In Scala we would introduce a constructor like
+
+\begin{center}
+\code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
+\end{center}
+
+\noindent With this we have a constant ``size'' regular
+expression for our running example no matter how large $n$ 
+is. This means we have to also add cases for $nullable$ and
+$der$. Does the change have any effect?
+
+\pgfplotsset{compat=1.11}
+\begin{center}
+\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,100,...,1000},
+    xmax=1000,
    ytick={0,5,...,30},
+    scaled ticks=false,
+    axis lines=left,
+    width=10cm,
+    height=5cm, 
+    legend entries={Python,Ruby,Scala V1,Scala V2},  
+    legend pos=outer north east,
+    legend cell align=left  
]
+\addplot[blue,mark=*, mark options={fill=white}] 
+  table {re-python.data};
+\addplot[brown,mark=pentagon*, mark options={fill=white}] 
+  table {re-ruby.data};  
+\addplot[red,mark=triangle*,mark options={fill=white}] 
+  table {re1.data};  
\addplot[green,mark=square*,mark options={fill=white}] 
+  table {re2b.data};
\end{axis}
\end{tikzpicture}
+\end{center}
+
+\noindent Now we are talking business! The modified matcher 
+can within 30 seconds handle regular expressions up to
+$n = 950$ before a StackOverflow is raised.
+
+The moral is that our algorithm is rather sensitive to the
+size of regular expressions it needs to handle. This is of
+course obvious because both $nullable$ and $der$ need to
+traverse the whole regular expression. There seems to be one
+more source of making the algorithm run faster. The derivative
+function often produces ``useless'' $\varnothing$s and
+$\epsilon$s. To see this, consider $r = ((a \cdot b) + b)^*$
+and the following two derivatives
+
+\begin{center}
+\begin{tabular}{l}
+$der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\
+$der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\
+$der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$
+\end{tabular}
+\end{center}
+
+\noindent 
+If we simplify them according to the simple rules from the
+beginning, we can replace the right-hand sides by the 
+smaller equivalent regular expressions
+
+\begin{center}
+\begin{tabular}{l}
+$der\,a\,r \equiv b \cdot r$\\
+$der\,b\,r \equiv r$\\
+$der\,c\,r \equiv \varnothing$
+\end{tabular}
+\end{center}
+
+\noindent I leave it to you to contemplate whether such a
+simplification can have any impact on the correctness of our
+algorithm (will it change any answers?). Figure~\ref{scala2}
+give a simplification function that recursively traverses a
+regular expression and simplifies it according to the rules
+given at the beginning. There are only rules for $+$, $\cdot$
+and $n$-times (the latter because we added it in the second
+version of our matcher). There is no rule for a star, because
+empirical data and also a little thought showed that
+simplifying under a star is waste of computation time. The
+simplification function will be called after every derivation.
+This additional step removes all the ``junk'' the derivative
+function introduced. Does this improve the speed? You bet!!
+
+\begin{figure}[p]
+\lstinputlisting{../progs/app6.scala}
+\caption{The simplification function and modified 
+\pcode{ders}-function.\label{scala2}}
+\end{figure}
+
+\pgfplotsset{compat=1.11}
+\begin{center}
+\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,2000,...,12000},
+    xmax=12000,
    ytick={0,5,...,30},
+    scaled ticks=false,
+    axis lines=left,
+    width=9cm,
+    height=4cm, 
+    legend entries={Scala V2,Scala V3},    
]
\addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
+\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
\end{axis}
\end{tikzpicture}
+\end{center}
+
+\end{document}
 
 
 
-\end{document}
 
 %%% Local Variables: 
 %%% mode: latex
--- a/progs/app6.scala	Sun Sep 28 18:07:58 2014 +0100
+++ b/progs/app6.scala	Sun Sep 28 22:30:25 2014 +0100
@@ -1,16 +1,30 @@
-def der (r: Rexp, c: Char) : Rexp = r match {
-  case NULL => NULL
-  case EMPTY => NULL
-  case CHAR(d) => if (c == d) EMPTY else NULL
-  case ALT(r1, r2) => ALT(der(r1, c), der(r2, c))
-  case SEQ(r1, r2) => 
-    if (nullable(r1)) ALT(SEQ(der(r1, c), r2), der(r2, c))
-    else SEQ(der(r1, c), r2)
-  case STAR(r) => SEQ(der(r, c), STAR(r))
+def simp(r: Rexp): Rexp = r match {
+  case ALT(r1, r2) => {
+    val r1s = simp(r1)
+    val r2s = simp(r2)
+    (r1s, r2s) match {
+      case (NULL, _) => r2s
+      case (_, NULL) => r1s
+      case _ => if (r1s == r2s) r1s else ALT(r1s, r2s)
+    }
+  }
+  case SEQ(r1, r2) => {
+    val r1s = simp(r1)
+    val r2s = simp(r2)
+    (r1s, r2s) match {
+      case (NULL, _) => NULL
+      case (_, NULL) => NULL
+      case (EMPTY, _) => r2s
+      case (_, EMPTY) => r1s
+      case _ => SEQ(r1s, r2s)
+    }
+  }
+  case NTIMES(r, n) => NTIMES(simp(r), n)    
+  case r => r
 }
 
 def ders (s: List[Char], r: Rexp) : Rexp = s match {
   case Nil => r
-  case c::s => ders(s, der(c, r))
+  case c::s => ders(s, simp(der(c, r)))
 }