# HG changeset patch # User Christian Urban # Date 1471899943 -7200 # Node ID 065ca01b62aeb362aae9e9c3d55d6dd7801c1c4e # Parent b7221df9662a5d00f7821c730aa6cfa55d3227ea updated diff -r b7221df9662a -r 065ca01b62ae data.sty --- a/data.sty Mon Aug 22 09:14:22 2016 +0200 +++ b/data.sty Mon Aug 22 23:05:43 2016 +0200 @@ -137,6 +137,18 @@ 951 31.96038 \end{filecontents} +\begin{filecontents}{re2c.data} +1 0.00070 +501 0.02137 +1001 0.04070 +1501 0.08710 +2001 0.14441 +2501 0.21303 +3001 0.33987 +3501 0.43522 +4001 0.55873 +\end{filecontents} + \begin{filecontents}{re3.data} 1 0.001605 501 0.131066 @@ -164,6 +176,21 @@ 11501 7.95864 \end{filecontents} +\begin{filecontents}{re3a.data} +1 0.00006 +500001 0.12975 +1000001 0.38887 +1500001 0.61373 +2000001 0.78268 +2500001 0.98069 +3000001 1.41858 +3500001 1.68450 +4000001 1.84533 +4500001 1.91059 +5000001 1.98881 +5500001 2.66755 +6000001 3.16683 +\end{filecontents} \begin{filecontents}{nfa.data} 0 0.00099 diff -r b7221df9662a -r 065ca01b62ae handouts/ho01.pdf Binary file handouts/ho01.pdf has changed diff -r b7221df9662a -r 065ca01b62ae handouts/ho02.pdf Binary file handouts/ho02.pdf has changed diff -r b7221df9662a -r 065ca01b62ae handouts/ho02.tex --- a/handouts/ho02.tex Mon Aug 22 09:14:22 2016 +0200 +++ b/handouts/ho02.tex Mon Aug 22 23:05:43 2016 +0200 @@ -24,10 +24,12 @@ \begin{center} +Plots: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$\\ \begin{tabular}{@{}cc@{}} \begin{tikzpicture} \begin{axis}[ - xlabel={\small $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.05,0.0)}}, ylabel={\small time in secs}, enlargelimits=false, xtick={0,5,...,30}, @@ -50,7 +52,8 @@ & \begin{tikzpicture} \begin{axis}[ - xlabel={\small $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.1,0.05)}}, ylabel={\small time in secs}, enlargelimits=false, xtick={0,3000,...,12000}, @@ -69,10 +72,12 @@ \end{center} \begin{center} +Plots: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$ \begin{tabular}{@{}cc@{}} \begin{tikzpicture} \begin{axis}[ - xlabel={$(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.05,0.0)}}, ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, @@ -92,7 +97,8 @@ & \begin{tikzpicture} \begin{axis}[ - xlabel={$(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.1,0.05)}}, ylabel={time in secs}, enlargelimits=false, xtick={0,3000,...,12000}, @@ -129,7 +135,7 @@ In such cases there is no way we can implement an exhaustive test for whether a string is member of this set or not. In contrast our matching algorithm will operate on the regular expression $r$ and -string $s$, only, which are both finite objects. Before we explain to +string $s$, only, which are both finite objects. Before we explain the matching algorithm, however, let us have a closer look at what it means when two regular expressions are equivalent. @@ -293,19 +299,19 @@ \begin{center} \begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l} - $der\, c\, (\ZERO)$ & $\dn$ & $\ZERO$\\ - $der\, c\, (\ONE)$ & $\dn$ & $\ZERO$ \\ - $der\, c\, (d)$ & $\dn$ & if $c = d$ then $\ONE$ else $\ZERO$\\ - $der\, c\, (r_1 + r_2)$ & $\dn$ & $der\, c\, r_1 + der\, c\, r_2$\\ - $der\, c\, (r_1 \cdot r_2)$ & $\dn$ & if $\textit{nullable} (r_1)$\\ - & & then $(der\,c\,r_1) \cdot r_2 + der\, c\, r_2$\\ - & & else $(der\, c\, r_1) \cdot r_2$\\ - $der\, c\, (r^*)$ & $\dn$ & $(der\,c\,r) \cdot (r^*)$ + $\textit{der}\, c\, (\ZERO)$ & $\dn$ & $\ZERO$\\ + $\textit{der}\, c\, (\ONE)$ & $\dn$ & $\ZERO$ \\ + $\textit{der}\, c\, (d)$ & $\dn$ & if $c = d$ then $\ONE$ else $\ZERO$\\ + $\textit{der}\, c\, (r_1 + r_2)$ & $\dn$ & $\textit{der}\, c\, r_1 + \textit{der}\, c\, r_2$\\ + $\textit{der}\, c\, (r_1 \cdot r_2)$ & $\dn$ & if $\textit{nullable} (r_1)$\\ + & & then $(\textit{der}\,c\,r_1) \cdot r_2 + \textit{der}\, c\, r_2$\\ + & & else $(\textit{der}\, c\, r_1) \cdot r_2$\\ + $\textit{der}\, c\, (r^*)$ & $\dn$ & $(\textit{der}\,c\,r) \cdot (r^*)$ \end{tabular} \end{center} \noindent The first two clauses can be rationalised as -follows: recall that $der$ should calculate a regular +follows: recall that $\textit{der}$ should calculate a regular expression so that given the ``input'' regular expression can match a string of the form $c\!::\!s$, we want a regular expression for $s$. Since neither $\ZERO$ nor $\ONE$ @@ -320,32 +326,33 @@ involved. Fortunately, the $+$-case is still relatively straightforward: all strings of the form $c\!::\!s$ are either matched by the regular expression $r_1$ or $r_2$. So we just -have to recursively call $der$ with these two regular +have to recursively call $\textit{der}$ with these two regular expressions and compose the results again with $+$. Makes sense? The $\cdot$-case is more complicated: if $r_1\cdot r_2$ matches a string of the form $c\!::\!s$, then the first part must be matched by $r_1$. Consequently, it makes sense to -construct the regular expression for $s$ by calling $der$ with +construct the regular expression for $s$ by calling $\textit{der}$ with $r_1$ and ``appending'' $r_2$. There is however one exception to this simple rule: if $r_1$ can match the empty string, then all of $c\!::\!s$ is matched by $r_2$. So in case $r_1$ is nullable (that is can match the empty string) we have to allow -the choice $der\,c\,r_2$ for calculating the regular +the choice $\textit{der}\,c\,r_2$ for calculating the regular expression that can match $s$. Therefore we have to add the -regular expression $der\,c\,r_2$ in the result. The $*$-case +regular expression $\textit{der}\,c\,r_2$ in the result. The $*$-case is again simple: if $r^*$ matches a string of the form $c\!::\!s$, then the first part must be ``matched'' by a -single copy of $r$. Therefore we call recursively $der\,c\,r$ -and ``append'' $r^*$ in order to match the rest of $s$. +single copy of $r$. Therefore we call recursively $\textit{der}\,c\,r$ +and ``append'' $r^*$ in order to match the rest of $s$. Still +makes sense? -If this did not make sense yet, here is another way to rationalise -the definition of $der$ by considering the following operation +If all this did not make sense yet, here is another way to rationalise +the definition of $\textit{der}$ by considering the following operation on sets: \begin{equation}\label{Der} -Der\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\} +\textit{Der}\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\} \end{equation} \noindent This operation essentially transforms a set of @@ -353,37 +360,37 @@ with $c$ and then strips off the $c$ from all the remaining strings. For example suppose $A = \{f\!oo, bar, f\!rak\}$ then -\[ Der\,f\,A = \{oo, rak\}\quad,\quad - Der\,b\,A = \{ar\} \quad \text{and} \quad - Der\,a\,A = \{\} +\[ \textit{Der}\,f\,A = \{oo, rak\}\quad,\quad + \textit{Der}\,b\,A = \{ar\} \quad \text{and} \quad + \textit{Der}\,a\,A = \{\} \] \noindent -Note that in the last case $Der$ is empty, because no string in $A$ +Note that in the last case $\textit{Der}$ is empty, because no string in $A$ starts with $a$. With this operation we can state the following -property about $der$: +property about $\textit{der}$: \[ -L(der\,c\,r) = Der\,c\,(L(r)) +L(\textit{der}\,c\,r) = \textit{Der}\,c\,(L(r)) \] \noindent -This property clarifies what regular expression $der$ calculates, +This property clarifies what regular expression $\textit{der}$ calculates, namely take the set of strings that $r$ can match (that is $L(r)$), filter out all strings not starting with $c$ and strip off the $c$ from the remaining strings---this is exactly the language that -$der\,c\,r$ can match. +$\textit{der}\,c\,r$ can match. If we want to find out whether the string $abc$ is matched by -the regular expression $r_1$ then we can iteratively apply $der$ +the regular expression $r_1$ then we can iteratively apply $\textit{der}$ as follows \begin{center} \begin{tabular}{rll} Input: $r_1$, $abc$\medskip\\ -Step 1: & build derivative of $a$ and $r_1$ & $(r_2 = der\,a\,r_1)$\smallskip\\ -Step 2: & build derivative of $b$ and $r_2$ & $(r_3 = der\,b\,r_2)$\smallskip\\ -Step 3: & build derivative of $c$ and $r_3$ & $(r_4 = der\,b\,r_3)$\smallskip\\ +Step 1: & build derivative of $a$ and $r_1$ & $(r_2 = \textit{der}\,a\,r_1)$\smallskip\\ +Step 2: & build derivative of $b$ and $r_2$ & $(r_3 = \textit{der}\,b\,r_2)$\smallskip\\ +Step 3: & build derivative of $c$ and $r_3$ & $(r_4 = \textit{der}\,b\,r_3)$\smallskip\\ Step 4: & the string is exhausted; test & ($\textit{nullable}(r_4)$)\\ & whether $r_4$ can recognise the\\ & empty string\smallskip\\ @@ -391,50 +398,50 @@ \end{tabular} \end{center} -\noindent Again the operation $Der$ might help to rationalise +\noindent Again the operation $\textit{Der}$ might help to rationalise this algorithm. We want to know whether $abc \in L(r_1)$. We -do not know yet---but let us assume it is. Then $Der\,a\,L(r_1)$ +do not know yet---but let us assume it is. Then $\textit{Der}\,a\,L(r_1)$ builds the set where all the strings not starting with $a$ are filtered out. Of the remaining strings, the $a$ is stripped off. So we should still have $bc$ in the set. Then we continue with filtering out all strings not starting with $b$ and stripping off the $b$ from the remaining -strings, that means we build $Der\,b\,(Der\,a\,(L(r_1)))$. +strings, that means we build $\textit{Der}\,b\,(\textit{Der}\,a\,(L(r_1)))$. Finally we filter out all strings not starting with $c$ and strip off $c$ from the remaining string. This is -$Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$. Now if $abc$ was in the -original set ($L(r_1)$), then in $Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$ +$\textit{Der}\,c\,(\textit{Der}\,b\,(\textit{Der}\,a\,(L(r_1))))$. Now if $abc$ was in the +original set ($L(r_1)$), then $\textit{Der}\,c\,(\textit{Der}\,b\,(\textit{Der}\,a\,(L(r_1))))$ must contain the empty string. If not, then $abc$ was not in the language we started with. -Our matching algorithm using $der$ and $\textit{nullable}$ works -similarly, just using regular expression instead of sets. For -this we need to extend the notion of derivatives from single +Our matching algorithm using $\textit{der}$ and $\textit{nullable}$ works +similarly, just using regular expression instead of sets. In order to +define our algorithm we need to extend the notion of derivatives from single characters to strings. This can be done using the following -function, taking a string and regular expression as input and +function, taking a string and a regular expression as input and a regular expression as output. \begin{center} \begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {\hspace{-10mm}}l@ {}} $\textit{ders}\, []\, r$ & $\dn$ & $r$ & \\ - $\textit{ders}\, (c\!::\!s)\, r$ & $\dn$ & $\textit{ders}\,s\,(der\,c\,r)$ & \\ + $\textit{ders}\, (c\!::\!s)\, r$ & $\dn$ & $\textit{ders}\,s\,(\textit{der}\,c\,r)$ & \\ \end{tabular} \end{center} -\noindent This function iterates $der$ taking one character at +\noindent This function iterates $\textit{der}$ taking one character at the time from the original string until it is exhausted. -Having $ders$ in place, we can finally define our matching +Having $\textit{der}s$ in place, we can finally define our matching algorithm: \[ -matches\,s\,r \dn \textit{nullable}(ders\,s\,r) +\textit{matches}\,s\,r \dn \textit{nullable}(\textit{ders}\,s\,r) \] \noindent and we can claim that \[ -matches\,s\,r\quad\text{if and only if}\quad s\in L(r) +\textit{matches}\,s\,r\quad\text{if and only if}\quad s\in L(r) \] \noindent holds, which means our algorithm satisfies the @@ -442,11 +449,12 @@ whether the claim holds any water is a different question, which for example is the point of the Strand-2 Coursework. -This algorithm was introduced by Janus Brzozowski in 1964. Its +This algorithm was introduced by Janus Brzozowski in 1964, but +is more widely known only in the last 10 or so years. Its main attractions are simplicity and being fast, as well as being easily extendable for other regular expressions such as $r^{\{n\}}$, $r^?$, $\sim{}r$ and so on (this is subject of -Strand-1 Coursework 1). +Strand-1 Coursework 1). \subsection*{The Matching Algorithm in Scala} @@ -465,22 +473,65 @@ definitions very closely.\label{scala1}} \end{figure} -For running the algorithm with our favourite example, the evil + +Remember our second example involving the regular expression +$(a^*)^* \cdot b$ which could not match strings of $n$ \texttt{a}s. +Java needed around 30 seconds to find this out a string with $n=28$. +It seems our algorithm is doing rather well in comparison: + +\begin{center} +\begin{tikzpicture} +\begin{axis}[ + title={Plot: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.05,0.0)}}, + ylabel={time in secs}, + enlargelimits=false, + xtick={0,500,...,4000}, + xmax=4100, + ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=10cm, + height=4.5cm, + legend entries={Java,Scala V1}, + legend pos=north east, + legend cell align=left] +\addplot[blue,mark=*, mark options={fill=white}] + table {re-java.data}; +\addplot[red,mark=triangle*,mark options={fill=white}] + table {re2c.data}; +\end{axis} +\end{tikzpicture} +\end{center} + +\noindent +This is no error: it hardly takes more than half a second for +strings up to the length of 4000. After that we receive a +StackOverflow exception, but still\ldots + +For running the algorithm with our first example, the evil regular expression $a^?{}^{\{n\}}a^{\{n\}}$, we need to implement the optional regular expression and the exactly $n$-times regular expression. This can be done with the translations \lstinputlisting[numbers=none]{../progs/app51.scala} -\noindent Running the matcher with the example, we find it is +\noindent Running the matcher with this example, we find it is slightly worse then the matcher in Ruby and Python. Ooops\ldots \begin{center} -\begin{tikzpicture} \begin{axis}[ xlabel={\pcode{a}s}, ylabel={time in secs}, +\begin{tikzpicture} +\begin{axis}[ + title={Plot: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.05,0.0)}}, + ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, - xmax=30, ytick={0,5,...,30}, + xmax=30, + ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=6cm, @@ -493,7 +544,9 @@ \addplot[brown,mark=pentagon*, mark options={fill=white}] table {re-ruby.data}; \addplot[red,mark=triangle*,mark options={fill=white}] - table {re1.data}; \end{axis} \end{tikzpicture} + table {re1.data}; +\end{axis} +\end{tikzpicture} \end{center} \noindent Analysing this failure we notice that for @@ -525,41 +578,50 @@ \noindent With this fix we have a constant ``size'' regular expression for our running example no matter how large $n$ is. This means we have to also add cases for \pcode{NTIMES} in the -functions $\textit{nullable}$ and $der$. Does the change have any +functions $\textit{nullable}$ and $\textit{der}$. Does the change have any effect? \begin{center} -\begin{tikzpicture} \begin{axis}[ xlabel={\pcode{a}s}, ylabel={time in secs}, +\begin{tikzpicture} +\begin{axis}[ + title={Plot: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.01,0.0)}}, + ylabel={time in secs}, enlargelimits=false, xtick={0,100,...,1000}, - xmax=1000, ytick={0,5,...,30}, + xmax=1100, + ytick={0,5,...,30}, scaled ticks=false, axis lines=left, - width=9.5cm, + width=10cm, height=5cm, legend entries={Python,Ruby,Scala V1,Scala V2}, legend pos=outer north east, - legend cell align=left ] + legend cell align=left] \addplot[blue,mark=*, mark options={fill=white}] table {re-python.data}; \addplot[brown,mark=pentagon*, mark options={fill=white}] table {re-ruby.data}; \addplot[red,mark=triangle*,mark options={fill=white}] - table {re1.data}; \addplot[green,mark=square*,mark options={fill=white}] - table {re2b.data}; \end{axis} \end{tikzpicture} + table {re1.data}; +\addplot[green,mark=square*,mark options={fill=white}] + table {re2b.data}; +\end{axis} +\end{tikzpicture} \end{center} \noindent Now we are talking business! The modified matcher can within 30 seconds handle regular expressions up to -$n = 950$ before a StackOverflow is raised. Python and Ruby -(and our first version) could only handle $n = 27$ or so in 30 -second. +$n = 950$ before a StackOverflow is raised. Recall that Python and Ruby +(and our first version, Scala V1) could only handle $n = 27$ or so in 30 +seconds. There is no change for our second example +$(a^*)^* \cdot b$---so this is still good. -SECOND EXAMPLE The moral is that our algorithm is rather sensitive to the size of regular expressions it needs to handle. This is of -course obvious because both $\textit{nullable}$ and $der$ frequently +course obvious because both $\textit{nullable}$ and $\textit{der}$ frequently need to traverse the whole regular expression. There seems, however, one more issue for making the algorithm run faster. The derivative function often produces ``useless'' @@ -568,9 +630,9 @@ \begin{center} \begin{tabular}{l} -$der\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\ -$der\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\ -$der\,c\,r = ((\ZERO \cdot b) + \ZERO)\cdot r$ +$\textit{der}\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\ +$\textit{der}\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\ +$\textit{der}\,c\,r = ((\ZERO \cdot b) + \ZERO)\cdot r$ \end{tabular} \end{center} @@ -581,9 +643,9 @@ \begin{center} \begin{tabular}{l} -$der\,a\,r \equiv b \cdot r$\\ -$der\,b\,r \equiv r$\\ -$der\,c\,r \equiv \ZERO$ +$\textit{der}\,a\,r \equiv b \cdot r$\\ +$\textit{der}\,b\,r \equiv r$\\ +$\textit{der}\,c\,r \equiv \ZERO$ \end{tabular} \end{center} @@ -613,10 +675,14 @@ \begin{center} \begin{tikzpicture} -\begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs}, +\begin{axis}[ + title={Plot: $a^{?\{n\}} \cdot a^{\{n\}}$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.04,0.0)}}, + ylabel={time in secs}, enlargelimits=false, xtick={0,2000,...,12000}, - xmax=12000, + xmax=13000, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, @@ -629,7 +695,32 @@ \end{tikzpicture} \end{center} -SECOND EXAMPLE +\begin{center} +\begin{tikzpicture} +\begin{axis}[ + title={Plot: $(a^*)^* \cdot b$ and strings $\underbrace{a\ldots a}_{n}$}, + xlabel={$n$}, + x label style={at={(1.09,0.0)}}, + ylabel={time in secs}, + enlargelimits=false, + %xtick={0,1e+6,...,6e+6}, + xmax=6200000, + ytick={0,5,...,30}, + ymax=33, + scaled ticks=false, + axis lines=left, + width=9cm, + height=5cm, + legend entries={Scala V3}] +\addplot[black,mark=square*,mark options={fill=white}] table {re3a.data}; +\end{axis} +\end{tikzpicture} +\end{center} + +\section{Epilogue} + +TBD + \section*{Proofs} @@ -771,23 +862,23 @@ Given this recipe, I let you show \begin{equation} -Ders\,s\,(L(r)) = L(ders\,s\,r) +\textit{Ders}\,s\,(L(r)) = L(\textit{ders}\,s\,r) \label{dersprop} \end{equation} -\noindent by induction on $s$. Recall $Der$ is defined for -character---see \eqref{Der}; $Ders$ is similar, but for strings: +\noindent by induction on $s$. Recall $\textit{Der}$ is defined for +character---see \eqref{Der}; $\textit{Ders}$ is similar, but for strings: \[ -Ders\,s\,A\;\dn\;\{s'\,|\,s @ s' \in A\} +\textit{Ders}\,s\,A\;\dn\;\{s'\,|\,s @ s' \in A\} \] \noindent In this proof you can assume the following property -for $der$ and $Der$ has already been proved, that is you can +for $der$ and $\textit{Der}$ has already been proved, that is you can assume \[ -L(der\,c\,r) = Der\,c\,(L(r)) +L(\textit{der}\,c\,r) = \textit{Der}\,c\,(L(r)) \] \noindent holds (this would be of course a property that @@ -806,16 +897,16 @@ following problem \begin{equation} -[] \in Ders\,s\,(L(r)) +[] \in \textit{Ders}\,s\,(L(r)) \label{dersstep} \end{equation} \noindent But we have shown above in \eqref{dersprop}, that -the $Ders$ can be replaced by $L(ders\ldots)$. That means +the $\textit{Ders}$ can be replaced by $L(\textit{ders}\ldots)$. That means \eqref{dersstep} is equivalent to \begin{equation} -[] \in L(ders\,s\,r) +[] \in L(\textit{ders}\,s\,r) \label{prefinalstep} \end{equation} @@ -825,13 +916,13 @@ \eqref{prefinalstep} is equivalent with \[ -\textit{nullable}(ders\,s\,r) +\textit{nullable}(\textit{ders}\,s\,r) \] \noindent But this is just the definition of $matches$ \[ -matches\,s\,r \dn nullable(ders\,s\,r) +matches\,s\,r \dn nullable(\textit{ders}\,s\,r) \] \noindent In effect we have shown diff -r b7221df9662a -r 065ca01b62ae progs/re2.scala --- a/progs/re2.scala Mon Aug 22 09:14:22 2016 +0200 +++ b/progs/re2.scala Mon Aug 22 23:05:43 2016 +0200 @@ -41,7 +41,8 @@ //optional: one or zero times def OPT(r: Rexp) = ALT(r, EMPTY) -def EVIL(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) +def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) +val EVIL2 = SEQ(STAR(CHAR('a')), CHAR('b')) def time_needed[T](i: Int, code: => T) = { val start = System.nanoTime() @@ -51,12 +52,17 @@ } //for (i <- 1 to 100) { -// println(i + ": " + "%.5f".format(time_needed(1, matches(EVIL(i), "a" * i)))) +// println(i + ": " + "%.5f".format(time_needed(1, matches(EVIL1(i), "a" * i)))) //} //a bit bolder test for (i <- 1 to 1000 by 50) { - println(i + " " + "%.5f".format(time_needed(1, matches(EVIL(i), "a" * i)))) + println(i + " " + "%.5f".format(time_needed(1, matches(EVIL1(i), "a" * i)))) } +for (i <- 1 to 4002 by 500) { + println(i + " " + "%.5f".format(time_needed(4, matches(EVIL2, "a" * i)))) +} + + diff -r b7221df9662a -r 065ca01b62ae progs/re4.scala --- a/progs/re4.scala Mon Aug 22 09:14:22 2016 +0200 +++ b/progs/re4.scala Mon Aug 22 23:05:43 2016 +0200 @@ -85,18 +85,9 @@ } -def ders2 (s: List[Char], r: Rexp) : Rexp = (s, r) match { - case (Nil, r) => r - case (s, ZERO) => ZERO - case (s, ONE) => if (s == Nil) ONE else ZERO - case (s, CHAR(c)) => if (s == List(c)) ONE else - if (s == Nil) CHAR(c) else ZERO - case (s, ALT(r1, r2)) => ALT(ders2(s, r2), ders2(s, r2)) - case (c::s, r) => ders2(s, der(c, r).simp) -} // main matcher function -def matcher(r: Rexp, s: String) : Boolean = nullable(ders2(s.toList, r)) +def matcher(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r)) //one or zero @@ -119,7 +110,7 @@ // println(i + " " + "%.5f".format(time_needed(10, matcher(EVIL1(i), "a" * i)))) //} -for (i <- 1 to 6000000 by 500000) { +for (i <- 1 to 6000001 by 500000) { println(i + " " + "%.5f".format(time_needed(10, matcher(EVIL2, "a" * i)))) } diff -r b7221df9662a -r 065ca01b62ae progs/re5.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/progs/re5.scala Mon Aug 22 23:05:43 2016 +0200 @@ -0,0 +1,118 @@ +import scala.annotation.tailrec +import scala.language.implicitConversions + +abstract class Rexp { + def simp : Rexp = this +} + +case object ZERO extends Rexp +case object ONE extends Rexp +case class CHAR(c: Char) extends Rexp +case class ALT(r1: Rexp, r2: Rexp) extends Rexp { + override def simp = (r1.simp, r2.simp) match { + case (ZERO, r) => r + case (r, ZERO) => r + case (r, ONE) => if (nullable(r)) r else ALT(r, ONE) + case (ONE, r) => if (nullable(r)) r else ALT(r, ONE) + case (r1, r2) => if (r1 == r2) r1 else ALT(r1, r2) + } +} +case class SEQ(r1: Rexp, r2: Rexp) extends Rexp { + override def simp = (r1.simp, r2.simp) match { + case (ZERO, _) => ZERO + case (_, ZERO) => ZERO + case (ONE, r) => r + case (r, ONE) => r + case (r1, r2) => SEQ(r1, r2) + } +} +case class STAR(r: Rexp) extends Rexp { + override def simp = r.simp match { + case ZERO => ONE + case ONE => ONE + case r => STAR(r) + } +} +case class NTIMES(r: Rexp, n: Int) extends Rexp { + override def simp = if (n == 0) ONE else + r.simp match { + case ZERO => ZERO + case ONE => ONE + case r => NTIMES(r, n) + } +} + +// some convenience for typing in regular expressions +def charlist2rexp(s : List[Char]) : Rexp = s match { + case Nil => ONE + case c::Nil => CHAR(c) + case c::s => SEQ(CHAR(c), charlist2rexp(s)) +} +implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) + + +// nullable function: tests whether the regular +// expression can recognise the empty string +def nullable (r: Rexp) : Boolean = r match { + case ZERO => false + case ONE => true + case CHAR(_) => false + case ALT(r1, r2) => nullable(r1) || nullable(r2) + case SEQ(r1, r2) => nullable(r1) && nullable(r2) + case STAR(_) => true + case NTIMES(r, i) => if (i == 0) true else nullable(r) +} + +// derivative of a regular expression w.r.t. a character +def der (c: Char, r: Rexp) : Rexp = r match { + case ZERO => ZERO + case ONE => ZERO + case CHAR(d) => if (c == d) ONE else ZERO + case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) + case SEQ(r1, r2) => + if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) + else SEQ(der(c, r1), r2) + case STAR(r) => SEQ(der(c, r), STAR(r)) + case NTIMES(r, i) => + if (i == 0) ZERO else SEQ(der(c, r), NTIMES(r, i - 1)) +} + +// derivative w.r.t. a string (iterates der) +def ders2(s: List[Char], r: Rexp) : Rexp = (s, r) match { + case (Nil, r) => r + case (s, ZERO) => ZERO + case (s, ONE) => if (s == Nil) ONE else ZERO + case (s, CHAR(c)) => if (s == List(c)) ONE else + if (s == Nil) CHAR(c) else ZERO + case (s, ALT(r1, r2)) => ALT(ders2(s, r2), ders2(s, r2)) + case (c::s, r) => ders2(s, der(c, r).simp) +} + +// main matcher function +def matcher(r: Rexp, s: String) : Boolean = nullable(ders2(s.toList, r)) + + +//one or zero +def OPT(r: Rexp) = ALT(r, ONE) + +def EVIL1(n: Int) = SEQ(NTIMES(OPT("a"), n), NTIMES("a", n)) +val EVIL2 = SEQ(STAR("a"), "b") + +def time_needed[T](i: Int, code: => T) = { + val start = System.nanoTime() + for (j <- 1 to i) code + val end = System.nanoTime() + (end - start)/(i * 1.0e9) +} + +val i = 5000 +println(i + " " + "%.5f".format(time_needed(10, matcher(EVIL1(i), "a" * i)))) + +for (i <- 1 to 80000 by 10000) { + println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL1(i), "a" * i)))) +} + +for (i <- 1 to 10000001 by 1000000) { + println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL2, "a" * i)))) +} +