diff -r c8ce95067c1a -r 5c1fbb39c93e handouts/ho02.tex --- a/handouts/ho02.tex Tue Mar 22 17:09:24 2016 +0000 +++ b/handouts/ho02.tex Wed Apr 06 11:51:33 2016 +0100 @@ -4,8 +4,10 @@ \usepackage{../graphics} \usepackage{../data} -\pgfplotsset{compat=1.11} + \begin{document} +\fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016} + \section*{Handout 2 (Regular Expression Matching)} @@ -14,16 +16,18 @@ than the matchers from regular expression libraries in Ruby and Python (the plots on the left). These plots show the running time for the evil regular expression -$a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings composed of $n$ \pcode{a}s. -We will use this regular expression and strings as running -example. To see the substantial differences in the two plots -below, note the different scales of the $x$-axes. +$a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings composed of $n$ +\pcode{a}s. We will use this regular expression and strings as +running example. To see the substantial differences in the two +plots below, note the different scales of the $x$-axes. \begin{center} \begin{tabular}{@{}cc@{}} \begin{tikzpicture} -\begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs}, +\begin{axis}[ + xlabel={strings of {\tt a}s}, + ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, @@ -44,7 +48,9 @@ \end{tikzpicture} & \begin{tikzpicture} - \begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs}, + \begin{axis}[ + xlabel={strings of \texttt{a}s}, + ylabel={time in secs}, enlargelimits=false, xtick={0,3000,...,12000}, xmax=12500, @@ -119,15 +125,15 @@ \noindent I leave it to you to verify these equivalences and non-equivalences. It is also interesting to look at some -corner cases involving $\epsilon$ and $\varnothing$: +corner cases involving $\ONE$ and $\ZERO$: \begin{center} \begin{tabular}{rcl} -$a \cdot \varnothing$ & $\not\equiv$ & $a$\\ -$a + \epsilon$ & $\not\equiv$ & $a$\\ -$\epsilon$ & $\equiv$ & $\varnothing^*$\\ -$\epsilon^*$ & $\equiv$ & $\epsilon$\\ -$\varnothing^*$ & $\not\equiv$ & $\varnothing$\\ +$a \cdot \ZERO$ & $\not\equiv$ & $a$\\ +$a + \ONE$ & $\not\equiv$ & $a$\\ +$\ONE$ & $\equiv$ & $\ZERO^*$\\ +$\ONE^*$ & $\equiv$ & $\ONE$\\ +$\ZERO^*$ & $\not\equiv$ & $\ZERO$\\ \end{tabular} \end{center} @@ -140,20 +146,20 @@ \begin{center} \begin{tabular}{rcl} -$r + \varnothing$ & $\equiv$ & $r$\\ -$\varnothing + r$ & $\equiv$ & $r$\\ -$r \cdot \epsilon$ & $\equiv$ & $r$\\ -$\epsilon \cdot r$ & $\equiv$ & $r$\\ -$r \cdot \varnothing$ & $\equiv$ & $\varnothing$\\ -$\varnothing \cdot r$ & $\equiv$ & $\varnothing$\\ +$r + \ZERO$ & $\equiv$ & $r$\\ +$\ZERO + r$ & $\equiv$ & $r$\\ +$r \cdot \ONE$ & $\equiv$ & $r$\\ +$\ONE \cdot r$ & $\equiv$ & $r$\\ +$r \cdot \ZERO$ & $\equiv$ & $\ZERO$\\ +$\ZERO \cdot r$ & $\equiv$ & $\ZERO$\\ $r + r$ & $\equiv$ & $r$ \end{tabular} \end{center} \noindent which always hold no matter what the regular expression $r$ looks like. The first two are easy to verify -since $L(\varnothing)$ is the empty set. The next two are also -easy to verify since $L(\epsilon) = \{[]\}$ and appending the +since $L(\ZERO)$ is the empty set. The next two are also +easy to verify since $L(\ONE) = \{[]\}$ and appending the empty string to every string of another set, leaves the set unchanged. Be careful to fully comprehend the fifth and sixth equivalence: if you concatenate two sets of strings and one is @@ -167,7 +173,7 @@ example the regular expression \begin{equation} -(r_1 + \varnothing) \cdot \epsilon + ((\epsilon + r_2) + r_3) \cdot (r_4 \cdot \varnothing) +(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO) \label{big} \end{equation} @@ -182,21 +188,21 @@ \begin{center} \begin{tabular}{ll} - & $(r_1 + \varnothing) \cdot \epsilon + ((\epsilon + r_2) + r_3) \cdot -(\underline{r_4 \cdot \varnothing})$\smallskip\\ -$\equiv$ & $(r_1 + \varnothing) \cdot \epsilon + \underline{((\epsilon + r_2) + r_3) \cdot -\varnothing}$\smallskip\\ -$\equiv$ & $\underline{(r_1 + \varnothing) \cdot \epsilon} + \varnothing$\smallskip\\ -$\equiv$ & $(\underline{r_1 + \varnothing}) + \varnothing$\smallskip\\ -$\equiv$ & $\underline{r_1 + \varnothing}$\smallskip\\ + & $(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot +(\underline{r_4 \cdot \ZERO})$\smallskip\\ +$\equiv$ & $(r_1 + \ZERO) \cdot \ONE + \underline{((\ONE + r_2) + r_3) \cdot +\ZERO}$\smallskip\\ +$\equiv$ & $\underline{(r_1 + \ZERO) \cdot \ONE} + \ZERO$\smallskip\\ +$\equiv$ & $(\underline{r_1 + \ZERO}) + \ZERO$\smallskip\\ +$\equiv$ & $\underline{r_1 + \ZERO}$\smallskip\\ $\equiv$ & $r_1$\ \end{tabular} \end{center} \noindent In each step, I underlined where a simplification rule is applied. Our matching algorithm in the next section -will often generate such ``useless'' $\epsilon$s and -$\varnothing$s, therefore simplifying them away will make the +will often generate such ``useless'' $\ONE$s and +$\ZERO$s, therefore simplifying them away will make the algorithm quite a bit faster. \subsection*{The Matching Algorithm} @@ -209,8 +215,8 @@ \begin{center} \begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}} -$nullable(\varnothing)$ & $\dn$ & $\textit{false}$\\ -$nullable(\epsilon)$ & $\dn$ & $true$\\ +$nullable(\ZERO)$ & $\dn$ & $\textit{false}$\\ +$nullable(\ONE)$ & $\dn$ & $true$\\ $nullable(c)$ & $\dn$ & $\textit{false}$\\ $nullable(r_1 + r_2)$ & $\dn$ & $nullable(r_1) \vee nullable(r_2)$\\ $nullable(r_1 \cdot r_2)$ & $\dn$ & $nullable(r_1) \wedge nullable(r_2)$\\ @@ -243,9 +249,9 @@ \begin{center} \begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l} - $der\, c\, (\varnothing)$ & $\dn$ & $\varnothing$\\ - $der\, c\, (\epsilon)$ & $\dn$ & $\varnothing$ \\ - $der\, c\, (d)$ & $\dn$ & if $c = d$ then $\epsilon$ else $\varnothing$\\ + $der\, c\, (\ZERO)$ & $\dn$ & $\ZERO$\\ + $der\, c\, (\ONE)$ & $\dn$ & $\ZERO$ \\ + $der\, c\, (d)$ & $\dn$ & if $c = d$ then $\ONE$ else $\ZERO$\\ $der\, c\, (r_1 + r_2)$ & $\dn$ & $der\, c\, r_1 + der\, c\, r_2$\\ $der\, c\, (r_1 \cdot r_2)$ & $\dn$ & if $nullable (r_1)$\\ & & then $(der\,c\,r_1) \cdot r_2 + der\, c\, r_2$\\ @@ -258,14 +264,14 @@ follows: recall that $der$ should calculate a regular expression so that given the ``input'' regular expression can match a string of the form $c\!::\!s$, we want a regular -expression for $s$. Since neither $\varnothing$ nor $\epsilon$ +expression for $s$. Since neither $\ZERO$ nor $\ONE$ can match a string of the form $c\!::\!s$, we return -$\varnothing$. In the third case we have to make a +$\ZERO$. In the third case we have to make a case-distinction: In case the regular expression is $c$, then clearly it can recognise a string of the form $c\!::\!s$, just that $s$ is the empty string. Therefore we return the -$\epsilon$-regular expression. In the other case we again -return $\varnothing$ since no string of the $c\!::\!s$ can be +$\ONE$-regular expression. In the other case we again +return $\ZERO$ since no string of the $c\!::\!s$ can be matched. Next come the recursive cases, which are a bit more involved. Fortunately, the $+$-case is still relatively straightforward: all strings of the form $c\!::\!s$ are either @@ -292,9 +298,9 @@ the definition of $der$ by considering the following operation on sets: -\[ +\begin{equation}\label{Der} Der\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\} -\] +\end{equation} \noindent This operation essentially transforms a set of strings $A$ by filtering out all strings that do not start @@ -406,7 +412,7 @@ \begin{figure}[p] \lstinputlisting{../progs/app5.scala} \caption{Scala implementation of the nullable and - derivatives functions. These functions are easy to + derivative functions. These functions are easy to implement in functional languages, because pattern matching and recursion allow us to mimic the mathematical definitions very closely.\label{scala1}} @@ -460,7 +466,7 @@ \noindent Our algorithm traverses such regular expressions at least once every time a derivative is calculated. So having large regular expressions will cause problems. This problem -is aggravated by $a^?$ being represented as $a + \epsilon$. +is aggravated by $a^?$ being represented as $a + \ONE$. We can however fix this by having an explicit constructor for $r^{\{n\}}$. In Scala we would introduce a constructor like @@ -508,14 +514,14 @@ need to traverse the whole regular expression. There seems, however, one more issue for making the algorithm run faster. The derivative function often produces ``useless'' -$\varnothing$s and $\epsilon$s. To see this, consider $r = ((a +$\ZERO$s and $\ONE$s. To see this, consider $r = ((a \cdot b) + b)^*$ and the following two derivatives \begin{center} \begin{tabular}{l} -$der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\ -$der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\ -$der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$ +$der\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\ +$der\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\ +$der\,c\,r = ((\ZERO \cdot b) + \ZERO)\cdot r$ \end{tabular} \end{center} @@ -528,7 +534,7 @@ \begin{tabular}{l} $der\,a\,r \equiv b \cdot r$\\ $der\,b\,r \equiv r$\\ -$der\,c\,r \equiv \varnothing$ +$der\,c\,r \equiv \ZERO$ \end{tabular} \end{center} @@ -553,7 +559,7 @@ calls \texttt{der} first, but then simplifies the resulting derivative regular expressions before building the next derivative, see -Line~28.\label{scala2}} +Line~\ref{simpline}.\label{scala2}} \end{figure} \begin{center} @@ -590,8 +596,8 @@ \begin{center} \begin{tabular}{r@{\hspace{1mm}}r@{\hspace{1mm}}l@{\hspace{13mm}}l} - $r$ & $::=$ & $\varnothing$ & null\\ - & $\mid$ & $\epsilon$ & empty string / \texttt{""} / []\\ + $r$ & $::=$ & $\ZERO$ & null language\\ + & $\mid$ & $\ONE$ & empty string / \texttt{""} / []\\ & $\mid$ & $c$ & single character\\ & $\mid$ & $r_1 + r_2$ & alternative / choice\\ & $\mid$ & $r_1 \cdot r_2$ & sequence\\ @@ -604,7 +610,7 @@ the recipe: \begin{itemize} -\item $P$ has to hold for $\varnothing$, $\epsilon$ and $c$ +\item $P$ has to hold for $\ZERO$, $\ONE$ and $c$ (these are the base cases). \item $P$ has to hold for $r_1 + r_2$ under the assumption that $P$ already holds for $r_1$ and $r_2$. @@ -625,21 +631,21 @@ \noindent Let us say that this property is $P(r)$, then the first case -we need to check is whether $P(\varnothing)$ (see recipe +we need to check is whether $P(\ZERO)$ (see recipe above). So we have to show that \[ -nullable(\varnothing) \;\;\text{if and only if}\;\; -[]\in L(\varnothing) +nullable(\ZERO) \;\;\text{if and only if}\;\; +[]\in L(\ZERO) \] -\noindent whereby $nullable(\varnothing)$ is by definition of +\noindent whereby $nullable(\ZERO)$ is by definition of the function $nullable$ always $\textit{false}$. We also have -that $L(\varnothing)$ is by definition $\{\}$. It is +that $L(\ZERO)$ is by definition $\{\}$. It is impossible that the empty string $[]$ is in the empty set. Therefore also the right-hand side is false. Consequently we verified this case: both sides are false. We would still need -to do this for $P(\varepsilon)$ and $P(c)$. I leave this to +to do this for $P(\ONE)$ and $P(c)$. I leave this to you to verify. Next we need to check the inductive cases, for example @@ -718,9 +724,16 @@ \label{dersprop} \end{equation} -\noindent by induction on $s$. In this proof you can assume -the following property for $der$ and $Der$ has already been -proved, that is you can assume +\noindent by induction on $s$. Recall $Der$ is defined for +character---see \eqref{Der}; $Ders$ is similar, but for strings: + +\[ +Ders\,s\,A\;\dn\;\{s'\,|\,s @ s' \in A\} +\] + +\noindent In this proof you can assume the following property +for $der$ and $Der$ has already been proved, that is you can +assume \[ L(der\,c\,r) = Der\,c\,(L(r))