# HG changeset patch # User Christian Urban # Date 1381569158 -3600 # Node ID 1be892087df2f577a1536563e5458cbc90e2cd58 # Parent 6e7c3db9023d2a70ca219a963f817df46770d4d5 added diff -r 6e7c3db9023d -r 1be892087df2 handouts/ho02.pdf Binary file handouts/ho02.pdf has changed diff -r 6e7c3db9023d -r 1be892087df2 handouts/ho02.tex --- a/handouts/ho02.tex Sat Oct 12 09:13:42 2013 +0100 +++ b/handouts/ho02.tex Sat Oct 12 10:12:38 2013 +0100 @@ -57,11 +57,12 @@ \] \noindent -Clearly we cannot use the function $L$ directly in order to solve this problem, because in general -the set of strings $L$ returns is infinite (recall what $L(a^*)$ is). In such cases there is no algorithm -then can test exhaustively, whether a string is member of this set. +we can look at an algorithm to solve this problem. +Clearly we cannot use the function $L$ directly for this, because in general +the set of strings $L$ returns is infinite (recall what $L(a^*)$ is). In such cases there is no way we can implement +an exhaustive test for whether a string is member of this set or not. -The algorithm we define below consists of two parts. One is the function $nullable$ which takes a +The algorithm we will define below consists of two parts. One is the function $nullable$ which takes a regular expression as argument and decides whether it can match the empty string (this means it returns a boolean). This can be easily defined recursively as follows: @@ -84,11 +85,11 @@ \] \noindent -On the left-hand side we have a function we can implement; on the right we have its specification. +Note on the left-hand side we have a function we can implement; on the right we have its specification. -The other function is calculating a \emph{derivative} of a regular expression. This is a function +The other function of our matching algorithm calculates a \emph{derivative} of a regular expression. This is a function which will take a regular expression, say $r$, and a character, say $c$, as argument and return -a new regular expression. Beware that the intuition behind this function is not so easy to grasp on first +a new regular expression. Be careful that the intuition behind this function is not so easy to grasp on first reading. Essentially this function solves the following problem: if $r$ can match a string of the form $c\!::\!s$, what does the regular expression look like that can match just $s$. The definition of this function is as follows: @@ -133,7 +134,7 @@ \noindent which essentially transforms a set of strings $A$ by filtering out all strings that do not start with $c$ and then -strip off the $c$ from all the remaining strings. For example suppose $A = \{"f\!oo", "bar", "f\!rak"\}$ then +strips off the $c$ from all the remaining strings. For example suppose $A = \{"f\!oo", "bar", "f\!rak"\}$ then \[ Der\,f\,A = \{"oo", "rak"\}\quad,\quad Der\,b\,A = \{"ar"\} \quad \text{and} \quad @@ -150,10 +151,21 @@ \noindent This property clarifies what regular expression $der$ calculates, namely take the set of strings -that $r$ can match ($L(r)$), filter out all strings not starting with $c$ and strip off the $c$ from the +that $r$ can match (that is $L(r)$), filter out all strings not starting with $c$ and strip off the $c$ from the remaining strings---this is exactly the language that $der\,c\,r$ can match. -For our matching algorithm we need to lift the notion of derivatives from characters to strings. This can be +If we want to find out whether the string $"abc"$ is matched by the regular expression $r$ +then we can iteratively apply $Der$ as follows + +\begin{enumerate} +\item $Der\,a\,(L(r))$ +\item $Der\,b\,(Der\,a\,(L(r)))$ +\item $Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$ +\end{enumerate} + +\noindent +In the last step we need to test whether the empty string is in the set. Our matching algorithm will work similarly, +just using regular expression instead of sets. For this we need to lift the notion of derivatives from characters to strings. This can be done using the following function, taking a string and regular expression as input and a regular expression as output. diff -r 6e7c3db9023d -r 1be892087df2 handouts/ho03.pdf Binary file handouts/ho03.pdf has changed diff -r 6e7c3db9023d -r 1be892087df2 handouts/ho03.tex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/handouts/ho03.tex Sat Oct 12 10:12:38 2013 +0100 @@ -0,0 +1,197 @@ +\documentclass{article} +\usepackage{charter} +\usepackage{hyperref} +\usepackage{amssymb} +\usepackage{amsmath} +\usepackage[T1]{fontenc} +\usepackage{listings} +\usepackage{xcolor} + +\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% + +\definecolor{javared}{rgb}{0.6,0,0} % for strings +\definecolor{javagreen}{rgb}{0.25,0.5,0.35} % comments +\definecolor{javapurple}{rgb}{0.5,0,0.35} % keywords +\definecolor{javadocblue}{rgb}{0.25,0.35,0.75} % javadoc + +\lstdefinelanguage{scala}{ + morekeywords={abstract,case,catch,class,def,% + do,else,extends,false,final,finally,% + for,if,implicit,import,match,mixin,% + new,null,object,override,package,% + private,protected,requires,return,sealed,% + super,this,throw,trait,true,try,% + type,val,var,while,with,yield}, + otherkeywords={=>,<-,<\%,<:,>:,\#,@}, + sensitive=true, + morecomment=[l]{//}, + morecomment=[n]{/*}{*/}, + morestring=[b]", + morestring=[b]', + morestring=[b]""" +} + +\lstset{language=Scala, + basicstyle=\ttfamily, + keywordstyle=\color{javapurple}\bfseries, + stringstyle=\color{javagreen}, + commentstyle=\color{javagreen}, + morecomment=[s][\color{javadocblue}]{/**}{*/}, + numbers=left, + numberstyle=\tiny\color{black}, + stepnumber=1, + numbersep=10pt, + tabsize=2, + showspaces=false, + showstringspaces=false} + +\begin{document} + +\section*{Handout 2} + +Having specified what problem our matching algorithm, $match$, is supposed to solve, namely +for a given regular expression $r$ and string $s$ answer $true$ if and only if + +\[ +s \in L(r) +\] + +\noindent +Clearly we cannot use the function $L$ directly in order to solve this problem, because in general +the set of strings $L$ returns is infinite (recall what $L(a^*)$ is). In such cases there is no algorithm +then can test exhaustively, whether a string is member of this set. + +The algorithm we define below consists of two parts. One is the function $nullable$ which takes a +regular expression as argument and decides whether it can match the empty string (this means it returns a +boolean). This can be easily defined recursively as follows: + +\begin{center} +\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}} +$nullable(\varnothing)$ & $\dn$ & $f\!\/alse$\\ +$nullable(\epsilon)$ & $\dn$ & $true$\\ +$nullable (c)$ & $\dn$ & $f\!alse$\\ +$nullable (r_1 + r_2)$ & $\dn$ & $nullable(r_1) \vee nullable(r_2)$\\ +$nullable (r_1 \cdot r_2)$ & $\dn$ & $nullable(r_1) \wedge nullable(r_2)$\\ +$nullable (r^*)$ & $\dn$ & $true$ \\ +\end{tabular} +\end{center} + +\noindent +The idea behind this function is that the following property holds: + +\[ +nullable(r) \;\;\text{if and only if}\;\; ""\in L(r) +\] + +\noindent +On the left-hand side we have a function we can implement; on the right we have its specification. + +The other function is calculating a \emph{derivative} of a regular expression. This is a function +which will take a regular expression, say $r$, and a character, say $c$, as argument and return +a new regular expression. Beware that the intuition behind this function is not so easy to grasp on first +reading. Essentially this function solves the following problem: if $r$ can match a string of the form +$c\!::\!s$, what does the regular expression look like that can match just $s$. The definition of this +function is as follows: + +\begin{center} +\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {\hspace{-10mm}}l@ {}} + $der\, c\, (\varnothing)$ & $\dn$ & $\varnothing$ & \\ + $der\, c\, (\epsilon)$ & $\dn$ & $\varnothing$ & \\ + $der\, c\, (d)$ & $\dn$ & if $c = d$ then $\epsilon$ else $\varnothing$ & \\ + $der\, c\, (r_1 + r_2)$ & $\dn$ & $der\, c\, r_1 + der\, c\, r_2$ & \\ + $der\, c\, (r_1 \cdot r_2)$ & $\dn$ & if $nullable (r_1)$\\ + & & then $(der\,c\,r_1) \cdot r_2 + der\, c\, r_2$\\ + & & else $(der\, c\, r_1) \cdot r_2$\\ + $der\, c\, (r^*)$ & $\dn$ & $(der\,c\,r) \cdot (r^*)$ & + \end{tabular} +\end{center} + +\noindent +The first two clauses can be rationalised as follows: recall that $der$ should calculate a regular +expression, if the ``input'' regular expression can match a string of the form $c\!::\!s$. Since neither +$\varnothing$ nor $\epsilon$ can match such a string we return $\varnothing$. In the third case +we have to make a case-distinction: In case the regular expression is $c$, then clearly it can recognise +a string of the form $c\!::\!s$, just that $s$ is the empty string. Therefore we return the $\epsilon$-regular +expression. In the other case we again return $\varnothing$ since no string of the $c\!::\!s$ can be matched. +The $+$-case is relatively straightforward: all strings of the form $c\!::\!s$ are either matched by the +regular expression $r_1$ or $r_2$. So we just have to recursively call $der$ with these two regular +expressions and compose the results again with $+$. The $\cdot$-case is more complicated: +if $r_1\cdot r_2$ matches a string of the form $c\!::\!s$, then the first part must be matched by $r_1$. +Consequently, it makes sense to construct the regular expression for $s$ by calling $der$ with $r_1$ and +``appending'' $r_2$. There is however one exception to this simple rule: if $r_1$ can match the empty +string, then all of $c\!::\!s$ is matched by $r_2$. So in case $r_1$ is nullable (that is can match the +empty string) we have to allow the choice $der\,c\,r_2$ for calculating the regular expression that can match +$s$. The $*$-case is again simple: if $r^*$ matches a string of the form $c\!::\!s$, then the first part must be +``matched'' by a single copy of $r$. Therefore we call recursively $der\,c\,r$ and ``append'' $r^*$ in order to +match the rest of $s$. + +Another way to rationalise the definition of $der$ is to consider the following operation on sets: + +\[ +Der\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\} +\] + +\noindent +which essentially transforms a set of strings $A$ by filtering out all strings that do not start with $c$ and then +strip off the $c$ from all the remaining strings. For example suppose $A = \{"f\!oo", "bar", "f\!rak"\}$ then +\[ +Der\,f\,A = \{"oo", "rak"\}\quad,\quad +Der\,b\,A = \{"ar"\} \quad \text{and} \quad +Der\,a\,A = \varnothing +\] + +\noindent +Note that in the last case $Der$ is empty, because no string in $A$ starts with $a$. With this operation we can +state the following property about $der$: + +\[ +L(der\,c\,r) = Der\,c\,(L(r)) +\] + +\noindent +This property clarifies what regular expression $der$ calculates, namely take the set of strings +that $r$ can match ($L(r)$), filter out all strings not starting with $c$ and strip off the $c$ from the +remaining strings---this is exactly the language that $der\,c\,r$ can match. + +For our matching algorithm we need to lift the notion of derivatives from characters to strings. This can be +done using the following function, taking a string and regular expression as input and a regular expression +as output. + +\begin{center} +\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {\hspace{-10mm}}l@ {}} + $der\!s\, []\, r$ & $\dn$ & $r$ & \\ + $der\!s\, (c\!::\!s)\, r$ & $\dn$ & $der\!s\,s\,(der\,c\,r)$ & \\ + \end{tabular} +\end{center} + +\noindent +Having $ders$ in place, we can finally define our matching algorithm: + +\[ +match\,s\,r = nullable(ders\,s\,r) +\] + +\noindent +We claim that + +\[ +match\,s\,r\quad\text{if and only if}\quad s\in L(r) +\] + +\noindent +holds, which means our algorithm satisfies the specification. This algorithm was introduced by +Janus Brzozowski in 1964. Its main attractions are simplicity and being fast, as well as +being easily extendable for other regular expressions such as $r^{\{n\}}$, $r^?$, $\sim{}r$ and so on. + +\begin{figure}[p] +{\lstset{language=Scala}\texttt{\lstinputlisting{../progs/app5.scala}}} +{\lstset{language=Scala}\texttt{\lstinputlisting{../progs/app6.scala}}} +\caption{Scala implementation of the nullable and derivatives functions.} +\end{figure} + +\end{document} + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: t +%%% End: