# HG changeset patch # User Christian Urban # Date 1459939893 -3600 # Node ID 5c1fbb39c93e0bfdc3d00c5cb4982335091a1a5d # Parent c8ce95067c1a3ae1f14c5113d67d5cb2a0420849 updated diff -r c8ce95067c1a -r 5c1fbb39c93e graphics.sty --- a/graphics.sty Tue Mar 22 17:09:24 2016 +0000 +++ b/graphics.sty Wed Apr 06 11:51:33 2016 +0100 @@ -10,6 +10,7 @@ \usepackage{graphicx} \usepackage{pgfplots} +\pgfplotsset{compat=1.11} \newenvironment{bubble}[1][]{% \addtolength{\leftmargini}{4mm}% diff -r c8ce95067c1a -r 5c1fbb39c93e handouts/ho01.pdf Binary file handouts/ho01.pdf has changed diff -r c8ce95067c1a -r 5c1fbb39c93e handouts/ho01.tex --- a/handouts/ho01.tex Tue Mar 22 17:09:24 2016 +0000 +++ b/handouts/ho01.tex Wed Apr 06 11:51:33 2016 +0100 @@ -10,7 +10,7 @@ %%https://www.reddit.com/r/ProgrammingLanguages/comments/43wlkq/formal_grammar_for_csh_tsch_sh_or_bash/ \begin{document} -\fnote{\copyright{} Christian Urban, 2014, 2015} +\fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016} \section*{Handout 1} @@ -26,10 +26,10 @@ followed by zero or more letters, numbers and underscores. Also often we face the problem that we are given a string (for example some user input) and want to know whether it matches a -particular pattern. In this way we can, for example, exclude -user input that would otherwise have nasty effects on our -program (crashing it or making it go into an infinite loop, if -not worse). +particular pattern---be it an email address, for example. In +this way we can exclude user input that would otherwise have +nasty effects on our program (crashing it or making it go into +an infinite loop, if not worse). \defn{Regular expressions} help with conveniently specifying such patterns. The idea behind regular expressions is that @@ -38,8 +38,8 @@ computer science. For example there is no convenient regular expression for describing the English language short of enumerating all English words. But they seem useful for -describing for example email addresses.\footnote{See ``8 -Regular Expressions You Should Know'' +describing for example simple email addresses.\footnote{See +``8 Regular Expressions You Should Know'' \url{http://goo.gl/5LoVX7}} Consider the following regular expression @@ -67,13 +67,19 @@ \noindent -But for example the following two do not: +But for example the following two do not \begin{lstlisting}[language={},numbers=none,keywordstyle=\color{black}] user@localserver disposable.style.email.with+symbol@example.com \end{lstlisting} +\noindent according to the regular expression we specified in +\eqref{email}. Whether this is intended or not is a different +question (the second email above is actually an acceptable +email address acording to the RFC 5322 standard for email +addresses). + As mentioned above, identifiers, or variables, in program code are often required to satisfy the constraints that they start with a letter and then can be followed by zero or more letters @@ -161,7 +167,9 @@ \begin{center} \begin{tikzpicture} -\begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs}, +\begin{axis}[ + xlabel={strings of {\tt a}s}, + ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, @@ -216,7 +224,9 @@ \begin{center} \begin{tikzpicture} - \begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs}, + \begin{axis}[ + xlabel={strings of \pcode{a}s}, + ylabel={time in secs}, enlargelimits=false, xtick={0,3000,...,12000}, xmax=12500, @@ -234,7 +244,7 @@ \subsection*{Basic Regular Expressions} -The regular expressions shown above for Scala, we +The regular expressions shown earlier for Scala, we will call \emph{extended regular expressions}. The ones we will mainly study in this module are \emph{basic regular expressions}, which by convention we will just call @@ -246,8 +256,8 @@ \begin{center} \begin{tabular}{r@{\hspace{1mm}}r@{\hspace{1mm}}l@{\hspace{13mm}}l} - $r$ & $::=$ & $\varnothing$ & null\\ - & $\mid$ & $\epsilon$ & empty string / \texttt{""} / []\\ + $r$ & $::=$ & $\ZERO$ & null language\\ + & $\mid$ & $\ONE$ & empty string / \texttt{""} / []\\ & $\mid$ & $c$ & single character\\ & $\mid$ & $r_1 + r_2$ & alternative / choice\\ & $\mid$ & $r_1 \cdot r_2$ & sequence\\ @@ -257,37 +267,36 @@ \noindent Because we overload our notation, there are some subtleties you should be aware of. When regular expressions -are referred to then $\varnothing$ does not stand for the -empty set: rather it is a particular pattern that does not -match any string. Similarly, in the context of regular -expressions, $\epsilon$ does not stand for the empty string -(as in many places in the literature) but for a regular -expression that matches the empty string. The letter $c$ -stands for any character from the alphabet at hand. Again in -the context of regular expressions, it is a particular pattern -that can match the specified character. You should also be -careful with our overloading of the star: assuming you have -read the handout about our basic mathematical notation, you -will see that in the context of languages (sets of strings) -the star stands for an operation on languages. Here $r^*$ -stands for a regular expression, which is different from the -operation on sets is defined as +are referred to then $\ZERO$ (in bold font) does not stand for +the number zero: rather it is a particular pattern that does +not match any string. Similarly, in the context of regular +expressions, $\ONE$ does not stand for the number one but for +a regular expression that matches the empty string. The letter +$c$ stands for any character from the alphabet at hand. Again +in the context of regular expressions, it is a particular +pattern that can match the specified character. You should +also be careful with our overloading of the star: assuming you +have read the handout about our basic mathematical notation, +you will see that in the context of languages (sets of +strings) the star stands for an operation on languages. Here +$r^*$ stands for a regular expression, which is different from +the operation on sets is defined as \[ -A^* \dn \bigcup_{0\le n} A^n +A\star\dn \bigcup_{0\le n} A^n \] \noindent Note that this expands to \[ -A^* \dn A^0 \cup A^1 \cup A^2 \cup A^3 \cup A^4 \cup \ldots +A\star \dn A^0 \cup A^1 \cup A^2 \cup A^3 \cup A^4 \cup \ldots \] \noindent which is equivalent to \[ -A^* \dn \{[]\} \cup A \cup A@A \cup A@A@A \cup A@A@A@A \cup \ldots +A\star \dn \{[]\} \cup A \cup A@A \cup A@A@A \cup A@A@A@A \cup \ldots \] \noindent @@ -309,9 +318,12 @@ in case of $+$ and $\cdot$ we actually do not care about the order and just write $r_1 + r_2 + r_3$, or $r_1 \cdot r_2 \cdot r_3$, respectively. The reasons for this will become -clear shortly. In the literature you will often find that the -choice $r_1 + r_2$ is written as $r_1\mid{}r_2$ or -$r_1\mid\mid{}r_2$. Also following the convention in the +clear shortly. + +In the literature you will often find that the choice $r_1 + +r_2$ is written as $r_1\mid{}r_2$ or $r_1\mid\mid{}r_2$. Also, +often our $\ZERO$ and $\ONE$ are written $\varnothing$ and +$\epsilon$, respectively. Following the convention in the literature, we will often omit the $\cdot$ all together. This is to make some concrete regular expressions more readable. For example the regular expression for email addresses shown @@ -342,8 +354,8 @@ \begin{center} \begin{tabular}{rcl} -$\varnothing$ & $\mapsto$ & \texttt{NULL}\\ -$\epsilon$ & $\mapsto$ & \texttt{EMPTY}\\ +$\ZERO$ & $\mapsto$ & \texttt{ZERO}\\ +$\ONE$ & $\mapsto$ & \texttt{ONE}\\ $c$ & $\mapsto$ & \texttt{CHAR(c)}\\ $r_1 + r_2$ & $\mapsto$ & \texttt{ALT(r1, r2)}\\ $r_1 \cdot r_2$ & $\mapsto$ & \texttt{SEQ(r1, r2)}\\ @@ -362,7 +374,7 @@ \begin{center} \begin{tabular}{rcl} $r+$ & $\mapsto$ & $r\cdot r^*$\\ -$r?$ & $\mapsto$ & $\epsilon + r$\\ +$r?$ & $\mapsto$ & $\ONE + r$\\ $\backslash d$ & $\mapsto$ & $0 + 1 + 2 + \ldots + 9$\\ $[\text{\it a - z}]$ & $\mapsto$ & $a + b + \ldots + z$\\ \end{tabular} @@ -388,13 +400,13 @@ is defined as follows \begin{center} -\begin{tabular}{rcl} -$L(\varnothing)$ & $\dn$ & $\{\}$\\ -$L(\epsilon)$ & $\dn$ & $\{[]\}$\\ -$L(c)$ & $\dn$ & $\{[c]\}$\\ -$L(r_1+ r_2)$ & $\dn$ & $L(r_1) \cup L(r_2)$\\ +\begin{tabular}{rcll} +$L(\ZERO)$ & $\dn$ & $\{\}$\\ +$L(\ONE)$ & $\dn$ & $\{[]\}$\\ +$L(c)$ & $\dn$ & $\{"c"\}$ & or equivalently $\dn \{[c]\}$\\ +$L(r_1+ r_2)$ & $\dn$ & $L(r_1) \cup L(r_2)$\\ $L(r_1 \cdot r_2)$ & $\dn$ & $L(r_1) \,@\, L(r_2)$\\ -$L(r^*)$ & $\dn$ & $(L(r))^*$\\ +$L(r^*)$ & $\dn$ & $(L(r))\star$\\ \end{tabular} \end{center} @@ -443,9 +455,9 @@ which can match the strings $a$ and $b$. But also the regular expression $b + a$ would match the same strings. However, sometimes it is not so obvious whether two regular expressions -match the same strings: for example do $r^*$ and $\epsilon + r -\cdot r^*$ match the same strings? What about $\varnothing^*$ -and $\epsilon^*$? This suggests the following relation between +match the same strings: for example do $r^*$ and $\ONE + r +\cdot r^*$ match the same strings? What about $\ZERO^*$ +and $\ONE^*$? This suggests the following relation between \defn{equivalent regular expressions}: \[ @@ -460,12 +472,13 @@ whether \[ -\varnothing^* \equiv \epsilon^* +\ZERO^* \equiv \ONE^* \] -\noindent holds? Such equivalences will be important for our -matching algorithm, because we can use them to simplify -regular expressions. +\noindent holds or not? Such equivalences will be important +for our matching algorithm, because we can use them to +simplify regular expressions, which will mean we can speed +up the calculations. \subsection*{My Fascination for Regular Expressions} @@ -475,15 +488,15 @@ found out about them. I even have the vague recollection that I did not quite understand them during my study. If I remember correctly,\footnote{That was really a long time ago.} I got -utterly confused about $\epsilon$ and the empty -string.\footnote{Obviously the lecturer must have been bad.} -Since my study, I have used regular expressions for -implementing lexers and parsers as I have always been -interested in all kinds of programming languages and -compilers, which invariably need regular expression in some -form or shape. +utterly confused about $\ONE$ (which my lecturer wrote as +$\epsilon$) and the empty string.\footnote{Obviously the +lecturer must have been bad.} Since my study, I have used +regular expressions for implementing lexers and parsers as I +have always been interested in all kinds of programming +languages and compilers, which invariably need regular +expression in some form or shape. -To understand my fascination nowadays with regular +To understand my fascination \emph{nowadays} with regular expressions, you need to know that my main scientific interest for the last 14 years has been with theorem provers. I am a core developer of one of @@ -533,16 +546,18 @@ What a feeling if you are an outsider to the subject! To conclude: Despite my early ignorance about regular -expressions, I find them now quite interesting. They have a -beautiful mathematical theory behind them. They have practical -importance (remember the shocking runtime of the regular -expression matchers in Python and Ruby in some instances). -People who are not very familiar with the mathematical -background of regular expressions get them consistently wrong. -The hope is that we can do better in the future---for example -by proving that the algorithms actually satisfy their -specification and that the corresponding implementations do -not contain any bugs. We are close, but not yet quite there. +expressions, I find them now very interesting. They have a +beautiful mathematical theory behind them, which can be +sometimes quite deep and contain hidden snares. They have +practical importance (remember the shocking runtime of the +regular expression matchers in Python and Ruby in some +instances). People who are not very familiar with the +mathematical background of regular expressions get them +consistently wrong. The hope is that we can do better in the +future---for example by proving that the algorithms actually +satisfy their specification and that the corresponding +implementations do not contain any bugs. We are close, but not +yet quite there. Notwithstanding my fascination, I am also happy to admit that regular expressions have their shortcomings. There are some well-known @@ -571,18 +586,28 @@ would not know---the only thing I can say to this regular expression is it is a monstrosity. However, this might actually be an argument against the RFC standard, rather than -against regular expressions. Still it is good to know that -some tasks in text processing just cannot be achieved by using -regular expressions. +against regular expressions. A similar argument is made in + +\begin{center} +\url{https://elliot.land/validating-an-email-address} +\end{center} + + +\noindent which explains some of the crazier parts of email +addresses. Still it is good to know that some tasks in text +processing just cannot be achieved by using regular +expressions. \begin{figure}[p] \lstinputlisting{../progs/crawler1.scala} + \caption{The Scala code for a simple web-crawler that checks for broken links in a web-page. It uses the regular expression -\texttt{http\_pattern} in Line~15 for recognising URL-addresses. -It finds all links using the library function -\texttt{findAllIn} in Line~21.\label{crawler1}} +\texttt{http\_pattern} in Line~\ref{httpline} for recognising +URL-addresses. It finds all links using the library function +\texttt{findAllIn} in Line~\ref{findallline}.\label{crawler1}} + \end{figure} @@ -592,9 +617,11 @@ \caption{A version of the web-crawler that only follows links in ``my'' domain---since these are the ones I am interested in to fix. It uses the regular expression \texttt{my\_urls} in -Line~16 to check for my name in the links. The main change is -in Lines~24--28 where there is a test whether URL is in ``my'' -domain or not.\label{crawler2}} +Line~\ref{myurlline} to check for my name in the links. The +main change is in +Lines~\ref{changestartline}--\ref{changeendline} where there +is a test whether URL is in ``my'' domain or +not.\label{crawler2}} \end{figure} @@ -604,9 +631,10 @@ \caption{A small email harvester---whenever we download a web-page, we also check whether it contains any email addresses. For this we use the regular expression -\texttt{email\_pattern} in Line~15. The main change is in Line -30 where all email addresses that can be found in a page are -printed.\label{crawler3}} +\texttt{email\_pattern} in Line~\ref{emailline}. The main +change is in Line~\ref{mainline} where all email addresses +that can be found in a page are printed.\label{crawler3}} + \end{figure} \begin{figure}[p] @@ -617,7 +645,7 @@ \end{minipage} \end{center} -\caption{Nothing that can be said\ldots\label{monster}} +\caption{Nothing that can be said this\ldots\label{monster}} \end{figure} diff -r c8ce95067c1a -r 5c1fbb39c93e handouts/ho02.pdf Binary file handouts/ho02.pdf has changed diff -r c8ce95067c1a -r 5c1fbb39c93e handouts/ho02.tex --- a/handouts/ho02.tex Tue Mar 22 17:09:24 2016 +0000 +++ b/handouts/ho02.tex Wed Apr 06 11:51:33 2016 +0100 @@ -4,8 +4,10 @@ \usepackage{../graphics} \usepackage{../data} -\pgfplotsset{compat=1.11} + \begin{document} +\fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016} + \section*{Handout 2 (Regular Expression Matching)} @@ -14,16 +16,18 @@ than the matchers from regular expression libraries in Ruby and Python (the plots on the left). These plots show the running time for the evil regular expression -$a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings composed of $n$ \pcode{a}s. -We will use this regular expression and strings as running -example. To see the substantial differences in the two plots -below, note the different scales of the $x$-axes. +$a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings composed of $n$ +\pcode{a}s. We will use this regular expression and strings as +running example. To see the substantial differences in the two +plots below, note the different scales of the $x$-axes. \begin{center} \begin{tabular}{@{}cc@{}} \begin{tikzpicture} -\begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs}, +\begin{axis}[ + xlabel={strings of {\tt a}s}, + ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, @@ -44,7 +48,9 @@ \end{tikzpicture} & \begin{tikzpicture} - \begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs}, + \begin{axis}[ + xlabel={strings of \texttt{a}s}, + ylabel={time in secs}, enlargelimits=false, xtick={0,3000,...,12000}, xmax=12500, @@ -119,15 +125,15 @@ \noindent I leave it to you to verify these equivalences and non-equivalences. It is also interesting to look at some -corner cases involving $\epsilon$ and $\varnothing$: +corner cases involving $\ONE$ and $\ZERO$: \begin{center} \begin{tabular}{rcl} -$a \cdot \varnothing$ & $\not\equiv$ & $a$\\ -$a + \epsilon$ & $\not\equiv$ & $a$\\ -$\epsilon$ & $\equiv$ & $\varnothing^*$\\ -$\epsilon^*$ & $\equiv$ & $\epsilon$\\ -$\varnothing^*$ & $\not\equiv$ & $\varnothing$\\ +$a \cdot \ZERO$ & $\not\equiv$ & $a$\\ +$a + \ONE$ & $\not\equiv$ & $a$\\ +$\ONE$ & $\equiv$ & $\ZERO^*$\\ +$\ONE^*$ & $\equiv$ & $\ONE$\\ +$\ZERO^*$ & $\not\equiv$ & $\ZERO$\\ \end{tabular} \end{center} @@ -140,20 +146,20 @@ \begin{center} \begin{tabular}{rcl} -$r + \varnothing$ & $\equiv$ & $r$\\ -$\varnothing + r$ & $\equiv$ & $r$\\ -$r \cdot \epsilon$ & $\equiv$ & $r$\\ -$\epsilon \cdot r$ & $\equiv$ & $r$\\ -$r \cdot \varnothing$ & $\equiv$ & $\varnothing$\\ -$\varnothing \cdot r$ & $\equiv$ & $\varnothing$\\ +$r + \ZERO$ & $\equiv$ & $r$\\ +$\ZERO + r$ & $\equiv$ & $r$\\ +$r \cdot \ONE$ & $\equiv$ & $r$\\ +$\ONE \cdot r$ & $\equiv$ & $r$\\ +$r \cdot \ZERO$ & $\equiv$ & $\ZERO$\\ +$\ZERO \cdot r$ & $\equiv$ & $\ZERO$\\ $r + r$ & $\equiv$ & $r$ \end{tabular} \end{center} \noindent which always hold no matter what the regular expression $r$ looks like. The first two are easy to verify -since $L(\varnothing)$ is the empty set. The next two are also -easy to verify since $L(\epsilon) = \{[]\}$ and appending the +since $L(\ZERO)$ is the empty set. The next two are also +easy to verify since $L(\ONE) = \{[]\}$ and appending the empty string to every string of another set, leaves the set unchanged. Be careful to fully comprehend the fifth and sixth equivalence: if you concatenate two sets of strings and one is @@ -167,7 +173,7 @@ example the regular expression \begin{equation} -(r_1 + \varnothing) \cdot \epsilon + ((\epsilon + r_2) + r_3) \cdot (r_4 \cdot \varnothing) +(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot (r_4 \cdot \ZERO) \label{big} \end{equation} @@ -182,21 +188,21 @@ \begin{center} \begin{tabular}{ll} - & $(r_1 + \varnothing) \cdot \epsilon + ((\epsilon + r_2) + r_3) \cdot -(\underline{r_4 \cdot \varnothing})$\smallskip\\ -$\equiv$ & $(r_1 + \varnothing) \cdot \epsilon + \underline{((\epsilon + r_2) + r_3) \cdot -\varnothing}$\smallskip\\ -$\equiv$ & $\underline{(r_1 + \varnothing) \cdot \epsilon} + \varnothing$\smallskip\\ -$\equiv$ & $(\underline{r_1 + \varnothing}) + \varnothing$\smallskip\\ -$\equiv$ & $\underline{r_1 + \varnothing}$\smallskip\\ + & $(r_1 + \ZERO) \cdot \ONE + ((\ONE + r_2) + r_3) \cdot +(\underline{r_4 \cdot \ZERO})$\smallskip\\ +$\equiv$ & $(r_1 + \ZERO) \cdot \ONE + \underline{((\ONE + r_2) + r_3) \cdot +\ZERO}$\smallskip\\ +$\equiv$ & $\underline{(r_1 + \ZERO) \cdot \ONE} + \ZERO$\smallskip\\ +$\equiv$ & $(\underline{r_1 + \ZERO}) + \ZERO$\smallskip\\ +$\equiv$ & $\underline{r_1 + \ZERO}$\smallskip\\ $\equiv$ & $r_1$\ \end{tabular} \end{center} \noindent In each step, I underlined where a simplification rule is applied. Our matching algorithm in the next section -will often generate such ``useless'' $\epsilon$s and -$\varnothing$s, therefore simplifying them away will make the +will often generate such ``useless'' $\ONE$s and +$\ZERO$s, therefore simplifying them away will make the algorithm quite a bit faster. \subsection*{The Matching Algorithm} @@ -209,8 +215,8 @@ \begin{center} \begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}} -$nullable(\varnothing)$ & $\dn$ & $\textit{false}$\\ -$nullable(\epsilon)$ & $\dn$ & $true$\\ +$nullable(\ZERO)$ & $\dn$ & $\textit{false}$\\ +$nullable(\ONE)$ & $\dn$ & $true$\\ $nullable(c)$ & $\dn$ & $\textit{false}$\\ $nullable(r_1 + r_2)$ & $\dn$ & $nullable(r_1) \vee nullable(r_2)$\\ $nullable(r_1 \cdot r_2)$ & $\dn$ & $nullable(r_1) \wedge nullable(r_2)$\\ @@ -243,9 +249,9 @@ \begin{center} \begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l} - $der\, c\, (\varnothing)$ & $\dn$ & $\varnothing$\\ - $der\, c\, (\epsilon)$ & $\dn$ & $\varnothing$ \\ - $der\, c\, (d)$ & $\dn$ & if $c = d$ then $\epsilon$ else $\varnothing$\\ + $der\, c\, (\ZERO)$ & $\dn$ & $\ZERO$\\ + $der\, c\, (\ONE)$ & $\dn$ & $\ZERO$ \\ + $der\, c\, (d)$ & $\dn$ & if $c = d$ then $\ONE$ else $\ZERO$\\ $der\, c\, (r_1 + r_2)$ & $\dn$ & $der\, c\, r_1 + der\, c\, r_2$\\ $der\, c\, (r_1 \cdot r_2)$ & $\dn$ & if $nullable (r_1)$\\ & & then $(der\,c\,r_1) \cdot r_2 + der\, c\, r_2$\\ @@ -258,14 +264,14 @@ follows: recall that $der$ should calculate a regular expression so that given the ``input'' regular expression can match a string of the form $c\!::\!s$, we want a regular -expression for $s$. Since neither $\varnothing$ nor $\epsilon$ +expression for $s$. Since neither $\ZERO$ nor $\ONE$ can match a string of the form $c\!::\!s$, we return -$\varnothing$. In the third case we have to make a +$\ZERO$. In the third case we have to make a case-distinction: In case the regular expression is $c$, then clearly it can recognise a string of the form $c\!::\!s$, just that $s$ is the empty string. Therefore we return the -$\epsilon$-regular expression. In the other case we again -return $\varnothing$ since no string of the $c\!::\!s$ can be +$\ONE$-regular expression. In the other case we again +return $\ZERO$ since no string of the $c\!::\!s$ can be matched. Next come the recursive cases, which are a bit more involved. Fortunately, the $+$-case is still relatively straightforward: all strings of the form $c\!::\!s$ are either @@ -292,9 +298,9 @@ the definition of $der$ by considering the following operation on sets: -\[ +\begin{equation}\label{Der} Der\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\} -\] +\end{equation} \noindent This operation essentially transforms a set of strings $A$ by filtering out all strings that do not start @@ -406,7 +412,7 @@ \begin{figure}[p] \lstinputlisting{../progs/app5.scala} \caption{Scala implementation of the nullable and - derivatives functions. These functions are easy to + derivative functions. These functions are easy to implement in functional languages, because pattern matching and recursion allow us to mimic the mathematical definitions very closely.\label{scala1}} @@ -460,7 +466,7 @@ \noindent Our algorithm traverses such regular expressions at least once every time a derivative is calculated. So having large regular expressions will cause problems. This problem -is aggravated by $a^?$ being represented as $a + \epsilon$. +is aggravated by $a^?$ being represented as $a + \ONE$. We can however fix this by having an explicit constructor for $r^{\{n\}}$. In Scala we would introduce a constructor like @@ -508,14 +514,14 @@ need to traverse the whole regular expression. There seems, however, one more issue for making the algorithm run faster. The derivative function often produces ``useless'' -$\varnothing$s and $\epsilon$s. To see this, consider $r = ((a +$\ZERO$s and $\ONE$s. To see this, consider $r = ((a \cdot b) + b)^*$ and the following two derivatives \begin{center} \begin{tabular}{l} -$der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\ -$der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\ -$der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$ +$der\,a\,r = ((\ONE \cdot b) + \ZERO) \cdot r$\\ +$der\,b\,r = ((\ZERO \cdot b) + \ONE)\cdot r$\\ +$der\,c\,r = ((\ZERO \cdot b) + \ZERO)\cdot r$ \end{tabular} \end{center} @@ -528,7 +534,7 @@ \begin{tabular}{l} $der\,a\,r \equiv b \cdot r$\\ $der\,b\,r \equiv r$\\ -$der\,c\,r \equiv \varnothing$ +$der\,c\,r \equiv \ZERO$ \end{tabular} \end{center} @@ -553,7 +559,7 @@ calls \texttt{der} first, but then simplifies the resulting derivative regular expressions before building the next derivative, see -Line~28.\label{scala2}} +Line~\ref{simpline}.\label{scala2}} \end{figure} \begin{center} @@ -590,8 +596,8 @@ \begin{center} \begin{tabular}{r@{\hspace{1mm}}r@{\hspace{1mm}}l@{\hspace{13mm}}l} - $r$ & $::=$ & $\varnothing$ & null\\ - & $\mid$ & $\epsilon$ & empty string / \texttt{""} / []\\ + $r$ & $::=$ & $\ZERO$ & null language\\ + & $\mid$ & $\ONE$ & empty string / \texttt{""} / []\\ & $\mid$ & $c$ & single character\\ & $\mid$ & $r_1 + r_2$ & alternative / choice\\ & $\mid$ & $r_1 \cdot r_2$ & sequence\\ @@ -604,7 +610,7 @@ the recipe: \begin{itemize} -\item $P$ has to hold for $\varnothing$, $\epsilon$ and $c$ +\item $P$ has to hold for $\ZERO$, $\ONE$ and $c$ (these are the base cases). \item $P$ has to hold for $r_1 + r_2$ under the assumption that $P$ already holds for $r_1$ and $r_2$. @@ -625,21 +631,21 @@ \noindent Let us say that this property is $P(r)$, then the first case -we need to check is whether $P(\varnothing)$ (see recipe +we need to check is whether $P(\ZERO)$ (see recipe above). So we have to show that \[ -nullable(\varnothing) \;\;\text{if and only if}\;\; -[]\in L(\varnothing) +nullable(\ZERO) \;\;\text{if and only if}\;\; +[]\in L(\ZERO) \] -\noindent whereby $nullable(\varnothing)$ is by definition of +\noindent whereby $nullable(\ZERO)$ is by definition of the function $nullable$ always $\textit{false}$. We also have -that $L(\varnothing)$ is by definition $\{\}$. It is +that $L(\ZERO)$ is by definition $\{\}$. It is impossible that the empty string $[]$ is in the empty set. Therefore also the right-hand side is false. Consequently we verified this case: both sides are false. We would still need -to do this for $P(\varepsilon)$ and $P(c)$. I leave this to +to do this for $P(\ONE)$ and $P(c)$. I leave this to you to verify. Next we need to check the inductive cases, for example @@ -718,9 +724,16 @@ \label{dersprop} \end{equation} -\noindent by induction on $s$. In this proof you can assume -the following property for $der$ and $Der$ has already been -proved, that is you can assume +\noindent by induction on $s$. Recall $Der$ is defined for +character---see \eqref{Der}; $Ders$ is similar, but for strings: + +\[ +Ders\,s\,A\;\dn\;\{s'\,|\,s @ s' \in A\} +\] + +\noindent In this proof you can assume the following property +for $der$ and $Der$ has already been proved, that is you can +assume \[ L(der\,c\,r) = Der\,c\,(L(r)) diff -r c8ce95067c1a -r 5c1fbb39c93e langs.sty --- a/langs.sty Tue Mar 22 17:09:24 2016 +0000 +++ b/langs.sty Wed Apr 06 11:51:33 2016 +0100 @@ -65,3 +65,5 @@ \newcommand{\pcode}[1]{\mbox{\lstset{language={},keywordstyle=\color{black}}\lstinline!#1!}} \newcommand{\scode}[1]{\mbox{\lstset{language={},basicstyle=\ttfamily\color{codegreen}}\lstinline!#1!}} \makeatother + +\lstset{escapeinside={(*@}{@*)}} diff -r c8ce95067c1a -r 5c1fbb39c93e progs/app5.scala --- a/progs/app5.scala Tue Mar 22 17:09:24 2016 +0000 +++ b/progs/app5.scala Wed Apr 06 11:51:33 2016 +0100 @@ -1,6 +1,6 @@ def nullable (r: Rexp) : Boolean = r match { - case NULL => false - case EMPTY => true + case ZERO => false + case ONE => true case CHAR(_) => false case ALT(r1, r2) => nullable(r1) || nullable(r2) case SEQ(r1, r2) => nullable(r1) && nullable(r2) @@ -8,9 +8,9 @@ } def der (c: Char, r: Rexp) : Rexp = r match { - case NULL => NULL - case EMPTY => NULL - case CHAR(d) => if (c == d) EMPTY else NULL + case ZERO => ZERO + case ONE => ZERO + case CHAR(d) => if (c == d) ONE else ZERO case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) case SEQ(r1, r2) => if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) diff -r c8ce95067c1a -r 5c1fbb39c93e progs/app51.scala --- a/progs/app51.scala Tue Mar 22 17:09:24 2016 +0000 +++ b/progs/app51.scala Wed Apr 06 11:51:33 2016 +0100 @@ -1,7 +1,7 @@ -def OPT(r: Rexp) = ALT(r, EMPTY) +def OPT(r: Rexp) = ALT(r, ONE) def NTIMES(r: Rexp, n: Int) : Rexp = n match { - case 0 => EMPTY + case 0 => ONE case 1 => r case n => SEQ(r, NTIMES(r, n - 1)) } diff -r c8ce95067c1a -r 5c1fbb39c93e progs/app6.scala --- a/progs/app6.scala Tue Mar 22 17:09:24 2016 +0000 +++ b/progs/app6.scala Wed Apr 06 11:51:33 2016 +0100 @@ -1,17 +1,17 @@ def simp(r: Rexp): Rexp = r match { case ALT(r1, r2) => { (simp(r1), simp(r2)) match { - case (NULL, r2s) => r2s - case (r1s, NULL) => r1s + case (ZERO, r2s) => r2s + case (r1s, ZERO) => r1s case (r1s, r2s) => if (r1s == r2s) r1s else ALT(r1s, r2s) } } case SEQ(r1, r2) => { (simp(r1), simp(r2)) match { - case (NULL, _) => NULL - case (_, NULL) => NULL - case (EMPTY, r2s) => r2s - case (r1s, EMPTY) => r1s + case (ZERO, _) => ZERO + case (_, ZERO) => ZERO + case (ONE, r2s) => r2s + case (r1s, ONE) => r1s case (r1s, r2s) => SEQ(r1s, r2s) } } @@ -21,6 +21,6 @@ def ders (s: List[Char], r: Rexp) : Rexp = s match { case Nil => r - case c::s => ders(s, simp(der(c, r))) + case c::s => ders(s, simp(der(c, r))) (*@\label{simpline}@*) } diff -r c8ce95067c1a -r 5c1fbb39c93e progs/crawler1.scala --- a/progs/crawler1.scala Tue Mar 22 17:09:24 2016 +0000 +++ b/progs/crawler1.scala Wed Apr 06 11:51:33 2016 +0100 @@ -12,13 +12,13 @@ } // regex for URLs -val http_pattern = """"https?://[^"]*"""".r +val http_pattern = """"https?://[^"]*"""".r (*@\label{httpline}@*) // drops the first and last character from a string def unquote(s: String) = s.drop(1).dropRight(1) def get_all_URLs(page: String) : Set[String] = - http_pattern.findAllIn(page).map(unquote).toSet + http_pattern.findAllIn(page).map(unquote).toSet (*@\label{findallline}@*) // naive version of crawl - searches until a given depth, diff -r c8ce95067c1a -r 5c1fbb39c93e progs/crawler2.scala --- a/progs/crawler2.scala Tue Mar 22 17:09:24 2016 +0000 +++ b/progs/crawler2.scala Wed Apr 06 11:51:33 2016 +0100 @@ -13,7 +13,7 @@ // regexes for URLs and "my" domain val http_pattern = """"https?://[^"]*"""".r -val my_urls = """urbanc""".r +val my_urls = """urbanc""".r (*@\label{myurlline}@*) def unquote(s: String) = s.drop(1).dropRight(1) @@ -21,11 +21,11 @@ http_pattern.findAllIn(page).map(unquote).toSet def crawl(url: String, n: Int) : Unit = { - if (n == 0) () + if (n == 0) () (*@\label{changestartline}@*) else if (my_urls.findFirstIn(url) == None) { println(s"Visiting: $n $url") get_page(url); () - } + } (*@\label{changeendline}@*) else { println(s"Visiting: $n $url") for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) diff -r c8ce95067c1a -r 5c1fbb39c93e progs/crawler3.scala --- a/progs/crawler3.scala Tue Mar 22 17:09:24 2016 +0000 +++ b/progs/crawler3.scala Wed Apr 06 11:51:33 2016 +0100 @@ -12,7 +12,7 @@ // regexes for URLs, for "my" domain and for email addresses val http_pattern = """"https?://[^"]*"""".r -val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r +val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r (*@\label{emailline}@*) def unquote(s: String) = s.drop(1).dropRight(1) @@ -27,7 +27,7 @@ else { println(s"Visiting: $n $url") val page = get_page(url) - print_str(email_pattern.findAllIn(page).mkString("\n")) + print_str(email_pattern.findAllIn(page).mkString("\n")) (*@\label{mainline}@*) for (u <- get_all_URLs(page).par) crawl(u, n - 1) } } diff -r c8ce95067c1a -r 5c1fbb39c93e style.sty --- a/style.sty Tue Mar 22 17:09:24 2016 +0000 +++ b/style.sty Wed Apr 06 11:51:33 2016 +0100 @@ -1,5 +1,5 @@ \usepackage{xcolor} -\usepackage{fontspec} +%%\usepackage{fontspec} \usepackage[sc]{mathpazo} \usepackage{fontspec} \setmainfont[Ligatures=TeX]{Palatino Linotype}