handouts/ho01.tex
changeset 399 5c1fbb39c93e
parent 398 c8ce95067c1a
child 403 564f7584eff1
--- a/handouts/ho01.tex	Tue Mar 22 17:09:24 2016 +0000
+++ b/handouts/ho01.tex	Wed Apr 06 11:51:33 2016 +0100
@@ -10,7 +10,7 @@
 %%https://www.reddit.com/r/ProgrammingLanguages/comments/43wlkq/formal_grammar_for_csh_tsch_sh_or_bash/
 
 \begin{document}
-\fnote{\copyright{} Christian Urban, 2014, 2015}
+\fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016}
 
 \section*{Handout 1}
 
@@ -26,10 +26,10 @@
 followed by zero or more letters, numbers and underscores.
 Also often we face the problem that we are given a string (for
 example some user input) and want to know whether it matches a
-particular pattern. In this way we can, for example, exclude
-user input that would otherwise have nasty effects on our
-program (crashing it or making it go into an infinite loop, if
-not worse).
+particular pattern---be it an email address, for example. In
+this way we can exclude user input that would otherwise have
+nasty effects on our program (crashing it or making it go into
+an infinite loop, if not worse).
 
 \defn{Regular expressions} help with conveniently specifying
 such patterns. The idea behind regular expressions is that
@@ -38,8 +38,8 @@
 computer science. For example there is no convenient regular
 expression for describing the English language short of
 enumerating all English words. But they seem useful for
-describing for example email addresses.\footnote{See ``8
-Regular Expressions You Should Know''
+describing for example simple email addresses.\footnote{See
+``8 Regular Expressions You Should Know''
 \url{http://goo.gl/5LoVX7}} Consider the following regular
 expression
 
@@ -67,13 +67,19 @@
 
 
 \noindent
-But for example the following two do not:
+But for example the following two do not
 
 \begin{lstlisting}[language={},numbers=none,keywordstyle=\color{black}]
 user@localserver
 disposable.style.email.with+symbol@example.com
 \end{lstlisting}
 
+\noindent according to the regular expression we specified in
+\eqref{email}. Whether this is intended or not is a different
+question (the second email above is actually an acceptable
+email address acording to the RFC 5322 standard for email
+addresses).
+ 
 As mentioned above, identifiers, or variables, in program code
 are often required to satisfy the constraints that they start
 with a letter and then can be followed by zero or more letters
@@ -161,7 +167,9 @@
 
 \begin{center}
 \begin{tikzpicture}
-\begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs},
+\begin{axis}[
+    xlabel={strings of {\tt a}s},
+    ylabel={time in secs},
     enlargelimits=false,
     xtick={0,5,...,30},
     xmax=33,
@@ -216,7 +224,9 @@
 
 \begin{center}
 \begin{tikzpicture}
-  \begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs},
+  \begin{axis}[
+    xlabel={strings of \pcode{a}s},
+    ylabel={time in secs},
     enlargelimits=false,
     xtick={0,3000,...,12000},
     xmax=12500,
@@ -234,7 +244,7 @@
 
 \subsection*{Basic Regular Expressions}
 
-The regular expressions shown above for Scala, we
+The regular expressions shown earlier for Scala, we
 will call \emph{extended regular expressions}. The ones we
 will mainly study in this module are \emph{basic regular
 expressions}, which by convention we will just call
@@ -246,8 +256,8 @@
 
 \begin{center}
 \begin{tabular}{r@{\hspace{1mm}}r@{\hspace{1mm}}l@{\hspace{13mm}}l}
-  $r$ & $::=$ &   $\varnothing$         & null\\
-        & $\mid$ & $\epsilon$           & empty string / \texttt{""} / []\\
+  $r$ & $::=$ &    $\ZERO$          & null language\\
+        & $\mid$ & $\ONE$           & empty string / \texttt{""} / []\\
         & $\mid$ & $c$                  & single character\\
         & $\mid$ & $r_1 + r_2$          & alternative / choice\\
         & $\mid$ & $r_1 \cdot r_2$      & sequence\\
@@ -257,37 +267,36 @@
 
 \noindent Because we overload our notation, there are some
 subtleties you should be aware of. When regular expressions
-are referred to then $\varnothing$ does not stand for the
-empty set: rather it is a particular pattern that does not
-match any string. Similarly, in the context of regular
-expressions, $\epsilon$ does not stand for the empty string
-(as in many places in the literature) but for a regular
-expression that matches the empty string. The letter $c$
-stands for any character from the alphabet at hand. Again in
-the context of regular expressions, it is a particular pattern
-that can match the specified character. You should also be
-careful with our overloading of the star: assuming you have
-read the handout about our basic mathematical notation, you
-will see that in the context of languages (sets of strings)
-the star stands for an operation on languages. Here $r^*$
-stands for a regular expression, which is different from the
-operation on sets is defined as
+are referred to then $\ZERO$ (in bold font) does not stand for
+the number zero: rather it is a particular pattern that does
+not match any string. Similarly, in the context of regular
+expressions, $\ONE$ does not stand for the number one but for
+a regular expression that matches the empty string. The letter
+$c$ stands for any character from the alphabet at hand. Again
+in the context of regular expressions, it is a particular
+pattern that can match the specified character. You should
+also be careful with our overloading of the star: assuming you
+have read the handout about our basic mathematical notation,
+you will see that in the context of languages (sets of
+strings) the star stands for an operation on languages. Here
+$r^*$ stands for a regular expression, which is different from
+the operation on sets is defined as
 
 \[
-A^* \dn \bigcup_{0\le n} A^n
+A\star\dn \bigcup_{0\le n} A^n
 \]
 
 \noindent 
 Note that this expands to
 
 \[
-A^* \dn A^0 \cup A^1 \cup A^2 \cup A^3 \cup A^4 \cup \ldots
+A\star \dn A^0 \cup A^1 \cup A^2 \cup A^3 \cup A^4 \cup \ldots
 \]
 
 \noindent which is equivalent to
 
 \[
-A^* \dn \{[]\} \cup A \cup A@A \cup A@A@A \cup A@A@A@A \cup \ldots
+A\star \dn \{[]\} \cup A \cup A@A \cup A@A@A \cup A@A@A@A \cup \ldots
 \]
 
 \noindent 
@@ -309,9 +318,12 @@
 in case of $+$ and $\cdot$ we actually do not care about the
 order and just write $r_1 + r_2 + r_3$, or $r_1 \cdot r_2
 \cdot r_3$, respectively. The reasons for this will become
-clear shortly. In the literature you will often find that the
-choice $r_1 + r_2$ is written as $r_1\mid{}r_2$ or
-$r_1\mid\mid{}r_2$. Also following the convention in the
+clear shortly. 
+
+In the literature you will often find that the choice $r_1 +
+r_2$ is written as $r_1\mid{}r_2$ or $r_1\mid\mid{}r_2$. Also,
+often our $\ZERO$ and $\ONE$ are written $\varnothing$ and
+$\epsilon$, respectively. Following the convention in the
 literature, we will often omit the $\cdot$ all together. This
 is to make some concrete regular expressions more readable.
 For example the regular expression for email addresses shown
@@ -342,8 +354,8 @@
 
 \begin{center}
 \begin{tabular}{rcl}
-$\varnothing$ & $\mapsto$ & \texttt{NULL}\\
-$\epsilon$    & $\mapsto$ & \texttt{EMPTY}\\
+$\ZERO$       & $\mapsto$ & \texttt{ZERO}\\
+$\ONE$        & $\mapsto$ & \texttt{ONE}\\
 $c$           & $\mapsto$ & \texttt{CHAR(c)}\\
 $r_1 + r_2$   & $\mapsto$ & \texttt{ALT(r1, r2)}\\
 $r_1 \cdot r_2$ & $\mapsto$ & \texttt{SEQ(r1, r2)}\\
@@ -362,7 +374,7 @@
 \begin{center}
 \begin{tabular}{rcl}
 $r+$ & $\mapsto$ & $r\cdot r^*$\\
-$r?$ & $\mapsto$ & $\epsilon + r$\\
+$r?$ & $\mapsto$ & $\ONE + r$\\
 $\backslash d$ & $\mapsto$ & $0 + 1 + 2 + \ldots + 9$\\
 $[\text{\it a - z}]$ & $\mapsto$ & $a + b + \ldots + z$\\
 \end{tabular}
@@ -388,13 +400,13 @@
 is defined as follows
 
 \begin{center}
-\begin{tabular}{rcl}
-$L(\varnothing)$  & $\dn$ & $\{\}$\\
-$L(\epsilon)$     & $\dn$ & $\{[]\}$\\
-$L(c)$            & $\dn$ & $\{[c]\}$\\
-$L(r_1+ r_2)$     & $\dn$ & $L(r_1) \cup L(r_2)$\\
+\begin{tabular}{rcll}
+$L(\ZERO)$         & $\dn$ & $\{\}$\\
+$L(\ONE)$          & $\dn$ & $\{[]\}$\\
+$L(c)$             & $\dn$ & $\{"c"\}$ & or equivalently $\dn \{[c]\}$\\
+$L(r_1+ r_2)$      & $\dn$ & $L(r_1) \cup L(r_2)$\\
 $L(r_1 \cdot r_2)$ & $\dn$ & $L(r_1) \,@\, L(r_2)$\\
-$L(r^*)$           & $\dn$ & $(L(r))^*$\\
+$L(r^*)$           & $\dn$ & $(L(r))\star$\\
 \end{tabular}
 \end{center}
 
@@ -443,9 +455,9 @@
 which can match the strings $a$ and $b$. But also the regular
 expression $b + a$ would match the same strings. However,
 sometimes it is not so obvious whether two regular expressions
-match the same strings: for example do $r^*$ and $\epsilon + r
-\cdot r^*$ match the same strings? What about $\varnothing^*$
-and $\epsilon^*$? This suggests the following relation between
+match the same strings: for example do $r^*$ and $\ONE + r
+\cdot r^*$ match the same strings? What about $\ZERO^*$
+and $\ONE^*$? This suggests the following relation between
 \defn{equivalent regular expressions}: 
 
 \[
@@ -460,12 +472,13 @@
 whether
 
 \[
-\varnothing^* \equiv \epsilon^*
+\ZERO^* \equiv \ONE^*
 \]
 
-\noindent holds? Such equivalences will be important for our
-matching algorithm, because we can use them to simplify 
-regular expressions. 
+\noindent holds or not? Such equivalences will be important
+for our matching algorithm, because we can use them to
+simplify regular expressions, which will mean we can speed
+up the calculations. 
 
 \subsection*{My Fascination for Regular Expressions}
 
@@ -475,15 +488,15 @@
 found out about them. I even have the vague recollection that
 I did not quite understand them during my study. If I remember
 correctly,\footnote{That was really a long time ago.} I got
-utterly confused about $\epsilon$ and the empty
-string.\footnote{Obviously the lecturer must have been bad.}
-Since my study, I have used regular expressions for
-implementing lexers and parsers as I have always been
-interested in all kinds of programming languages and
-compilers, which invariably need regular expression in some
-form or shape. 
+utterly confused about $\ONE$ (which my lecturer wrote as
+$\epsilon$) and the empty string.\footnote{Obviously the
+lecturer must have been bad.} Since my study, I have used
+regular expressions for implementing lexers and parsers as I
+have always been interested in all kinds of programming
+languages and compilers, which invariably need regular
+expression in some form or shape. 
 
-To understand my fascination nowadays with regular
+To understand my fascination \emph{nowadays} with regular
 expressions, you need to know that my main scientific interest
 for the last 14 years has been with theorem provers. I am a
 core developer of one of
@@ -533,16 +546,18 @@
 What a feeling if you are an outsider to the subject!
 
 To conclude: Despite my early ignorance about regular
-expressions, I find them now quite interesting. They have a
-beautiful mathematical theory behind them. They have practical
-importance (remember the shocking runtime of the regular
-expression matchers in Python and Ruby in some instances).
-People who are not very familiar with the mathematical
-background of regular expressions get them consistently wrong.
-The hope is that we can do better in the future---for example
-by proving that the algorithms actually satisfy their
-specification and that the corresponding implementations do
-not contain any bugs. We are close, but not yet quite there.
+expressions, I find them now very interesting. They have a
+beautiful mathematical theory behind them, which can be
+sometimes quite deep and contain hidden snares. They have
+practical importance (remember the shocking runtime of the
+regular expression matchers in Python and Ruby in some
+instances). People who are not very familiar with the
+mathematical background of regular expressions get them
+consistently wrong. The hope is that we can do better in the
+future---for example by proving that the algorithms actually
+satisfy their specification and that the corresponding
+implementations do not contain any bugs. We are close, but not
+yet quite there.
 
 Notwithstanding my fascination, I am also happy to admit that regular
 expressions have their shortcomings. There are some well-known
@@ -571,18 +586,28 @@
 would not know---the only thing I can say to this regular
 expression is it is a monstrosity. However, this might
 actually be an argument against the RFC standard, rather than
-against regular expressions. Still it is good to know that
-some tasks in text processing just cannot be achieved by using
-regular expressions.
+against regular expressions. A similar argument is made in
+
+\begin{center}
+\url{https://elliot.land/validating-an-email-address}
+\end{center}
+
+
+\noindent which explains some of the crazier parts of email
+addresses. Still it is good to know that some tasks in text
+processing just cannot be achieved by using regular
+expressions.
 
 
 \begin{figure}[p]
 \lstinputlisting{../progs/crawler1.scala}
+
 \caption{The Scala code for a simple web-crawler that checks
 for broken links in a web-page. It uses the regular expression
-\texttt{http\_pattern} in Line~15 for recognising URL-addresses.
-It finds all links using the library function
-\texttt{findAllIn} in Line~21.\label{crawler1}}
+\texttt{http\_pattern} in Line~\ref{httpline} for recognising
+URL-addresses. It finds all links using the library function
+\texttt{findAllIn} in Line~\ref{findallline}.\label{crawler1}}
+
 \end{figure}
 
 
@@ -592,9 +617,11 @@
 \caption{A version of the web-crawler that only follows links
 in ``my'' domain---since these are the ones I am interested in
 to fix. It uses the regular expression \texttt{my\_urls} in
-Line~16 to check for my name in the links. The main change is
-in Lines~24--28 where there is a test whether URL is in ``my''
-domain or not.\label{crawler2}}
+Line~\ref{myurlline} to check for my name in the links. The
+main change is in
+Lines~\ref{changestartline}--\ref{changeendline} where there
+is a test whether URL is in ``my'' domain or
+not.\label{crawler2}}
 
 \end{figure}
 
@@ -604,9 +631,10 @@
 \caption{A small email harvester---whenever we download a
 web-page, we also check whether it contains any email
 addresses. For this we use the regular expression
-\texttt{email\_pattern} in Line~15. The main change is in Line
-30 where all email addresses that can be found in a page are
-printed.\label{crawler3}}
+\texttt{email\_pattern} in Line~\ref{emailline}. The main
+change is in Line~\ref{mainline} where all email addresses
+that can be found in a page are printed.\label{crawler3}}
+
 \end{figure}
 
 \begin{figure}[p]
@@ -617,7 +645,7 @@
 \end{minipage}
 \end{center}
 
-\caption{Nothing that can be said\ldots\label{monster}}
+\caption{Nothing that can be said this\ldots\label{monster}}
 \end{figure}