coursework/cw01.tex
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Tue, 12 Jan 2016 02:18:58 +0000
changeset 395 e57d3d92b856
parent 358 b3129cff41e9
child 418 010c5a03dca2
permissions -rw-r--r--
updated

\documentclass{article}
\usepackage{../style}
\usepackage{../langs}

\begin{document}

\section*{Coursework 1 (Strand 1)}

This coursework is worth 5\% and is due on 20 October at
16:00. You are asked to implement a regular expression matcher
and submit a document containing the answers for the questions
below. You can do the implementation in any programming
language you like, but you need to submit the source code with
which you answered the questions, otherwise a mark of 0\% will
be awarded. You can submit your answers in a txt-file or pdf.



\subsubsection*{Disclaimer}

It should be understood that the work you submit represents
your own effort. You have not copied from anyone else. An
exception is the Scala code I showed during the lectures or
uploaded to KEATS, which you can use.\bigskip


\subsubsection*{Tasks}

The task is to implement a regular expression matcher based on
derivatives. The implementation should be able to deal with the usual
(basic) regular expressions

\[
\varnothing, \epsilon, c, r_1 + r_2, r_1 \cdot r_2, r^*
\]

\noindent
but also with the following extended regular expressions:

\begin{center}
\begin{tabular}{ll}
$[c_1 c_2 \ldots c_n]$ & a range of characters\\
$r^+$ & one or more times $r$\\
$r^?$ & optional $r$\\
$r^{\{n,m\}}$ & at least $n$-times $r$ but no more than $m$-times\\
$\sim{}r$ & not-regular expression of $r$\\
\end{tabular}
\end{center}

\noindent In the case of $r^{\{n,m\}}$ we have the convention
that $0 \le n \le m$. The meanings of the extended regular
expressions are

\begin{center}
\begin{tabular}{r@{\hspace{2mm}}c@{\hspace{2mm}}l}
$L([c_1 c_2 \ldots c_n])$ & $\dn$ & $\{[c_1], [c_2], \ldots, [c_n]\}$\\ 
$L(r^+)$                  & $\dn$ & $\bigcup_{1\le i}. L(r)^i$\\
$L(r^?)$                  & $\dn$ & $L(r) \cup \{[]\}$\\
$L(r^{\{n,m\}})$           & $\dn$ & $\bigcup_{n\le i \le m}. L(r)^i$\\
$L(\sim{}r)$              & $\dn$ & $\Sigma^* - L(r)$
\end{tabular}
\end{center}

\noindent whereby in the last clause the set $\Sigma^*$ stands
for the set of \emph{all} strings over the alphabet $\Sigma$
(in the implementation the alphabet can be just what is
represented by, say, the type \pcode{char}). So $\sim{}r$
means `all the strings that $r$ cannot match'. 

Be careful that your implementation of \textit{nullable} and
\textit{der} satisfies for every $r$ the following two
properties (see also Question 2):

\begin{itemize}
\item $\textit{nullable}(r)$ if and only if $[]\in L(r)$
\item $L(der\,c\,r) = Der\,c\,(L(r))$
\end{itemize}

\noindent {\bf Important!} Your implementation should have
explicit cases for the basic regular expressions, but also
explicit cases for the extended regular expressions. That
means do not treat the extended regular expressions by just
translating them into the basic ones. See also Question 2,
where you are asked to explicitly give the rules for
\textit{nullable} and \textit{der} for the extended regular
expressions.


\subsection*{Question 1 (unmarked)}

What is your King's email address (you will need it in
Question 3)?

\subsection*{Question 2 (marked with 2\%)}

This question does not require any implementation. From the
lectures you have seen the definitions for the functions
\textit{nullable} and \textit{der} for the basic regular
expressions. Give the rules for the extended regular
expressions:

\begin{center}
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}}
$\textit{nullable}([c_1 c_2 \ldots c_n])$  & $\dn$ & $?$\\
$\textit{nullable}(r^+)$                   & $\dn$ & $?$\\
$\textit{nullable}(r^?)$                   & $\dn$ & $?$\\
$\textit{nullable}(r^{\{n,m\}})$            & $\dn$ & $?$\\
$\textit{nullable}(\sim{}r)$               & $\dn$ & $?$\medskip\\
$der\, c\, ([c_1 c_2 \ldots c_n])$  & $\dn$ & $?$\\
$der\, c\, (r^+)$                   & $\dn$ & $?$\\
$der\, c\, (r^?)$                   & $\dn$ & $?$\\
$der\, c\, (r^{\{n,m\}})$            & $\dn$ & $?$\\
$der\, c\, (\sim{}r)$               & $\dn$ & $?$\\
\end{tabular}
\end{center}

\noindent
Remember your definitions have to satisfy the two properties

\begin{itemize}
\item $\textit{nullable}(r)$ if and only if $[]\in L(r)$
\item $L(der\,c\,r)) = Der\,c\,(L(r))$
\end{itemize}

\subsection*{Question 3 (marked with 1\%)}

Implement the following regular expression for email addresses

\[
([a\mbox{-}z0\mbox{-}9\_\!\_\,.-]^+)\cdot @\cdot ([a\mbox{-}z0\mbox{-}9\,.-]^+)\cdot .\cdot ([a\mbox{-}z\,.]^{\{2,6\}})
\]

\noindent and calculate the derivative according to your email
address. When calculating the derivative, simplify all regular
expressions as much as possible, but at least apply the
following six simplification rules:

\begin{center}
\begin{tabular}{l@{\hspace{2mm}}c@{\hspace{2mm}}ll}
$r \cdot \varnothing$ & $\mapsto$ & $\varnothing$\\ 
$\varnothing \cdot r$ & $\mapsto$ & $\varnothing$\\ 
$r \cdot \epsilon$ & $\mapsto$ & $r$\\ 
$\epsilon \cdot r$ & $\mapsto$ & $r$\\ 
$r + \varnothing$ & $\mapsto$ & $r$\\ 
$\varnothing + r$ & $\mapsto$ & $r$\\ 
$r + r$ & $\mapsto$ & $r$\\ 
\end{tabular}
\end{center}

\noindent
Write down your simplified derivative in the ``mathematicical''
notation using parentheses where necessary. That means you should
use the more readable infix notation $+$, $\cdot$ and $^*$, 
instead of code.
 
\subsection*{Question 4 (marked with 1\%)}

Suppose \textit{[a-z]} stands for the range regular expression
$[a,b,c,\ldots,z]$.  Consider the regular expression $/ \cdot * \cdot
(\sim{}([a\mbox{-}z]^* \cdot * \cdot / \cdot [a\mbox{-}z]^*)) \cdot *
\cdot /$ and decide wether the following four strings are matched by
this regular expression. Answer yes or no.

\begin{enumerate}
\item \texttt{"/**/"}
\item \texttt{"/*foobar*/"}
\item \texttt{"/*test*/test*/"}
\item \texttt{"/*test/*test*/"}
\end{enumerate}

\subsection*{Question 5 (marked with 1\%)}

Let $r_1$ be the regular expression $a\cdot a\cdot a$ and $r_2$ be
$(a^{\{19,19\}}) \cdot (a^?)$.  Decide whether the following three
strings consisting of $a$s only can be matched by $(r_1^+)^+$.
Similarly test them with $(r_2^+)^+$. Again answer in all six cases
with yes or no. \medskip

\noindent
These are strings are meant to be entirely made up of $a$s. Be careful
when copy-and-pasting the strings so as to not forgetting any $a$ and
to not introducing any other character.

\begin{enumerate}
\item \texttt{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
\item \texttt{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\ 
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\ 
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
\item \texttt{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\ 
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\ 
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
\end{enumerate}


\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: