# HG changeset patch # User Christian Urban # Date 1413594497 -3600 # Node ID 39aeca14af8cba74a8c5bc83cb8744db498bdb15 # Parent 2c50b8b5886c47ea1695a3b17ad56f20dcbf1f57 updated diff -r 2c50b8b5886c -r 39aeca14af8c handouts/ho04.pdf Binary file handouts/ho04.pdf has changed diff -r 2c50b8b5886c -r 39aeca14af8c handouts/ho04.tex --- a/handouts/ho04.tex Sat Oct 18 01:02:03 2014 +0100 +++ b/handouts/ho04.tex Sat Oct 18 02:08:17 2014 +0100 @@ -570,11 +570,139 @@ the (rectified) value. -\subsubsection*{Records and Tokenisation} +\subsubsection*{Records} + +Remember we want to tokenize input strings, that means +splitting strings into their ``word'' components. But +furthermore we want to classify each token as being a keyword +or identifier and so on. For this one more feature will be +required, which I call \emph{record}. While values record +precisely how a regular expression matches a string, +records can be used to focus on some particular +parts of the regular expression and forget about others. +Let us look at an example. + +Suppose you have the regular expression $ab + ac$. Clearly +this regular expression can only recognise two strings. But +suppose you are not interested whether it can recognise $ab$ +or $ac$, but rather if it matched, then what was the last +character of the matched string\ldots either $b$ or $c$. +You can do this by annotating the regular expression with +a record, written $(x:r)$, where $x$ is just an identifier +(in my implementation a plain string) and $r$ is a regular +expression. A record will be regarded as a regular expression. +The extended definition in Scala looks as follows: + +{\small\lstinputlisting[language=Scala] +{../progs/app03.scala}} + +\noindent Since we regard records as regular expressions +we need to extend the functions $nullable$ and $der$. +Similarly $mkeps$ and $inj$ need to be extended and they +sometimes can return a particular value for records. +This means we also need to extend the definition of values. +The extended definition in Scala looks as follows: + +{\small\lstinputlisting[language=Scala] +{../progs/app04.scala}} + +\noindent Let us now look at the purpose of records more +closely and lets return to our question whether the string +terminated in a $b$ or $c$. We can do this as follows: we +annotate the regular expression $ab + ac$ with a record +as follows + +\begin{center} +$a(x:b) + a(x:c)$ +\end{center} + +\noindent This regular expression can still only recognise +the strings $ab$ and $ac$, but we can now use a function +that takes a value and returns all records. I call this +function \emph{env} for environment\ldots it builds a list +of identifiers associated with their string. This function +can be defined as follows: + +\begin{center} +\begin{tabular}{lcl} + $env(Empty)$ & $\dn$ & $[]$\\ + $env(Char(c))$ & $\dn$ & $[]$\\ + $env(Left(v))$ & $\dn$ & $env(v)$\\ + $env(Right(v))$ & $\dn$ & $env(v)$\\ + $env(Seq(v_1,v_2))$& $\dn$ & $env(v_1) \,@\, env(v_2)$\\ + $env([v_1,\ldots ,v_n])$ & $\dn$ & + $env(v_1) \,@\ldots @\, env(v_n)$\\ + $env(Rec(x:v))$ & $\dn$ & $(x:|v|) :: env(v)$\\ +\end{tabular} +\end{center} -\newpage -Algorithm by Sulzmann, Lexing +\noindent where in the last clause we use the flatten function +defined earlier. The function $env$ ``picks'' out all +underlying strings where a record is given. Since there can be +more than one, the environment will potentially contain +many ``recordings''. If we now postprocess the value +calculated by $lex$ extracting all recordings using $env$, +we can answer the question whether the last element in the +string was an $b$ or a $c$. Lets see this in action: if +we use $ab + ac$ and $ac$ the calculated value will be + +\begin{center} +$Right(Seq(Char(a), Char(c)))$ +\end{center} + +\noindent If we use instead $a(x:b) + a(x:c)$ and +use the $env$ function to extract the recording for +$x$ we obtain + +\begin{center} +$[(x:c)]$ +\end{center} + +\noindent If we had given the string $ab$ instead, then the +record would have been $[(x:b)]$. The fun starts if we +iterate this. Consider the regular expression + +\begin{center} +$(a(x:b) + a(y:c))^*$ +\end{center} +\noindent and the string $ababacabacab$. This string is +clearly matched by the regular expression, but we are only +interested in the sequence of $b$s and $c$s. Using $env$ +we obtain + +\begin{center} +$[(x:b), (x:b), (y:c), (x:b), (y:c), (x:b)]$ +\end{center} + +\noindent While this feature might look silly, it is in fact +quite useful. For example if we want to match the name of +an email we might use the regular expression + +\[ +(name: [a\mbox{-}z0\mbox{-}9\_\!\_\,.-]^+)\cdot @\cdot +(domain: [a\mbox{-}z0\mbox{-}9\,.-]^+)\cdot .\cdot +(top\_level: [a\mbox{-}z\,.]^{\{2,6\}}) +\] + +\noindent Then if we match the email address + +\[ +\texttt{christian.urban@kcl.ac.uk} +\] + +\noindent we can use the $env$ function and find out +what the name, domain and top-level part of the email +address are: + +\begin{center} +$[(name:\texttt{christian.urban}), + (domain:\texttt{kcl}), + (top\_level:\texttt{ac.uk})]$ +\end{center} + +\noindent As you will see in the next lecture, this is now all +we need to tokenise an input string and classify each token. \end{document} %%% Local Variables: diff -r 2c50b8b5886c -r 39aeca14af8c progs/app03.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/progs/app03.scala Sat Oct 18 02:08:17 2014 +0100 @@ -0,0 +1,8 @@ +abstract class Rexp +case object NULL extends Rexp +case object EMPTY extends Rexp +case class CHAR(c: Char) extends Rexp +case class ALT(r1: Rexp, r2: Rexp) extends Rexp +case class SEQ(r1: Rexp, r2: Rexp) extends Rexp +case class STAR(r: Rexp) extends Rexp +case class REC(x: String, r: Rexp) extends Rexp diff -r 2c50b8b5886c -r 39aeca14af8c progs/app04.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/progs/app04.scala Sat Oct 18 02:08:17 2014 +0100 @@ -0,0 +1,8 @@ +abstract class Val +case object Empty extends Val +case class Char(c: Char) extends Val +case class Seq(v1: Val, v2: Val) extends Val +case class Left(v: Val) extends Val +case class Right(v: Val) extends Val +case class Stars(vs: List[Val]) extends Val +case class Rec(x: String, v: Val) extends Val \ No newline at end of file