--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/parser.scala Wed Oct 31 02:05:12 2012 +0000
@@ -0,0 +1,182 @@
+// regular expressions including NOT
+abstract class Rexp
+case object NULL extends Rexp
+case object EMPTY extends Rexp
+case class CHAR(c: Char) extends Rexp
+case class ALT(r1: Rexp, r2: Rexp) extends Rexp
+case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
+case class STAR(r: Rexp) extends Rexp
+case class NOT(r: Rexp) extends Rexp
+// some convenience for typing in regular expressions
+def charlist2rexp(s : List[Char]) : Rexp = s match {
+ case Nil => EMPTY
+ case c::Nil => CHAR(c)
+ case c::s => SEQ(CHAR(c), charlist2rexp(s))
+implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList)
+// nullable function: tests whether the regular
+// expression can recognise the empty string
+def nullable (r: Rexp) : Boolean = r match {
+ case NULL => false
+ case EMPTY => true
+ case CHAR(_) => false
+ case ALT(r1, r2) => nullable(r1) || nullable(r2)
+ case SEQ(r1, r2) => nullable(r1) && nullable(r2)
+ case STAR(_) => true
+ case NOT(r) => !(nullable(r))
+// tests whether a regular expression
+// cannot recognise more
+def no_more (r: Rexp) : Boolean = r match {
+ case NULL => true
+ case EMPTY => false
+ case CHAR(_) => false
+ case ALT(r1, r2) => no_more(r1) && no_more(r2)
+ case SEQ(r1, r2) => if (nullable(r1)) (no_more(r1) && no_more(r2)) else no_more(r1)
+ case STAR(_) => false
+ case NOT(r) => !(no_more(r))
+// derivative of a regular expression w.r.t. a character
+def der (c: Char, r: Rexp) : Rexp = r match {
+ case NULL => NULL
+ case EMPTY => NULL case CHAR(d) => if (c == d) EMPTY else NULL
+ case ALT(r1, r2) => ALT(der(c, r1), der(c, r2))
+ case SEQ(r1, r2) =>
+ if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2))
+ else SEQ(der(c, r1), r2)
+ case STAR(r) => SEQ(der(c, r), STAR(r))
+ case NOT(r) => NOT(der (c, r))
+// regular expression for specifying
+// ranges of characters
+def RANGE(s : List[Char]) : Rexp = s match {
+ case Nil => NULL
+ case c::Nil => CHAR(c)
+ case c::s => ALT(CHAR(c), RANGE(s))
+// one or more
+def PLUS(r: Rexp) = SEQ(r, STAR(r))
+// some regular expressions
+val DIGIT = RANGE("0123456789".toList)
+val NONZERODIGIT = RANGE("123456789".toList)
+val LPAREN = CHAR('(')
+val RPAREN = CHAR(')')
+val WHITESPACE = PLUS(RANGE(" \n".toList))
+val OPS = RANGE("+-*".toList)
+// for classifying the strings that have been recognised
+abstract class Token
+case object T_WHITESPACE extends Token
+case object T_NUM extends Token
+case class T_OP(s: String) extends Token
+case object T_LPAREN extends Token
+case object T_RPAREN extends Token
+case class T_NT(s: String) extends Token
+type Rule = (Rexp, List[Char] => Token)
+def error (s: String) = throw new IllegalArgumentException ("Cannot tokenize: " + s)
+def munch(r: Rexp, action: List[Char] => Token, s: List[Char], t: List[Char]) : Option[(List[Char], Token)] =
+ s match {
+ case Nil if (nullable(r)) => Some(Nil, action(t))
+ case Nil => None
+ case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, action(t))
+ case c::s if (no_more(der (c, r))) => None
+ case c::s => munch(der (c, r), action, s, t ::: List(c))
+ }
+def one_token (rs: List[Rule], s: List[Char]) : (List[Char], Token) = {
+ val somes = rs.map { (r) => munch(r._1, r._2, s, Nil) } .flatten
+ if (somes == Nil) error(s.mkString) else (somes sortBy (_._1.length) head)
+def tokenize (rs: List[Rule], s: List[Char]) : List[Token] = s match {
+ case Nil => Nil
+ case _ => one_token(rs, s) match {
+ case (rest, token) => token :: tokenize(rs, rest)
+ }
+def tokenizer(rs: List[Rule], s: String) : List[Token] =
+ tokenize(rs, s.toList).filterNot(_ match {
+ case T_WHITESPACE => true
+ case _ => false
+ })
+// lexing rules for arithmetic expressions
+val lexing_rules: List[Rule]=
+ List((NUMBER, (s) => T_NUM),
+ (LPAREN, (s) => T_LPAREN),
+ (RPAREN, (s) => T_RPAREN),
+ (OPS, (s) => T_OP(s.mkString)))
+// examples
+println(tokenizer(lexing_rules, "2 + 3 * 4 + 1"))
+println(tokenizer(lexing_rules, "(2 + 3) * (4 + 1)"))
+type Grammar = List[(String, List[Token])]
+// grammar for arithmetic expressions
+val grammar =
+ List ("E" -> List(T_NUM),
+ "E" -> List(T_NT("E"), T_OP("+"), T_NT("E")),
+ "E" -> List(T_NT("E"), T_OP("-"), T_NT("E")),
+ "E" -> List(T_NT("E"), T_OP("*"), T_NT("E")),
+ "E" -> List(T_LPAREN, T_NT("E"), T_RPAREN))
+def chop[A](ts1: List[A], prefix: List[A], ts2: List[A]) : Option[(List[A], List[A])] =
+ ts1 match {
+ case Nil => None
+ case t::ts =>
+ if (ts1.startsWith(prefix)) Some(ts2.reverse, ts1.drop(prefix.length))
+ else chop(ts, prefix, t::ts2)
+ }
+// examples
+chop(List(1,2,3,4,5,6,7,8,9), List(4,5), Nil)
+chop(List(1,2,3,4,5,6,7,8,9), List(3,5), Nil)
+def replace[A](ts: List[A], out: List[A], in: List [A]) =
+ chop(ts, out, Nil) match {
+ case None => None
+ case Some((before, after)) => Some(before ::: in ::: after)
+ }
+def parse1(g: Grammar, ts: List[Token]) : Boolean = {
+ //println(ts)
+ if (ts == List(T_NT("E"))) true
+ else {
+ val tss = for ((lhs, rhs) <- g) yield replace(ts, rhs, List(T_NT(lhs)))
+ tss.flatten.exists(parse1(g, _))
+ }
+println() ; parse1(grammar, tokenizer(lexing_rules, "2 + 3 * 4 + 1"))
+println() ; parse1(grammar, tokenizer(lexing_rules, "(2 + 3) * (4 + 1)"))
+println() ; parse1(grammar, tokenizer(lexing_rules, "(2 + 3) * 4 (4 + 1)"))
Binary file slides06.pdf has changed
--- a/slides06.tex Mon Oct 29 12:31:31 2012 +0000
+++ b/slides06.tex Wed Oct 31 02:05:12 2012 +0000
@@ -219,6 +219,26 @@
+``I hate coding. I do not want to look at code.''
+``I am appalled. You do not show code anymore.''
@@ -348,134 +368,33 @@
-\frametitle{\begin{tabular}{c}Last Week\end{tabular}}
-Last week I showed you\bigskip
-\item an algorithm for automata minimisation
-\item an algorithm for transforming a regular expression into an NFA
-\item an algorithm for transforming an NFA into a DFA (subset construction)
-\frametitle{\begin{tabular}{c}This Week\end{tabular}}
-Go over the algorithms again, but with two new things and \ldots\medskip
-\item with the example: what is the regular expression that accepts every string, except those ending
-in \bl{aa}?\medskip
-\item Go over the proof for \bl{$L(rev(r)) = Rev(L(r))$}.\medskip
-\item Anything else so far.
-\frametitle{\begin{tabular}{c}Proofs By Induction\end{tabular}}
+A (context-free) Grammar \bl{$G$} consists of
-\item \bl{$P$} holds for \bl{$\varnothing$}, \bl{$\epsilon$} and \bl{c}\bigskip
-\item \bl{$P$} holds for \bl{r$_1$ + r$_2$} under the assumption that \bl{$P$} already
-holds for \bl{r$_1$} and \bl{r$_2$}.\bigskip
-\item \bl{$P$} holds for \bl{r$_1$ $\cdot$ r$_2$} under the assumption that \bl{$P$} already
-holds for \bl{r$_1$} and \bl{r$_2$}.
-\item \bl{$P$} holds for \bl{r$^*$} under the assumption that \bl{$P$} already
-holds for \bl{r}.
+\item a finite set of nonterminal symbols (upper case)
+\item a finite terminal symbols or tokens (lower case)
+\item a start symbol (which must be a nonterminal)
+\item a set of rules
-\bl{$P(r):\;\;L(rev(r)) = Rev(L(r))$}
-What is the regular expression that accepts every string, except those ending
-in \bl{aa}?\pause\bigskip
-\bl{(a + b)$^*$ba}\\
-\bl{(a + b)$^*$ab}\\
-\bl{(a + b)$^*$bb}\\\pause
-What are the strings to be avoided?\pause\medskip
-\bl{(a + b)$^*$aa}
+\bl{$A \rightarrow \text{rhs}$}
-An NFA for \bl{(a + b)$^*$aa}
-\begin{tikzpicture}[scale=2, line width=0.5mm]
- \node[state, initial] (q0) at ( 0,1) {$q_0$};
- \node[state] (q1) at ( 1,1) {$q_1$};
- \node[state, accepting] (q2) at ( 2,1) {$q_2$};
- \path[->] (q0) edge node[above] {$a$} (q1)
- (q1) edge node[above] {$a$} (q2)
- (q0) edge [loop below] node {$a$} ()
- (q0) edge [loop above] node {$b$} ()
- ;
+where \bl{rhs} are sequences involving terminals and nonterminals.\medskip\pause
-Minimisation for DFAs\\
-Subset Construction for NFAs
-\frametitle{\begin{tabular}{c}DFA Minimisation\end{tabular}}
+We can also allow rules
+\bl{$A \rightarrow \text{rhs}_1 | \text{rhs}_2 | \ldots$}
-\item Take all pairs \bl{(q, p)} with \bl{q $\not=$ p}
-\item Mark all pairs that accepting and non-accepting states
-\item For all unmarked pairs \bl{(q, p)} and all characters \bl{c} tests wether
-\bl{($\delta$(q,c), $\delta$(p,c))}
-are marked. If yes, then also mark \bl{(q, p)}.
-\item Repeat last step until nothing changed.
-\item All unmarked pairs can be merged.
@@ -483,141 +402,46 @@
-Minimal DFA \only<1>{\bl{(a + b)$^*$aa}}\only<2->{\alert{not} \bl{(a + b)$^*$aa}}
-\begin{tikzpicture}[scale=2, line width=0.5mm]
- \only<1>{\node[state, initial] (q0) at ( 0,1) {$q_0$};}
- \only<2->{\node[state, initial,accepting] (q0) at ( 0,1) {$q_0$};}
- \only<1>{\node[state] (q1) at ( 1,1) {$q_1$};}
- \only<2->{\node[state,accepting] (q1) at ( 1,1) {$q_1$};}
- \only<1>{\node[state, accepting] (q2) at ( 2,1) {$q_2$};}
- \only<2->{\node[state] (q2) at ( 2,1) {$q_2$};}
- \path[->] (q0) edge[bend left] node[above] {$a$} (q1)
- (q1) edge[bend left] node[above] {$b$} (q0)
- (q2) edge[bend left=50] node[below] {$b$} (q0)
- (q1) edge node[above] {$a$} (q2)
- (q2) edge [loop right] node {$a$} ()
- (q0) edge [loop below] node {$b$} ()
- ;
+$S$ & $\rightarrow$ & $\epsilon$ \\
+$S$ & $\rightarrow$ & $aSa$ \\
+$S$ & $\rightarrow$ & $bSb$ \\
+$S$ & $\rightarrow$ & $\epsilon \;|\; aSa \;|\;bSb$ \\
-\onslide<3>{How to get from a DFA to a regular expression?}
-\begin{tikzpicture}[scale=2, line width=0.5mm]
- \only<1->{\node[state, initial] (q0) at ( 0,1) {$q_0$};}
- \only<1->{\node[state] (q1) at ( 1,1) {$q_1$};}
- \only<1->{\node[state] (q2) at ( 2,1) {$q_2$};}
- \path[->] (q0) edge[bend left] node[above] {$a$} (q1)
- (q1) edge[bend left] node[above] {$b$} (q0)
- (q2) edge[bend left=50] node[below] {$b$} (q0)
- (q1) edge node[above] {$a$} (q2)
- (q2) edge [loop right] node {$a$} ()
- (q0) edge [loop below] node {$b$} ()
- ;
-\begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
-\bl{$q_0$} & \bl{$=$} & \bl{$2\, q_0 + 3 \,q_1 + 4\, q_2$}\\
-\bl{$q_1$} & \bl{$=$} & \bl{$2 \,q_0 + 3\, q_1 + 1\, q_2$}\\
-\bl{$q_2$} & \bl{$=$} & \bl{$1\, q_0 + 5\, q_1 + 2\, q_2$}\\
-\begin{tikzpicture}[scale=2, line width=0.5mm]
- \only<1->{\node[state, initial] (q0) at ( 0,1) {$q_0$};}
- \only<1->{\node[state] (q1) at ( 1,1) {$q_1$};}
- \only<1->{\node[state] (q2) at ( 2,1) {$q_2$};}
- \path[->] (q0) edge[bend left] node[above] {$a$} (q1)
- (q1) edge[bend left] node[above] {$b$} (q0)
- (q2) edge[bend left=50] node[below] {$b$} (q0)
- (q1) edge node[above] {$a$} (q2)
- (q2) edge [loop right] node {$a$} ()
- (q0) edge [loop below] node {$b$} ()
- ;
-\begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
-\bl{$q_0$} & \bl{$=$} & \bl{$\epsilon + q_0\,b + q_1\,b + q_2\,b$}\\
-\bl{$q_1$} & \bl{$=$} & \bl{$q_0\,a$}\\
-\bl{$q_2$} & \bl{$=$} & \bl{$q_1\,a + q_2\,a$}\\
-Arden's Lemma:
-If \bl{$q = q\,r + s$}\; then\; \bl{$q = s\, r^*$}
-\frametitle{\begin{tabular}{c}Algorithms on Automata\end{tabular}}
-\item Reg $\rightarrow$ NFA: Thompson-McNaughton-Yamada method\medskip
-\item NFA $\rightarrow$ DFA: Subset Construction\medskip
-\item DFA $\rightarrow$ Reg: Brzozowski's Algebraic Method\medskip
-\item DFA minimisation: Hopcrofts Algorithm\medskip
-\item complement DFA
+\frametitle{\begin{tabular}{c}Arithmetic Expressions\end{tabular}}
-$E$ & $\rightarrow$ & $F + (F \cdot \qq*\qq \cdot F) + (F \cdot \qq\backslash\qq \cdot F)$\\
-$F$ & $\rightarrow$ & $T + (T \cdot \qq\texttt{+}\qq \cdot T) + (T \cdot \qq\texttt{-}\qq \cdot T)$\\
-$T$ & $\rightarrow$ & $num + (\qq\texttt{(}\qq \cdot E \cdot \qq\texttt{)}\qq)$\\
+$E$ & $\rightarrow$ & $num\_token$ \\
+$E$ & $\rightarrow$ & $E + E$ \\
+$E$ & $\rightarrow$ & $E - E$ \\
+$E$ & $\rightarrow$ & $E * E$ \\
+$E$ & $\rightarrow$ & $( E )$
-\bl{$E$}, \bl{$F$} and \bl{$T$} are non-terminals\\
-\bl{$E$} is start symbol\\
-\bl{$num$}, \bl{(}, \bl{)}, \bl{+} \ldots are terminals\bigskip\\
+\bl{\texttt{1 + 2 * 3 + 4}}
@@ -626,32 +450,33 @@
+\frametitle{\begin{tabular}{c}Parse Trees\end{tabular}}
-$E$ & $\rightarrow$ & $F + (F \cdot \qq*\qq \cdot F) + (F \cdot \qq\backslash\qq \cdot F)$\\
-$F$ & $\rightarrow$ & $T + (T \cdot \qq\texttt{+}\qq \cdot T) + (T \cdot \qq\texttt{-}\qq \cdot T)$\\
-$T$ & $\rightarrow$ & $num + (\qq\texttt{(}\qq \cdot E \cdot \qq\texttt{)}\qq)$\\
+$E$ & $\rightarrow$ & $F \;|\; F * F$\\
+$F$ & $\rightarrow$ & $T \;|\; T + T \;|\; T - T$\\
+$T$ & $\rightarrow$ & $num\_token \;|\; ( E )$\\
\begin{tikzpicture}[level distance=8mm, blue]
- \node {E}
- child {node {F}
- child {node {T}
- child {node {\qq(\qq\,E\,\qq)\qq}
- child {node{F \qq*\qq{} F}
- child {node {T} child {node {2}}}
- child {node {T} child {node {3}}}
+ \node {$E$}
+ child {node {$F$}
+ child {node {$T$}
+ child {node {(\,$E$\,)}
+ child {node{$F$ *{} $F$}
+ child {node {$T$} child {node {2}}}
+ child {node {$T$} child {node {3}}}
- child {node {\qq+\qq}}
- child {node {T}
- child {node {\qq(\qq\,E\,\qq)\qq}
- child {node {F}
- child {node {T \qq+\qq{} T}
+ child {node {+}}
+ child {node {$T$}
+ child {node {(\,$E$\,)}
+ child {node {$F$}
+ child {node {$T$ +{} $T$}
child {node {3}}
child {node {4}}
@@ -660,12 +485,59 @@
-\begin{textblock}{5}(1, 5)
+\begin{textblock}{5}(1, 6.5)
+\frametitle{\begin{tabular}{c}Ambiguous Grammars\end{tabular}}
+A grammar is \alert{ambiguous} if there is a string that has at least parse trees.
+$E$ & $\rightarrow$ & $num\_token$ \\
+$E$ & $\rightarrow$ & $E + E$ \\
+$E$ & $\rightarrow$ & $E - E$ \\
+$E$ & $\rightarrow$ & $E * E$ \\
+$E$ & $\rightarrow$ & $( E )$
+\bl{\texttt{1 + 2 * 3 + 4}}
+\frametitle{\begin{tabular}{c}Chomsky Normal Form\end{tabular}}
+All rules must be of the form
+\bl{$A \rightarrow a$}
+\bl{$A \rightarrow BC$}