# HG changeset patch # User Christian Urban # Date 1353464416 0 # Node ID 9215b9fb88520b2b9e6586304d42219459829e63 # Parent ade6af51402cc241b7e696ade5e06b6d95cf2e16 tuned diff -r ade6af51402c -r 9215b9fb8852 app7.scala --- a/app7.scala Tue Nov 20 22:06:05 2012 +0000 +++ b/app7.scala Wed Nov 21 02:20:16 2012 +0000 @@ -1,17 +1,14 @@ -def matches(r: Rexp, s: String) : Boolean = - nullable(derivs(r, s.toList)) +abstract class Parser[I, T] { + def parse(ts: I): Set[(T, I)] + + def parse_all(ts: I) : Set[T] = + for ((head, tail) <- parse(ts); if (tail.isEmpty)) + yield head + + def || (right : => Parser[I, T]) : Parser[I, T] = + new AltParser(this, right) + def ==>[S] (f: => T => S) : Parser [I, S] = + new FunParser(this, f) +} -/* Examples */ - -println(matches(SEQ(SEQ(CHAR('c'), CHAR('a')), CHAR('b')),"cab")) -println(matches(STAR(CHAR('a')),"aaa")) - -/* Convenience using implicits */ -implicit def string2rexp(s : String) : Rexp = { - s.foldRight (EMPTY: Rexp) ( (c, r) => SEQ(CHAR(c), r) ) -} - -println(matches("cab" ,"cab")) -println(matches(STAR("a"),"aaa")) -println(matches(STAR("a"),"aaab")) diff -r ade6af51402c -r 9215b9fb8852 app8.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/app8.scala Wed Nov 21 02:20:16 2012 +0000 @@ -0,0 +1,23 @@ +class SeqParser[I, T, S](p: => Parser[I, T], + q: => Parser[I, S]) + extends Parser[I, (T, S)] { + def parse(sb: I) = + for ((head1, tail1) <- p.parse(sb); + (head2, tail2) <- q.parse(tail1)) + yield ((head1, head2), tail2) +} + +class AltParser[I, T](p: => Parser[I, T], + q: => Parser[I, T]) + extends Parser[I, T] { + def parse(sb: I) = p.parse(sb) ++ q.parse(sb) +} + +class FunParser[I, T, S](p: => Parser[I, T], f: T => S) + extends Parser[I, S] { + def parse(sb: I) = + for ((head, tail) <- p.parse(sb)) + yield (f(head), tail) +} + + diff -r ade6af51402c -r 9215b9fb8852 html-tokens.scala --- a/html-tokens.scala Tue Nov 20 22:06:05 2012 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -import Console._ - -// regular expressions including NOT -abstract class Rexp { - def ~ (right : Rexp) = SEQ(this, right) - def || (right : Rexp) = ALT(this, right) -} -case object NULL extends Rexp -case object EMPTY extends Rexp -case class CHAR(c: Char) extends Rexp -case class ALT(r1: Rexp, r2: Rexp) extends Rexp -case class SEQ(r1: Rexp, r2: Rexp) extends Rexp -case class STAR(r: Rexp) extends Rexp -case class NOT(r: Rexp) extends Rexp - -// some convenience for typing in regular expressions -def charlist2rexp(s : List[Char]) : Rexp = s match { - case Nil => EMPTY - case c::Nil => CHAR(c) - case c::s => SEQ(CHAR(c), charlist2rexp(s)) -} -implicit def string2rexp(s : String) : Rexp = charlist2rexp(s.toList) - - -// nullable function: tests whether the regular -// expression can recognise the empty string -def nullable (r: Rexp) : Boolean = r match { - case NULL => false - case EMPTY => true - case CHAR(_) => false - case ALT(r1, r2) => nullable(r1) || nullable(r2) - case SEQ(r1, r2) => nullable(r1) && nullable(r2) - case STAR(_) => true - case NOT(r) => !(nullable(r)) -} - -// tests whether a regular expression -// cannot recognise more -def no_more (r: Rexp) : Boolean = r match { - case NULL => true - case EMPTY => false - case CHAR(_) => false - case ALT(r1, r2) => no_more(r1) && no_more(r2) - case SEQ(r1, r2) => if (nullable(r1)) (no_more(r1) && no_more(r2)) else no_more(r1) - case STAR(_) => false - case NOT(r) => !(no_more(r)) -} - - -// derivative of a regular expression w.r.t. a character -def der (c: Char, r: Rexp) : Rexp = r match { - case NULL => NULL - case EMPTY => NULL - case CHAR(d) => if (c == d) EMPTY else NULL - case ALT(r1, r2) => ALT(der(c, r1), der(c, r2)) - case SEQ(r1, r2) => - if (nullable(r1)) ALT(SEQ(der(c, r1), r2), der(c, r2)) - else SEQ(der(c, r1), r2) - case STAR(r) => SEQ(der(c, r), STAR(r)) - case NOT(r) => NOT(der (c, r)) -} - -def error (s: String) = "Could not lex " + s - -def munch(r: Rexp, s: List[Char], t: List[Char]) : Option[(List[Char], List[Char])] = { - //println("Look at" + s); - s match { - case Nil if (nullable(r)) => Some(Nil, t) - case Nil => None - case c::s if (no_more(der (c, r)) && nullable(r)) => Some(c::s, t) - case c::s if (no_more(der (c, r))) => None - case c::s => munch(der (c, r), s, t ::: List(c)) - } -} - -def one_string (regs: List[Rexp], s: List[Char]) : Either[(List[Char], List[Char]), String] = { - val somes = regs.map { munch(_, s, Nil) } .flatten - if (somes == Nil) Right(error(s.mkString)) else Left(somes sortBy (_._1.length) head) -} - -def tokenize (regs: List[Rexp], s: List[Char]) : List[String] = s match { - case Nil => Nil - case _ => one_string(regs, s) match { - case Left((rest, s)) => { println("tokenized: " + s.mkString) ; s.mkString :: tokenize(regs, rest) } - case Right(msg) => { println(msg); sys.exit() } - } -} - - -// regular expression for specifying -// ranges of characters -def RANGE(s : List[Char]) : Rexp = s match { - case Nil => NULL - case c::Nil => CHAR(c) - case c::s => CHAR(c) || RANGE(s) -} - -//one or more -def PLUS(r: Rexp) = r ~ STAR(r) - - -//some regular expressions -val COLOUR = "BLACK" || "BLUE" || "CYAN" || "GREEN" || "MAGENTA" || "RED" || "WHITE" || "YELLOW" - -//BOLD || BLINK -//INVISIBLE -//RESET -//REVERSED -//UNDERLINED - -println(RED + BOLD + "test") -println(RESET) diff -r ade6af51402c -r 9215b9fb8852 html.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html.scala Wed Nov 21 02:20:16 2012 +0000 @@ -0,0 +1,59 @@ + +//:load matcher.scala + +// some regular expressions +val SYM = RANGE("""ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz.,!?-{[()]}':;%0123456789""") +val WORD = PLUS(SYM) + +val BTAG = SEQS("<", WORD, ">") +val ETAG = SEQS("") + +val WHITESPACE = PLUS(RANGE(" \n")) + +// for classifying the strings that have been recognised +abstract class Token +case object T_WHITESPACE extends Token +case class T_WORD(s: String) extends Token +case class T_ETAG(s: String) extends Token +case class T_BTAG(s: String) extends Token +case class T_NT(s: String, rhs: List[Token]) extends Token + +val lexing_rules: List[Rule[Token]] = + List((BTAG, (s) => T_BTAG(s.mkString)), + (ETAG, (s) => T_ETAG(s.mkString)), + (WORD, (s) => T_WORD(s.mkString)), + (WHITESPACE, (s) => T_WHITESPACE)) + +// the tokenizer +val T = Tokenizer(lexing_rules) + +// width for printing +val WIDTH = 60 + + +def interpret(ts: List[Token], c: Int, ctr: List[String]) : Unit= ts match { + case Nil => println(Console.RESET) + case T_WHITESPACE::rest => print(Console.RESET + " "); interpret(rest, c + 1, ctr) + case T_WORD(s)::rest => { + val newstr = Console.RESET + ctr.reverse.mkString + s + if (c + s.length < WIDTH) { + print(newstr); + interpret(rest, c + s.length, ctr) + } + else { + print("\n" + newstr) + interpret(rest, s.length, ctr) + } + } + case T_BTAG("

")::rest => print("\n"); interpret(rest, 0, ctr) + case T_ETAG("

")::rest => print("\n"); interpret(rest, 0, ctr) + case T_BTAG("")::rest => interpret(rest, c, Console.BOLD :: ctr) + case T_BTAG("")::rest => interpret(rest, c, Console.UNDERLINED :: ctr) + case T_BTAG("")::rest => interpret(rest, c, Console.CYAN :: ctr) + case T_BTAG("")::rest => interpret(rest, c, Console.RED :: ctr) + case T_BTAG("")::rest => interpret(rest, c, Console.BLINK :: ctr) + case T_ETAG(_)::rest => interpret(rest, c, ctr.tail) + case _::rest => interpret(rest, c, ctr) +} + +interpret(T.fromFile("test.html"), 0, Nil) diff -r ade6af51402c -r 9215b9fb8852 slides08.pdf Binary file slides08.pdf has changed diff -r ade6af51402c -r 9215b9fb8852 slides08.tex --- a/slides08.tex Tue Nov 20 22:06:05 2012 +0000 +++ b/slides08.tex Wed Nov 21 02:20:16 2012 +0000 @@ -151,6 +151,360 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \mode{ \begin{frame}[c] +\frametitle{\begin{tabular}{c}Building a Web Browser\end{tabular}} + +Using a lexer: assume the following regular expressions + +\begin{center} +\bl{\begin{tabular}{lcl} +$SY\!M$ & $\dn$ & $(\text{a}..\text{zA}..\text{Z0}..\text{9}..)$\\ +$W\!O\!RD$ & $\dn$ & $SY\!M^+$\\ +$BT\!AG$ & $\dn$ & $<\!W\!O\!RD\!>$\\ +$ET\!AG$ & $\dn$ & $<\!/W\!O\!RD\!>$\\ +$W\!HIT\!E$ & $\dn$ & $\texttt{" "} + \texttt{"}\slash{}n\texttt{"}$\\ +\end{tabular}} +\end{center} + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +\frametitle{\begin{tabular}{c}Interpreting a List of Token\end{tabular}} + +\begin{itemize} +\item text should be formatted consistently up to a specified width, say 60 characters +\item potential linebreaks are inserted by the formatter +\item repeated whitespaces are ``condensed'' to a single whitepace +\item \bl{$<\!p\!>$} \bl{$<\!\slash{}p\!>$} start/end paragraph +\item \bl{$<\!b\!>$} \bl{$<\!\slash{}b\!>$} start/end bold +\item \bl{$<\!red\!>$} \bl{$<\!\slash{}red\!>$} start/end red (cyan, etc) + + +\end{itemize} + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +\frametitle{\begin{tabular}{c}Interpreting a List of Token\end{tabular}} + +The lexer cannot prevent errors like + +\begin{center} +\bl{$<\!b\!>$} \ldots \bl{$<\!p\!>$} \ldots \bl{$<\!\slash{}b\!>$} \ldots \bl{$<\!\slash{}p\!>$} +\end{center} + +or + +\begin{center} +\bl{$<\!\slash{}b\!>$} \ldots \bl{$<\!b\!>$} +\end{center} + + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +\frametitle{\begin{tabular}{c}Parser Combinators\end{tabular}} + +Parser combinators: \bigskip + +\begin{minipage}{1.1\textwidth} +\begin{center} +\mbox{}\hspace{-12mm}\mbox{}$\underbrace{\text{list of tokens}}_{\text{input}}$ \bl{$\Rightarrow$} +$\underbrace{\text{set of (parsed input, unparsed input)}}_{\text{output}}$ +\end{center} +\end{minipage}\bigskip + +\begin{itemize} +\item sequencing +\item alternative +\item semantic action +\end{itemize} + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] + +Alternative parser (code \bl{$p\;||\;q$})\bigskip + +\begin{itemize} +\item apply \bl{$p$} and also \bl{$q$}; then combine the outputs +\end{itemize} + +\begin{center} +\large \bl{$p(\text{input}) \cup q(\text{input})$} +\end{center} + + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] + +Sequence parser (code \bl{$p\sim q$})\bigskip + +\begin{itemize} +\item apply \bl{$p$} first producing a set of pairs +\item then apply \bl{$q$} to the unparsed parts +\item the combine the results:\\ \mbox{}\;\;((input$_1$, input$_2$), unparsed part) +\end{itemize} + +\begin{center} +\begin{tabular}{l} +\large \bl{$\{((o_1, o_2), u_2) \;|\;$}\\[2mm] +\large\mbox{}\hspace{15mm} \bl{$(o_1, u_1) \in p(\text{input}) \wedge$}\\[2mm] +\large\mbox{}\hspace{15mm} \bl{$(o_2, u_2) \in q(u_1)\}$} +\end{tabular} +\end{center} + + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] + +Function parser (code \bl{$p \Longrightarrow f$})\bigskip + +\begin{itemize} +\item apply \bl{$p$} producing a set of pairs +\item then apply the function \bl{$f$} to each fist component +\end{itemize} + +\begin{center} +\begin{tabular}{l} +\large \bl{$\{(f(o_1), u_1) \;|\; (o_1, u_1) \in p(\text{input})\}$} +\end{tabular} +\end{center} + + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] + +Token parser:\bigskip + +\begin{itemize} +\item if the input is + +\begin{center} +\large \bl{$tok_1:: tok_2 :: \ldots :: tok_n$} +\end{center} + +then return + +\begin{center} +\large \bl{$\{(tok_1,\; tok_2 :: \ldots :: tok_n)\}$} +\end{center} + +or + +\begin{center} +\large \bl{$\{\}$} +\end{center} + +if \bl{$tok_1$} is not the right token we are looking for +\end{itemize} + + + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] + +Number-Token parser:\bigskip + +\begin{itemize} +\item if the input is + +\begin{center} +\large \bl{$num\_tok(42):: tok_2 :: \ldots :: tok_n$} +\end{center} + +then return + +\begin{center} +\large \bl{$\{(42,\; tok_2 :: \ldots :: tok_n)\}$} +\end{center} + +or + +\begin{center} +\large \bl{$\{\}$} +\end{center} + +if \bl{$tok_1$} is not the right token we are looking for +\end{itemize}\pause + +\begin{center} +list of tokens \bl{$\Rightarrow$} set of (\alert{int}, list of tokens) +\end{center} + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] + +\begin{itemize} +\item if the input is + +\begin{center} +\begin{tabular}{l} +\large \bl{$num\_tok(42)::$}\\ +\hspace{7mm}\large \bl{$num\_tok(3) ::$}\\ +\hspace{14mm}\large \bl{$tok_3 :: \ldots :: tok_n$} +\end{tabular} +\end{center} + +and the parser is + +\begin{center} +\bl{$ntp \sim ntp$} +\end{center} + +the successful output will be + +\begin{center} +\large \bl{$\{((42, 3),\; tok_2 :: \ldots :: tok_n)\}$} +\end{center}\pause + +Now we can form +\begin{center} +\bl{$(ntp \sim ntp) \Longrightarrow f$} +\end{center} + +where \bl{$f$} is the semantic action (what to do with the pair) + +\end{itemize} + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +\frametitle{\begin{tabular}{c}Semantic Actions\end{tabular}} + +Addition + +\begin{center} +\bl{$T \sim + \sim E \Longrightarrow \underbrace{f((x,y), z) \Rightarrow x + z}_{\text{semantic action}}$} +\end{center}\pause + +Multiplication + +\begin{center} +\bl{$F \sim * \sim T \Longrightarrow f((x,y), z) \Rightarrow x * z$} +\end{center}\pause + +Parenthesis + +\begin{center} +\bl{$\text{(} \sim E \sim \text{)} \Longrightarrow f((x,y), z) \Rightarrow y$} +\end{center} + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +\frametitle{\begin{tabular}{c}Types of Parsers\end{tabular}} + +\begin{itemize} +\item {\bf Sequencing}: if \bl{$p$} returns results of type \bl{$T$}, and \bl{$q$} results of type \bl{$S$}, +then \bl{$p \sim q$} returns results of type + +\begin{center} +\bl{$T \times S$} +\end{center}\pause + +\item {\bf Alternative}: if \bl{$p$} returns results of type \bl{$T$} then \bl{$q$} \alert{must} also have results of type \bl{$T$}, +and \bl{$p \;||\; q$} returns results of type + +\begin{center} +\bl{$T$} +\end{center}\pause + +\item {\bf Semantic Action}: if \bl{$p$} returns results of type \bl{$T$} and \bl{$f$} is a function from +\bl{$T$} to \bl{$S$}, then +\bl{$p \Longrightarrow f$} returns results of type + +\begin{center} +\bl{$S$} +\end{center} + +\end{itemize} + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +\frametitle{\begin{tabular}{c}Input Types of Parsers\end{tabular}} + +\begin{itemize} +\item input: \alert{list of tokens} +\item output: set of (output\_type, \alert{list of tokens}) +\end{itemize}\bigskip\pause + +actually it can be any input type as long as it is a kind of sequence +(for example a string) + +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +{\lstset{language=Scala}\fontsize{10}{12}\selectfont +\texttt{\lstinputlisting{app7.scala}}} +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +{\lstset{language=Scala}\fontsize{10}{12}\selectfont +\texttt{\lstinputlisting{app7.scala}}} +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] +{\lstset{language=Scala}\fontsize{10}{12}\selectfont +\texttt{\lstinputlisting{app8.scala}}} +\end{frame}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode{ +\begin{frame}[c] \frametitle{\begin{tabular}{c}Two Grammars\end{tabular}} Which languages are recognised by the following two grammars?