handouts/ho06.tex
changeset 799 85267be9a5ed
parent 595 4bf0096bc06b
child 937 dc5ab66b11cc
equal deleted inserted replaced
798:aaf0bd0a211d 799:85267be9a5ed
     2 % !TEX program = xelatex
     2 % !TEX program = xelatex
     3 \documentclass{article}
     3 \documentclass{article}
     4 \usepackage{../style}
     4 \usepackage{../style}
     5 \usepackage{../langs}
     5 \usepackage{../langs}
     6 \usepackage{../grammar}
     6 \usepackage{../grammar}
       
     7 \usepackage{../graphics}
     7 
     8 
     8 \begin{document}
     9 \begin{document}
     9 
    10 
    10 \section*{Handout 6 (Parser Combinators)}
    11 \section*{Handout 6 (Parser Combinators)}
    11 
    12 
   242 new AltParser(CharParser('a'), CharParser('b'))
   243 new AltParser(CharParser('a'), CharParser('b'))
   243 \end{lstlisting}
   244 \end{lstlisting}
   244 \end{center}
   245 \end{center}
   245 
   246 
   246 \noindent Later on we will use Scala mechanism for introducing some
   247 \noindent Later on we will use Scala mechanism for introducing some
   247 more readable shorthand notation for this, like \texttt{"a" |
   248 more readable shorthand notation for this, like \texttt{p"a" ||
   248   "b"}. Let us look in detail at what this parser combinator produces
   249   p"b"}. Let us look in detail at what this parser combinator produces
   249 with some sample strings.
   250 with some sample strings.
   250 
   251 
   251 \begin{center}
   252 \begin{center}
   252 \begin{tabular}{rcl}
   253 \begin{tabular}{rcl}
   253 input strings & & output\medskip\\
   254 input strings & & output\medskip\\
   334 to an operator).  If any of the runs of \textit{p} and \textit{q}
   335 to an operator).  If any of the runs of \textit{p} and \textit{q}
   335 fail, that is produce the empty set, then \texttt{parse} will also
   336 fail, that is produce the empty set, then \texttt{parse} will also
   336 produce the empty set.
   337 produce the empty set.
   337 
   338 
   338 With the shorthand notation we shall introduce later for the sequence
   339 With the shorthand notation we shall introduce later for the sequence
   339 parser combinator, we can write for example \pcode{"a" ~ "b"}, which
   340 parser combinator, we can write for example \pcode{p"a" ~ p"b"}, which
   340 is the parser combinator that first recognises the character
   341 is the parser combinator that first recognises the character
   341 \texttt{a} from a string and then \texttt{b}. Let us look again at
   342 \texttt{a} from a string and then \texttt{b}. Let us look again at
   342 some examples of how this parser combinator processes some strings:
   343 some examples of how this parser combinator processes some strings:
   343 
   344 
   344 \begin{center}
   345 \begin{center}
   358 \emph{not} a simple pair \texttt{(ab, cde)} as one might erroneously
   359 \emph{not} a simple pair \texttt{(ab, cde)} as one might erroneously
   359 expect.  The parser returns the empty set in the other examples,
   360 expect.  The parser returns the empty set in the other examples,
   360 because they do not fit with what the parser is supposed to parse.
   361 because they do not fit with what the parser is supposed to parse.
   361 
   362 
   362 
   363 
   363 A slightly more complicated parser is \pcode{("a" | "b") ~ "c"} which
   364 A slightly more complicated parser is \pcode{(p"a" || p"b") ~ p"c"} which
   364 parses as first character either an \texttt{a} or \texttt{b}, followed
   365 parses as first character either an \texttt{a} or \texttt{b}, followed
   365 by a \texttt{c}. This parser produces the following outputs.
   366 by a \texttt{c}. This parser produces the following outputs.
   366 
   367 
   367 \begin{center}
   368 \begin{center}
   368 \begin{tabular}{rcl}
   369 \begin{tabular}{rcl}
   372 \texttt{\Grid{abde}} & $\rightarrow$ & $\{\}$
   373 \texttt{\Grid{abde}} & $\rightarrow$ & $\{\}$
   373 \end{tabular}
   374 \end{tabular}
   374 \end{center}
   375 \end{center}
   375 
   376 
   376 \noindent
   377 \noindent
   377 Now consider the parser \pcode{("a" ~ "b") ~ "c"} which parses
   378 Now consider the parser \pcode{(p"a" ~ p"b") ~ p"c"} which parses
   378 \texttt{a}, \texttt{b}, \texttt{c} in sequence. This parser produces
   379 \texttt{a}, \texttt{b}, \texttt{c} in sequence. This parser produces
   379 the following outputs.
   380 the following outputs.
   380 
   381 
   381 \begin{center}
   382 \begin{center}
   382 \begin{tabular}{rcl}
   383 \begin{tabular}{rcl}
   390 
   391 
   391 \noindent The second and third example fail, because something is
   392 \noindent The second and third example fail, because something is
   392 ``missing'' in the sequence we are looking for. The first succeeds but
   393 ``missing'' in the sequence we are looking for. The first succeeds but
   393 notice how the results nest with sequences: the parsed part is a
   394 notice how the results nest with sequences: the parsed part is a
   394 nested pair of the form \pcode{((a, b), c)}. If we nest the sequence
   395 nested pair of the form \pcode{((a, b), c)}. If we nest the sequence
   395 parser differently, say \pcode{"a" ~ ("b" ~ "c")}, then also
   396 parser differently, say \pcode{p"a" ~ (p"b" ~ p"c")}, then also
   396 our output pairs nest differently
   397 our output pairs nest differently
   397 
   398 
   398 \begin{center}
   399 \begin{center}
   399 \begin{tabular}{rcl}
   400 \begin{tabular}{rcl}
   400 input strings & & output\medskip\\
   401 input strings & & output\medskip\\
   402 \end{tabular}
   403 \end{tabular}
   403 \end{center}
   404 \end{center}
   404 
   405 
   405 \noindent
   406 \noindent
   406 Two more examples: first consider the parser
   407 Two more examples: first consider the parser
   407 \pcode{("a" ~ "a") ~ "a"} and the input \pcode{aaaa}:
   408 \pcode{(p"a" ~ p"a") ~ p"a"} and the input \pcode{aaaa}:
   408 
   409 
   409 \begin{center}
   410 \begin{center}
   410 \begin{tabular}{rcl}
   411 \begin{tabular}{rcl}
   411 input string & & output\medskip\\
   412 input string & & output\medskip\\
   412 \texttt{\Grid{aaaa}} & $\rightarrow$ & 
   413 \texttt{\Grid{aaaa}} & $\rightarrow$ & 
   414 \end{tabular}
   415 \end{tabular}
   415 \end{center}
   416 \end{center}
   416 
   417 
   417 \noindent Notice again how the results nest deeper and deeper as pairs (the
   418 \noindent Notice again how the results nest deeper and deeper as pairs (the
   418 last \pcode{a} is in the unprocessed part). To consume everything of
   419 last \pcode{a} is in the unprocessed part). To consume everything of
   419 this string we can use the parser \pcode{(("a" ~ "a") ~ "a") ~
   420 this string we can use the parser \pcode{((p"a" ~ p"a") ~ p"a") ~
   420   "a"}. Then the output is as follows:
   421   p"a"}. Then the output is as follows:
   421 
   422 
   422 \begin{center}
   423 \begin{center}
   423 \begin{tabular}{rcl}
   424 \begin{tabular}{rcl}
   424 input string & & output\medskip\\
   425 input string & & output\medskip\\
   425 \texttt{\Grid{aaaa}} & $\rightarrow$ & 
   426 \texttt{\Grid{aaaa}} & $\rightarrow$ & 
   442 ultimately-unsuccessful-parses. The main point is that the sequence
   443 ultimately-unsuccessful-parses. The main point is that the sequence
   443 parser combinator returns pairs that can nest according to the
   444 parser combinator returns pairs that can nest according to the
   444 nesting of the component parsers.
   445 nesting of the component parsers.
   445 
   446 
   446 
   447 
   447 Consider also carefully that constructing a parser such \pcode{"a" |
   448 Consider also carefully that constructing a parser such \pcode{p"a" ||
   448   ("a" ~ "b")} will result in a typing error. The intention with this
   449   (p"a" ~ p"b")} will result in a typing error. The intention with this
   449 parser is that we want to parse either an \texttt{a}, or an \texttt{a}
   450 parser is that we want to parse either an \texttt{a}, or an \texttt{a}
   450 followed by a \texttt{b}. However, the first parser has as output type
   451 followed by a \texttt{b}. However, the first parser has as output type
   451 a single character (recall the type of \texttt{CharParser}), but the
   452 a single character (recall the type of \texttt{CharParser}), but the
   452 second parser produces a pair of characters as output. The alternative
   453 second parser produces a pair of characters as output. The alternative
   453 parser is required to have both component parsers to have the same
   454 parser is required to have both component parsers to have the same
   460 actually combine two smaller parsers, but applies a function to the result
   461 actually combine two smaller parsers, but applies a function to the result
   461 of a parser.  It is implemented in Scala as follows
   462 of a parser.  It is implemented in Scala as follows
   462 
   463 
   463 \begin{center}
   464 \begin{center}
   464 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   465 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   465 class FunParser[I, T, S]
   466 class MapParser[I, T, S]
   466          (p: => Parser[I, T], 
   467          (p: => Parser[I, T], 
   467           f: T => S) extends Parser[I, S] {
   468           f: T => S) extends Parser[I, S] {
   468   def parse(in: I) = 
   469   def parse(in: I) = 
   469     for ((head, tail) <- p.parse(in)) yield (f(head), tail)
   470     for ((head, tail) <- p.parse(in)) yield (f(head), tail)
   470 }
   471 }
   478 produces sets of type \texttt{Set[(T, I)]}. The semantic action
   479 produces sets of type \texttt{Set[(T, I)]}. The semantic action
   479 combinator then applies the function \texttt{f} to all the `processed'
   480 combinator then applies the function \texttt{f} to all the `processed'
   480 parser outputs. Since this function is of type \texttt{T => S}, we
   481 parser outputs. Since this function is of type \texttt{T => S}, we
   481 obtain a parser with output type \texttt{S}. Again Scala lets us
   482 obtain a parser with output type \texttt{S}. Again Scala lets us
   482 introduce some shorthand notation for this parser
   483 introduce some shorthand notation for this parser
   483 combinator. Therefore we will write short \texttt{p ==> f} for it.
   484 combinator. Therefore we will write short \texttt{p.map(f)} for it.
   484 
   485 
   485 What are semantic actions good for? Well, they allow you to transform
   486 What are semantic actions good for? Well, they allow you to transform
   486 the parsed input into datastructures you can use for further
   487 the parsed input into datastructures you can use for further
   487 processing. A simple (contrived) example would be to transform parsed
   488 processing. A simple (contrived) example would be to transform parsed
   488 characters into ASCII numbers. Suppose we define a function \texttt{f}
   489 characters into ASCII numbers. Suppose we define a function \texttt{f}
   501 We then can run the following two parsers on the input \texttt{cbd}:
   502 We then can run the following two parsers on the input \texttt{cbd}:
   502 
   503 
   503 \begin{center}
   504 \begin{center}
   504 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   505 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   505 c.parse("cbd")
   506 c.parse("cbd")
   506 (c ==> f).parse("cbd")
   507 c.map(f).parse("cbd")
   507 \end{lstlisting}
   508 \end{lstlisting}
   508 \end{center}
   509 \end{center}
   509 
   510 
   510 \noindent
   511 \noindent
   511 In the first line we obtain the expected result \texttt{Set(('c',
   512 In the first line we obtain the expected result \texttt{Set(('c',
   563 Scala). We want to convert this string into the corresponding
   564 Scala). We want to convert this string into the corresponding
   564 \texttt{Int}. We can do this as follows using a semantic action
   565 \texttt{Int}. We can do this as follows using a semantic action
   565 
   566 
   566 \begin{center}
   567 \begin{center}
   567 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   568 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   568 (NumParser ==> (s => s.toInt)).parse("123abc")
   569 NumParser.map{s => s.toInt}.parse("123abc")
   569 \end{lstlisting}
   570 \end{lstlisting}
   570 \end{center}  
   571 \end{center}  
   571 
   572 
   572 \noindent
   573 \noindent
   573 The function in the semantic action converts a string into an
   574 The function in the semantic action converts a string into an
   583 will make our life much easier when writing actual parsers. We can define
   584 will make our life much easier when writing actual parsers. We can define
   584 some implicits which allow us to write
   585 some implicits which allow us to write
   585 
   586 
   586 \begin{center}
   587 \begin{center}
   587 \begin{tabular}{ll}  
   588 \begin{tabular}{ll}  
   588   \pcode{p | q} & alternative parser\\
   589   \pcode{p || q} & alternative parser\\
   589   \pcode{p ~ q} & sequence parser\\ 
   590   \pcode{p ~ q} & sequence parser\\ 
   590   \pcode{p ==> f} & semantic action parser
   591   \pcode{p.map(f)} & semantic action parser
   591 \end{tabular}
   592 \end{tabular}
   592 \end{center}
   593 \end{center}
   593 
   594 
   594 \noindent
   595 \noindent
   595 as well as to use plain strings for specifying simple string parsers.
   596 as well as to use string interpolations for specifying simple string parsers.
   596 
   597 
   597 The idea is that this shorthand notation allows us to easily translate
   598 The idea is that this shorthand notation allows us to easily translate
   598 context-free grammars into code. For example recall our context-free
   599 context-free grammars into code. For example recall our context-free
   599 grammar for palindromes:
   600 grammar for palindromes:
   600 
   601 
   604 
   605 
   605 \noindent
   606 \noindent
   606 Each alternative in this grammar translates into an alternative parser
   607 Each alternative in this grammar translates into an alternative parser
   607 combinator.  The $\cdot$ can be translated to a sequence parser
   608 combinator.  The $\cdot$ can be translated to a sequence parser
   608 combinator. The parsers for $a$, $b$ and $\epsilon$ can be simply
   609 combinator. The parsers for $a$, $b$ and $\epsilon$ can be simply
   609 written as \texttt{"a"}, \texttt{"b"} and \texttt{""}.
   610 written as \texttt{p"a"}, \texttt{p"b"} and \texttt{p""}.
   610 
   611 
   611 
   612 
   612 \subsubsection*{How to build parsers using parser combinators?}
   613 \subsubsection*{How to build parsers using parser combinators?}
   613 
   614 
   614 The beauty of parser combinators is the ease with which they can be
   615 The beauty of parser combinators is the ease with which they can be
   618 The first idea would be to translate it into the following code
   619 The first idea would be to translate it into the following code
   619 
   620 
   620 \begin{center}
   621 \begin{center}
   621 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   622 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   622 lazy val Pal : Parser[String, String] = 
   623 lazy val Pal : Parser[String, String] = 
   623   (("a" ~ Pal ~ "a") | ("b" ~ Pal ~ "b") | "a" | "b" | "")
   624   ((p"a" ~ Pal ~ p"a") || (p"b" ~ Pal ~ p"b") || p"a" || p"b" || p"")
   624 \end{lstlisting}
   625 \end{lstlisting}
   625 \end{center}
   626 \end{center}
   626 
   627 
   627 \noindent
   628 \noindent
   628 Unfortunately, this does not quite work yet as it produces a typing
   629 Unfortunately, this does not quite work yet as it produces a typing
   629 error. The reason is that the parsers \texttt{"a"}, \texttt{"b"} and
   630 error. The reason is that the parsers \texttt{p"a"}, \texttt{p"b"} and
   630 \texttt{""} all produce strings as output type and therefore can be
   631 \texttt{p""} all produce strings as output type and therefore can be
   631 put into an alternative \texttt{...| "a" | "b" | ""}. But both
   632 put into an alternative \texttt{...|| p"a" || p"b" || p""}. But both
   632 sequence parsers \pcode{"a" ~ Pal ~ "a"} and \pcode{"b" ~ Pal ~ "b"}
   633 sequence parsers \pcode{p"a" ~ Pal ~ p"a"} and \pcode{p"b" ~ Pal ~ p"b"}
   633 produce pairs of the form
   634 produce pairs of the form
   634 
   635 
   635 \begin{center}
   636 \begin{center}
   636 (((\texttt{a}-part, \texttt{Pal}-part), \texttt{a}-part), unprocessed part)
   637 (((\texttt{a}-part, \texttt{Pal}-part), \texttt{a}-part), unprocessed part)
   637 \end{center}
   638 \end{center}
   642 ``flattens'' these pairs and appends the corresponding strings, like
   643 ``flattens'' these pairs and appends the corresponding strings, like
   643 
   644 
   644 \begin{center}
   645 \begin{center}
   645 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   646 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   646 lazy val Pal : Parser[String, String] =  
   647 lazy val Pal : Parser[String, String] =  
   647   (("a" ~ Pal ~ "a") ==> { case ((x, y), z) => x + y + z } |
   648   ((p"a" ~ Pal ~ p"a").map{ case ((x, y), z) => x + y + z } ||
   648    ("b" ~ Pal ~ "b") ==> { case ((x, y), z) => x + y + z } |
   649    (p"b" ~ Pal ~ p"b").map{ case ((x, y), z) => x + y + z } ||
   649     "a" | "b" | "")
   650     p"a" || p"b" || p"")
   650 \end{lstlisting}
   651 \end{lstlisting}
   651 \end{center}
   652 \end{center}
   652 
   653 
   653 \noindent
   654 \noindent
   654 How does this work? Well, recall again what the pairs look like for
   655 How does this work? Well, recall again what the pairs look like for
   655 the parser \pcode{"a" ~ Pal ~ "a"}.  The pattern in the semantic
   656 the parser \pcode{p"a" ~ Pal ~ p"a"}.  The pattern in the semantic
   656 action matches the nested pairs (the \texttt{x} with the
   657 action matches the nested pairs (the \texttt{x} with the
   657 \texttt{a}-part and so on).  Unfortunately when we have such nested
   658 \texttt{a}-part and so on).  Unfortunately when we have such nested
   658 pairs, Scala requires us to define the function using the
   659 pairs, Scala requires us to define the function using the
   659 \pcode{case}-syntax
   660 \pcode{case}-syntax
   660 
   661 
   668 If we have more sequence parser combinators or have them differently nested,
   669 If we have more sequence parser combinators or have them differently nested,
   669 then the pattern in the semantic action needs to be adjusted accordingly.
   670 then the pattern in the semantic action needs to be adjusted accordingly.
   670 The action we implement above is to concatenate all three strings, which
   671 The action we implement above is to concatenate all three strings, which
   671 means after the semantic action is applied the output type of the parser 
   672 means after the semantic action is applied the output type of the parser 
   672 is \texttt{String}, which means it fits with the alternative parsers
   673 is \texttt{String}, which means it fits with the alternative parsers
   673 \texttt{...| "a" | "b" | ""}.
   674 \texttt{...|| p"a" || p"b" || p""}.
   674 
   675 
   675 If we run the parser above with \pcode{Pal.parse_all("abaaaba")} we obtain
   676 If we run the parser above with \pcode{Pal.parse_all("abaaaba")} we obtain
   676 as result the \pcode{Set(abaaaba)}, which indicates that the string is a palindrome
   677 as result the \pcode{Set(abaaaba)}, which indicates that the string is a palindrome
   677 (an empty set would mean something is wrong). But also notice what the
   678 (an empty set would mean something is wrong). But also notice what the
   678 intermediate results are generated by \pcode{Pal.parse("abaaaba")}
   679 intermediate results are generated by \pcode{Pal.parse("abaaaba")}
   770 
   771 
   771 \begin{center}
   772 \begin{center}
   772   \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily,
   773   \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily,
   773     xleftmargin=0mm, numbers=none]
   774     xleftmargin=0mm, numbers=none]
   774 lazy val P : Parser[String, String] = 
   775 lazy val P : Parser[String, String] = 
   775   "(" ~ P ~ ")" ~ P ==> { case (((_,x),_),y) => "{" + x + "}" + y } | ""
   776   (p"(" ~ P ~ p")" ~ P).map{ case (((_,x),_),y) => "{" + x + "}" + y } || p""
   776 \end{lstlisting}
   777 \end{lstlisting}
   777 \end{center}
   778 \end{center}
   778 
   779 
   779 \noindent
   780 \noindent
   780 Here we define a function where which ignores the parentheses in the
   781 Here we define a function where which ignores the parentheses in the
   813 write something like
   814 write something like
   814 
   815 
   815 \begin{center}
   816 \begin{center}
   816 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   817 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   817 lazy val E: Parser[String, Int] = 
   818 lazy val E: Parser[String, Int] = 
   818   (E ~ "+" ~ E ==> { case ((x, y), z) => x + z} |
   819   ((E ~ p"+" ~ E).map{ case ((x, y), z) => x + z} ||
   819    E ~ "-" ~ E ==> { case ((x, y), z) => x - z} |
   820    (E ~ p"-" ~ E).map{ case ((x, y), z) => x - z} ||
   820    E ~ "*" ~ E ==> { case ((x, y), z) => x * z} |
   821    (E ~ p"*" ~ E).map{ case ((x, y), z) => x * z} ||
   821    "(" ~ E ~ ")" ==> { case ((x, y), z) => y} |
   822    (p"(" ~ E ~ p")").map{ case ((x, y), z) => y} ||
   822    NumParserInt)
   823    NumParserInt)
   823 \end{lstlisting}
   824 \end{lstlisting}
   824 \end{center}
   825 \end{center}
   825 
   826 
   826 \noindent
   827 \noindent
   868 
   869 
   869 
   870 
   870 \begin{center}
   871 \begin{center}
   871 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   872 \begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
   872 lazy val E: Parser[String, Int] = 
   873 lazy val E: Parser[String, Int] = 
   873   (T ~ "+" ~ E) ==> { case ((x, y), z) => x + z } |
   874   (T ~ p"+" ~ E).map{ case ((x, y), z) => x + z } ||
   874   (T ~ "-" ~ E) ==> { case ((x, y), z) => x - z } | T 
   875   (T ~ p"-" ~ E).map{ case ((x, y), z) => x - z } || T 
   875 lazy val T: Parser[String, Int] = 
   876 lazy val T: Parser[String, Int] = 
   876   (F ~ "*" ~ T) ==> { case ((x, y), z) => x * z } | F
   877   (F ~ p"*" ~ T).map{ case ((x, y), z) => x * z } || F
   877 lazy val F: Parser[String, Int] = 
   878 lazy val F: Parser[String, Int] = 
   878   ("(" ~ E ~ ")") ==> { case ((x, y), z) => y } | NumParserInt
   879   (p"(" ~ E ~ p")").map{ case ((x, y), z) => y } || NumParserInt
   879 \end{lstlisting}
   880 \end{lstlisting}
   880 \end{center}
   881 \end{center}
   881 
   882 
   882 \noindent
   883 \noindent
   883 Let us try out some examples:
   884 Let us try out some examples: