author | Christian Urban <christian dot urban at kcl dot ac dot uk> |
Mon, 13 Oct 2014 06:26:30 +0100 | |
changeset 277 | 8eb3261294ba |
parent 217 | cd6066f1056a |
child 292 | 7ed2a25dd115 |
permissions | -rw-r--r-- |
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
1 |
\documentclass{article} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
2 |
\usepackage{hyperref} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
3 |
\usepackage{amssymb} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
4 |
\usepackage{amsmath} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
5 |
\usepackage[T1]{fontenc} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
6 |
\usepackage{tikz} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
7 |
\usetikzlibrary{arrows} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
8 |
\usetikzlibrary{automata} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
9 |
\usetikzlibrary{shapes} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
10 |
\usetikzlibrary{shadows} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
11 |
\usetikzlibrary{positioning} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
12 |
\usetikzlibrary{calc} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
13 |
\usetikzlibrary{fit} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
14 |
\usetikzlibrary{backgrounds} |
217
cd6066f1056a
updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
183
diff
changeset
|
15 |
\usepackage{../langs} |
cd6066f1056a
updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
183
diff
changeset
|
16 |
|
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
17 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
18 |
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
19 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
20 |
\newcommand\grid[1]{% |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
21 |
\begin{tikzpicture}[baseline=(char.base)] |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
22 |
\path[use as bounding box] |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
23 |
(0,0) rectangle (1em,1em); |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
24 |
\draw[red!50, fill=red!20] |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
25 |
(0,0) rectangle (1em,1em); |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
26 |
\node[inner sep=1pt,anchor=base west] |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
27 |
(char) at (0em,\gridraiseamount) {#1}; |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
28 |
\end{tikzpicture}} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
29 |
\newcommand\gridraiseamount{0.12em} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
30 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
31 |
\makeatletter |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
32 |
\newcommand\Grid[1]{% |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
33 |
\@tfor\z:=#1\do{\grid{\z}}} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
34 |
\makeatother |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
35 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
36 |
\newcommand\Vspace[1][.3em]{% |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
37 |
\mbox{\kern.06em\vrule height.3ex}% |
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
38 |
\vbox{\hrule width#1}% |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
39 |
\hbox{\vrule height.3ex}} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
40 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
41 |
\def\VS{\Vspace[0.6em]} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
42 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
43 |
\begin{document} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
44 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
45 |
\section*{Handout 6} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
46 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
47 |
While regular expressions are very useful for lexing and for recognising |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
48 |
many patterns in strings (like email addresses), they have their limitations. For |
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
49 |
example there is no regular expression that can recognise the language |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
50 |
$a^nb^n$. Another example for which there exists no regular expression is the language of well-parenthesised |
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
51 |
expressions. In languages like Lisp, which use parentheses rather |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
52 |
extensively, it might be of interest whether the following two expressions |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
53 |
are well-parenthesised (the left one is, the right one is not): |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
54 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
55 |
\begin{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
56 |
$(((()()))())$ \hspace{10mm} $(((()()))()))$ |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
57 |
\end{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
58 |
|
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
59 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
60 |
Not being able to solve such recognition problems is a serious limitation. |
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
61 |
In order to solve such recognition problems, we need more powerful |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
62 |
techniques than regular expressions. We will in particular look at \emph{context-free |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
63 |
languages}. They include the regular languages as the picture below shows: |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
64 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
65 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
66 |
\begin{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
67 |
\begin{tikzpicture} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
68 |
[rect/.style={draw=black!50, top color=white,bottom color=black!20, rectangle, very thick, rounded corners}] |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
69 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
70 |
\draw (0,0) node [rect, text depth=30mm, text width=46mm] {all languages}; |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
71 |
\draw (0,-0.4) node [rect, text depth=20mm, text width=44mm] {decidable languages}; |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
72 |
\draw (0,-0.65) node [rect, text depth=13mm] {context sensitive languages}; |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
73 |
\draw (0,-0.84) node [rect, text depth=7mm, text width=35mm] {context-free languages}; |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
74 |
\draw (0,-1.05) node [rect] {regular languages}; |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
75 |
\end{tikzpicture} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
76 |
\end{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
77 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
78 |
\noindent |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
79 |
Context-free languages play an important role in `day-to-day' text processing and in |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
80 |
programming languages. Context-free languages are usually specified by grammars. |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
81 |
For example a grammar for well-parenthesised expressions is |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
82 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
83 |
\begin{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
84 |
$P \;\;\rightarrow\;\; ( \cdot P \cdot ) \cdot P \;|\; \epsilon$ |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
85 |
\end{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
86 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
87 |
\noindent |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
88 |
In general grammars consist of finitely many rules built up from \emph{terminal symbols} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
89 |
(usually lower-case letters) and \emph{non-terminal symbols} (upper-case letters). Rules |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
90 |
have the shape |
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
91 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
92 |
\begin{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
93 |
$NT \;\;\rightarrow\;\; \textit{rhs}$ |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
94 |
\end{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
95 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
96 |
\noindent |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
97 |
where on the left-hand side is a single non-terminal and on the right a string consisting |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
98 |
of both terminals and non-terminals including the $\epsilon$-symbol for indicating the |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
99 |
empty string. We use the convention to separate components on |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
100 |
the right hand-side by using the $\cdot$ symbol, as in the grammar for well-parenthesised expressions. |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
101 |
We also use the convention to use $|$ as a shorthand notation for several rules. For example |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
102 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
103 |
\begin{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
104 |
$NT \;\;\rightarrow\;\; \textit{rhs}_1 \;|\; \textit{rhs}_2$ |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
105 |
\end{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
106 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
107 |
\noindent |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
108 |
means that the non-terminal $NT$ can be replaced by either $\textit{rhs}_1$ or $\textit{rhs}_2$. |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
109 |
If there are more than one non-terminal on the left-hand side of the rules, then we need to indicate |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
110 |
what is the \emph{starting} symbol of the grammar. For example the grammar for arithmetic expressions |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
111 |
can be given as follows |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
112 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
113 |
\begin{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
114 |
\begin{tabular}{lcl} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
115 |
$E$ & $\rightarrow$ & $N$ \\ |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
116 |
$E$ & $\rightarrow$ & $E \cdot + \cdot E$ \\ |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
117 |
$E$ & $\rightarrow$ & $E \cdot - \cdot E$ \\ |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
118 |
$E$ & $\rightarrow$ & $E \cdot * \cdot E$ \\ |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
119 |
$E$ & $\rightarrow$ & $( \cdot E \cdot )$\\ |
175
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
120 |
$N$ & $\rightarrow$ & $N \cdot N \;|\; 0 \;|\; 1 \;|\: \ldots \;|\; 9$ |
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
121 |
\end{tabular} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
122 |
\end{center} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
123 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
124 |
\noindent |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
125 |
where $E$ is the starting symbol. A \emph{derivation} for a grammar |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
126 |
starts with the staring symbol of the grammar and in each step replaces one |
175
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
127 |
non-terminal by a right-hand side of a rule. A derivation ends with a string |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
128 |
in which only terminal symbols are left. For example a derivation for the |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
129 |
string $(1 + 2) + 3$ is as follows: |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
130 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
131 |
\begin{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
132 |
\begin{tabular}{lll} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
133 |
$E$ & $\rightarrow$ & $E+E$\\ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
134 |
& $\rightarrow$ & $(E)+E$\\ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
135 |
& $\rightarrow$ & $(E+E)+E$\\ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
136 |
& $\rightarrow$ & $(E+E)+N$\\ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
137 |
& $\rightarrow$ & $(E+E)+3$\\ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
138 |
& $\rightarrow$ & $(N+E)+3$\\ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
139 |
& $\rightarrow^+$ & $(1+2)+3$\\ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
140 |
\end{tabular} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
141 |
\end{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
142 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
143 |
\noindent |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
144 |
The \emph{language} of a context-free grammar $G$ with start symbol $S$ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
145 |
is defined as the set of strings derivable by a derivation, that is |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
146 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
147 |
\begin{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
148 |
$\{c_1\ldots c_n \;|\; S \rightarrow^* c_1\ldots c_n \;\;\text{with all} \; c_i \;\text{being non-terminals}\}$ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
149 |
\end{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
150 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
151 |
\noindent |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
152 |
A \emph{parse-tree} encodes how a string is derived with the starting symbol on |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
153 |
top and each non-terminal containing a subtree for how it is replaced in a derivation. |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
154 |
The parse tree for the string $(1 + 23)+4$ is as follows: |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
155 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
156 |
\begin{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
157 |
\begin{tikzpicture}[level distance=8mm, black] |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
158 |
\node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
159 |
child {node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
160 |
child {node {$($}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
161 |
child {node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
162 |
child {node {$E$} child {node {$N$} child {node {$1$}}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
163 |
child {node {$+$}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
164 |
child {node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
165 |
child {node {$N$} child {node {$2$}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
166 |
child {node {$N$} child {node {$3$}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
167 |
} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
168 |
} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
169 |
child {node {$)$}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
170 |
} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
171 |
child {node {$+$}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
172 |
child {node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
173 |
child {node {$N$} child {node {$4$}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
174 |
}; |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
175 |
\end{tikzpicture} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
176 |
\end{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
177 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
178 |
\noindent |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
179 |
We are often interested in these parse-trees since they encode the structure of |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
180 |
how a string is derived by a grammar. Before we come to the problem of constructing |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
181 |
such parse-trees, we need to consider the following two properties of grammars. |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
182 |
A grammar is \emph{left-recursive} if there is a derivation starting from a non-terminal, say |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
183 |
$NT$ which leads to a string which again starts with $NT$. This means a derivation of the |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
184 |
form. |
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
185 |
|
175
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
186 |
\begin{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
187 |
$NT \rightarrow \ldots \rightarrow NT \cdot \ldots$ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
188 |
\end{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
189 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
190 |
\noindent |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
191 |
It can be easily seen that the grammar above for arithmetic expressions is left-recursive: |
175
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
192 |
for example the rules $E \rightarrow E\cdot + \cdot E$ and $N \rightarrow N\cdot N$ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
193 |
show that this grammar is left-recursive. Some algorithms cannot cope with left-recursive |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
194 |
grammars. Fortunately every left-recursive grammar can be transformed into one that is |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
195 |
not left-recursive, although this transformation might make the grammar less human-readable. |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
196 |
For example if we want to give a non-left-recursive grammar for numbers we might |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
197 |
specify |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
198 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
199 |
\begin{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
200 |
$N \;\;\rightarrow\;\; 0\;|\;\ldots\;|\;9\;|\;1\cdot N\;|\;2\cdot N\;|\;\ldots\;|\;9\cdot N$ |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
201 |
\end{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
202 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
203 |
\noindent |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
204 |
Using this grammar we can still derive every number string, but we will never be able |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
205 |
to derive a string of the form $\ldots \rightarrow N \cdot \ldots$. |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
206 |
|
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
207 |
The other property we have to watch out for is when a grammar is |
175
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
208 |
\emph{ambiguous}. A grammar is said to be ambiguous if there are two parse-trees |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
209 |
for one string. Again the grammar for arithmetic expressions shown above is ambiguous. |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
210 |
While the shown parse tree for the string $(1 + 23) + 4$ is unique, this is not the case in |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
211 |
general. For example there are two parse |
175
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
212 |
trees for the string $1 + 2 + 3$, namely |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
213 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
214 |
\begin{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
215 |
\begin{tabular}{c@{\hspace{10mm}}c} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
216 |
\begin{tikzpicture}[level distance=8mm, black] |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
217 |
\node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
218 |
child {node {$E$} child {node {$N$} child {node {$1$}}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
219 |
child {node {$+$}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
220 |
child {node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
221 |
child {node {$E$} child {node {$N$} child {node {$2$}}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
222 |
child {node {$+$}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
223 |
child {node {$E$} child {node {$N$} child {node {$3$}}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
224 |
} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
225 |
; |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
226 |
\end{tikzpicture} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
227 |
& |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
228 |
\begin{tikzpicture}[level distance=8mm, black] |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
229 |
\node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
230 |
child {node {$E$} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
231 |
child {node {$E$} child {node {$N$} child {node {$1$}}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
232 |
child {node {$+$}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
233 |
child {node {$E$} child {node {$N$} child {node {$2$}}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
234 |
} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
235 |
child {node {$+$}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
236 |
child {node {$E$} child {node {$N$} child {node {$3$}}}} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
237 |
; |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
238 |
\end{tikzpicture} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
239 |
\end{tabular} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
240 |
\end{center} |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
241 |
|
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
242 |
\noindent |
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
243 |
In particular in programming languages we will try to avoid ambiguous |
176
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
244 |
grammars because two different parse-trees for a string mean a program can |
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
245 |
be interpreted in two different ways. In such cases we have to somehow make sure |
175
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
246 |
the two different ways do not matter, or disambiguate the grammar in |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
247 |
some other way (for example making the $+$ left-associative). Unfortunately already |
175
5801e8c0e528
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
248 |
the problem of deciding whether a grammar |
176
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
249 |
is ambiguous or not is in general undecidable. |
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
250 |
|
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
251 |
Let us now turn to the problem of generating a parse-tree for a grammar and string. |
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
252 |
In what follows we explain \emph{parser combinators}, because they are easy |
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
253 |
to implement and closely resemble grammar rules. Imagine that a grammar |
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
254 |
describes the strings of natural numbers, such as the grammar $N$ shown above. |
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
255 |
For all such strings we want to generate the parse-trees or later on we actually |
177
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
256 |
want to extract the meaning of these strings, that is the concrete integers ``behind'' |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
257 |
these strings. In Scala the parser combinators will be functions of type |
176
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
258 |
|
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
259 |
\begin{center} |
177
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
260 |
\texttt{I $\Rightarrow$ Set[(T, I)]} |
176
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
261 |
\end{center} |
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
262 |
|
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
263 |
\noindent |
177
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
264 |
that is they take as input something of type \texttt{I}, typically a list of tokens or a string, |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
265 |
and return a set of pairs. The first component of these pairs corresponds to what the |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
266 |
parser combinator was able to process from the input and the second is the unprocessed |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
267 |
part of the input. As we shall see shortly, a parser combinator might return more than one such pair, |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
268 |
with the idea that there are potentially several ways how to interpret the input. As a concrete |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
269 |
example, consider the case where the input is of type string, say the string |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
270 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
271 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
272 |
\tt\Grid{iffoo\VS testbar} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
273 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
274 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
275 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
276 |
We might have a parser combinator which tries to interpret this string as a keyword (\texttt{if}) or |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
277 |
an identifier (\texttt{iffoo}). Then the output will be the set |
177
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
278 |
|
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
279 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
280 |
$\left\{ \left(\texttt{\Grid{if}}\,,\, \texttt{\Grid{foo\VS testbar}}\right), |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
281 |
\left(\texttt{\Grid{iffoo}}\,,\, \texttt{\Grid{\VS testbar}}\right) \right\}$ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
282 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
283 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
284 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
285 |
where the first pair means the parser could recognise \texttt{if} from the input and leaves |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
286 |
the rest as `unprocessed' as the second component of the pair; in the other case |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
287 |
it could recognise \texttt{iffoo} and leaves \texttt{\VS testbar} as unprocessed. If the parser |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
288 |
cannot recognise anything from the input then parser combinators just return the empty |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
289 |
set $\varnothing$. This will indicate something ``went wrong''. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
290 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
291 |
The main attraction is that we can easily build parser combinators out of smaller components |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
292 |
following very closely the structure of a grammar. In order to implement this in an object |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
293 |
oriented programming language, like Scala, we need to specify an abstract class for parser |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
294 |
combinators. This abstract class requires the implementation of the function |
177
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
295 |
\texttt{parse} taking an argument of type \texttt{I} and returns a set of type |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
296 |
\mbox{\texttt{Set[(T, I)]}}. |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
297 |
|
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
298 |
\begin{center} |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
299 |
\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none] |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
300 |
abstract class Parser[I, T] { |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
301 |
def parse(ts: I): Set[(T, I)] |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
302 |
|
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
303 |
def parse_all(ts: I): Set[T] = |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
304 |
for ((head, tail) <- parse(ts); if (tail.isEmpty)) |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
305 |
yield head |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
306 |
} |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
307 |
\end{lstlisting} |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
308 |
\end{center} |
176
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
309 |
|
177
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
310 |
\noindent |
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
311 |
From the function \texttt{parse} we can then ``centrally'' derive the function \texttt{parse\_all}, |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
312 |
which just filters out all pairs whose second component is not empty (that is has still some |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
313 |
unprocessed part). The reason is that at the end of parsing we are only interested in the |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
314 |
results where all the input has been consumed and no unprocessed part is left. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
315 |
|
177
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
316 |
One of the simplest parser combinators recognises just a character, say $c$, |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
317 |
from the beginning of strings. Its behaviour is as follows: |
176
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
318 |
|
177
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
319 |
\begin{itemize} |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
320 |
\item if the head of the input string starts with a $c$, it returns |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
321 |
the set $\{(c, \textit{tail of}\; s)\}$ |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
322 |
\item otherwise it returns the empty set $\varnothing$ |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
323 |
\end{itemize} |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
324 |
|
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
325 |
\noindent |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
326 |
The input type of this simple parser combinator for characters is |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
327 |
\texttt{String} and the output type \mbox{\texttt{Set[(Char, String)]}}. |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
328 |
The code in Scala is as follows: |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
329 |
|
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
330 |
\begin{center} |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
331 |
\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none] |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
332 |
case class CharParser(c: Char) extends Parser[String, Char] { |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
333 |
def parse(sb: String) = |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
334 |
if (sb.head == c) Set((c, sb.tail)) else Set() |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
335 |
} |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
336 |
\end{lstlisting} |
53def1fbf472
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
176
diff
changeset
|
337 |
\end{center} |
176
3c2653fc8b5a
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
175
diff
changeset
|
338 |
|
183
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
339 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
340 |
The \texttt{parse} function tests whether the first character of the |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
341 |
input string \texttt{sb} is equal to \texttt{c}. If yes, then it splits the |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
342 |
string into the recognised part \texttt{c} and the unprocessed part |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
343 |
\texttt{sb.tail}. In case \texttt{sb} does not start with \texttt{c} then |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
344 |
the parser returns the empty set (in Scala \texttt{Set()}). |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
345 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
346 |
More interesting are the parser combinators that build larger parsers |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
347 |
out of smaller component parsers. For example the alternative |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
348 |
parser combinator is as follows. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
349 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
350 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
351 |
\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none] |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
352 |
class AltParser[I, T] |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
353 |
(p: => Parser[I, T], |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
354 |
q: => Parser[I, T]) extends Parser[I, T] { |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
355 |
def parse(sb: I) = p.parse(sb) ++ q.parse(sb) |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
356 |
} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
357 |
\end{lstlisting} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
358 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
359 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
360 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
361 |
The types of this parser combinator are polymorphic (we just have \texttt{I} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
362 |
for the input type, and \texttt{T} for the output type). The alternative parser |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
363 |
builds a new parser out of two existing parser combinator \texttt{p} and \texttt{q}. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
364 |
Both need to be able to process input of type \texttt{I} and return the same |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
365 |
output type \texttt{Set[(T, I)]}. (There is an interesting detail of Scala, namely the |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
366 |
\texttt{=>} in front of the types of \texttt{p} and \texttt{q}. They will prevent the |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
367 |
evaluation of the arguments before they are used. This is often called |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
368 |
\emph{lazy evaluation} of the arguments.) The alternative parser should run |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
369 |
the input with the first parser \texttt{p} (producing a set of outputs) and then |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
370 |
run the same input with \texttt{q}. The result should be then just the union |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
371 |
of both sets, which is the operation \texttt{++} in Scala. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
372 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
373 |
This parser combinator already allows us to construct a parser that either |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
374 |
a character \texttt{a} or \texttt{b}, as |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
375 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
376 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
377 |
\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none] |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
378 |
new AltParser(CharParser('a'), CharParser('b')) |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
379 |
\end{lstlisting} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
380 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
381 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
382 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
383 |
Scala allows us to introduce some more readable shorthand notation for this, like \texttt{'a' || 'b'}. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
384 |
We can call this parser combinator with the strings |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
385 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
386 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
387 |
\begin{tabular}{rcl} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
388 |
input string & & output\medskip\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
389 |
\texttt{\Grid{ac}} & $\rightarrow$ & $\left\{(\texttt{\Grid{a}}, \texttt{\Grid{c}})\right\}$\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
390 |
\texttt{\Grid{bc}} & $\rightarrow$ & $\left\{(\texttt{\Grid{b}}, \texttt{\Grid{c}})\right\}$\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
391 |
\texttt{\Grid{cc}} & $\rightarrow$ & $\varnothing$ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
392 |
\end{tabular} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
393 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
394 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
395 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
396 |
We receive in the first two cases a successful output (that is a non-empty set). |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
397 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
398 |
A bit more interesting is the \emph{sequence parser combinator} implemented in |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
399 |
Scala as follows: |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
400 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
401 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
402 |
\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none] |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
403 |
class SeqParser[I, T, S] |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
404 |
(p: => Parser[I, T], |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
405 |
q: => Parser[I, S]) extends Parser[I, (T, S)] { |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
406 |
def parse(sb: I) = |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
407 |
for ((head1, tail1) <- p.parse(sb); |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
408 |
(head2, tail2) <- q.parse(tail1)) |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
409 |
yield ((head1, head2), tail2) |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
410 |
} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
411 |
\end{lstlisting} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
412 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
413 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
414 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
415 |
This parser takes as input two parsers, \texttt{p} and \texttt{q}. It implements \texttt{parse} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
416 |
as follows: let first run the parser \texttt{p} on the input producing a set of pairs (\texttt{head1}, \texttt{tail1}). |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
417 |
The \texttt{tail1} stands for the unprocessed parts left over by \texttt{p}. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
418 |
Let \texttt{q} run on these unprocessed parts |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
419 |
producing again a set of pairs. The output of the sequence parser combinator is then a set |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
420 |
containing pairs where the first components are again pairs, namely what the first parser could parse |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
421 |
together with what the second parser could parse; the second component is the unprocessed |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
422 |
part left over after running the second parser \texttt{q}. Therefore the input type of |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
423 |
the sequence parser combinator is as usual \texttt{I}, but the output type is |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
424 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
425 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
426 |
\texttt{Set[((T, S), I)]} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
427 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
428 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
429 |
Scala allows us to provide some |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
430 |
shorthand notation for the sequence parser combinator. So we can write for |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
431 |
example \texttt{'a' $\sim$ 'b'}, which is the |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
432 |
parser combinator that first consumes the character \texttt{a} from a string and then \texttt{b}. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
433 |
Calling this parser combinator with the strings |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
434 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
435 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
436 |
\begin{tabular}{rcl} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
437 |
input string & & output\medskip\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
438 |
\texttt{\Grid{abc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{a}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
439 |
\texttt{\Grid{bac}} & $\rightarrow$ & $\varnothing$\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
440 |
\texttt{\Grid{ccc}} & $\rightarrow$ & $\varnothing$ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
441 |
\end{tabular} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
442 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
443 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
444 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
445 |
A slightly more complicated parser is \texttt{('a' || 'b') $\sim$ 'b'} which parses as first character either |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
446 |
an \texttt{a} or \texttt{b} followed by a \texttt{b}. This parser produces the following results. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
447 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
448 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
449 |
\begin{tabular}{rcl} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
450 |
input string & & output\medskip\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
451 |
\texttt{\Grid{abc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{a}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
452 |
\texttt{\Grid{bbc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{b}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
453 |
\texttt{\Grid{aac}} & $\rightarrow$ & $\varnothing$ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
454 |
\end{tabular} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
455 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
456 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
457 |
Note carefully that constructing the parser \texttt{'a' || ('a' $\sim$ 'b')} will result in a tying error. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
458 |
The first parser has as output type a single character (recall the type of \texttt{CharParser}), |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
459 |
but the second parser produces a pair of characters as output. The alternative parser is however |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
460 |
required to have both component parsers to have the same type. We will see later how we can |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
461 |
build this parser without the typing error. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
462 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
463 |
The next parser combinator does not actually combine smaller parsers, but applies |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
464 |
a function to the result of the parser. It is implemented in Scala as follows |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
465 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
466 |
\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
467 |
\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none] |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
468 |
class FunParser[I, T, S] |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
469 |
(p: => Parser[I, T], |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
470 |
f: T => S) extends Parser[I, S] { |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
471 |
def parse(sb: I) = |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
472 |
for ((head, tail) <- p.parse(sb)) yield (f(head), tail) |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
473 |
} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
474 |
\end{lstlisting} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
475 |
\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
476 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
477 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
478 |
\noindent |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
479 |
This parser combinator takes a parser \texttt{p} with output type \texttt{T} as |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
480 |
input as well as a function \texttt{f} with type \texttt{T => S}. The parser \texttt{p} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
481 |
produces sets of type \texttt{(T, I)}. The \texttt{FunParser} combinator then |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
482 |
applies the function \texttt{f} to all the parer outputs. Since this function |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
483 |
is of type \texttt{T => S}, we obtain a parser with output type \texttt{S}. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
484 |
Again Scala lets us introduce some shorthand notation for this parser combinator. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
485 |
Therefore we will write \texttt{p ==> f} for it. |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
486 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
487 |
%\bigskip |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
488 |
%takes advantage of the full generality---have a look |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
489 |
%what it produces if we call it with the string \texttt{abc} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
490 |
% |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
491 |
%\begin{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
492 |
%\begin{tabular}{rcl} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
493 |
%input string & & output\medskip\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
494 |
%\texttt{\Grid{abc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{a}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
495 |
%\texttt{\Grid{bbc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{b}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
496 |
%\texttt{\Grid{aac}} & $\rightarrow$ & $\varnothing$ |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
497 |
%\end{tabular} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
498 |
%\end{center} |
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
499 |
|
b17eff695c7f
added new stuff
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
177
diff
changeset
|
500 |
|
173
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
501 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
502 |
\end{document} |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
503 |
|
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
504 |
%%% Local Variables: |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
505 |
%%% mode: latex |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
506 |
%%% TeX-master: t |
7cfb7a6f7c99
added slides
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
507 |
%%% End: |