afl-material: handouts/ho05.tex@7b7736bea3ca (annotated)

665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	1	% !TEX program = xelatex
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	2	\documentclass{article}
297 5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	3	\usepackage{../style}
217 cd6066f1056a updated handouts Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 183 diff changeset	4	\usepackage{../langs}
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	5	\usepackage{../grammar}
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	6
545 76a98ed71a2a updated Christian Urban <urbanc@in.tum.de> parents: 459 diff changeset	7	% epsilon and left-recursion elimination
76a98ed71a2a updated Christian Urban <urbanc@in.tum.de> parents: 459 diff changeset	8	% http://www.mollypages.org/page/grammar/index.mp
76a98ed71a2a updated Christian Urban <urbanc@in.tum.de> parents: 459 diff changeset	9
618 f4818c95a32e updated Christian Urban <urbanc@in.tum.de> parents: 582 diff changeset	10	%% parsing scala files
f4818c95a32e updated Christian Urban <urbanc@in.tum.de> parents: 582 diff changeset	11	%%https://scalameta.org/
f4818c95a32e updated Christian Urban <urbanc@in.tum.de> parents: 582 diff changeset	12
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	13	\begin{document}
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	14
385 7f8516ff408d updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 362 diff changeset	15	\section*{Handout 5 (Grammars \& Parser)}
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	16
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	17	While regular expressions are very useful for lexing and for recognising
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	18	many patterns in strings (like email addresses), they have their
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	19	limitations. For example there is no regular expression that can
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	20	recognise the language $a^nb^n$ (where you have strings with $n$ $a$'s
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	21	followed by the same amount of $b$'s). Another example for which there
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	22	exists no regular expression is the language of well-parenthesised
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	23	expressions. In languages like Lisp, which use parentheses rather
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	24	extensively, it might be of interest to know whether the following two
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	25	expressions are well-parenthesised or not (the left one is, the right
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	26	one is not):
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	27
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	28	\begin{center}
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	29	$(((()()))())$ \hspace{10mm} $(((()()))()))$
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	30	\end{center}
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	31
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	32	\noindent Not being able to solve such recognition problems is
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	33	a serious limitation. In order to solve such recognition
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	34	problems, we need more powerful techniques than regular
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	35	expressions. We will in particular look at \emph{context-free
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	36	languages}. They include the regular languages as the picture
582 d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	37	below about language classes shows:
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	38
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	39
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	40	\begin{center}
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	41	\begin{tikzpicture}
297 5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	42	[rect/.style={draw=black!50,
5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	43	top color=white,bottom color=black!20,
5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	44	rectangle, very thick, rounded corners}]
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	45
297 5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	46	\draw (0,0) node [rect, text depth=30mm, text width=46mm] {\small all languages};
5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	47	\draw (0,-0.4) node [rect, text depth=20mm, text width=44mm] {\small decidable languages};
5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	48	\draw (0,-0.65) node [rect, text depth=13mm] {\small context sensitive languages};
5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	49	\draw (0,-0.84) node [rect, text depth=7mm, text width=35mm] {\small context-free languages};
5c51839c88fd updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 292 diff changeset	50	\draw (0,-1.05) node [rect] {\small regular languages};
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	51	\end{tikzpicture}
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	52	\end{center}
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	53
582 d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	54	\noindent Each ``bubble'' stands for sets of languages (remember
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	55	languages are sets of strings). As indicated the set of regular
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	56	languages is fully included inside the context-free languages,
582 d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	57	meaning every regular language is also context-free, but not vice
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	58	versa. Below I will let you think, for example, what the context-free
582 d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	59	grammar is for the language corresponding to the regular expression
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	60	$(aaa)^*a$.
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	61
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	62	Because of their convenience, context-free languages play an important
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	63	role in `day-to-day' text processing and in programming
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	64	languages. Context-free in this setting means that ``words'' have one
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	65	meaning only and this meaning is independent from the context
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	66	the ``words'' appear in. For example ambiguity issues like
582 d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	67
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	68	\begin{center}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	69	\tt Time flies like an arrow; fruit flies like bananas.
582 d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	70	\end{center}
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	71
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	72	\noindent
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	73	from natural languages were the meaning of \emph{flies} depends on the
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	74	surrounding \emph{context} are avoided as much as possible.
582 d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	75
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	76	Context-free languages are usually specified by grammars. For example
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	77	a grammar for well-parenthesised expressions can be given as follows:
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	78
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	79	\begin{plstx}[margin=3cm]
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	80	: \meta{P} ::= ( \cdot \meta{P} \cdot ) \cdot \meta{P}
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	81	\| \epsilon\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	82	\end{plstx}
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	83
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	84	\noindent
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	85	or a grammar for recognising strings consisting of ones is
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	86
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	87	\begin{plstx}[margin=3cm]
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	88	: \meta{O} ::= 1 \cdot \meta{O}
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	89	\| 1\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	90	\end{plstx}
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	91
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	92	In general grammars consist of finitely many rules built up
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	93	from \emph{terminal symbols} (usually lower-case letters) and
582 d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	94	\emph{non-terminal symbols} (upper-case letters written in
d236e75e1d55 updated Christian Urban <urbanc@in.tum.de> parents: 545 diff changeset	95	bold like \meta{A}, \meta{N} and so on). Rules have
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	96	the shape
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	97
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	98	\begin{plstx}[margin=3cm]
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	99	: \meta{NT} ::= rhs\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	100	\end{plstx}
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	101
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	102	\noindent where on the left-hand side is a single non-terminal
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	103	and on the right a string consisting of both terminals and
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	104	non-terminals including the $\epsilon$-symbol for indicating
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	105	the empty string. We use the convention to separate components
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	106	on the right hand-side by using the $\cdot$ symbol, as in the
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	107	grammar for well-parenthesised expressions. We also use the
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	108	convention to use $\|$ as a shorthand notation for several
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	109	rules. For example
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	110
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	111	\begin{plstx}[margin=3cm]
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	112	: \meta{NT} ::= rhs_1
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	113	\| rhs_2\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	114	\end{plstx}
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	115
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	116	\noindent means that the non-terminal \meta{NT} can be replaced by
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	117	either $\textit{rhs}_1$ or $\textit{rhs}_2$. If there are more
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	118	than one non-terminal on the left-hand side of the rules, then
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	119	we need to indicate what is the \emph{starting} symbol of the
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	120	grammar. For example the grammar for arithmetic expressions
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	121	can be given as follows
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	122
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	123	\begin{plstx}[margin=3cm,one per line]
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	124	\mbox{\rm (1)}: \meta{E} ::= \meta{N}\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	125	\mbox{\rm (2)}: \meta{E} ::= \meta{E} \cdot + \cdot \meta{E}\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	126	\mbox{\rm (3)}: \meta{E} ::= \meta{E} \cdot - \cdot \meta{E}\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	127	\mbox{\rm (4)}: \meta{E} ::= \meta{E} \cdot * \cdot \meta{E}\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	128	\mbox{\rm (5)}: \meta{E} ::= ( \cdot \meta{E} \cdot )\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	129	\mbox{\rm (6\ldots)}: \meta{N} ::= \meta{N} \cdot \meta{N}
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	130	\mid 0 \mid 1 \mid \ldots \mid 9\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	131	\end{plstx}
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	132
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	133	\noindent where \meta{E} is the starting symbol. A
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	134	\emph{derivation} for a grammar starts with the starting
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	135	symbol of the grammar and in each step replaces one
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	136	non-terminal by a right-hand side of a rule. A derivation ends
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	137	with a string in which only terminal symbols are left. For
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	138	example a derivation for the string $(1 + 2) + 3$ is as
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	139	follows:
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	140
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	141	\begin{center}
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	142	\begin{tabular}{lll@{\hspace{2cm}}l}
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	143	\meta{E} & $\rightarrow$ & $\meta{E}+\meta{E}$ & by (2)\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	144	& $\rightarrow$ & $(\meta{E})+\meta{E}$ & by (5)\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	145	& $\rightarrow$ & $(\meta{E}+\meta{E})+\meta{E}$ & by (2)\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	146	& $\rightarrow$ & $(\meta{E}+\meta{E})+\meta{N}$ & by (1)\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	147	& $\rightarrow$ & $(\meta{E}+\meta{E})+3$ & by (6\dots)\\
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	148	& $\rightarrow$ & $(\meta{N}+\meta{E})+3$ & by (1)\\
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	149	& $\rightarrow^+$ & $(1+2)+3$ & by (1, 6\ldots)\\
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	150	\end{tabular}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	151	\end{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	152
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	153	\noindent where on the right it is indicated which
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	154	grammar rule has been applied. In the last step we
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	155	merged several steps into one.
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	156
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	157	The \emph{language} of a context-free grammar $G$
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	158	with start symbol $S$ is defined as the set of strings
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	159	derivable by a derivation, that is
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	160
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	161	\begin{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	162	$\{c_1\ldots c_n \;\|\; S \rightarrow^* c_1\ldots c_n \;\;\text{with all} \; c_i \;\text{being non-terminals}\}$
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	163	\end{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	164
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	165	\noindent
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	166	A \emph{parse-tree} encodes how a string is derived with the starting
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	167	symbol on top and each non-terminal containing a subtree for how it is
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	168	replaced in a derivation. The parse tree for the string $(1 + 23)+4$ is
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	169	as follows:
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	170
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	171	\begin{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	172	\begin{tikzpicture}[level distance=8mm, black]
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	173	\node {\meta{E}}
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	174	child {node {\meta{E} }
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	175	child {node {$($}}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	176	child {node {\meta{E} }
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	177	child {node {\meta{E} } child {node {\meta{N} } child {node {$1$}}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	178	child {node {$+$}}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	179	child {node {\meta{E} }
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	180	child {node {\meta{N} } child {node {$2$}}}
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	181	child {node {\meta{N} } child {node {$3$}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	182	}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	183	}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	184	child {node {$)$}}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	185	}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	186	child {node {$+$}}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	187	child {node {\meta{E} }
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	188	child {node {\meta{N} } child {node {$4$}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	189	};
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	190	\end{tikzpicture}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	191	\end{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	192
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	193	\noindent We are often interested in these parse-trees since
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	194	they encode the structure of how a string is derived by a
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	195	grammar.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	196
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	197	Before we come to the problem of constructing such parse-trees, we need
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	198	to consider the following two properties of grammars. A grammar is
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	199	\emph{left-recursive} if there is a derivation starting from a
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	200	non-terminal, say \meta{NT} which leads to a string which again starts
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	201	with \meta{NT}. This means a derivation of the form.
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	202
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	203	\begin{center}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	204	$\meta{NT} \rightarrow \ldots \rightarrow \meta{NT} \cdot \ldots$
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	205	\end{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	206
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	207	\noindent It can be easily seen that the grammar above for arithmetic
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	208	expressions is left-recursive: for example the rules $\meta{E}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	209	\rightarrow \meta{E}\cdot + \cdot \meta{E}$ and $\meta{N} \rightarrow
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	210	\meta{N}\cdot \meta{N}$ show that this grammar is left-recursive. But
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	211	note that left-recursiveness can involve more than one step in the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	212	derivation. The problem with left-recursive grammars is that some
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	213	algorithms cannot cope with them: with left-recursive grammars they will
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	214	fall into a loop. Fortunately every left-recursive grammar can be
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	215	transformed into one that is not left-recursive, although this
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	216	transformation might make the grammar less ``human-readable''. For
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	217	example if we want to give a non-left-recursive grammar for numbers we
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	218	might specify
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	219
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	220	\begin{center}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	221	$\meta{N} \;\;\rightarrow\;\; 0\;\|\;\ldots\;\|\;9\;\|\;
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	222	1\cdot \meta{N}\;\|\;2\cdot \meta{N}\;\|\;\ldots\;\|\;9\cdot \meta{N}$
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	223	\end{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	224
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	225	\noindent Using this grammar we can still derive every number
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	226	string, but we will never be able to derive a string of the
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	227	form $\meta{N} \to \ldots \to \meta{N} \cdot \ldots$.
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	228
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	229	The other property we have to watch out for is when a grammar
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	230	is \emph{ambiguous}. A grammar is said to be ambiguous if
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	231	there are two parse-trees for one string. Again the grammar
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	232	for arithmetic expressions shown above is ambiguous. While the
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	233	shown parse tree for the string $(1 + 23) + 4$ is unique, this
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	234	is not the case in general. For example there are two parse
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	235	trees for the string $1 + 2 + 3$, namely
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	236
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	237	\begin{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	238	\begin{tabular}{c@{\hspace{10mm}}c}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	239	\begin{tikzpicture}[level distance=8mm, black]
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	240	\node {\meta{E} }
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	241	child {node {\meta{E} } child {node {\meta{N} } child {node {$1$}}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	242	child {node {$+$}}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	243	child {node {\meta{E} }
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	244	child {node {\meta{E} } child {node {\meta{N} } child {node {$2$}}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	245	child {node {$+$}}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	246	child {node {\meta{E} } child {node {\meta{N} } child {node {$3$}}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	247	}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	248	;
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	249	\end{tikzpicture}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	250	&
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	251	\begin{tikzpicture}[level distance=8mm, black]
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	252	\node {\meta{E} }
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	253	child {node {\meta{E} }
6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	254	child {node {\meta{E} } child {node {\meta{N} } child {node {$1$}}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	255	child {node {$+$}}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	256	child {node {\meta{E} } child {node {\meta{N} } child {node {$2$}}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	257	}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	258	child {node {$+$}}
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	259	child {node {\meta{E} } child {node {\meta{N} } child {node {$3$}}}}
175 5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	260	;
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	261	\end{tikzpicture}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	262	\end{tabular}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	263	\end{center}
5801e8c0e528 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 173 diff changeset	264
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	265	\noindent In particular in programming languages we will try to avoid
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	266	ambiguous grammars because two different parse-trees for a string mean a
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	267	program can be interpreted in two different ways. In such cases we have
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	268	to somehow make sure the two different ways do not matter, or
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	269	disambiguate the grammar in some other way (for example making the $+$
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	270	left-associative). Unfortunately already the problem of deciding whether
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	271	a grammar is ambiguous or not is in general undecidable. But in simple
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	272	instance (the ones we deal with in this module) one can usually see when
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	273	a grammar is ambiguous.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	274
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	275	\subsection*{Removing Left-Recursion}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	276
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	277	Let us come back to the problem of left-recursion and consider the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	278	following grammar for binary numbers:
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	279
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	280	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	281	: \meta{B} ::= \meta{B} \cdot \meta{B} \| 0 \| 1\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	282	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	283
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	284	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	285	It is clear that this grammar can create all binary numbers, but
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	286	it is also clear that this grammar is left-recursive. Giving this
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	287	grammar as is to parser combinators will result in an infinite
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	288	loop. Fortunately, every left-recursive grammar can be translated
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	289	into one that is not left-recursive with the help of some
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	290	transformation rules. Suppose we identified the ``offensive''
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	291	rule, then we can separate the grammar into this offensive rule
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	292	and the ``rest'':
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	293
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	294	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	295	: \meta{B} ::= \underbrace{\meta{B} \cdot \meta{B}}_{\textit{lft-rec}}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	296	\| \underbrace{0 \;\;\|\;\; 1}_{\textit{rest}}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	297	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	298
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	299	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	300	To make the idea of the transformation clearer, suppose the left-recursive
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	301	rule is of the form $\meta{B}\alpha$ (the left-recursive non-terminal
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	302	followed by something called $\alpha$) and the ``rest'' is called $\beta$.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	303	That means our grammar looks schematically as follows
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	304
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	305	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	306	: \meta{B} ::= \meta{B} \cdot \alpha \| \beta\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	307	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	308
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	309	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	310	To get rid of the left-recursion, we are required to introduce
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	311	a new non-terminal, say $\meta{B'}$ and transform the rule
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	312	as follows:
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	313
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	314	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	315	: \meta{B} ::= \beta \cdot \meta{B'}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	316	: \meta{B'} ::= \alpha \cdot \meta{B'} \| \epsilon\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	317	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	318
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	319	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	320	In our example of binary numbers we would after the transformation
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	321	end up with the rules
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	322
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	323	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	324	: \meta{B} ::= 0 \cdot \meta{B'} \| 1 \cdot \meta{B'}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	325	: \meta{B'} ::= \meta{B} \cdot \meta{B'} \| \epsilon\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	326	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	327
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	328	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	329	A little thought should convince you that this grammar still derives
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	330	all the binary numbers (for example 0 and 1 are derivable because $\meta{B'}$
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	331	can be $\epsilon$). Less clear might be why this grammar is non-left recursive.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	332	For $\meta{B'}$ it is relatively clear because we will never be
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	333	able to derive things like
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	334
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	335	\begin{center}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	336	$\meta{B'} \rightarrow\ldots\rightarrow \meta{B'}\cdot\ldots$
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	337	\end{center}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	338
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	339	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	340	because there will always be a $\meta{B}$ in front of a $\meta{B'}$, and
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	341	$\meta{B}$ now has always a $0$ or $1$ in front, so a $\meta{B'}$ can
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	342	never be in the first place. The reasoning is similar for $\meta{B}$:
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	343	the $0$ and $1$ in the rule for $\meta{B}$ ``protect'' it from becoming
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	344	left-recursive. This transformation does not mean the grammar is the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	345	simplest left-recursive grammar for binary numbers. For example the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	346	following grammar would do as well
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	347
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	348	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	349	: \meta{B} ::= 0 \cdot \meta{B} \| 1 \cdot \meta{B} \| 0 \| 1\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	350	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	351
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	352	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	353	The point is that we can in principle transform every left-recursive
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	354	grammar into one that is non-left-recursive one. This explains why often
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	355	the following grammar is used for arithmetic expressions:
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	356
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	357	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	358	: \meta{E} ::= \meta{T} \| \meta{T} \cdot + \cdot \meta{E} \| \meta{T} \cdot - \cdot \meta{E}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	359	: \meta{T} ::= \meta{F} \| \meta{F} \cdot * \cdot \meta{T}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	360	: \meta{F} ::= num\_token \| ( \cdot \meta{E} \cdot )\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	361	\end{plstx}
176 3c2653fc8b5a updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 175 diff changeset	362
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	363	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	364	In this grammar all $\meta{E}$xpressions, $\meta{T}$erms and $\meta{F}$actors
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	365	are in some way protected from being left-recusive. For example if you
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	366	start $\meta{E}$ you can derive another one by going through $\meta{T}$, then
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	367	$\meta{F}$, but then $\meta{E}$ is protected by the open-parenthesis.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	368
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	369	\subsection*{Removing $\epsilon$-Rules and CYK-Algorithm}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	370
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	371	I showed above that the non-left-recursive grammar for binary numbers is
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	372
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	373	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	374	: \meta{B} ::= 0 \cdot \meta{B'} \| 1 \cdot \meta{B'}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	375	: \meta{B'} ::= \meta{B} \cdot \meta{B'} \| \epsilon\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	376	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	377
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	378	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	379	The transformation made the original grammar non-left-recursive, but at
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	380	the expense of introducing an $\epsilon$ in the second rule. Having an
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	381	explicit $\epsilon$-rule is annoying to, not in terms of looping, but in
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	382	terms of efficiency. The reason is that the $\epsilon$-rule always
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	383	applies but since it recognises the empty string, it does not make any
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	384	progress with recognising a string. Better are rules like $( \cdot
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	385	\meta{E} \cdot )$ where something of the input is consumed. Getting
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	386	rid of $\epsilon$-rules is also important for the CYK parsing algorithm,
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	387	which can give us an insight into the complexity class of parsing.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	388
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	389	It turns out we can also by some generic transformations eliminate
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	390	$\epsilon$-rules from grammars. Consider again the grammar above for
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	391	binary numbers where have a rule $\meta{B'} ::= \epsilon$. In this case
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	392	we look for rules of the (generic) form \mbox{$\meta{A} :=
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	393	\alpha\cdot\meta{B'}\cdot\beta$}. That is there are rules that use
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	394	$\meta{B'}$ and something ($\alpha$) is in front of $\meta{B'}$ and
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	395	something follows ($\beta$). Such rules need to be replaced by
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	396	additional rules of the form \mbox{$\meta{A} := \alpha\cdot\beta$}.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	397	In our running example there are the two rules for $\meta{B}$ which
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	398	fall into this category
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	399
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	400	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	401	: \meta{B} ::= 0 \cdot \meta{B'} \| 1 \cdot \meta{B'}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	402	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	403
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	404	\noindent To follow the general scheme of the transfromation,
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	405	the $\alpha$ is either is either $0$ or $1$, and the $\beta$ happens
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	406	to be empty. SO we need to generate new rules for the form
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	407	\mbox{$\meta{A} := \alpha\cdot\beta$}, which in our particular
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	408	example means we obtain
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	409
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	410	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	411	: \meta{B} ::= 0 \cdot \meta{B'} \| 1 \cdot \meta{B'} \| 0 \| 1\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	412	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	413
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	414	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	415	Unfortunately $\meta{B'}$ is also used in the rule
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	416
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	417	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	418	: \meta{B'} ::= \meta{B} \cdot \meta{B'}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	419	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	420
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	421	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	422	For this we repeat the transformation, giving
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	423
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	424	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	425	: \meta{B'} ::= \meta{B} \cdot \meta{B'} \| \meta{B}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	426	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	427
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	428	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	429	In this case $\alpha$ was substituted with $\meta{B}$ and $\beta$
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	430	was again empty. Once no rule is left over, we can simply throw
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	431	away the $\epsilon$ rule. This gives the grammar
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	432
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	433	\begin{plstx}[margin=1cm]
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	434	: \meta{B} ::= 0 \cdot \meta{B'} \| 1 \cdot \meta{B'} \| 0 \| 1\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	435	: \meta{B'} ::= \meta{B} \cdot \meta{B'} \| \meta{B}\\
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	436	\end{plstx}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	437
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	438	\noindent
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	439	I let you think about whether this grammar can still recognise all
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	440	binary numbers and whether this grammar is non-left-recursive. The
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	441	precise statement for the transformation of removing $\epsilon$-rules is
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	442	that if the original grammar was able to recognise only non-empty
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	443	strings, then the transformed grammar will be equivalent (matching the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	444	same set of strings); if the original grammar was able to match the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	445	empty string, then the transformed grammar will be able to match the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	446	same strings, \emph{except} the empty string. So the $\epsilon$-removal
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	447	does not preserve equivalence of grammars, but the small defect with the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	448	empty string is not important for practical purposes.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	449
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	450	So why are these transformations all useful? Well apart from making the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	451	parser combinators work (remember they cannot deal with left-recursion and
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	452	are inefficient with $\epsilon$-rules), a second reason is that they help
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	453	with getting any insight into the complexity of the parsing problem.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	454	The parser combinators are very easy to implement, but are far from the
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	455	most efficient way of processing input (they can blow up exponentially
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	456	with ambiguous grammars). The question remains what is the best possible
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	457	complexity for parsing? It turns out that this is $O(n^3)$ for context-free
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	458	languages.
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	459
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	460	To answer the question about complexity, let me describe next the CYK
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	461	algorithm (named after the authors Cockeâ€“Youngerâ€“Kasami). This algorithm
681 7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	462	works with grammars that are in \emph{Chomsky normalform}. In Chomsky
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	463	normalform all rules must be of the form $\meta{A} ::= a$, where $a$ is
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	464	a terminal, or $\meta{A} ::= \meta{B}\cdot \meta{C}$, where $\meta{B}$ and
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	465	$\meta{B}$ need to be non-terminals. And no rule can contain $\epsilon$.
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	466	The following grammar is in Chomsky normalform:
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	467
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	468	\begin{plstx}[margin=1cm]
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	469	: \meta{S\/} ::= \meta{N}\cdot \meta{P}\\
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	470	: \meta{P\/} ::= \meta{V}\cdot \meta{N}\\
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	471	: \meta{N\/} ::= \meta{N}\cdot \meta{N}\\
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	472	: \meta{N\/} ::= \meta{A}\cdot \meta{N}\\
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	473	: \meta{N\/} ::= \texttt{student} \| \texttt{trainer} \| \texttt{team}
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	474	\| \texttt{trains}\\
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	475	: \meta{V\/} ::= \texttt{trains} \| \texttt{team}\\
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	476	: \meta{A\/} ::= \texttt{The} \| \texttt{the}\\
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	477	\end{plstx}
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	478
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	479	\noindent
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	480	where $\meta{S}$ is the start symbol and $\meta{S}$, $\meta{P}$,
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	481	$\meta{N}$, $\meta{V}$ and $\meta{A}$ are non-terminals. The ``words''
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	482	are terminals. The rough idea behind this grammar is that $\meta{S}$
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	483	stands for a sentence, $\meta{P}$ is a predicate, $\meta{N}$ is a noun
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	484	and so on. For example the rule \mbox{$\meta{P} ::= \meta{V}\cdot
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	485	\meta{N}$} states that a predicate can be a verb followed by a noun.
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	486	Now the question is whether the string
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	487
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	488	\begin{center}
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	489	\texttt{The trainer trains the student team}
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	490	\end{center}
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	491
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	492	\noindent
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	493	is recognised by the grammar. The CYK algorithm starts with the
7b7736bea3ca updated Christian Urban <urbanc@in.tum.de> parents: 680 diff changeset	494	following triangular data structure.
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	495
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	496	TBD
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	497
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	498	\end{document}
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	499
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	500
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	501	%%% Parser combinators are now part of handout 6
459 780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	502
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	503	\subsection*{Parser Combinators}
780486571e38 updated Christian Urban <urbanc@in.tum.de> parents: 385 diff changeset	504
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	505	Let us now turn to the problem of generating a parse-tree for
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	506	a grammar and string. In what follows we explain \emph{parser
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	507	combinators}, because they are easy to implement and closely
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	508	resemble grammar rules. Imagine that a grammar describes the
665 6d74d2a0a4b0 updated Christian Urban <urbanc@in.tum.de> parents: 618 diff changeset	509	strings of natural numbers, such as the grammar \meta{N} shown
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	510	above. For all such strings we want to generate the
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	511	parse-trees or later on we actually want to extract the
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	512	meaning of these strings, that is the concrete integers
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	513	``behind'' these strings. In Scala the parser combinators will
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	514	be functions of type
176 3c2653fc8b5a updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 175 diff changeset	515
3c2653fc8b5a updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 175 diff changeset	516	\begin{center}
177 53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	517	\texttt{I $\Rightarrow$ Set[(T, I)]}
176 3c2653fc8b5a updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 175 diff changeset	518	\end{center}
3c2653fc8b5a updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 175 diff changeset	519
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	520	\noindent that is they take as input something of type
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	521	\texttt{I}, typically a list of tokens or a string, and return
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	522	a set of pairs. The first component of these pairs corresponds
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	523	to what the parser combinator was able to process from the
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	524	input and the second is the unprocessed part of the input. As
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	525	we shall see shortly, a parser combinator might return more
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	526	than one such pair, with the idea that there are potentially
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	527	several ways how to interpret the input. As a concrete
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	528	example, consider the case where the input is of type string,
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	529	say the string
183 b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	530
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	531	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	532	\tt\Grid{iffoo\VS testbar}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	533	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	534
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	535	\noindent We might have a parser combinator which tries to
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	536	interpret this string as a keyword (\texttt{if}) or an
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	537	identifier (\texttt{iffoo}). Then the output will be the set
177 53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	538
183 b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	539	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	540	$\left\{ \left(\texttt{\Grid{if}}\,,\, \texttt{\Grid{foo\VS testbar}}\right),
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	541	\left(\texttt{\Grid{iffoo}}\,,\, \texttt{\Grid{\VS testbar}}\right) \right\}$
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	542	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	543
362 57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	544	\noindent where the first pair means the parser could
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	545	recognise \texttt{if} from the input and leaves the rest as
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	546	`unprocessed' as the second component of the pair; in the
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	547	other case it could recognise \texttt{iffoo} and leaves
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	548	\texttt{\VS testbar} as unprocessed. If the parser cannot
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	549	recognise anything from the input then parser combinators just
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	550	return the empty set $\{\}$. This will indicate
57ea439feaff updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 360 diff changeset	551	something ``went wrong''.
183 b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	552
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	553	The main attraction is that we can easily build parser combinators out of smaller components
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	554	following very closely the structure of a grammar. In order to implement this in an object
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	555	oriented programming language, like Scala, we need to specify an abstract class for parser
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	556	combinators. This abstract class requires the implementation of the function
177 53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	557	\texttt{parse} taking an argument of type \texttt{I} and returns a set of type
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	558	\mbox{\texttt{Set[(T, I)]}}.
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	559
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	560	\begin{center}
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	561	\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	562	abstract class Parser[I, T] {
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	563	def parse(ts: I): Set[(T, I)]
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	564
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	565	def parse_all(ts: I): Set[T] =
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	566	for ((head, tail) <- parse(ts); if (tail.isEmpty))
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	567	yield head
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	568	}
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	569	\end{lstlisting}
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	570	\end{center}
176 3c2653fc8b5a updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 175 diff changeset	571
177 53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	572	\noindent
183 b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	573	From the function \texttt{parse} we can then ``centrally'' derive the function \texttt{parse\_all},
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	574	which just filters out all pairs whose second component is not empty (that is has still some
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	575	unprocessed part). The reason is that at the end of parsing we are only interested in the
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	576	results where all the input has been consumed and no unprocessed part is left.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	577
177 53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	578	One of the simplest parser combinators recognises just a character, say $c$,
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	579	from the beginning of strings. Its behaviour is as follows:
176 3c2653fc8b5a updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 175 diff changeset	580
177 53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	581	\begin{itemize}
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	582	\item if the head of the input string starts with a $c$, it returns
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	583	the set $\{(c, \textit{tail of}\; s)\}$
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	584	\item otherwise it returns the empty set $\varnothing$
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	585	\end{itemize}
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	586
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	587	\noindent
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	588	The input type of this simple parser combinator for characters is
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	589	\texttt{String} and the output type \mbox{\texttt{Set[(Char, String)]}}.
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	590	The code in Scala is as follows:
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	591
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	592	\begin{center}
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	593	\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	594	case class CharParser(c: Char) extends Parser[String, Char] {
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	595	def parse(sb: String) =
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	596	if (sb.head == c) Set((c, sb.tail)) else Set()
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	597	}
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	598	\end{lstlisting}
53def1fbf472 updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 176 diff changeset	599	\end{center}
176 3c2653fc8b5a updated Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 175 diff changeset	600
183 b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	601	\noindent
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	602	The \texttt{parse} function tests whether the first character of the
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	603	input string \texttt{sb} is equal to \texttt{c}. If yes, then it splits the
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	604	string into the recognised part \texttt{c} and the unprocessed part
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	605	\texttt{sb.tail}. In case \texttt{sb} does not start with \texttt{c} then
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	606	the parser returns the empty set (in Scala \texttt{Set()}).
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	607
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	608	More interesting are the parser combinators that build larger parsers
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	609	out of smaller component parsers. For example the alternative
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	610	parser combinator is as follows.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	611
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	612	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	613	\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	614	class AltParser[I, T]
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	615	(p: => Parser[I, T],
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	616	q: => Parser[I, T]) extends Parser[I, T] {
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	617	def parse(sb: I) = p.parse(sb) ++ q.parse(sb)
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	618	}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	619	\end{lstlisting}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	620	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	621
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	622	\noindent
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	623	The types of this parser combinator are polymorphic (we just have \texttt{I}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	624	for the input type, and \texttt{T} for the output type). The alternative parser
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	625	builds a new parser out of two existing parser combinator \texttt{p} and \texttt{q}.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	626	Both need to be able to process input of type \texttt{I} and return the same
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	627	output type \texttt{Set[(T, I)]}. (There is an interesting detail of Scala, namely the
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	628	\texttt{=>} in front of the types of \texttt{p} and \texttt{q}. They will prevent the
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	629	evaluation of the arguments before they are used. This is often called
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	630	\emph{lazy evaluation} of the arguments.) The alternative parser should run
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	631	the input with the first parser \texttt{p} (producing a set of outputs) and then
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	632	run the same input with \texttt{q}. The result should be then just the union
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	633	of both sets, which is the operation \texttt{++} in Scala.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	634
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	635	This parser combinator already allows us to construct a parser that either
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	636	a character \texttt{a} or \texttt{b}, as
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	637
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	638	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	639	\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	640	new AltParser(CharParser('a'), CharParser('b'))
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	641	\end{lstlisting}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	642	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	643
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	644	\noindent
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	645	Scala allows us to introduce some more readable shorthand notation for this, like \texttt{'a' \|\| 'b'}.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	646	We can call this parser combinator with the strings
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	647
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	648	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	649	\begin{tabular}{rcl}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	650	input string & & output\medskip\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	651	\texttt{\Grid{ac}} & $\rightarrow$ & $\left\{(\texttt{\Grid{a}}, \texttt{\Grid{c}})\right\}$\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	652	\texttt{\Grid{bc}} & $\rightarrow$ & $\left\{(\texttt{\Grid{b}}, \texttt{\Grid{c}})\right\}$\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	653	\texttt{\Grid{cc}} & $\rightarrow$ & $\varnothing$
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	654	\end{tabular}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	655	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	656
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	657	\noindent
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	658	We receive in the first two cases a successful output (that is a non-empty set).
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	659
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	660	A bit more interesting is the \emph{sequence parser combinator} implemented in
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	661	Scala as follows:
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	662
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	663	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	664	\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	665	class SeqParser[I, T, S]
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	666	(p: => Parser[I, T],
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	667	q: => Parser[I, S]) extends Parser[I, (T, S)] {
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	668	def parse(sb: I) =
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	669	for ((head1, tail1) <- p.parse(sb);
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	670	(head2, tail2) <- q.parse(tail1))
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	671	yield ((head1, head2), tail2)
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	672	}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	673	\end{lstlisting}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	674	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	675
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	676	\noindent
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	677	This parser takes as input two parsers, \texttt{p} and \texttt{q}. It implements \texttt{parse}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	678	as follows: let first run the parser \texttt{p} on the input producing a set of pairs (\texttt{head1}, \texttt{tail1}).
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	679	The \texttt{tail1} stands for the unprocessed parts left over by \texttt{p}.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	680	Let \texttt{q} run on these unprocessed parts
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	681	producing again a set of pairs. The output of the sequence parser combinator is then a set
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	682	containing pairs where the first components are again pairs, namely what the first parser could parse
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	683	together with what the second parser could parse; the second component is the unprocessed
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	684	part left over after running the second parser \texttt{q}. Therefore the input type of
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	685	the sequence parser combinator is as usual \texttt{I}, but the output type is
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	686
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	687	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	688	\texttt{Set[((T, S), I)]}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	689	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	690
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	691	Scala allows us to provide some
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	692	shorthand notation for the sequence parser combinator. So we can write for
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	693	example \texttt{'a' $\sim$ 'b'}, which is the
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	694	parser combinator that first consumes the character \texttt{a} from a string and then \texttt{b}.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	695	Calling this parser combinator with the strings
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	696
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	697	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	698	\begin{tabular}{rcl}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	699	input string & & output\medskip\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	700	\texttt{\Grid{abc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{a}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	701	\texttt{\Grid{bac}} & $\rightarrow$ & $\varnothing$\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	702	\texttt{\Grid{ccc}} & $\rightarrow$ & $\varnothing$
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	703	\end{tabular}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	704	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	705
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	706	\noindent
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	707	A slightly more complicated parser is \texttt{('a' \|\| 'b') $\sim$ 'b'} which parses as first character either
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	708	an \texttt{a} or \texttt{b} followed by a \texttt{b}. This parser produces the following results.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	709
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	710	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	711	\begin{tabular}{rcl}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	712	input string & & output\medskip\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	713	\texttt{\Grid{abc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{a}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	714	\texttt{\Grid{bbc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{b}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	715	\texttt{\Grid{aac}} & $\rightarrow$ & $\varnothing$
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	716	\end{tabular}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	717	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	718
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	719	Note carefully that constructing the parser \texttt{'a' \|\| ('a' $\sim$ 'b')} will result in a tying error.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	720	The first parser has as output type a single character (recall the type of \texttt{CharParser}),
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	721	but the second parser produces a pair of characters as output. The alternative parser is however
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	722	required to have both component parsers to have the same type. We will see later how we can
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	723	build this parser without the typing error.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	724
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	725	The next parser combinator does not actually combine smaller parsers, but applies
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	726	a function to the result of the parser. It is implemented in Scala as follows
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	727
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	728	\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	729	\begin{lstlisting}[language=Scala,basicstyle=\small\ttfamily, numbers=none]
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	730	class FunParser[I, T, S]
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	731	(p: => Parser[I, T],
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	732	f: T => S) extends Parser[I, S] {
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	733	def parse(sb: I) =
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	734	for ((head, tail) <- p.parse(sb)) yield (f(head), tail)
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	735	}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	736	\end{lstlisting}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	737	\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	738
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	739
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	740	\noindent
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	741	This parser combinator takes a parser \texttt{p} with output type \texttt{T} as
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	742	input as well as a function \texttt{f} with type \texttt{T => S}. The parser \texttt{p}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	743	produces sets of type \texttt{(T, I)}. The \texttt{FunParser} combinator then
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	744	applies the function \texttt{f} to all the parer outputs. Since this function
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	745	is of type \texttt{T => S}, we obtain a parser with output type \texttt{S}.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	746	Again Scala lets us introduce some shorthand notation for this parser combinator.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	747	Therefore we will write \texttt{p ==> f} for it.
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	748
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	749	%\bigskip
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	750	%takes advantage of the full generality---have a look
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	751	%what it produces if we call it with the string \texttt{abc}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	752	%
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	753	%\begin{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	754	%\begin{tabular}{rcl}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	755	%input string & & output\medskip\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	756	%\texttt{\Grid{abc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{a}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	757	%\texttt{\Grid{bbc}} & $\rightarrow$ & $\left\{((\texttt{\Grid{b}}, \texttt{\Grid{b}}), \texttt{\Grid{c}})\right\}$\\
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	758	%\texttt{\Grid{aac}} & $\rightarrow$ & $\varnothing$
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	759	%\end{tabular}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	760	%\end{center}
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	761
b17eff695c7f added new stuff Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 177 diff changeset	762
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	763
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	764
eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	765
173 7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	766
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	767	%%% Local Variables:
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	768	%%% mode: latex
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	769	%%% TeX-master: t
7cfb7a6f7c99 added slides Christian Urban <christian dot urban at kcl dot ac dot uk> parents: diff changeset	770	%%% End:
680 eecc4d5a2172 updated Christian Urban <urbanc@in.tum.de> parents: 665 diff changeset	771

author	Christian Urban <urbanc@in.tum.de>
	Wed, 06 Nov 2019 17:09:58 +0000 (2019-11-06)
changeset 681	7b7736bea3ca
parent 680	eecc4d5a2172
child 682	553b4d4e3719
permissions	-rw-r--r--