author | Christian Urban <christian dot urban at kcl dot ac dot uk> |
Tue, 28 Oct 2014 06:01:00 +0000 | |
changeset 291 | 201c2c6d8696 |
parent 217 | cd6066f1056a |
child 292 | 7ed2a25dd115 |
permissions | -rw-r--r-- |
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
1 |
\documentclass{article} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
2 |
\usepackage{hyperref} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
3 |
\usepackage{amssymb} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
4 |
\usepackage{amsmath} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
5 |
\usepackage[T1]{fontenc} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
6 |
\usepackage{tikz} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
7 |
\usetikzlibrary{arrows} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
8 |
\usetikzlibrary{automata} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
9 |
\usetikzlibrary{shapes} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
10 |
\usetikzlibrary{shadows} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
11 |
\usetikzlibrary{positioning} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
12 |
\usetikzlibrary{calc} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
13 |
\usetikzlibrary{fit} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
14 |
\usetikzlibrary{backgrounds} |
217
cd6066f1056a
updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
173
diff
changeset
|
15 |
\usepackage{../langs} |
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
16 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
17 |
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
18 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
19 |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
20 |
\newcommand\grid[1]{% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
21 |
\begin{tikzpicture}[baseline=(char.base)] |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
22 |
\path[use as bounding box] |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
23 |
(0,0) rectangle (1em,1em); |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
24 |
\draw[red!50, fill=red!20] |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
25 |
(0,0) rectangle (1em,1em); |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
26 |
\node[inner sep=1pt,anchor=base west] |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
27 |
(char) at (0em,\gridraiseamount) {#1}; |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
28 |
\end{tikzpicture}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
29 |
\newcommand\gridraiseamount{0.12em} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
30 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
31 |
\makeatletter |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
32 |
\newcommand\Grid[1]{% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
33 |
\@tfor\z:=#1\do{\grid{\z}}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
34 |
\makeatother |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
35 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
36 |
\newcommand\Vspace[1][.3em]{% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
37 |
\mbox{\kern.06em\vrule height.3ex}% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
38 |
\vbox{\hrule width#1}% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
39 |
\hbox{\vrule height.3ex}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
40 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
41 |
\def\VS{\Vspace[0.6em]} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
42 |
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
43 |
\begin{document} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
44 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
45 |
\section*{Handout 5} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
46 |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
47 |
Whenever you want to design a new programming language or implement a compiler for an |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
48 |
existing language, the first task is to fix the basic ``words'' of the language. For example what are the |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
49 |
keywords, or reserved words, of the language, what are permitted identifiers, numbers, expressions and so |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
50 |
on. One convenient way to do this is, of |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
51 |
course, by using regular expressions. |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
52 |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
53 |
In this course we want to take a closer look at the |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
54 |
WHILE programming language. This is a simple imperative programming language consisting of arithmetic |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
55 |
expressions, assignments, if-statements and loops. For example the Fibonacci program can be |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
56 |
written in this language as follows: |
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
57 |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
58 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
59 |
\mbox{\lstinputlisting[language=while]{../progs/fib.while}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
60 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
61 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
62 |
\noindent |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
63 |
The keywords in this language will be |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
64 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
65 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
66 |
\texttt{while}, \texttt{if}, \texttt{then}, \texttt{else}, \texttt{write}, \texttt{read} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
67 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
68 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
69 |
\noindent |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
70 |
In addition we will have some common operators, such as \texttt{<}, \texttt{>}, \texttt{:=} and so on, as well as numbers |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
71 |
and strings (which we however ignore for the moment). We also need to specify what the ``whitespace'' |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
72 |
is in our programming language and what comments should look like. |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
73 |
As a first try, we might specify the regular expressions for our language roughly as follows |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
74 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
75 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
76 |
\begin{tabular}{lcl} |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
77 |
$\textit{LETTER}$ & $:=$ & $\texttt{a} + \texttt{A} + \texttt{b} + \texttt{B} + \ldots$\\ |
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
78 |
$\textit{DIGIT}$ & $:=$ & $\texttt{0} + \texttt{1} + \texttt{2} + \ldots$\\ |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
79 |
$\textit{KEYWORD}$ & $:=$ & $\texttt{while} + \texttt{if} + \texttt{then} + \texttt{else} + \dots$\\ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
80 |
$\textit{IDENT}$ & $:=$ & $\textit{LETTER} \cdot (\textit{LETTER} + \textit{DIGIT} + {\_})^*$\\ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
81 |
$\textit{OP}$ & $:=$ & $\texttt{:=} + \texttt{<} + \ldots$\\ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
82 |
$\textit{NUM}$ & $:=$ & $\textit{DIGIT}^+$\\ |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
83 |
$\textit{WHITESPACE}$ & $:=$ & $("\hspace{2mm}" + \backslash\texttt{n})^+$ |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
84 |
\end{tabular} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
85 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
86 |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
87 |
Having the regular expressions in place, the problem we have to solve is: |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
88 |
given a string of our programming language, which regular expression |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
89 |
matches which part of the string. By solving this problem, we can split up a string |
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
90 |
of our language into components. |
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
91 |
For example given the input string |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
92 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
93 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
94 |
\texttt{\Grid{if\VS true\VS then\VS x+2\VS else\VS x+3}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
95 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
96 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
97 |
\noindent |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
98 |
we expect it is split up as follows |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
99 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
100 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
101 |
\tt |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
102 |
\Grid{if}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
103 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
104 |
\Grid{true}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
105 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
106 |
\Grid{then}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
107 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
108 |
\Grid{x}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
109 |
\Grid{+}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
110 |
\Grid{2}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
111 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
112 |
\Grid{else}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
113 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
114 |
\Grid{x}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
115 |
\Grid{+}\, |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
116 |
\Grid{3} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
117 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
118 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
119 |
\noindent |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
120 |
This process of splitting up an input string into components is often called \emph{lexing} or \emph{scanning}. |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
121 |
It is usually the first phase of a compiler. Note that the separation into words cannot, in general, |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
122 |
be done by just looking at whitespaces: while \texttt{if} and \texttt{true} are separated by a whitespace |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
123 |
in the example above, this is not always the case. As can be seen the three components in \texttt{x+2} are |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
124 |
not separated by any whitespace. Another reason for recognising whitespaces explicitly is |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
125 |
that in some languages, for example Python, whitespaces matters, that is carry meaning. However in |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
126 |
our small language we will eventually just filter out all whitespaces and also all comments. |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
127 |
|
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
128 |
Lexing not just separates a string into its components, but also classifies the components, meaning it explicitly records that \texttt{if} is a keyword, \VS{} a whitespace, \texttt{true} an identifier and so on. |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
129 |
For the moment, though, we will only focus on the simpler problem of just splitting up |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
130 |
a string into components. |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
131 |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
132 |
There are a few subtleties we need to consider first. For example, say the string is |
154
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
133 |
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
134 |
\begin{center} |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
135 |
\texttt{\Grid{iffoo\VS}}\;\ldots |
154
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
136 |
\end{center} |
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
137 |
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
138 |
\noindent |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
139 |
then there are two possibilities for how it can be split up: either we regard the input as the keyword \texttt{if} followed |
154
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
140 |
by the identifier \texttt{foo} (both regular expressions match) or we regard \texttt{iffoo} as a |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
141 |
single identifier. The choice that is often made in lexers is to look for the longest possible match. |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
142 |
This leaves \texttt{iffoo} as the only match in this case (since it is longer than \texttt{if}). |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
143 |
|
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
144 |
Unfortunately, the convention about the longest match does not yet make the process |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
145 |
of lexing completely deterministic. Consider the string |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
146 |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
147 |
\begin{center} |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
148 |
\texttt{\Grid{then\VS}}\;\dots |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
149 |
\end{center} |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
150 |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
151 |
\noindent |
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
152 |
Clearly, this string should be identified as a keyword. The problem is that also the regular expression \textit{IDENT} for identifiers matches this string. To overcome this ambiguity we need to rank our |
156
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
153 |
regular expressions. In our running example we just use the ranking |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
154 |
|
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
155 |
\[ |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
156 |
\textit{KEYWORD} < \textit{IDENT} < \textit{OP} < \ldots |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
157 |
\] |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
158 |
|
156
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
159 |
\noindent |
160
8134c3b981e0
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
159
diff
changeset
|
160 |
So even if both regular expressions match in the example above, |
8134c3b981e0
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
159
diff
changeset
|
161 |
we give preference to the regular expression for keywords. |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
162 |
|
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
163 |
Let us see how our algorithm for lexing works in detail. In addition to the functions $nullable$ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
164 |
and $der$, it will use the function \emph{zeroable} defined as follows: |
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
165 |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
166 |
\begin{center} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
167 |
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
168 |
$zeroable(\varnothing)$ & $\dn$ & $true$\\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
169 |
$zeroable(\epsilon)$ & $\dn$ & $f\!alse$\\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
170 |
$zeroable (c)$ & $\dn$ & $f\!alse$\\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
171 |
$zeroable (r_1 + r_2)$ & $\dn$ & $zeroable(r_1) \wedge zeroable(r_2)$ \\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
172 |
$zeroable (r_1 \cdot r_2)$ & $\dn$ & $zeroable(r_1) \vee zeroable(r_2)$ \\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
173 |
$zeroable (r^*)$ & $\dn$ & $f\!alse$\\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
174 |
\end{tabular} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
175 |
\end{center} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
176 |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
177 |
\noindent |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
178 |
Recall that the function $nullable(r)$ tests whether a regular expression |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
179 |
can match the empty string. The function $zeroable$, on the other hand, tests whether a regular |
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
180 |
expression cannot match anything at all. The mathematical way of stating this |
160
8134c3b981e0
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
159
diff
changeset
|
181 |
property is |
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
182 |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
183 |
\begin{center} |
160
8134c3b981e0
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
159
diff
changeset
|
184 |
$zeroable(r)$ if and only if $L(r) = \varnothing$ |
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
185 |
\end{center} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
186 |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
187 |
\noindent |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
188 |
For what follows let us fix a set of regular expressions $rs$ as being |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
189 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
190 |
\begin{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
191 |
\textit{KEYWORD}, \textit{IDENT}, \textit{WHITESPACE} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
192 |
\end{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
193 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
194 |
\noindent |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
195 |
specifying the ``words'' |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
196 |
of our programming language. The algorithm takes as input the $rs$ and a string, say |
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
197 |
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
198 |
\begin{center} |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
199 |
\texttt{\grid{$c_1$}\grid{$c_2$}\grid{$c_3$}\grid{$c_4$}}\;\ldots |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
200 |
\end{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
201 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
202 |
\noindent |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
203 |
and tries to chop off one word from the beginning of the string. If none of the |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
204 |
regular expression in $rs$ matches, we will just return |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
205 |
the empty string. |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
206 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
207 |
The crucial idea in the algorithm is to build the derivatives of all regular expressions in $rs$ with respect |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
208 |
to the first character $c_1$. Then we take the results and continue with |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
209 |
building the derivatives with respect to $c_2$ until we have either exhausted our |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
210 |
input string or all of the regular expressions are ``zeroable''. Suppose the input string is |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
211 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
212 |
\begin{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
213 |
\texttt{\Grid{if2\VS}}\;\dots |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
214 |
\end{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
215 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
216 |
\noindent |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
217 |
then building the derivatives with respect to \texttt{i} gives |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
218 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
219 |
\begin{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
220 |
\begin{tabular}{l|c} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
221 |
& $zeroable$\\\hline |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
222 |
$der\;\texttt{i}\;(\textit{KEYWORD})$ & no\\ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
223 |
$der\;\texttt{i}\;(\textit{IDENT})$ & no\\ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
224 |
$der\;\texttt{i}\;(\textit{WHITESPACE})$ & yes\\ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
225 |
\end{tabular} |
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
226 |
\end{center} |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
227 |
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
228 |
\noindent |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
229 |
We can eliminate \textit{WHITESPACE} as a potential candidate, because |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
230 |
no derivative can go from $zeroable = \text{yes}$ to no. That leaves the other |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
231 |
two regular expressions as potential candidate and we have to consider the |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
232 |
next character, \texttt{f}, from the input string |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
233 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
234 |
\begin{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
235 |
\begin{tabular}{l|c} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
236 |
& $zeroable$\\\hline |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
237 |
$der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{KEYWORD}))$ & no\\ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
238 |
$der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{IDENT}))$ & no\\ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
239 |
\end{tabular} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
240 |
\end{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
241 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
242 |
\noindent |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
243 |
Since both are `no', we have to continue with \texttt{2} from the input string |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
244 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
245 |
\begin{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
246 |
\begin{tabular}{l|c} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
247 |
& $zeroable$\\\hline |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
248 |
$der\;\texttt{2}\;(der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{KEYWORD})))$ & yes\\ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
249 |
$der\;\texttt{2}\;(der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{IDENT})))$ & no\\ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
250 |
\end{tabular} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
251 |
\end{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
252 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
253 |
\noindent |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
254 |
Although we now know that the beginning is definitely an \textit{IDENT}, we do not yet |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
255 |
know how much of the input string should be considered as an \textit{IDENT}. So we |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
256 |
still have to continue and consider the next derivative. |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
257 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
258 |
\begin{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
259 |
\begin{tabular}{l|c} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
260 |
& $zeroable$\\\hline |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
261 |
$der\;\texttt{\VS}\;(der\;\texttt{2}\;(der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{IDENT}))))$ & yes\\ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
262 |
\end{tabular} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
263 |
\end{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
264 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
265 |
\noindent |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
266 |
Since the answer is now `yes' also in this case, we can stop: once all derivatives are |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
267 |
zeroable, we know the regular expressions cannot match any more letters from |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
268 |
the input. In this case we only have to go back to the derivative that is nullable. In this case it |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
269 |
is |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
270 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
271 |
\begin{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
272 |
$der\;\texttt{2}\;(der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{IDENT})))$ |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
273 |
\end{center} |
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
274 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
275 |
\noindent |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
276 |
which means we recognised an identifier. In case where there is a choice of more than one |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
277 |
regular expressions that are nullable, then we choose the one with the highest precedence. |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
278 |
You can try out such a case with the input string |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
279 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
280 |
\begin{center} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
281 |
\texttt{\Grid{then\VS}}\;\dots |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
282 |
\end{center} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
283 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
284 |
\noindent |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
285 |
which can both be recognised as a keyword, but also an identifier. |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
286 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
287 |
While in the example above the last nullable derivative is the one directly before |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
288 |
the derivative turns zeroable, this is not always the case. Imagine, identifiers can be |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
289 |
letters, as permuted by the regular expression \textit{IDENT}, but must end with an |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
290 |
undercore. |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
291 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
292 |
\begin{center} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
293 |
\begin{tabular}{lcl} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
294 |
$\textit{NEWIDENT}$ & $:=$ & $\textit{LETTER} \cdot (\textit{LETTER} + \textit{DIGIT} + {\_})^* \cdot \_$\\ |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
295 |
\end{tabular} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
296 |
\end{center} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
297 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
298 |
\noindent |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
299 |
If we use \textit{NEWIDENT} with the input string |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
300 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
301 |
\begin{center} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
302 |
\texttt{\Grid{iffoo\VS}}\;\ldots |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
303 |
\end{center} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
304 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
305 |
\noindent |
163
89d6d89d9844
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
162
diff
changeset
|
306 |
then it will only become $zeroable$ after the $\VS$ has been analysed. In this case we have to go back to the |
162
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
307 |
first \texttt{f} because only |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
308 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
309 |
\begin{center} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
310 |
$der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{KEYWORD}))$ |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
311 |
\end{center} |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
312 |
|
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
313 |
\noindent |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
314 |
is nullable. As a result we recognise successfully the keyword \texttt{if} and the remaining |
edcd84c7b491
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
161
diff
changeset
|
315 |
string needs to be consumed by other regular expressions or lead to a lexing error. |
161
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
316 |
|
bfcf838dda4d
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
160
diff
changeset
|
317 |
|
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
318 |
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
319 |
\end{document} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
320 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
321 |
%%% Local Variables: |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
322 |
%%% mode: latex |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
323 |
%%% TeX-master: t |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
324 |
%%% End: |