handouts/ho05.tex
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Tue, 28 Oct 2014 06:01:00 +0000
changeset 291 201c2c6d8696
parent 217 cd6066f1056a
child 292 7ed2a25dd115
permissions -rw-r--r--
updated
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     1
\documentclass{article}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     2
\usepackage{hyperref}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     3
\usepackage{amssymb}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     4
\usepackage{amsmath}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     5
\usepackage[T1]{fontenc}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     6
\usepackage{tikz}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     7
\usetikzlibrary{arrows}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     8
\usetikzlibrary{automata}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     9
\usetikzlibrary{shapes}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    10
\usetikzlibrary{shadows}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    11
\usetikzlibrary{positioning}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    12
\usetikzlibrary{calc}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    13
\usetikzlibrary{fit}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    14
\usetikzlibrary{backgrounds}
217
cd6066f1056a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 173
diff changeset
    15
\usepackage{../langs}
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    16
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    17
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    18
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    19
	
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    20
\newcommand\grid[1]{%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    21
\begin{tikzpicture}[baseline=(char.base)]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    22
  \path[use as bounding box]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    23
    (0,0) rectangle (1em,1em);
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    24
  \draw[red!50, fill=red!20]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    25
    (0,0) rectangle (1em,1em);
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    26
  \node[inner sep=1pt,anchor=base west]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    27
    (char) at (0em,\gridraiseamount) {#1};
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    28
\end{tikzpicture}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    29
\newcommand\gridraiseamount{0.12em}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    30
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    31
\makeatletter
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    32
\newcommand\Grid[1]{%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    33
  \@tfor\z:=#1\do{\grid{\z}}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    34
\makeatother	
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    35
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    36
\newcommand\Vspace[1][.3em]{%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    37
  \mbox{\kern.06em\vrule height.3ex}%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    38
  \vbox{\hrule width#1}%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    39
  \hbox{\vrule height.3ex}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    40
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    41
\def\VS{\Vspace[0.6em]}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    42
	
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    43
\begin{document}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    44
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    45
\section*{Handout 5}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    46
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    47
Whenever you want to design a new programming language or implement a compiler for an
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    48
existing language, the first task is to fix the basic ``words'' of the language. For example what are the 
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    49
keywords, or reserved words, of the language, what are permitted identifiers, numbers, expressions and so 
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    50
on. One convenient way to do this is, of 
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    51
course, by using regular expressions. 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    52
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    53
In this course we want to take a closer look at the 
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    54
WHILE programming language. This is a simple imperative programming language consisting of arithmetic
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    55
expressions, assignments, if-statements and loops. For example the Fibonacci program can be
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    56
written in this language as follows:
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    57
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    58
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    59
\mbox{\lstinputlisting[language=while]{../progs/fib.while}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    60
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    61
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    62
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    63
The keywords in this language will be
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    64
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    65
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    66
\texttt{while}, \texttt{if}, \texttt{then}, \texttt{else}, \texttt{write}, \texttt{read}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    67
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    68
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    69
\noindent
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    70
In addition we will have some common operators, such as \texttt{<}, \texttt{>}, \texttt{:=} and so on, as well as numbers
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    71
and strings (which we however ignore for the moment). We also need to specify what the ``whitespace''
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    72
is in our programming language and what comments should look like.
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    73
As a first try, we might specify the regular expressions for our language roughly as follows
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    74
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    75
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    76
\begin{tabular}{lcl}
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    77
$\textit{LETTER}$ & $:=$ & $\texttt{a} + \texttt{A} + \texttt{b} + \texttt{B} + \ldots$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    78
$\textit{DIGIT}$ & $:=$ & $\texttt{0} + \texttt{1} + \texttt{2} + \ldots$\\
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    79
$\textit{KEYWORD}$ & $:=$ & $\texttt{while}  + \texttt{if} + \texttt{then} + \texttt{else} + \dots$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    80
$\textit{IDENT}$ & $:=$ & $\textit{LETTER} \cdot (\textit{LETTER} + \textit{DIGIT} + {\_})^*$\\ 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    81
$\textit{OP}$      &  $:=$ & $\texttt{:=} + \texttt{<} + \ldots$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    82
$\textit{NUM}$   &  $:=$ & $\textit{DIGIT}^+$\\
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    83
$\textit{WHITESPACE}$ & $:=$ & $("\hspace{2mm}" + \backslash\texttt{n})^+$
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    84
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    85
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    86
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    87
Having the regular expressions in place, the problem we have to solve is: 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    88
given a string of our programming language, which regular expression 
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    89
matches which part of the string. By solving this problem, we can split up a string 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    90
of our language into components. 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
    91
For example given the input string
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    92
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    93
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    94
\texttt{\Grid{if\VS true\VS then\VS x+2\VS else\VS x+3}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    95
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    96
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    97
\noindent
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
    98
we expect it is split up as follows
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    99
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   100
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   101
\tt
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   102
\Grid{if}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   103
\Grid{\VS}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   104
\Grid{true}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   105
\Grid{\VS}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   106
\Grid{then}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   107
\Grid{\VS}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   108
\Grid{x}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   109
\Grid{+}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   110
\Grid{2}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   111
\Grid{\VS}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   112
\Grid{else}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   113
\Grid{\VS}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   114
\Grid{x}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   115
\Grid{+}\,
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   116
\Grid{3}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   117
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   118
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   119
\noindent
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   120
This process of splitting up an input string into components is often called \emph{lexing} or \emph{scanning}.
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   121
It is usually the first phase of a compiler. Note that the separation into words cannot, in general, 
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   122
be done by just looking at whitespaces: while \texttt{if} and \texttt{true} are separated by a whitespace
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   123
in the example above, this is not always the case. As can be seen the three components in \texttt{x+2} are 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   124
not separated by any whitespace. Another reason for recognising whitespaces explicitly is
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   125
that in some languages, for example Python, whitespaces matters, that is carry meaning. However in 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   126
our small language we will eventually just filter out all whitespaces and also all comments.
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   127
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   128
Lexing not just separates a string into its components, but also classifies the components, meaning it explicitly records that \texttt{if} is a keyword,  \VS{} a whitespace, \texttt{true} an identifier and so on.
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   129
For the moment, though, we will only focus on the simpler problem of just splitting up 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   130
a string into components.
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   131
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
   132
There are a few subtleties  we need to consider first. For example, say the string is
154
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   133
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   134
\begin{center}
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   135
\texttt{\Grid{iffoo\VS}}\;\ldots
154
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   136
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   137
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   138
\noindent
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
   139
then there are two possibilities for how it can be split up: either we regard the input as the keyword \texttt{if} followed
154
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   140
by the identifier \texttt{foo} (both regular expressions match) or we regard \texttt{iffoo} as a 
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   141
single identifier. The choice that is often made in lexers is to look for the longest possible match.
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   142
This leaves  \texttt{iffoo} as the only match in this case (since it is longer than \texttt{if}).
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   143
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   144
Unfortunately, the convention about the longest match does not yet make the process 
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
   145
of lexing completely deterministic. Consider the string
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   146
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   147
\begin{center}
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   148
\texttt{\Grid{then\VS}}\;\dots
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   149
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   150
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   151
\noindent
159
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 158
diff changeset
   152
Clearly, this string should be identified as a keyword. The problem is that also the regular expression \textit{IDENT} for identifiers matches this string. To overcome this ambiguity we need to rank our 
156
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   153
regular expressions. In our running example we just use the ranking
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   154
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   155
\[
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   156
\textit{KEYWORD} < \textit{IDENT} < \textit{OP} < \ldots
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   157
\]
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   158
156
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   159
\noindent
160
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 159
diff changeset
   160
So even if both regular expressions match in the example above,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 159
diff changeset
   161
we give preference to the regular expression for keywords.
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   162
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   163
Let us see how our algorithm for lexing works in detail. In addition to the functions $nullable$
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   164
and $der$, it will  use the function \emph{zeroable} defined as follows:
157
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   165
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   166
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   167
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   168
$zeroable(\varnothing)$      & $\dn$ & $true$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   169
$zeroable(\epsilon)$           & $\dn$ &  $f\!alse$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   170
$zeroable (c)$                    & $\dn$ &  $f\!alse$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   171
$zeroable (r_1 + r_2)$        & $\dn$ &  $zeroable(r_1) \wedge zeroable(r_2)$ \\ 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   172
$zeroable (r_1 \cdot r_2)$  & $\dn$ &  $zeroable(r_1) \vee zeroable(r_2)$ \\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   173
$zeroable (r^*)$                  & $\dn$ & $f\!alse$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   174
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   175
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   176
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   177
\noindent
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   178
Recall that the function $nullable(r)$ tests whether a regular expression
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   179
can match the empty string. The function $zeroable$, on the other hand, tests whether a regular
158
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   180
expression cannot match anything at all. The mathematical way of stating this
160
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 159
diff changeset
   181
property is
157
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   182
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   183
\begin{center}
160
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 159
diff changeset
   184
$zeroable(r)$ if and only if $L(r) = \varnothing$
157
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   185
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   186
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   187
\noindent
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   188
For what follows let us fix a set of regular expressions $rs$ as being 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   189
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   190
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   191
\textit{KEYWORD}, \textit{IDENT}, \textit{WHITESPACE}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   192
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   193
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   194
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   195
specifying the ``words'' 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   196
of our programming language. The algorithm takes as input the $rs$ and a string, say
158
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   197
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   198
\begin{center}
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   199
\texttt{\grid{$c_1$}\grid{$c_2$}\grid{$c_3$}\grid{$c_4$}}\;\ldots
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   200
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   201
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   202
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   203
and tries to chop off one word from the beginning of the string. If none of the
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   204
regular expression in $rs$ matches, we will just return
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   205
the empty string.
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   206
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   207
The crucial idea in the algorithm is to build the derivatives of all regular expressions in $rs$ with respect
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   208
to the first character $c_1$. Then we take the results and continue with 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   209
building the derivatives with respect to $c_2$ until we have either exhausted our 
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   210
input string or all of the regular expressions are ``zeroable''.  Suppose the input string is 
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   211
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   212
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   213
\texttt{\Grid{if2\VS}}\;\dots
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   214
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   215
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   216
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   217
then building the derivatives with respect to \texttt{i} gives
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   218
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   219
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   220
\begin{tabular}{l|c}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   221
 & $zeroable$\\\hline
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   222
 $der\;\texttt{i}\;(\textit{KEYWORD})$      & no\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   223
 $der\;\texttt{i}\;(\textit{IDENT})$              & no\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   224
 $der\;\texttt{i}\;(\textit{WHITESPACE})$ & yes\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   225
\end{tabular}
158
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   226
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   227
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   228
\noindent
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   229
We can eliminate \textit{WHITESPACE} as a potential candidate, because
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   230
no derivative can go from $zeroable = \text{yes}$ to no. That leaves the other
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   231
two regular expressions as potential candidate and we have to consider the
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   232
next character, \texttt{f}, from the input string
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   233
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   234
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   235
\begin{tabular}{l|c}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   236
 & $zeroable$\\\hline
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   237
 $der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{KEYWORD}))$      & no\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   238
 $der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{IDENT}))$              & no\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   239
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   240
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   241
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   242
\noindent
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   243
Since both are `no', we have to continue with \texttt{2} from the input string
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   244
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   245
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   246
\begin{tabular}{l|c}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   247
 & $zeroable$\\\hline
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   248
 $der\;\texttt{2}\;(der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{KEYWORD})))$      & yes\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   249
 $der\;\texttt{2}\;(der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{IDENT})))$              & no\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   250
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   251
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   252
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   253
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   254
Although we now know that the beginning is definitely an \textit{IDENT}, we do not yet
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   255
know how much of the input string should be considered as an \textit{IDENT}. So we
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   256
still have to continue and consider the next derivative.
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   257
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   258
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   259
\begin{tabular}{l|c}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   260
 & $zeroable$\\\hline
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   261
 $der\;\texttt{\VS}\;(der\;\texttt{2}\;(der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{IDENT}))))$              & yes\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   262
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   263
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   264
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   265
\noindent
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   266
Since the answer is now `yes' also in this case, we can stop: once all derivatives are
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   267
zeroable, we know the regular expressions cannot match any more letters from
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   268
the input. In this case we only have to go back to the derivative that is nullable.  In this case it
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   269
is
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   270
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   271
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   272
$der\;\texttt{2}\;(der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{IDENT})))$ 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   273
\end{center} 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   274
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   275
\noindent
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   276
which means we recognised an identifier. In case where there is a choice of more than one
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   277
regular expressions that are nullable, then we choose the one with the highest precedence. 
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   278
You can try out such a case with the input string
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   279
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   280
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   281
\texttt{\Grid{then\VS}}\;\dots
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   282
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   283
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   284
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   285
which can both be recognised as a keyword, but also an identifier. 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   286
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   287
While in the example above the last nullable derivative is the one directly before
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   288
the derivative turns zeroable, this is not always the case. Imagine, identifiers can be 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   289
letters, as permuted by the regular expression \textit{IDENT}, but must end with an 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   290
undercore.
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   291
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   292
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   293
\begin{tabular}{lcl}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   294
$\textit{NEWIDENT}$ & $:=$ & $\textit{LETTER} \cdot (\textit{LETTER} + \textit{DIGIT} + {\_})^* \cdot \_$\\ 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   295
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   296
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   297
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   298
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   299
If we use \textit{NEWIDENT} with the input string
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   300
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   301
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   302
\texttt{\Grid{iffoo\VS}}\;\ldots
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   303
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   304
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   305
\noindent
163
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 162
diff changeset
   306
then it will only become $zeroable$ after the $\VS$ has been analysed. In this case we have to go back to the
162
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   307
first \texttt{f} because only 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   308
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   309
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   310
 $der\;\texttt{f}\;(der\;\texttt{i}\;(\textit{KEYWORD}))$  
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   311
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   312
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   313
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   314
is nullable. As a result we recognise successfully the keyword \texttt{if} and the remaining
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 161
diff changeset
   315
string needs to be consumed by other regular expressions or lead to a lexing error.
161
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   316
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 160
diff changeset
   317
157
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   318
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   319
\end{document}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   320
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   321
%%% Local Variables: 
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   322
%%% mode: latex
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   323
%%% TeX-master: t
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   324
%%% End: