handouts/ho05.tex
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Sat, 26 Oct 2013 01:22:21 +0100
changeset 158 77e8397ec2ec
parent 157 b6eee9571a63
child 159 ae5ceef5355e
permissions -rw-r--r--
added
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     1
\documentclass{article}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     2
\usepackage{charter}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     3
\usepackage{hyperref}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     4
\usepackage{amssymb}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     5
\usepackage{amsmath}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     6
\usepackage[T1]{fontenc}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     7
\usepackage{listings}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     8
\usepackage{xcolor}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     9
\usepackage{tikz}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    10
\usetikzlibrary{arrows}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    11
\usetikzlibrary{automata}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    12
\usetikzlibrary{shapes}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    13
\usetikzlibrary{shadows}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    14
\usetikzlibrary{positioning}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    15
\usetikzlibrary{calc}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    16
\usetikzlibrary{fit}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    17
\usetikzlibrary{backgrounds}
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    18
\usepackage{fontspec}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    19
\setmonofont{Consolas}
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    20
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    21
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    22
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    23
\definecolor{javared}{rgb}{0.6,0,0} % for strings
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    24
\definecolor{javagreen}{rgb}{0.25,0.5,0.35} % comments
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    25
\definecolor{javapurple}{rgb}{0.5,0,0.35} % keywords
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    26
\definecolor{javadocblue}{rgb}{0.25,0.35,0.75} % javadoc
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    27
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    28
\lstdefinelanguage{scala}{
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    29
  morekeywords={abstract,case,catch,class,def,%
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    30
    do,else,extends,false,final,finally,%
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    31
    for,if,implicit,import,match,mixin,%
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    32
    new,null,object,override,package,%
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    33
    private,protected,requires,return,sealed,%
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    34
    super,this,throw,trait,true,try,%
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    35
    type,val,var,while,with,yield},
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    36
  otherkeywords={=>,<-,<\%,<:,>:,\#,@},
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    37
  sensitive=true,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    38
  morecomment=[l]{//},
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    39
  morecomment=[n]{/*}{*/},
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    40
  morestring=[b]",
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    41
  morestring=[b]',
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    42
  morestring=[b]"""
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    43
}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    44
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    45
\lstdefinelanguage{while}{
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    46
  morekeywords={while, if, then. else, read, write},
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    47
  otherkeywords={=>,<-,<\%,<:,>:,\#,@},
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    48
  sensitive=true,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    49
  morecomment=[l]{//},
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    50
  morecomment=[n]{/*}{*/},
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    51
  morestring=[b]",
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    52
  morestring=[b]',
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    53
  morestring=[b]"""
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    54
}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    55
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    56
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    57
\lstset{language=Scala,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    58
	basicstyle=\ttfamily,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    59
	keywordstyle=\color{javapurple}\bfseries,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    60
	stringstyle=\color{javagreen},
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    61
	commentstyle=\color{javagreen},
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    62
	morecomment=[s][\color{javadocblue}]{/**}{*/},
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    63
	numbers=left,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    64
	numberstyle=\tiny\color{black},
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    65
	stepnumber=1,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    66
	numbersep=10pt,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    67
	tabsize=2,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    68
	showspaces=false,
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    69
	showstringspaces=false}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    70
	
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    71
\newcommand\grid[1]{%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    72
\begin{tikzpicture}[baseline=(char.base)]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    73
  \path[use as bounding box]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    74
    (0,0) rectangle (1em,1em);
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    75
  \draw[red!50, fill=red!20]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    76
    (0,0) rectangle (1em,1em);
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    77
  \node[inner sep=1pt,anchor=base west]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    78
    (char) at (0em,\gridraiseamount) {#1};
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    79
\end{tikzpicture}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    80
\newcommand\gridraiseamount{0.12em}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    81
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    82
\makeatletter
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    83
\newcommand\Grid[1]{%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    84
  \@tfor\z:=#1\do{\grid{\z}}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    85
\makeatother	
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    86
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    87
\newcommand\Vspace[1][.3em]{%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    88
  \mbox{\kern.06em\vrule height.3ex}%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    89
  \vbox{\hrule width#1}%
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    90
  \hbox{\vrule height.3ex}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    91
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    92
\def\VS{\Vspace[0.6em]}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
    93
	
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    94
\begin{document}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    95
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    96
\section*{Handout 5}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    97
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    98
Whenever you want to design a new programming language or implement a compiler for an
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
    99
existing language, the first task is to fix the basic ``words'' of the language. For example what are the 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   100
keywords, or reserved words, of the language, what are permitted identifiers, numbers and so 
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   101
on. One convenient way to do this is, of 
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   102
course, by using regular expressions. 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   103
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   104
In this course we want to take a closer look at the 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   105
WHILE-language. This is a simple imperative programming language consisting of arithmetic
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   106
expressions, assignments, if-statements and loops. For example the Fibonacci program can be
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   107
written in this language as follows:
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   108
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   109
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   110
\mbox{\lstinputlisting[language=while]{../progs/fib.while}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   111
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   112
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   113
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   114
The keywords in this language will be
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   115
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   116
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   117
\texttt{while}, \texttt{if}, \texttt{then}, \texttt{else}, \texttt{write}, \texttt{read}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   118
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   119
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   120
\noindent
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   121
In addition we will have some common operators, such as \texttt{<}, \texttt{>}, \texttt{:=} and so on, as well as numbers
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   122
and strings (which we however ignore for the moment). We also need to specify what the ``white space''
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   123
is in our programming language and what comments should look like.
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   124
As a first try, we specify the regular expressions for our language roughly as follows
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   126
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   127
\begin{tabular}{lcl}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   128
$\textit{KEYWORD}$ & $:=$ & $\texttt{while}  + \texttt{if} + \texttt{then} + \texttt{else} + \dots$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   129
$\textit{IDENT}$ & $:=$ & $\textit{LETTER} \cdot (\textit{LETTER} + \textit{DIGIT} + {\_})^*$\\ 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   130
$\textit{OP}$      &  $:=$ & $\texttt{:=} + \texttt{<} + \ldots$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   131
$\textit{NUM}$   &  $:=$ & $\textit{DIGIT}^+$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   132
$\textit{WHITESPACE}$ & $:=$ & $"\hspace{2mm}" + \backslash\texttt{n}$
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   133
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   134
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   135
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   136
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   137
with the usual meanings for the regular expressions $\textit{LETTER}$ and $\textit{DIGIT}$.
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   138
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   139
Having the regular expressions in place, the problem we have to solve is: 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   140
given a string of our programming language, which regular expression 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   141
matches which part of the string. In this way we can split up a string into components. 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   142
For example we expect that the input string
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   143
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   144
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   145
\texttt{\Grid{if\VS true\VS then\VS x+2\VS else\VS x+3}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   146
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   147
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   148
\noindent
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   149
is split up as follows
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   150
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   151
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   152
\tt
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   153
\Grid{if}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   154
\Grid{\VS}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   155
\Grid{true}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   156
\Grid{\VS}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   157
\Grid{then}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   158
\Grid{\VS}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   159
\Grid{x}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   160
\Grid{+}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   161
\Grid{2}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   162
\Grid{\VS}\, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   163
\Grid{else}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   164
\Grid{\VS}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   165
\Grid{x}\,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   166
\Grid{+}\,
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   167
\Grid{3}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   168
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   169
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   170
\noindent
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   171
This process of splitting an input string into components is often called \emph{lexing} or \emph{scanning}.
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   172
It is usually the first phase of a compiler. Note that the separation into words cannot, in general, 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   173
be done by looking at whitespaces: while \texttt{if} and \texttt{true} are separated by a whitespace,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   174
the components in \texttt{x+2} are not. Another reason for recognising whitespaces explicitly is
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   175
that in some languages, for example Python, whitespace matters. However in our small language we will eventually just filter out all whitespaces and also comments.
153
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 152
diff changeset
   176
154
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   177
Lexing will not just separate the input into its components, but also classify the components, that
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   178
is explicitly record that \texttt{it} is a keyword,  \VS{} a whitespace, \texttt{true} an identifier and so on.
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   179
For the moment, though, we will only focus on the simpler problem of splitting a string into components.
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   180
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   181
There are a few subtleties  we need to consider first. For example, say the input string is
154
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   182
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   183
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   184
\texttt{\Grid{iffoo\VS\ldots}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   185
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   186
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   187
\noindent
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   188
then there are two possibilities how it can be split up: either we regard the input as the keyword \texttt{if} followed
154
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 153
diff changeset
   189
by the identifier \texttt{foo} (both regular expressions match) or we regard \texttt{iffoo} as a 
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   190
single identifier. The choice that is often made in lexers is to look for the longest possible match.
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   191
This leaves  \texttt{iffoo} as the only match (since it is longer than \texttt{if}).
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   192
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   193
Unfortuantely, the convention of the longs match does not yet make the whole process 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   194
completely deterministic. Consider the input string
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   195
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   196
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   197
\texttt{\Grid{then\VS\dots}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   198
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   199
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   200
\noindent
156
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   201
Clearly, this string should clearly be identified as a keyword. The problem is that also the regular expression \textit{IDENT} for identifiers would also match this string. To overcome this ambiguity we need to rank our 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   202
regular expressions. In our running example we just use the ranking
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   203
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   204
\[
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   205
\textit{KEYWORD} < \textit{IDENT} < \textit{OP} < \ldots
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   206
\]
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   207
156
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   208
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   209
and so on. So even if both regular expressions match in the example above,
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   210
we can give the regular expression for \ref{Page ??} as follows
155
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 154
diff changeset
   211
156
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 155
diff changeset
   212
Let us see how our algorithm for lexing works in detail. The regular 
157
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   213
expressions and their ranking are shown above. For our algorithm it will  
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   214
be helpful to have a look at the function \emph{zeroable}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   215
defined as follows:
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   216
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   217
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   218
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   219
$zeroable(\varnothing)$      & $\dn$ & $true$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   220
$zeroable(\epsilon)$           & $\dn$ &  $f\!alse$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   221
$zeroable (c)$                    & $\dn$ &  $f\!alse$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   222
$zeroable (r_1 + r_2)$        & $\dn$ &  $zeroable(r_1) \wedge zeroable(r_2)$ \\ 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   223
$zeroable (r_1 \cdot r_2)$  & $\dn$ &  $zeroable(r_1) \vee zeroable(r_2)$ \\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   224
$zeroable (r^*)$                  & $\dn$ & $f\!alse$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   225
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   226
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   227
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   228
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   229
In contrast to the function $nullable(r)$, which test whether a regular expression
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   230
can match the empty string, the $zeroable$ function identifies whether a regular
158
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   231
expression cannot match anything at all. The mathematical way of stating this
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   232
is 
157
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   233
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   234
\begin{center}
158
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   235
$s \in zeroable(s)$ implies $L(r) = \varnothing$
157
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   236
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   237
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   238
\noindent
158
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   239
Let us fix a set of regular expressions $rs$.
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   240
The crucial idea of the algorithm is to take the input string, say
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   241
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   242
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   243
\texttt{\grid{$c_1$}\grid{$c_2$}\grid{$c_3$}\grid{$c_4$}\grid{\ldots}}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   244
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   245
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   246
\noindent
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   247
and build the derivative of all regular expressions in $rs$ with respect
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   248
to the first character. Then we take the result and continue with $c_2$
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 157
diff changeset
   249
until we have either exhausted our input string or all of the regula
157
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 156
diff changeset
   250
151
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   251
\end{document}
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   252
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   253
%%% Local Variables: 
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   254
%%% mode: latex
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   255
%%% TeX-master: t
df229ec49b22 added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   256
%%% End: