author | Christian Urban <christian dot urban at kcl dot ac dot uk> |
Sat, 26 Oct 2013 01:22:21 +0100 | |
changeset 158 | 77e8397ec2ec |
parent 157 | b6eee9571a63 |
child 159 | ae5ceef5355e |
permissions | -rw-r--r-- |
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
1 |
\documentclass{article} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
2 |
\usepackage{charter} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
3 |
\usepackage{hyperref} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
4 |
\usepackage{amssymb} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
5 |
\usepackage{amsmath} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
6 |
\usepackage[T1]{fontenc} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
7 |
\usepackage{listings} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
8 |
\usepackage{xcolor} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
9 |
\usepackage{tikz} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
10 |
\usetikzlibrary{arrows} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
11 |
\usetikzlibrary{automata} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
12 |
\usetikzlibrary{shapes} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
13 |
\usetikzlibrary{shadows} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
14 |
\usetikzlibrary{positioning} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
15 |
\usetikzlibrary{calc} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
16 |
\usetikzlibrary{fit} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
17 |
\usetikzlibrary{backgrounds} |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
18 |
\usepackage{fontspec} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
19 |
\setmonofont{Consolas} |
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
20 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
21 |
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
22 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
23 |
\definecolor{javared}{rgb}{0.6,0,0} % for strings |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
24 |
\definecolor{javagreen}{rgb}{0.25,0.5,0.35} % comments |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
25 |
\definecolor{javapurple}{rgb}{0.5,0,0.35} % keywords |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
26 |
\definecolor{javadocblue}{rgb}{0.25,0.35,0.75} % javadoc |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
27 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
28 |
\lstdefinelanguage{scala}{ |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
29 |
morekeywords={abstract,case,catch,class,def,% |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
30 |
do,else,extends,false,final,finally,% |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
31 |
for,if,implicit,import,match,mixin,% |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
32 |
new,null,object,override,package,% |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
33 |
private,protected,requires,return,sealed,% |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
34 |
super,this,throw,trait,true,try,% |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
35 |
type,val,var,while,with,yield}, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
36 |
otherkeywords={=>,<-,<\%,<:,>:,\#,@}, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
37 |
sensitive=true, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
38 |
morecomment=[l]{//}, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
39 |
morecomment=[n]{/*}{*/}, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
40 |
morestring=[b]", |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
41 |
morestring=[b]', |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
42 |
morestring=[b]""" |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
43 |
} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
44 |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
45 |
\lstdefinelanguage{while}{ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
46 |
morekeywords={while, if, then. else, read, write}, |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
47 |
otherkeywords={=>,<-,<\%,<:,>:,\#,@}, |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
48 |
sensitive=true, |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
49 |
morecomment=[l]{//}, |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
50 |
morecomment=[n]{/*}{*/}, |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
51 |
morestring=[b]", |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
52 |
morestring=[b]', |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
53 |
morestring=[b]""" |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
54 |
} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
55 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
56 |
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
57 |
\lstset{language=Scala, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
58 |
basicstyle=\ttfamily, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
59 |
keywordstyle=\color{javapurple}\bfseries, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
60 |
stringstyle=\color{javagreen}, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
61 |
commentstyle=\color{javagreen}, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
62 |
morecomment=[s][\color{javadocblue}]{/**}{*/}, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
63 |
numbers=left, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
64 |
numberstyle=\tiny\color{black}, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
65 |
stepnumber=1, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
66 |
numbersep=10pt, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
67 |
tabsize=2, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
68 |
showspaces=false, |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
69 |
showstringspaces=false} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
70 |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
71 |
\newcommand\grid[1]{% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
72 |
\begin{tikzpicture}[baseline=(char.base)] |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
73 |
\path[use as bounding box] |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
74 |
(0,0) rectangle (1em,1em); |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
75 |
\draw[red!50, fill=red!20] |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
76 |
(0,0) rectangle (1em,1em); |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
77 |
\node[inner sep=1pt,anchor=base west] |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
78 |
(char) at (0em,\gridraiseamount) {#1}; |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
79 |
\end{tikzpicture}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
80 |
\newcommand\gridraiseamount{0.12em} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
81 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
82 |
\makeatletter |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
83 |
\newcommand\Grid[1]{% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
84 |
\@tfor\z:=#1\do{\grid{\z}}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
85 |
\makeatother |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
86 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
87 |
\newcommand\Vspace[1][.3em]{% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
88 |
\mbox{\kern.06em\vrule height.3ex}% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
89 |
\vbox{\hrule width#1}% |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
90 |
\hbox{\vrule height.3ex}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
91 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
92 |
\def\VS{\Vspace[0.6em]} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
93 |
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
94 |
\begin{document} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
95 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
96 |
\section*{Handout 5} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
97 |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
98 |
Whenever you want to design a new programming language or implement a compiler for an |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
99 |
existing language, the first task is to fix the basic ``words'' of the language. For example what are the |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
100 |
keywords, or reserved words, of the language, what are permitted identifiers, numbers and so |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
101 |
on. One convenient way to do this is, of |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
102 |
course, by using regular expressions. |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
103 |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
104 |
In this course we want to take a closer look at the |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
105 |
WHILE-language. This is a simple imperative programming language consisting of arithmetic |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
106 |
expressions, assignments, if-statements and loops. For example the Fibonacci program can be |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
107 |
written in this language as follows: |
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
108 |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
109 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
110 |
\mbox{\lstinputlisting[language=while]{../progs/fib.while}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
111 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
112 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
113 |
\noindent |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
114 |
The keywords in this language will be |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
115 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
116 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
117 |
\texttt{while}, \texttt{if}, \texttt{then}, \texttt{else}, \texttt{write}, \texttt{read} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
118 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
119 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
120 |
\noindent |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
121 |
In addition we will have some common operators, such as \texttt{<}, \texttt{>}, \texttt{:=} and so on, as well as numbers |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
122 |
and strings (which we however ignore for the moment). We also need to specify what the ``white space'' |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
123 |
is in our programming language and what comments should look like. |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
124 |
As a first try, we specify the regular expressions for our language roughly as follows |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
125 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
126 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
127 |
\begin{tabular}{lcl} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
128 |
$\textit{KEYWORD}$ & $:=$ & $\texttt{while} + \texttt{if} + \texttt{then} + \texttt{else} + \dots$\\ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
129 |
$\textit{IDENT}$ & $:=$ & $\textit{LETTER} \cdot (\textit{LETTER} + \textit{DIGIT} + {\_})^*$\\ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
130 |
$\textit{OP}$ & $:=$ & $\texttt{:=} + \texttt{<} + \ldots$\\ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
131 |
$\textit{NUM}$ & $:=$ & $\textit{DIGIT}^+$\\ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
132 |
$\textit{WHITESPACE}$ & $:=$ & $"\hspace{2mm}" + \backslash\texttt{n}$ |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
133 |
\end{tabular} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
134 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
135 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
136 |
\noindent |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
137 |
with the usual meanings for the regular expressions $\textit{LETTER}$ and $\textit{DIGIT}$. |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
138 |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
139 |
Having the regular expressions in place, the problem we have to solve is: |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
140 |
given a string of our programming language, which regular expression |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
141 |
matches which part of the string. In this way we can split up a string into components. |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
142 |
For example we expect that the input string |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
143 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
144 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
145 |
\texttt{\Grid{if\VS true\VS then\VS x+2\VS else\VS x+3}} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
146 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
147 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
148 |
\noindent |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
149 |
is split up as follows |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
150 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
151 |
\begin{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
152 |
\tt |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
153 |
\Grid{if}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
154 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
155 |
\Grid{true}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
156 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
157 |
\Grid{then}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
158 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
159 |
\Grid{x}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
160 |
\Grid{+}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
161 |
\Grid{2}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
162 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
163 |
\Grid{else}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
164 |
\Grid{\VS}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
165 |
\Grid{x}\, |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
166 |
\Grid{+}\, |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
167 |
\Grid{3} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
168 |
\end{center} |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
169 |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
170 |
\noindent |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
171 |
This process of splitting an input string into components is often called \emph{lexing} or \emph{scanning}. |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
172 |
It is usually the first phase of a compiler. Note that the separation into words cannot, in general, |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
173 |
be done by looking at whitespaces: while \texttt{if} and \texttt{true} are separated by a whitespace, |
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
174 |
the components in \texttt{x+2} are not. Another reason for recognising whitespaces explicitly is |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
175 |
that in some languages, for example Python, whitespace matters. However in our small language we will eventually just filter out all whitespaces and also comments. |
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
176 |
|
154
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
177 |
Lexing will not just separate the input into its components, but also classify the components, that |
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
178 |
is explicitly record that \texttt{it} is a keyword, \VS{} a whitespace, \texttt{true} an identifier and so on. |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
179 |
For the moment, though, we will only focus on the simpler problem of splitting a string into components. |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
180 |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
181 |
There are a few subtleties we need to consider first. For example, say the input string is |
154
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
182 |
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
183 |
\begin{center} |
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
184 |
\texttt{\Grid{iffoo\VS\ldots}} |
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
185 |
\end{center} |
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
186 |
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
187 |
\noindent |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
188 |
then there are two possibilities how it can be split up: either we regard the input as the keyword \texttt{if} followed |
154
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
189 |
by the identifier \texttt{foo} (both regular expressions match) or we regard \texttt{iffoo} as a |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
190 |
single identifier. The choice that is often made in lexers is to look for the longest possible match. |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
191 |
This leaves \texttt{iffoo} as the only match (since it is longer than \texttt{if}). |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
192 |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
193 |
Unfortuantely, the convention of the longs match does not yet make the whole process |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
194 |
completely deterministic. Consider the input string |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
195 |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
196 |
\begin{center} |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
197 |
\texttt{\Grid{then\VS\dots}} |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
198 |
\end{center} |
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
199 |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
200 |
\noindent |
156
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
201 |
Clearly, this string should clearly be identified as a keyword. The problem is that also the regular expression \textit{IDENT} for identifiers would also match this string. To overcome this ambiguity we need to rank our |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
202 |
regular expressions. In our running example we just use the ranking |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
203 |
|
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
204 |
\[ |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
205 |
\textit{KEYWORD} < \textit{IDENT} < \textit{OP} < \ldots |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
206 |
\] |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
207 |
|
156
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
208 |
\noindent |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
209 |
and so on. So even if both regular expressions match in the example above, |
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
210 |
we can give the regular expression for \ref{Page ??} as follows |
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
211 |
|
156
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
212 |
Let us see how our algorithm for lexing works in detail. The regular |
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
213 |
expressions and their ranking are shown above. For our algorithm it will |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
214 |
be helpful to have a look at the function \emph{zeroable} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
215 |
defined as follows: |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
216 |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
217 |
\begin{center} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
218 |
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
219 |
$zeroable(\varnothing)$ & $\dn$ & $true$\\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
220 |
$zeroable(\epsilon)$ & $\dn$ & $f\!alse$\\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
221 |
$zeroable (c)$ & $\dn$ & $f\!alse$\\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
222 |
$zeroable (r_1 + r_2)$ & $\dn$ & $zeroable(r_1) \wedge zeroable(r_2)$ \\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
223 |
$zeroable (r_1 \cdot r_2)$ & $\dn$ & $zeroable(r_1) \vee zeroable(r_2)$ \\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
224 |
$zeroable (r^*)$ & $\dn$ & $f\!alse$\\ |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
225 |
\end{tabular} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
226 |
\end{center} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
227 |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
228 |
\noindent |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
229 |
In contrast to the function $nullable(r)$, which test whether a regular expression |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
230 |
can match the empty string, the $zeroable$ function identifies whether a regular |
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
231 |
expression cannot match anything at all. The mathematical way of stating this |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
232 |
is |
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
233 |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
234 |
\begin{center} |
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
235 |
$s \in zeroable(s)$ implies $L(r) = \varnothing$ |
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
236 |
\end{center} |
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
237 |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
238 |
\noindent |
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
239 |
Let us fix a set of regular expressions $rs$. |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
240 |
The crucial idea of the algorithm is to take the input string, say |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
241 |
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
242 |
\begin{center} |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
243 |
\texttt{\grid{$c_1$}\grid{$c_2$}\grid{$c_3$}\grid{$c_4$}\grid{\ldots}} |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
244 |
\end{center} |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
245 |
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
246 |
\noindent |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
247 |
and build the derivative of all regular expressions in $rs$ with respect |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
248 |
to the first character. Then we take the result and continue with $c_2$ |
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
249 |
until we have either exhausted our input string or all of the regula |
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
250 |
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
251 |
\end{document} |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
252 |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
253 |
%%% Local Variables: |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
254 |
%%% mode: latex |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
255 |
%%% TeX-master: t |
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
256 |
%%% End: |