| author | Christian Urban <christian dot urban at kcl dot ac dot uk> |
| Sat, 26 Oct 2013 10:02:29 +0100 | |
| changeset 159 | ae5ceef5355e |
| parent 158 | 77e8397ec2ec |
| child 160 | 8134c3b981e0 |
| permissions | -rw-r--r-- |
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
1 |
\documentclass{article}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
2 |
\usepackage{charter}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
3 |
\usepackage{hyperref}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
4 |
\usepackage{amssymb}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
5 |
\usepackage{amsmath}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
6 |
\usepackage[T1]{fontenc}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
7 |
\usepackage{listings}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
8 |
\usepackage{xcolor}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
9 |
\usepackage{tikz}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
10 |
\usetikzlibrary{arrows}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
11 |
\usetikzlibrary{automata}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
12 |
\usetikzlibrary{shapes}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
13 |
\usetikzlibrary{shadows}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
14 |
\usetikzlibrary{positioning}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
15 |
\usetikzlibrary{calc}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
16 |
\usetikzlibrary{fit}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
17 |
\usetikzlibrary{backgrounds}
|
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
18 |
\usepackage{fontspec}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
19 |
\setmonofont{Consolas}
|
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
20 |
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
21 |
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
22 |
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
23 |
\definecolor{javared}{rgb}{0.6,0,0} % for strings
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
24 |
\definecolor{javagreen}{rgb}{0.25,0.5,0.35} % comments
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
25 |
\definecolor{javapurple}{rgb}{0.5,0,0.35} % keywords
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
26 |
\definecolor{javadocblue}{rgb}{0.25,0.35,0.75} % javadoc
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
27 |
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
28 |
\lstdefinelanguage{scala}{
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
29 |
morekeywords={abstract,case,catch,class,def,%
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
30 |
do,else,extends,false,final,finally,% |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
31 |
for,if,implicit,import,match,mixin,% |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
32 |
new,null,object,override,package,% |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
33 |
private,protected,requires,return,sealed,% |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
34 |
super,this,throw,trait,true,try,% |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
35 |
type,val,var,while,with,yield}, |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
36 |
otherkeywords={=>,<-,<\%,<:,>:,\#,@},
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
37 |
sensitive=true, |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
38 |
morecomment=[l]{//},
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
39 |
morecomment=[n]{/*}{*/},
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
40 |
morestring=[b]", |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
41 |
morestring=[b]', |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
42 |
morestring=[b]""" |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
43 |
} |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
44 |
|
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
45 |
\lstdefinelanguage{while}{
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
46 |
morekeywords={while, if, then. else, read, write},
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
47 |
otherkeywords={=>,<-,<\%,<:,>:,\#,@},
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
48 |
sensitive=true, |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
49 |
morecomment=[l]{//},
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
50 |
morecomment=[n]{/*}{*/},
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
51 |
morestring=[b]", |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
52 |
morestring=[b]', |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
53 |
morestring=[b]""" |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
54 |
} |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
55 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
56 |
|
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
57 |
\lstset{language=Scala,
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
58 |
basicstyle=\ttfamily, |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
59 |
keywordstyle=\color{javapurple}\bfseries,
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
60 |
stringstyle=\color{javagreen},
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
61 |
commentstyle=\color{javagreen},
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
62 |
morecomment=[s][\color{javadocblue}]{/**}{*/},
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
63 |
numbers=left, |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
64 |
numberstyle=\tiny\color{black},
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
65 |
stepnumber=1, |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
66 |
numbersep=10pt, |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
67 |
tabsize=2, |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
68 |
showspaces=false, |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
69 |
showstringspaces=false} |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
70 |
|
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
71 |
\newcommand\grid[1]{%
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
72 |
\begin{tikzpicture}[baseline=(char.base)]
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
73 |
\path[use as bounding box] |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
74 |
(0,0) rectangle (1em,1em); |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
75 |
\draw[red!50, fill=red!20] |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
76 |
(0,0) rectangle (1em,1em); |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
77 |
\node[inner sep=1pt,anchor=base west] |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
78 |
(char) at (0em,\gridraiseamount) {#1};
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
79 |
\end{tikzpicture}}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
80 |
\newcommand\gridraiseamount{0.12em}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
81 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
82 |
\makeatletter |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
83 |
\newcommand\Grid[1]{%
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
84 |
\@tfor\z:=#1\do{\grid{\z}}}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
85 |
\makeatother |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
86 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
87 |
\newcommand\Vspace[1][.3em]{%
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
88 |
\mbox{\kern.06em\vrule height.3ex}%
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
89 |
\vbox{\hrule width#1}%
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
90 |
\hbox{\vrule height.3ex}}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
91 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
92 |
\def\VS{\Vspace[0.6em]}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
93 |
|
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
94 |
\begin{document}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
95 |
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
96 |
\section*{Handout 5}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
97 |
|
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
98 |
Whenever you want to design a new programming language or implement a compiler for an |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
99 |
existing language, the first task is to fix the basic ``words'' of the language. For example what are the |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
100 |
keywords, or reserved words, of the language, what are permitted identifiers, numbers, expressions and so |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
101 |
on. One convenient way to do this is, of |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
102 |
course, by using regular expressions. |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
103 |
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
104 |
In this course we want to take a closer look at the |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
105 |
WHILE programming language. This is a simple imperative programming language consisting of arithmetic |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
106 |
expressions, assignments, if-statements and loops. For example the Fibonacci program can be |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
107 |
written in this language as follows: |
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
108 |
|
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
109 |
\begin{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
110 |
\mbox{\lstinputlisting[language=while]{../progs/fib.while}}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
111 |
\end{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
112 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
113 |
\noindent |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
114 |
The keywords in this language will be |
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
115 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
116 |
\begin{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
117 |
\texttt{while}, \texttt{if}, \texttt{then}, \texttt{else}, \texttt{write}, \texttt{read}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
118 |
\end{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
119 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
120 |
\noindent |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
121 |
In addition we will have some common operators, such as \texttt{<}, \texttt{>}, \texttt{:=} and so on, as well as numbers
|
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
122 |
and strings (which we however ignore for the moment). We also need to specify what the ``whitespace'' |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
123 |
is in our programming language and what comments should look like. |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
124 |
As a first try, we might specify the regular expressions for our language roughly as follows |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
125 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
126 |
\begin{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
127 |
\begin{tabular}{lcl}
|
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
128 |
$\textit{LETTER}$ & $:=$ & $\texttt{a} + \texttt{A} + \texttt{b} + \texttt{B} + \ldots$\\
|
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
129 |
$\textit{DIGIT}$ & $:=$ & $\texttt{0} + \texttt{1} + \texttt{2} + \ldots$\\
|
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
130 |
$\textit{KEYWORD}$ & $:=$ & $\texttt{while} + \texttt{if} + \texttt{then} + \texttt{else} + \dots$\\
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
131 |
$\textit{IDENT}$ & $:=$ & $\textit{LETTER} \cdot (\textit{LETTER} + \textit{DIGIT} + {\_})^*$\\
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
132 |
$\textit{OP}$ & $:=$ & $\texttt{:=} + \texttt{<} + \ldots$\\
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
133 |
$\textit{NUM}$ & $:=$ & $\textit{DIGIT}^+$\\
|
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
134 |
$\textit{WHITESPACE}$ & $:=$ & $("\hspace{2mm}" + \backslash\texttt{n})^+$
|
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
135 |
\end{tabular}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
136 |
\end{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
137 |
|
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
138 |
Having the regular expressions in place, the problem we have to solve is: |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
139 |
given a string of our programming language, which regular expression |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
140 |
matches which part of the string. By solving this problem, we can split up a string |
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
141 |
of our language into components. |
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
142 |
For example given the input string |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
143 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
144 |
\begin{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
145 |
\texttt{\Grid{if\VS true\VS then\VS x+2\VS else\VS x+3}}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
146 |
\end{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
147 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
148 |
\noindent |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
149 |
we expect it will be split up as follows |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
150 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
151 |
\begin{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
152 |
\tt |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
153 |
\Grid{if}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
154 |
\Grid{\VS}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
155 |
\Grid{true}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
156 |
\Grid{\VS}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
157 |
\Grid{then}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
158 |
\Grid{\VS}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
159 |
\Grid{x}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
160 |
\Grid{+}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
161 |
\Grid{2}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
162 |
\Grid{\VS}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
163 |
\Grid{else}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
164 |
\Grid{\VS}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
165 |
\Grid{x}\,
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
166 |
\Grid{+}\,
|
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
167 |
\Grid{3}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
168 |
\end{center}
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
169 |
|
|
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
170 |
\noindent |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
171 |
This process of splitting an input string into components is often called \emph{lexing} or \emph{scanning}.
|
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
172 |
It is usually the first phase of a compiler. Note that the separation into words cannot, in general, |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
173 |
be done by just looking at whitespaces: while \texttt{if} and \texttt{true} are separated by a whitespace,
|
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
174 |
this is not always the case. As can be seen the three components in \texttt{x+2} are not separated by any
|
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
175 |
whitespace. Another reason for recognising whitespaces explicitly is |
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
176 |
that in some languages, for example Python, whitespace matters. However in our small language we will eventually just filter out all whitespaces and also all comments. |
|
153
70ab41cb610e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
152
diff
changeset
|
177 |
|
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
178 |
Lexing will not just separate a string into its components, but also classify the components, that |
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
179 |
is explicitly record that \texttt{if} is a keyword, \VS{} a whitespace, \texttt{true} an identifier and so on.
|
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
180 |
For the moment, though, we will only focus on the simpler problem of just splitting a string into components. |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
181 |
|
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
182 |
There are a few subtleties we need to consider first. For example, say the string is |
|
154
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
183 |
|
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
184 |
\begin{center}
|
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
185 |
\texttt{\Grid{iffoo\VS\ldots}}
|
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
186 |
\end{center}
|
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
187 |
|
|
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
188 |
\noindent |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
189 |
then there are two possibilities for how it can be split up: either we regard the input as the keyword \texttt{if} followed
|
|
154
51d6b8b828c4
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
153
diff
changeset
|
190 |
by the identifier \texttt{foo} (both regular expressions match) or we regard \texttt{iffoo} as a
|
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
191 |
single identifier. The choice that is often made in lexers is to look for the longest possible match. |
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
192 |
This leaves \texttt{iffoo} as the only match (since it is longer than \texttt{if}).
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
193 |
|
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
194 |
Unfortunately, the convention about the longest match does not yet make the whole process |
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
195 |
of lexing completely deterministic. Consider the string |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
196 |
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
197 |
\begin{center}
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
198 |
\texttt{\Grid{then\VS\dots}}
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
199 |
\end{center}
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
200 |
|
|
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
201 |
\noindent |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
202 |
Clearly, this string should be identified as a keyword. The problem is that also the regular expression \textit{IDENT} for identifiers matches this string. To overcome this ambiguity we need to rank our
|
|
156
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
203 |
regular expressions. In our running example we just use the ranking |
|
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
204 |
|
|
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
205 |
\[ |
|
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
206 |
\textit{KEYWORD} < \textit{IDENT} < \textit{OP} < \ldots
|
|
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
207 |
\] |
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
208 |
|
|
156
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
209 |
\noindent |
|
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
210 |
and so on. So even if both regular expressions match in the example above, |
|
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
211 |
we can give the regular expression for \ref{Page ??} as follows
|
|
155
9b2d128765e1
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
154
diff
changeset
|
212 |
|
|
156
95eaee695636
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
155
diff
changeset
|
213 |
Let us see how our algorithm for lexing works in detail. The regular |
|
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
214 |
expressions and their ranking are shown above. For our algorithm it will |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
215 |
be helpful to have a look at the function \emph{zeroable}
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
216 |
defined as follows: |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
217 |
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
218 |
\begin{center}
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
219 |
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}}
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
220 |
$zeroable(\varnothing)$ & $\dn$ & $true$\\ |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
221 |
$zeroable(\epsilon)$ & $\dn$ & $f\!alse$\\ |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
222 |
$zeroable (c)$ & $\dn$ & $f\!alse$\\ |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
223 |
$zeroable (r_1 + r_2)$ & $\dn$ & $zeroable(r_1) \wedge zeroable(r_2)$ \\ |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
224 |
$zeroable (r_1 \cdot r_2)$ & $\dn$ & $zeroable(r_1) \vee zeroable(r_2)$ \\ |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
225 |
$zeroable (r^*)$ & $\dn$ & $f\!alse$\\ |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
226 |
\end{tabular}
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
227 |
\end{center}
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
228 |
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
229 |
\noindent |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
230 |
In contrast to the function $nullable(r)$, which test whether a regular expression |
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
231 |
can match the empty string, the $zeroable$ function identifies whether a regular |
|
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
232 |
expression cannot match anything at all. The mathematical way of stating this |
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
233 |
is |
|
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
234 |
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
235 |
\begin{center}
|
|
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
236 |
$s \in zeroable(s)$ implies $L(r) = \varnothing$ |
|
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
237 |
\end{center}
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
238 |
|
|
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
239 |
\noindent |
|
158
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
240 |
Let us fix a set of regular expressions $rs$. |
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
241 |
The crucial idea of the algorithm is to take the input string, say |
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
242 |
|
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
243 |
\begin{center}
|
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
244 |
\texttt{\grid{$c_1$}\grid{$c_2$}\grid{$c_3$}\grid{$c_4$}\grid{\ldots}}
|
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
245 |
\end{center}
|
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
246 |
|
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
247 |
\noindent |
|
77e8397ec2ec
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
157
diff
changeset
|
248 |
and build the derivative of all regular expressions in $rs$ with respect |
|
159
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
249 |
to the first character $c_1$. Then we take the result and continue with $c_2$ |
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
250 |
until we have either exhausted our input string or all of the regular expressions |
|
ae5ceef5355e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
158
diff
changeset
|
251 |
are ``zeroable''. |
|
157
b6eee9571a63
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
156
diff
changeset
|
252 |
|
|
151
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
253 |
\end{document}
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
254 |
|
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
255 |
%%% Local Variables: |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
256 |
%%% mode: latex |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
257 |
%%% TeX-master: t |
|
df229ec49b22
added ho
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff
changeset
|
258 |
%%% End: |