handouts/ho02.tex
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Tue, 22 Mar 2016 17:09:24 +0000
changeset 398 c8ce95067c1a
parent 394 2f9fe225ecc8
child 399 5c1fbb39c93e
permissions -rw-r--r--
updated
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     1
\documentclass{article}
251
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
     2
\usepackage{../style}
217
cd6066f1056a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 140
diff changeset
     3
\usepackage{../langs}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
     4
\usepackage{../graphics}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
     5
\usepackage{../data}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     6
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
     7
\pgfplotsset{compat=1.11}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     8
\begin{document}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     9
272
1446bc47a294 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 268
diff changeset
    10
\section*{Handout 2 (Regular Expression Matching)}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    11
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    12
This lecture is about implementing a more efficient regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    13
expression matcher (the plots on the right)---more efficient
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    14
than the matchers from regular expression libraries in Ruby
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    15
and Python (the plots on the left). These plots show the
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    16
running time for the evil regular expression
394
2f9fe225ecc8 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 343
diff changeset
    17
$a^?{}^{\{n\}}\cdot a^{\{n\}}$ and strings composed of $n$ \pcode{a}s.
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    18
We will use this regular expression and strings as running
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    19
example. To see the substantial differences in the two plots
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    20
below, note the different scales of the $x$-axes. 
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    21
263
92e6985018ae updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 262
diff changeset
    22
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    23
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    24
\begin{tabular}{@{}cc@{}}
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    25
\begin{tikzpicture}
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    26
\begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs},
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    27
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    28
    xtick={0,5,...,30},
291
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
    29
    xmax=33,
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    30
    ymax=35,
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    31
    ytick={0,5,...,30},
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    32
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    33
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    34
    width=5cm,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    35
    height=5cm, 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    36
    legend entries={Python,Ruby},  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    37
    legend pos=north west,
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    38
    legend cell align=left]
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    39
\addplot[blue,mark=*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    40
  table {re-python.data};
291
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
    41
\addplot[brown,mark=triangle*, mark options={fill=white}] 
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    42
  table {re-ruby.data};  
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    43
\end{axis}
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    44
\end{tikzpicture}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    45
&
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    46
\begin{tikzpicture}
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    47
  \begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs},
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    48
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    49
    xtick={0,3000,...,12000},
291
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
    50
    xmax=12500,
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
    51
    ymax=35,
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
    52
    ytick={0,5,...,30},
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    53
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    54
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    55
    width=6.5cm,
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    56
    height=5cm]
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    57
\addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    58
\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    59
\end{axis}
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
    60
\end{tikzpicture}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    61
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    62
\end{center}\medskip
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    63
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    64
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    65
\noindent Having specified in the previous lecture what
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    66
problem our regular expression matcher is supposed to solve,
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    67
namely for any given regular expression $r$ and string $s$
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    68
answer \textit{true} if and only if
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    69
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    70
\[
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    71
s \in L(r)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    72
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    73
251
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    74
\noindent we can look at an algorithm to solve this problem.
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    75
Clearly we cannot use the function $L$ directly for this,
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    76
because in general the set of strings $L$ returns is infinite
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    77
(recall what $L(a^*)$ is). In such cases there is no way we
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    78
can implement an exhaustive test for whether a string is
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    79
member of this set or not. In contrast our matching algorithm
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
    80
will operate on the regular expression $r$ and string $s$,
334
fd89a63e9db3 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 333
diff changeset
    81
only, which are both finite objects. Before we come to the matching
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    82
algorithm, however, let us have a closer look at what it means
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    83
when two regular expressions are equivalent.
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
    84
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
    85
\subsection*{Regular Expression Equivalences}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    86
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    87
We already defined in Handout 1 what it means for two regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    88
expressions to be equivalent, namely if their meaning is the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    89
same language:
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
    90
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    91
\[
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    92
r_1 \equiv r_2 \;\dn\; L(r_1) = L(r_2)
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    93
\]
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    94
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    95
\noindent
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    96
It is relatively easy to verify that some concrete equivalences
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    97
hold, for example
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
    98
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
    99
\begin{center}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   100
\begin{tabular}{rcl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   101
$(a + b)  + c$ & $\equiv$ & $a + (b + c)$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   102
$a + a$        & $\equiv$ & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   103
$a + b$        & $\equiv$ & $b + a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   104
$(a \cdot b) \cdot c$ & $\equiv$ & $a \cdot (b \cdot c)$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   105
$c \cdot (a + b)$ & $\equiv$ & $(c \cdot a) + (c \cdot b)$\\
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   106
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   107
\end{center}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   108
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   109
\noindent
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   110
but also easy to verify that the following regular expressions
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   111
are \emph{not} equivalent
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   112
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   113
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   114
\begin{tabular}{rcl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   115
$a \cdot a$ & $\not\equiv$ & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   116
$a + (b \cdot c)$ & $\not\equiv$ & $(a + b) \cdot (a + c)$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   117
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   118
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   119
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   120
\noindent I leave it to you to verify these equivalences and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   121
non-equivalences. It is also interesting to look at some
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   122
corner cases involving $\epsilon$ and $\varnothing$:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   123
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   124
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   125
\begin{tabular}{rcl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   126
$a \cdot \varnothing$ & $\not\equiv$ & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   127
$a + \epsilon$ & $\not\equiv$ & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   128
$\epsilon$ & $\equiv$ & $\varnothing^*$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   129
$\epsilon^*$ & $\equiv$ & $\epsilon$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   130
$\varnothing^*$ & $\not\equiv$ & $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   131
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   132
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   133
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   134
\noindent Again I leave it to you to make sure you agree
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   135
with these equivalences and non-equivalences. 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   136
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   137
318
7975e4f0d4de updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 296
diff changeset
   138
For our matching algorithm however the following seven
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   139
equivalences will play an important role:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   140
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   141
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   142
\begin{tabular}{rcl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   143
$r + \varnothing$  & $\equiv$ & $r$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   144
$\varnothing + r$  & $\equiv$ & $r$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   145
$r \cdot \epsilon$ & $\equiv$ & $r$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   146
$\epsilon \cdot r$     & $\equiv$ & $r$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   147
$r \cdot \varnothing$ & $\equiv$ & $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   148
$\varnothing \cdot r$ & $\equiv$ & $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   149
$r + r$ & $\equiv$ & $r$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   150
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   151
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   152
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   153
\noindent which always hold no matter what the regular
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   154
expression $r$ looks like. The first two are easy to verify
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   155
since $L(\varnothing)$ is the empty set. The next two are also
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   156
easy to verify since $L(\epsilon) = \{[]\}$ and appending the
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   157
empty string to every string of another set, leaves the set
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   158
unchanged. Be careful to fully comprehend the fifth and sixth
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   159
equivalence: if you concatenate two sets of strings and one is
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   160
the empty set, then the concatenation will also be the empty
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   161
set. To see this, check the definition of $\_ @ \_$. The
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   162
last equivalence is again trivial.
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   163
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   164
What will be important later on is that we can orient these
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   165
equivalences and read them from left to right. In this way we
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   166
can view them as \emph{simplification rules}. Consider for 
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   167
example the regular expression 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   168
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   169
\begin{equation}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   170
(r_1 + \varnothing) \cdot \epsilon + ((\epsilon + r_2) + r_3) \cdot (r_4 \cdot \varnothing)
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   171
\label{big}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   172
\end{equation}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   173
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   174
\noindent If we can find an equivalent regular expression that
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   175
is simpler (smaller for example), then this might potentially
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   176
make our matching algorithm run faster. The reason is that
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   177
whether a string $s$ is in $L(r)$ or in $L(r')$ with $r\equiv
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   178
r'$ will always give the same answer. In the example above you
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   179
will see that the regular expression is equivalent to just $r_1$.
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   180
You can verify this by iteratively applying the simplification
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   181
rules from above:
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   182
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   183
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   184
\begin{tabular}{ll}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   185
 & $(r_1 + \varnothing) \cdot \epsilon + ((\epsilon + r_2) + r_3) \cdot 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   186
(\underline{r_4 \cdot \varnothing})$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   187
$\equiv$ & $(r_1 + \varnothing) \cdot \epsilon + \underline{((\epsilon + r_2) + r_3) \cdot 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   188
\varnothing}$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   189
$\equiv$ & $\underline{(r_1 + \varnothing) \cdot \epsilon} + \varnothing$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   190
$\equiv$ & $(\underline{r_1 + \varnothing}) + \varnothing$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   191
$\equiv$ & $\underline{r_1 + \varnothing}$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   192
$\equiv$ & $r_1$\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   193
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   194
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   195
296
796b9b81ac8d updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 291
diff changeset
   196
\noindent In each step, I underlined where a simplification
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   197
rule is applied. Our matching algorithm in the next section
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   198
will often generate such ``useless'' $\epsilon$s and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   199
$\varnothing$s, therefore simplifying them away will make the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   200
algorithm quite a bit faster.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   201
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   202
\subsection*{The Matching Algorithm}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   203
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   204
The algorithm we will define below consists of two parts. One
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   205
is the function $nullable$ which takes a regular expression as
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   206
argument and decides whether it can match the empty string
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   207
(this means it returns a boolean in Scala). This can be easily
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   208
defined recursively as follows:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   209
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   210
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   211
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   212
$nullable(\varnothing)$      & $\dn$ & $\textit{false}$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   213
$nullable(\epsilon)$         & $\dn$ & $true$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   214
$nullable(c)$                & $\dn$ & $\textit{false}$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   215
$nullable(r_1 + r_2)$     & $\dn$ &  $nullable(r_1) \vee nullable(r_2)$\\ 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   216
$nullable(r_1 \cdot r_2)$ & $\dn$ &  $nullable(r_1) \wedge nullable(r_2)$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   217
$nullable(r^*)$              & $\dn$ & $true$ \\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   218
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   219
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   220
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   221
\noindent The idea behind this function is that the following
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   222
property holds:
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   223
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   224
\[
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   225
nullable(r) \;\;\text{if and only if}\;\; []\in L(r)
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   226
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   227
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   228
\noindent Note on the left-hand side of the if-and-only-if we
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   229
have a function we can implement; on the right we have its
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   230
specification (which we cannot implement in a programming
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   231
language).
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   232
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   233
The other function of our matching algorithm calculates a
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   234
\emph{derivative} of a regular expression. This is a function
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   235
which will take a regular expression, say $r$, and a
332
4755ad4b457b updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 325
diff changeset
   236
character, say $c$, as argument and returns a new regular
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   237
expression. Be careful that the intuition behind this function
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   238
is not so easy to grasp on first reading. Essentially this
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   239
function solves the following problem: if $r$ can match a
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   240
string of the form $c\!::\!s$, what does the regular
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   241
expression look like that can match just $s$? The definition
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   242
of this function is as follows:
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   243
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   244
\begin{center}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   245
\begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   246
  $der\, c\, (\varnothing)$      & $\dn$ & $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   247
  $der\, c\, (\epsilon)$         & $\dn$ & $\varnothing$ \\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   248
  $der\, c\, (d)$                & $\dn$ & if $c = d$ then $\epsilon$ else $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   249
  $der\, c\, (r_1 + r_2)$        & $\dn$ & $der\, c\, r_1 + der\, c\, r_2$\\
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   250
  $der\, c\, (r_1 \cdot r_2)$  & $\dn$  & if $nullable (r_1)$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   251
  & & then $(der\,c\,r_1) \cdot r_2 + der\, c\, r_2$\\ 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   252
  & & else $(der\, c\, r_1) \cdot r_2$\\
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   253
  $der\, c\, (r^*)$          & $\dn$ & $(der\,c\,r) \cdot (r^*)$
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   254
  \end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   255
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   256
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   257
\noindent The first two clauses can be rationalised as
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   258
follows: recall that $der$ should calculate a regular
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   259
expression so that given the ``input'' regular expression can
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   260
match a string of the form $c\!::\!s$, we want a regular
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   261
expression for $s$. Since neither $\varnothing$ nor $\epsilon$
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   262
can match a string of the form $c\!::\!s$, we return
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   263
$\varnothing$. In the third case we have to make a
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   264
case-distinction: In case the regular expression is $c$, then
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   265
clearly it can recognise a string of the form $c\!::\!s$, just
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   266
that $s$ is the empty string. Therefore we return the
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   267
$\epsilon$-regular expression. In the other case we again
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   268
return $\varnothing$ since no string of the $c\!::\!s$ can be
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   269
matched. Next come the recursive cases, which are a bit more
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   270
involved. Fortunately, the $+$-case is still relatively
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   271
straightforward: all strings of the form $c\!::\!s$ are either
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   272
matched by the regular expression $r_1$ or $r_2$. So we just
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   273
have to recursively call $der$ with these two regular
332
4755ad4b457b updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 325
diff changeset
   274
expressions and compose the results again with $+$. Makes
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   275
sense? The $\cdot$-case is more complicated: if $r_1\cdot r_2$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   276
matches a string of the form $c\!::\!s$, then the first part
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   277
must be matched by $r_1$. Consequently, it makes sense to
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   278
construct the regular expression for $s$ by calling $der$ with
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   279
$r_1$ and ``appending'' $r_2$. There is however one exception
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   280
to this simple rule: if $r_1$ can match the empty string, then
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   281
all of $c\!::\!s$ is matched by $r_2$. So in case $r_1$ is
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   282
nullable (that is can match the empty string) we have to allow
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   283
the choice $der\,c\,r_2$ for calculating the regular
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   284
expression that can match $s$. Therefore we have to add the
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   285
regular expression $der\,c\,r_2$ in the result. The $*$-case
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   286
is again simple: if $r^*$ matches a string of the form
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   287
$c\!::\!s$, then the first part must be ``matched'' by a
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   288
single copy of $r$. Therefore we call recursively $der\,c\,r$
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   289
and ``append'' $r^*$ in order to match the rest of $s$.
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   290
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   291
If this did not make sense, here is another way to rationalise
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   292
the definition of $der$ by considering the following operation
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   293
on sets:
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   294
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   295
\[
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   296
Der\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   297
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   298
291
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
   299
\noindent This operation essentially transforms a set of
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
   300
strings $A$ by filtering out all strings that do not start
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
   301
with $c$ and then strips off the $c$ from all the remaining
201c2c6d8696 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 272
diff changeset
   302
strings. For example suppose $A = \{f\!oo, bar, f\!rak\}$ then
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   303
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   304
\[ Der\,f\,A = \{oo, rak\}\quad,\quad 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   305
   Der\,b\,A = \{ar\} \quad \text{and} \quad 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   306
   Der\,a\,A = \{\} 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   307
\]
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   308
 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   309
\noindent
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   310
Note that in the last case $Der$ is empty, because no string in $A$
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   311
starts with $a$. With this operation we can state the following
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   312
property about $der$:
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   313
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   314
\[
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   315
L(der\,c\,r) = Der\,c\,(L(r))
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   316
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   317
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   318
\noindent
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   319
This property clarifies what regular expression $der$ calculates,
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   320
namely take the set of strings that $r$ can match (that is $L(r)$),
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   321
filter out all strings not starting with $c$ and strip off the $c$
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   322
from the remaining strings---this is exactly the language that
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   323
$der\,c\,r$ can match.
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   324
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   325
If we want to find out whether the string $abc$ is matched by
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   326
the regular expression $r_1$ then we can iteratively apply $der$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   327
as follows
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   328
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   329
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   330
\begin{tabular}{rll}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   331
Input: $r_1$, $abc$\medskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   332
Step 1: & build derivative of $a$ and $r_1$ & $(r_2 = der\,a\,r_1)$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   333
Step 2: & build derivative of $b$ and $r_2$ & $(r_3 = der\,b\,r_2)$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   334
Step 3: & build derivative of $c$ and $r_3$ & $(r_4 = der\,b\,r_3)$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   335
Step 4: & the string is exhausted; test & ($nullable(r_4)$)\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   336
        & whether $r_4$ can recognise the\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   337
        & empty string\smallskip\\
332
4755ad4b457b updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 325
diff changeset
   338
Output: & result of this test $\Rightarrow true \,\text{or}\, \textit{false}$\\        
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   339
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   340
\end{center}
140
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 133
diff changeset
   341
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   342
\noindent Again the operation $Der$ might help to rationalise
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   343
this algorithm. We want to know whether $abc \in L(r_1)$. We
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   344
do not know yet---but let us assume it is. Then $Der\,a\,L(r_1)$
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   345
builds the set where all the strings not starting with $a$ are
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   346
filtered out. Of the remaining strings, the $a$ is stripped
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   347
off. Then we continue with filtering out all strings not
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   348
starting with $b$ and stripping off the $b$ from the remaining
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   349
strings, that means we build $Der\,b\,(Der\,a\,(L(r_1)))$.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   350
Finally we filter out all strings not starting with $c$ and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   351
strip off $c$ from the remaining string. This is
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   352
$Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$. Now if $abc$ was in the 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   353
original set ($L(r_1)$), then in $Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$ 
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   354
must be the empty string. If not, then $abc$ was not in the 
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   355
language we started with.
140
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 133
diff changeset
   356
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   357
Our matching algorithm using $der$ and $nullable$ works
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   358
similarly, just using regular expression instead of sets. For
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   359
this we need to extend the notion of derivatives from single
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   360
characters to strings. This can be done using the following
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   361
function, taking a string and regular expression as input and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   362
a regular expression as output.
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   363
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   364
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   365
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {\hspace{-10mm}}l@ {}}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   366
  $\textit{ders}\, []\, r$     & $\dn$ & $r$ & \\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   367
  $\textit{ders}\, (c\!::\!s)\, r$ & $\dn$ & $\textit{ders}\,s\,(der\,c\,r)$ & \\
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   368
  \end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   369
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   370
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   371
\noindent This function iterates $der$ taking one character at
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   372
the time from the original string until it is exhausted.
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   373
Having $ders$ in place, we can finally define our matching
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   374
algorithm:
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   375
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   376
\[
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   377
matches\,s\,r \dn nullable(ders\,s\,r)
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   378
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   379
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   380
\noindent
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   381
and we can claim that
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   382
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   383
\[
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   384
matches\,s\,r\quad\text{if and only if}\quad s\in L(r)
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   385
\]
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   386
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   387
\noindent holds, which means our algorithm satisfies the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   388
specification. Of course we can claim many things\ldots
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   389
whether the claim holds any water is a different question,
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   390
which for example is the point of the Strand-2 Coursework.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   391
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   392
This algorithm was introduced by Janus Brzozowski in 1964. Its
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   393
main attractions are simplicity and being fast, as well as
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   394
being easily extendable for other regular expressions such as
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   395
$r^{\{n\}}$, $r^?$, $\sim{}r$ and so on (this is subject of
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   396
Strand-1 Coursework 1).
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   397
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   398
\subsection*{The Matching Algorithm in Scala}
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   399
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   400
Another attraction of the algorithm is that it can be easily
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   401
implemented in a functional programming language, like Scala.
296
796b9b81ac8d updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 291
diff changeset
   402
Given the implementation of regular expressions in Scala shown
796b9b81ac8d updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 291
diff changeset
   403
in the first lecture and handout, the functions and subfunctions
796b9b81ac8d updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 291
diff changeset
   404
for \pcode{matches} are shown in Figure~\ref{scala1}.
126
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 125
diff changeset
   405
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 125
diff changeset
   406
\begin{figure}[p]
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   407
\lstinputlisting{../progs/app5.scala}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   408
\caption{Scala implementation of the nullable and 
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   409
  derivatives functions. These functions are easy to
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   410
  implement in functional languages, because pattern 
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   411
  matching and recursion allow us to mimic the mathematical
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   412
  definitions very closely.\label{scala1}}
126
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 125
diff changeset
   413
\end{figure}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   414
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   415
For running the algorithm with our favourite example, the evil
394
2f9fe225ecc8 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 343
diff changeset
   416
regular expression $a^?{}^{\{n\}}a^{\{n\}}$, we need to implement
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   417
the optional regular expression and the exactly $n$-times
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   418
regular expression. This can be done with the translations
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   419
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   420
\lstinputlisting[numbers=none]{../progs/app51.scala}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   421
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   422
\noindent Running the matcher with the example, we find it is
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   423
slightly worse then the matcher in Ruby and Python.
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   424
Ooops\ldots
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   425
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   426
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   427
\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   428
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   429
    xtick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   430
    xmax=30,
    ytick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   431
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   432
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   433
    width=6cm,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   434
    height=5cm, 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   435
    legend entries={Python,Ruby,Scala V1},  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   436
    legend pos=outer north east,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   437
    legend cell align=left  
]
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   438
\addplot[blue,mark=*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   439
  table {re-python.data};
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   440
\addplot[brown,mark=pentagon*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   441
  table {re-ruby.data};  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   442
\addplot[red,mark=triangle*,mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   443
  table {re1.data};  
\end{axis}
\end{tikzpicture}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   444
\end{center}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   445
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   446
\noindent Analysing this failure we notice that for
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   447
$a^{\{n\}}$ we generate quite big regular expressions:
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   448
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   449
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   450
\begin{tabular}{rl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   451
1: & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   452
2: & $a\cdot a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   453
3: & $a\cdot a\cdot a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   454
& \ldots\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   455
13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   456
& \ldots
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   457
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   458
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   459
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   460
\noindent Our algorithm traverses such regular expressions at
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   461
least once every time a derivative is calculated. So having
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   462
large regular expressions will cause problems. This problem
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   463
is aggravated by $a^?$ being represented as $a + \epsilon$.
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   464
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   465
We can however fix this by having an explicit constructor for
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   466
$r^{\{n\}}$. In Scala we would introduce a constructor like
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   467
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   468
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   469
\code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   470
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   471
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   472
\noindent With this fix we have a constant ``size'' regular
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   473
expression for our running example no matter how large $n$ is.
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   474
This means we have to also add cases for \pcode{NTIMES} in the
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   475
functions $nullable$ and $der$. Does the change have any
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   476
effect?
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   477
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   478
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   479
\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   480
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   481
    xtick={0,100,...,1000},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   482
    xmax=1000,
    ytick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   483
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   484
    axis lines=left,
263
92e6985018ae updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 262
diff changeset
   485
    width=9.5cm,
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   486
    height=5cm, 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   487
    legend entries={Python,Ruby,Scala V1,Scala V2},  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   488
    legend pos=outer north east,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   489
    legend cell align=left  
]
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   490
\addplot[blue,mark=*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   491
  table {re-python.data};
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   492
\addplot[brown,mark=pentagon*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   493
  table {re-ruby.data};  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   494
\addplot[red,mark=triangle*,mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   495
  table {re1.data};  
\addplot[green,mark=square*,mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   496
  table {re2b.data};
\end{axis}
\end{tikzpicture}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   497
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   498
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   499
\noindent Now we are talking business! The modified matcher 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   500
can within 30 seconds handle regular expressions up to
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   501
$n = 950$ before a StackOverflow is raised. Python and Ruby 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   502
(and our first version) could only handle $n = 27$ or so in 30 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   503
second.
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   504
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   505
The moral is that our algorithm is rather sensitive to the
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   506
size of regular expressions it needs to handle. This is of
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   507
course obvious because both $nullable$ and $der$ frequently
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   508
need to traverse the whole regular expression. There seems,
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   509
however, one more issue for making the algorithm run faster.
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   510
The derivative function often produces ``useless''
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   511
$\varnothing$s and $\epsilon$s. To see this, consider $r = ((a
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   512
\cdot b) + b)^*$ and the following two derivatives
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   513
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   514
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   515
\begin{tabular}{l}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   516
$der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   517
$der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   518
$der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   519
\end{tabular}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   520
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   521
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   522
\noindent 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   523
If we simplify them according to the simple rules from the
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   524
beginning, we can replace the right-hand sides by the 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   525
smaller equivalent regular expressions
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   526
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   527
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   528
\begin{tabular}{l}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   529
$der\,a\,r \equiv b \cdot r$\\
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   530
$der\,b\,r \equiv r$\\
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   531
$der\,c\,r \equiv \varnothing$
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   532
\end{tabular}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   533
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   534
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   535
\noindent I leave it to you to contemplate whether such a
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   536
simplification can have any impact on the correctness of our
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   537
algorithm (will it change any answers?). Figure~\ref{scala2}
296
796b9b81ac8d updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 291
diff changeset
   538
gives a simplification function that recursively traverses a
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   539
regular expression and simplifies it according to the rules
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   540
given at the beginning. There are only rules for $+$, $\cdot$
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   541
and $n$-times (the latter because we added it in the second
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   542
version of our matcher). There is no rule for a star, because
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   543
empirical data and also a little thought showed that
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   544
simplifying under a star is a waste of computation time. The
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   545
simplification function will be called after every derivation.
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   546
This additional step removes all the ``junk'' the derivative
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   547
function introduced. Does this improve the speed? You bet!!
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   548
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   549
\begin{figure}[p]
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   550
\lstinputlisting{../progs/app6.scala}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   551
\caption{The simplification function and modified 
325
794c599cee53 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 318
diff changeset
   552
\texttt{ders}-function; this function now
333
8890852e18b7 updated coursework
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 332
diff changeset
   553
calls \texttt{der} first, but then simplifies
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   554
the resulting derivative regular expressions before
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   555
building the next derivative, see
333
8890852e18b7 updated coursework
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 332
diff changeset
   556
Line~28.\label{scala2}}
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   557
\end{figure}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   558
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   559
\begin{center}
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   560
\begin{tikzpicture}
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   561
\begin{axis}[xlabel={\pcode{a}s},ylabel={time in secs},
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   562
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   563
    xtick={0,2000,...,12000},
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   564
    xmax=12000,
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   565
    ytick={0,5,...,30},
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   566
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   567
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   568
    width=9cm,
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   569
    height=5cm, 
268
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   570
    legend entries={Scala V2,Scala V3}]
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   571
\addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   572
\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   573
\end{axis}
18bef085a7ca updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 263
diff changeset
   574
\end{tikzpicture}
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   575
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   576
334
fd89a63e9db3 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 333
diff changeset
   577
\section*{Proofs}
fd89a63e9db3 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 333
diff changeset
   578
339
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   579
You might not like doing proofs. But they serve a very
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   580
important purpose in Computer Science: How can we be sure that
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   581
our algorithm matches its specification. We can try to test
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   582
the algorithm, but that often overlooks corner cases and an
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   583
exhaustive testing is impossible (since there are infinitely
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   584
many inputs). Proofs allow us to ensure that an algorithm
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   585
really meets its specification. 
338
f16120cb4e19 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 334
diff changeset
   586
339
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   587
For the programs we look at in this module, the proofs will
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   588
mostly by some form of induction. Remember that regular
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   589
expressions are defined as 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   590
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   591
\begin{center}
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   592
\begin{tabular}{r@{\hspace{1mm}}r@{\hspace{1mm}}l@{\hspace{13mm}}l}
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   593
  $r$ & $::=$ &   $\varnothing$         & null\\
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   594
        & $\mid$ & $\epsilon$           & empty string / \texttt{""} / []\\
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   595
        & $\mid$ & $c$                  & single character\\
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   596
        & $\mid$ & $r_1 + r_2$          & alternative / choice\\
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   597
        & $\mid$ & $r_1 \cdot r_2$      & sequence\\
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   598
        & $\mid$ & $r^*$                & star (zero or more)\\
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   599
  \end{tabular}
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   600
\end{center}
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   601
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   602
\noindent If you want to show a property $P(r)$ for all 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   603
regular expressions $r$, then you have to follow essentially 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   604
the recipe:
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   605
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   606
\begin{itemize}
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   607
\item $P$ has to hold for $\varnothing$, $\epsilon$ and $c$
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   608
 (these are the base cases).
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   609
\item $P$ has to hold for $r_1 + r_2$ under the assumption 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   610
   that $P$ already holds for $r_1$ and $r_2$.
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   611
\item $P$ has to hold for $r_1 \cdot r_2$ under the 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   612
  assumption that $P$ already holds for $r_1$ and $r_2$.
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   613
\item $P$ has to hold for $r^*$ under the assumption 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   614
  that $P$ already holds for $r$.
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   615
\end{itemize}
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   616
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   617
\noindent 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   618
A simple proof is for example showing the following 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   619
property:
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   620
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   621
\begin{equation}
339
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   622
nullable(r) \;\;\text{if and only if}\;\; []\in L(r)
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   623
\label{nullableprop}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   624
\end{equation}
339
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   625
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   626
\noindent
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   627
Let us say that this property is $P(r)$, then the first case
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   628
we need to check is whether $P(\varnothing)$ (see recipe 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   629
above). So we have to show that
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   630
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   631
\[
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   632
nullable(\varnothing) \;\;\text{if and only if}\;\; 
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   633
[]\in L(\varnothing)
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   634
\]
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   635
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   636
\noindent whereby $nullable(\varnothing)$ is by definition of
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   637
the function $nullable$ always $\textit{false}$. We also have
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   638
that $L(\varnothing)$ is by definition $\{\}$. It is
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   639
impossible that the empty string $[]$ is in the empty set.
339
bc395ccfba7f updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 338
diff changeset
   640
Therefore also the right-hand side is false. Consequently we
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   641
verified this case: both sides are false. We would still need
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   642
to do this for $P(\varepsilon)$ and $P(c)$. I leave this to
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   643
you to verify.
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   644
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   645
Next we need to check the inductive cases, for example
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   646
$P(r_1 + r_2)$, which is
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   647
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   648
\begin{equation}
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   649
nullable(r_1 + r_2) \;\;\text{if and only if}\;\; 
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   650
[]\in L(r_1 + r_2)
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   651
\label{propalt}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   652
\end{equation}
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   653
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   654
\noindent The difference to the base cases is that in this
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   655
case we can already assume we proved
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   656
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   657
\begin{center}
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   658
\begin{tabular}{l}
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   659
$nullable(r_1) \;\;\text{if and only if}\;\; []\in L(r_1)$ and\\
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   660
$nullable(r_2) \;\;\text{if and only if}\;\; []\in L(r_2)$\\
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   661
\end{tabular}
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   662
\end{center}
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   663
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   664
\noindent These are the induction hypotheses. To check this 
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   665
case, we can start from $nullable(r_1 + r_2)$, which by 
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   666
definition is
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   667
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   668
\[
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   669
nullable(r_1) \vee nullable(r_2)
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   670
\]
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   671
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   672
\noindent Using the two induction hypotheses from above,
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   673
we can transform this into 
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   674
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   675
\[
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   676
[] \in L(r_1) \vee []\in(r_2)
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   677
\]
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   678
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   679
\noindent We just replaced the $nullable(\ldots)$ parts by
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   680
the equivalent $[] \in L(\ldots)$ from the induction 
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   681
hypotheses. A bit of thinking convinces you that if
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   682
$[] \in L(r_1) \vee []\in L(r_2)$ then the empty string
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   683
must be in the union $L(r_1)\cup L(r_2)$, that is
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   684
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   685
\[
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   686
[] \in L(r_1)\cup L(r_2)
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   687
\]
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   688
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   689
\noindent but this is by definition of $L$ exactly $[] \in
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   690
L(r_1 + r_2)$, which we needed to establish according to
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   691
\eqref{propalt}. What we have shown is that starting from
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   692
$nullable(r_1 + r_2)$ we have done equivalent transformations
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   693
to end up with $[] \in L(r_1 + r_2)$. Consequently we have
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   694
established that $P(r_1 + r_2)$ holds.
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   695
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   696
In order to complete the proof we would now need to look 
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   697
at the cases \mbox{$P(r_1\cdot r_2)$} and $P(r^*)$. Again I let you
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   698
check the details.
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   699
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   700
You might have to do induction proofs over strings. 
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   701
That means you want to establish a property $P(s)$ for all
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   702
strings $s$. For this remember strings are lists of 
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   703
characters. These lists can be either the empty list or a
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   704
list of the form $c::s$. If you want to perform an induction
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   705
proof for strings you need to consider the cases
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   706
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   707
\begin{itemize}
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   708
\item $P$ has to hold for $[]$ (this is the base case).
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   709
\item $P$ has to hold for $c::s$ under the assumption 
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   710
   that $P$ already holds for $s$.
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   711
\end{itemize}
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   712
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   713
\noindent
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   714
Given this recipe, I let you show
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   715
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   716
\begin{equation}
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   717
Ders\,s\,(L(r)) = L(ders\,s\,r)
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   718
\label{dersprop}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   719
\end{equation}
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   720
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   721
\noindent by induction on $s$. In this proof you can assume
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   722
the following property for $der$ and $Der$ has already been
340
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   723
proved, that is you can assume
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   724
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   725
\[
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   726
L(der\,c\,r) = Der\,c\,(L(r))
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   727
\]
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   728
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   729
\noindent holds (this would be of course a property that
c49122dbcdd1 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 339
diff changeset
   730
needs to be proved in a side-lemma by induction on $r$).
338
f16120cb4e19 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 334
diff changeset
   731
343
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   732
To sum up, using reasoning like the one shown above allows us 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   733
to show the correctness of our algorithm. To see this,
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   734
start from the specification
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   735
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   736
\[
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   737
s \in L(r)
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   738
\]
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   739
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   740
\noindent That is the problem we want to solve. Thinking a 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   741
little, you will see that this problem is equivalent to the 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   742
following problem
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   743
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   744
\begin{equation}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   745
[] \in Ders\,s\,(L(r))
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   746
\label{dersstep}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   747
\end{equation}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   748
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   749
\noindent But we have shown above in \eqref{dersprop}, that
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   750
the $Ders$ can be replaced by $L(ders\ldots)$. That means 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   751
\eqref{dersstep} is equivalent to 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   752
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   753
\begin{equation}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   754
[] \in L(ders\,s\,r)
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   755
\label{prefinalstep}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   756
\end{equation}
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   757
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   758
\noindent We have also shown that testing whether the empty
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   759
string is in a language is equivalent to the $nullable$
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   760
function; see \eqref{nullableprop}. That means
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   761
\eqref{prefinalstep} is equivalent with
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   762
 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   763
\[
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   764
nullable(ders\,s\,r)
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   765
\] 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   766
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   767
\noindent But this is just the definition of $matches$
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   768
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   769
\[
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   770
matches\,s\,r \dn nullable(ders\,s\,r)
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   771
\]
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   772
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   773
\noindent In effect we have shown
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   774
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   775
\[
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   776
matches\,s\,r\;\;\text{if and only if}\;\;
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   777
s\in L(r)
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   778
\]
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   779
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   780
\noindent which is the property we set out to prove:
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   781
our algorithm meets its specification. To have done
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   782
so, requires a few induction proofs about strings and
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   783
regular expressions. Following the recipes is already a big 
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   784
step in performing these proofs.
539b2e88f5b9 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 340
diff changeset
   785
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   786
\end{document}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   787
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   788
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   789
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   790
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   791
%%% Local Variables: 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   792
%%% mode: latex
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   793
%%% TeX-master: t
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   794
%%% End: