handouts/ho02.tex
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Sun, 28 Sep 2014 22:30:25 +0100
changeset 262 ee4304bc6350
parent 261 24531cfaa36a
child 263 92e6985018ae
permissions -rw-r--r--
updated handouts
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     1
\documentclass{article}
251
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
     2
\usepackage{../style}
217
cd6066f1056a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 140
diff changeset
     3
\usepackage{../langs}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
     4
\usepackage{../graphics}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
     5
\usepackage{../data}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     6
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
     7
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     8
\begin{document}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
     9
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    10
\section*{Handout 2}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    11
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    12
This lecture is about implementing a more efficient regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    13
expression matcher (the plots on the right)---more efficient
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    14
than the matchers from regular expression libraries in Ruby and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    15
Python (the plots on the left). These plots show the running 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    16
time for the evil regular expression $a?^{\{n\}}a^{\{n\}}$.
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    17
Note the different scales in each plot.
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    18
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    19
\pgfplotsset{compat=1.11}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    20
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    21
\begin{tabular}{@{}cc@{}}
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    22
\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    23
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    24
    xtick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    25
    xmax=30,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    26
    ymax=35,
    ytick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    27
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    28
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    29
    width=5cm,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    30
    height=5cm, 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    31
    legend entries={Python,Ruby},  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    32
    legend pos=north west,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    33
    legend cell align=left  
]
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    34
\addplot[blue,mark=*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    35
  table {re-python.data};
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    36
\addplot[brown,mark=pentagon*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    37
  table {re-ruby.data};  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    38
\end{axis}
\end{tikzpicture}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    39
&
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    40
\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    41
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    42
    xtick={0,3000,...,12000},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    43
    xmax=12000,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    44
    ymax=35,
    ytick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    45
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    46
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    47
    width=6.5cm,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    48
    height=5cm
]
\addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
    49
\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
\end{axis}
\end{tikzpicture}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    50
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    51
\end{center}\medskip
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    52
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    53
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    54
\noindent Having specified in the previous lecture what
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    55
problem our regular expression matcher, which we will call
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    56
\pcode{matches}, is supposed to solve, namely for any given
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    57
regular expression $r$ and string $s$ answer \textit{true} if
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    58
and only if
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    59
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    60
\[
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    61
s \in L(r)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    62
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    63
251
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    64
\noindent we can look at an algorithm to solve this problem.
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    65
Clearly we cannot use the function $L$ directly for this,
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    66
because in general the set of strings $L$ returns is infinite
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    67
(recall what $L(a^*)$ is). In such cases there is no way we
5b5a68df6d16 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 217
diff changeset
    68
can implement an exhaustive test for whether a string is
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    69
member of this set or not. In contrast our matching algorithm
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    70
will mainly operate on the regular expression $r$ and string
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    71
$s$, which are both finite. Before we come to the matching
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    72
algorithm, however, let us have a closer look at what it means
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    73
when two regular expressions are equivalent.
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
    74
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
    75
\subsection*{Regular Expression Equivalences}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    76
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    77
We already defined in Handout 1 what it means for two regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    78
expressions to be equivalent, namely if their meaning is the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    79
same language:
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
    80
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    81
\[
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    82
r_1 \equiv r_2 \;\dn\; L(r_1) = L(r_2)
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    83
\]
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    84
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    85
\noindent
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    86
It is relatively easy to verify that some concrete equivalences
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    87
hold, for example
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
    88
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
    89
\begin{center}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    90
\begin{tabular}{rcl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    91
$(a + b)  + c$ & $\equiv$ & $a + (b + c)$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    92
$a + a$        & $\equiv$ & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    93
$a + b$        & $\equiv$ & $b + a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    94
$(a \cdot b) \cdot c$ & $\equiv$ & $a \cdot (b \cdot c)$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
    95
$c \cdot (a + b)$ & $\equiv$ & $(c \cdot a) + (c \cdot b)$\\
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
    96
\end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
    97
\end{center}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
    98
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
    99
\noindent
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   100
but also easy to verify that the following regular expressions
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   101
are \emph{not} equivalent
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   102
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   103
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   104
\begin{tabular}{rcl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   105
$a \cdot a$ & $\not\equiv$ & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   106
$a + (b \cdot c)$ & $\not\equiv$ & $(a + b) \cdot (a + c)$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   107
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   108
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   109
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   110
\noindent I leave it to you to verify these equivalences and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   111
non-equivalences. It is also interesting to look at some
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   112
corner cases involving $\epsilon$ and $\varnothing$:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   113
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   114
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   115
\begin{tabular}{rcl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   116
$a \cdot \varnothing$ & $\not\equiv$ & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   117
$a + \epsilon$ & $\not\equiv$ & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   118
$\epsilon$ & $\equiv$ & $\varnothing^*$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   119
$\epsilon^*$ & $\equiv$ & $\epsilon$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   120
$\varnothing^*$ & $\not\equiv$ & $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   121
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   122
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   123
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   124
\noindent Again I leave it to you to make sure you agree
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   125
with these equivalences and non-equivalences. 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   126
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   127
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   128
For our matching algorithm however the following six
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   129
equivalences will play an important role:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   130
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   131
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   132
\begin{tabular}{rcl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   133
$r + \varnothing$  & $\equiv$ & $r$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   134
$\varnothing + r$  & $\equiv$ & $r$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   135
$r \cdot \epsilon$ & $\equiv$ & $r$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   136
$\epsilon \cdot r$     & $\equiv$ & $r$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   137
$r \cdot \varnothing$ & $\equiv$ & $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   138
$\varnothing \cdot r$ & $\equiv$ & $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   139
$r + r$ & $\equiv$ & $r$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   140
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   141
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   142
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   143
\noindent which always hold no matter what the regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   144
expression $r$ looks like. The first are easy to verify since
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   145
$L(\varnothing)$ is the empty set. The next two are also easy 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   146
to verify since $L(\epsilon) = \{[]\}$ and appending the empty 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   147
string to every string of another set, leaves the set 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   148
unchanged. Be careful to fully comprehend the fifth and 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   149
sixth equivalence: if you concatenate two sets of strings
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   150
and one is the empty set, then the concatenation will also be
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   151
the empty set. Check the definition of \pcode{_ @ _}.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   152
The last equivalence is again trivial.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   153
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   154
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   155
What will be important later on is that we can orient these
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   156
equivalences and read them from left to right. In this way we
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   157
can view them as \emph{simplification rules}. Suppose for 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   158
example the regular expression 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   159
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   160
\begin{equation}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   161
(r_1 + \varnothing) \cdot \epsilon + ((\epsilon + r_2) + r_3) \cdot (r_4 \cdot \varnothing)
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   162
\label{big}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   163
\end{equation}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   164
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   165
\noindent If we can find an equivalent regular expression that
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   166
is simpler (smaller for example), then this might potentially 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   167
make our matching algorithm is faster. The reason is that 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   168
whether a string $s$ is in $L(r)$ or in $L(r')$ with $r\equiv r'$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   169
will always give the same answer. In the example above you 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   170
will see that the regular expression is equivalent to $r_1$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   171
if you iteratively apply the simplification rules from above:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   172
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   173
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   174
\begin{tabular}{ll}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   175
 & $(r_1 + \varnothing) \cdot \epsilon + ((\epsilon + r_2) + r_3) \cdot 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   176
(\underline{r_4 \cdot \varnothing})$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   177
$\equiv$ & $(r_1 + \varnothing) \cdot \epsilon + \underline{((\epsilon + r_2) + r_3) \cdot 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   178
\varnothing}$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   179
$\equiv$ & $\underline{(r_1 + \varnothing) \cdot \epsilon} + \varnothing$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   180
$\equiv$ & $(\underline{r_1 + \varnothing}) + \varnothing$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   181
$\equiv$ & $\underline{r_1 + \varnothing}$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   182
$\equiv$ & $r_1$\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   183
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   184
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   185
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   186
\noindent In each step I underlined where a simplification
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   187
rule is applied. Our matching algorithm in the next section
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   188
will often generate such ``useless'' $\epsilon$s and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   189
$\varnothing$s, therefore simplifying them away will make the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   190
algorithm quite a bit faster.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   191
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   192
\subsection*{The Matching Algorithm}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   193
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   194
The algorithm we will define below consists of two parts. One
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   195
is the function $nullable$ which takes a regular expression as
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   196
argument and decides whether it can match the empty string
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   197
(this means it returns a boolean in Scala). This can be easily
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   198
defined recursively as follows:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   199
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   200
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   201
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {}}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   202
$nullable(\varnothing)$      & $\dn$ & $\textit{false}$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   203
$nullable(\epsilon)$         & $\dn$ & $true$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   204
$nullable(c)$                & $\dn$ & $\textit{false}$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   205
$nullable(r_1 + r_2)$     & $\dn$ &  $nullable(r_1) \vee nullable(r_2)$\\ 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   206
$nullable(r_1 \cdot r_2)$ & $\dn$ &  $nullable(r_1) \wedge nullable(r_2)$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   207
$nullable(r^*)$              & $\dn$ & $true$ \\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   208
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   209
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   210
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   211
\noindent The idea behind this function is that the following
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   212
property holds:
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   213
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   214
\[
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   215
nullable(r) \;\;\text{if and only if}\;\; []\in L(r)
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   216
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   217
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   218
\noindent Note on the left-hand side we have a function we can
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   219
implement; on the right we have its specification (which we
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   220
cannot implement in a programming language).
124
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 123
diff changeset
   221
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   222
The other function of our matching algorithm calculates a
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   223
\emph{derivative} of a regular expression. This is a function
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   224
which will take a regular expression, say $r$, and a
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   225
character, say $c$, as argument and return a new regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   226
expression. Be careful that the intuition behind this function
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   227
is not so easy to grasp on first reading. Essentially this
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   228
function solves the following problem: if $r$ can match a
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   229
string of the form $c\!::\!s$, what does the regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   230
expression look like that can match just $s$. The definition
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   231
of this function is as follows:
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   232
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   233
\begin{center}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   234
\begin{tabular}{l@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   235
  $der\, c\, (\varnothing)$      & $\dn$ & $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   236
  $der\, c\, (\epsilon)$         & $\dn$ & $\varnothing$ \\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   237
  $der\, c\, (d)$                & $\dn$ & if $c = d$ then $\epsilon$ else $\varnothing$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   238
  $der\, c\, (r_1 + r_2)$        & $\dn$ & $der\, c\, r_1 + der\, c\, r_2$\\
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   239
  $der\, c\, (r_1 \cdot r_2)$  & $\dn$  & if $nullable (r_1)$\\
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   240
  & & then $(der\,c\,r_1) \cdot r_2 + der\, c\, r_2$\\ 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   241
  & & else $(der\, c\, r_1) \cdot r_2$\\
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   242
  $der\, c\, (r^*)$          & $\dn$ & $(der\,c\,r) \cdot (r^*)$
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   243
  \end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   244
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   245
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   246
\noindent The first two clauses can be rationalised as
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   247
follows: recall that $der$ should calculate a regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   248
expression, if the ``input'' regular expression can match a
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   249
string of the form $c\!::\!s$. Since neither $\varnothing$ nor
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   250
$\epsilon$ can match such a string we return $\varnothing$. In
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   251
the third case we have to make a case-distinction: In case the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   252
regular expression is $c$, then clearly it can recognise a
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   253
string of the form $c\!::\!s$, just that $s$ is the empty
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   254
string. Therefore we return the $\epsilon$-regular expression.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   255
In the other case we again return $\varnothing$ since no
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   256
string of the $c\!::\!s$ can be matched. Next come the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   257
recursive cases. Fortunately, the $+$-case is still relatively
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   258
straightforward: all strings of the form $c\!::\!s$ are either
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   259
matched by the regular expression $r_1$ or $r_2$. So we just
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   260
have to recursively call $der$ with these two regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   261
expressions and compose the results again with $+$. Yes, makes
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   262
sense? The $\cdot$-case is more complicated: if $r_1\cdot r_2$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   263
matches a string of the form $c\!::\!s$, then the first part
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   264
must be matched by $r_1$. Consequently, it makes sense to
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   265
construct the regular expression for $s$ by calling $der$ with
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   266
$r_1$ and ``appending'' $r_2$. There is however one exception
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   267
to this simple rule: if $r_1$ can match the empty string, then
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   268
all of $c\!::\!s$ is matched by $r_2$. So in case $r_1$ is
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   269
nullable (that is can match the empty string) we have to allow
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   270
the choice $der\,c\,r_2$ for calculating the regular
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   271
expression that can match $s$. Therefore we have to 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   272
add the regular expression $der\,c\,r_2$.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   273
The $*$-case is again simple:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   274
if $r^*$ matches a string of the form $c\!::\!s$, then the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   275
first part must be ``matched'' by a single copy of $r$.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   276
Therefore we call recursively $der\,c\,r$ and ``append'' $r^*$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   277
in order to match the rest of $s$.
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   278
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   279
If this did not make sense, here is another way to rationalise
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   280
the definition of $der$ by considering the following operation
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   281
on sets:
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   282
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   283
\[
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   284
Der\,c\,A\;\dn\;\{s\,|\,c\!::\!s \in A\}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   285
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   286
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   287
\noindent
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   288
which essentially transforms a set of strings $A$ by filtering out all
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   289
strings that do not start with $c$ and then strips off the $c$ from
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   290
all the remaining strings. For example suppose $A = \{f\!oo, bar,
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   291
f\!rak\}$ then
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   292
\[
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   293
Der\,f\,A = \{oo, rak\}\quad,\quad
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   294
Der\,b\,A = \{ar\}  \quad \text{and} \quad
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   295
Der\,a\,A = \varnothing
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   296
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   297
 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   298
\noindent
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   299
Note that in the last case $Der$ is empty, because no string in $A$
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   300
starts with $a$. With this operation we can state the following
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   301
property about $der$:
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   302
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   303
\[
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   304
L(der\,c\,r) = Der\,c\,(L(r))
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   305
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   306
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   307
\noindent
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   308
This property clarifies what regular expression $der$ calculates,
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   309
namely take the set of strings that $r$ can match (that is $L(r)$),
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   310
filter out all strings not starting with $c$ and strip off the $c$
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   311
from the remaining strings---this is exactly the language that
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   312
$der\,c\,r$ can match.
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   313
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   314
If we want to find out whether the string $abc$ is matched by
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   315
the regular expression $r_1$ then we can iteratively apply $der$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   316
as follows
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   317
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   318
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   319
\begin{tabular}{rll}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   320
Input: $r_1$, $abc$\medskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   321
Step 1: & build derivative of $a$ and $r_1$ & $(r_2 = der\,a\,r_1)$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   322
Step 2: & build derivative of $b$ and $r_2$ & $(r_3 = der\,b\,r_2)$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   323
Step 3: & build derivative of $c$ and $r_3$ & $(r_4 = der\,b\,r_3)$\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   324
Step 4: & the string is exhausted; test & ($nullable(r_4)$)\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   325
        & whether $r_4$ can recognise the\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   326
        & empty string\smallskip\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   327
Output: & result of the test $\Rightarrow true \,\text{or}\, \textit{false}$\\        
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   328
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   329
\end{center}
140
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 133
diff changeset
   330
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   331
\noindent Again the operation $Der$ might help to rationalise
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   332
this algorithm. We want to know whether $abc \in L(r_1)$. We
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   333
do not know yet. But lets assume it is. Then $Der\,a\,L(r_1)$
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   334
builds the set where all the strings not starting with $a$ are
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   335
filtered out. Of the remaining strings, the $a$ is stripped
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   336
off. Then we continue with filtering out all strings not
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   337
starting with $b$ and stripping off the $b$ from the remaining
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   338
strings, that means we build $Der\,b\,(Der\,a\,(L(r_1)))$.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   339
Finally we filter out all strings not starting with $c$ and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   340
strip off $c$ from the remaining string. This is
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   341
$Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$. Now if $abc$ was in the 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   342
original set ($L(r_1)$), then in $Der\,c\,(Der\,b\,(Der\,a\,(L(r))))$ 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   343
must be the empty string. If not then $abc$ was not in the 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   344
language we started with.
140
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 133
diff changeset
   345
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   346
Our matching algorithm using $der$ and $nullable$ works
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   347
similarly, just using regular expression instead of sets. For
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   348
this we need to extend the notion of derivatives from
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   349
characters to strings. This can be done using the following
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   350
function, taking a string and regular expression as input and
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   351
a regular expression as output.
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   352
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   353
\begin{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   354
\begin{tabular}{@ {}l@ {\hspace{2mm}}c@ {\hspace{2mm}}l@ {\hspace{-10mm}}l@ {}}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   355
  $\textit{ders}\, []\, r$     & $\dn$ & $r$ & \\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   356
  $\textit{ders}\, (c\!::\!s)\, r$ & $\dn$ & $\textit{ders}\,s\,(der\,c\,r)$ & \\
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   357
  \end{tabular}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   358
\end{center}
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   359
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   360
\noindent This function essentially iterates $der$ taking one
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   361
character at the time from the original string until it is
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   362
exhausted. Having $ders$ in place, we can finally define our
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   363
matching algorithm:
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   364
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   365
\[
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   366
matches\,s\,r = nullable(ders\,s\,r)
125
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   367
\]
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   368
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 124
diff changeset
   369
\noindent
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   370
We can claim that
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   371
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   372
\[
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   373
matches\,s\,r\quad\text{if and only if}\quad s\in L(r)
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   374
\]
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   375
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   376
\noindent holds, which means our algorithm satisfies the
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   377
specification. Of course we can claim many things\ldots
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   378
whether the claim holds any water is a different question,
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   379
which for example is the point of the Strand-2 Coursework.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   380
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   381
This algorithm was introduced by Janus Brzozowski in 1964. Its
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   382
main attractions are simplicity and being fast, as well as
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   383
being easily extendable for other regular expressions such as
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   384
$r^{\{n\}}$, $r^?$, $\sim{}r$ and so on (this is subject of
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   385
Strand-1 Coursework 1).
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   386
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   387
\subsection*{The Matching Algorithm in Scala}
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 251
diff changeset
   388
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   389
Another attraction of the algorithm is that it can be easily
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   390
implemented in a functional programming language, like Scala.
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   391
Given the implementation of regular expressions in Scala given
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   392
in the first lecture and handout, the functions for
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   393
\pcode{matches} are shown in Figure~\ref{scala1}.
126
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 125
diff changeset
   394
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 125
diff changeset
   395
\begin{figure}[p]
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   396
\lstinputlisting{../progs/app5.scala}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   397
\caption{Scala implementation of the nullable and 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   398
  derivatives functions.\label{scala1}}
126
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 125
diff changeset
   399
\end{figure}
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   400
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   401
For running the algorithm with our favourite example, the evil
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   402
regular expression $a?^{\{n\}}a^{\{n\}}$, we need to implement
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   403
the optional regular expression and the exactly $n$-times
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   404
regular expression. This can be done with the translations
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   405
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   406
\lstinputlisting[numbers=none]{../progs/app51.scala}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   407
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   408
\noindent Running the matcher with the example, we find it is
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   409
slightly worse then the matcher in Ruby and Python.
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   410
Ooops\ldots
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   411
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   412
\pgfplotsset{compat=1.11}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   413
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   414
\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   415
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   416
    xtick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   417
    xmax=30,
    ytick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   418
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   419
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   420
    width=6cm,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   421
    height=5cm, 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   422
    legend entries={Python,Ruby,Scala V1},  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   423
    legend pos=outer north east,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   424
    legend cell align=left  
]
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   425
\addplot[blue,mark=*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   426
  table {re-python.data};
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   427
\addplot[brown,mark=pentagon*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   428
  table {re-ruby.data};  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   429
\addplot[red,mark=triangle*,mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   430
  table {re1.data};  
\end{axis}
\end{tikzpicture}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   431
\end{center}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   432
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   433
\noindent Analysing this failure a bit we notice that 
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   434
for $a^{\{n\}}$ we generate quite big regular expressions:
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   435
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   436
\begin{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   437
\begin{tabular}{rl}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   438
1: & $a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   439
2: & $a\cdot a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   440
3: & $a\cdot a\cdot a$\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   441
& \ldots\\
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   442
13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   443
& \ldots
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   444
\end{tabular}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   445
\end{center}
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   446
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   447
\noindent Our algorithm traverses such regular expressions at
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   448
least once every time a derivative is calculated. So having
262
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   449
large regular expressions will cause problems. This problem
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   450
is aggravated by $a?$ being represented as $a + \epsilon$.
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   451
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   452
We can fix this by having an explicit constructor for 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   453
$r^{\{n\}}$. In Scala we would introduce a constructor like
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   454
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   455
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   456
\code{case class NTIMES(r: Rexp, n: Int) extends Rexp}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   457
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   458
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   459
\noindent With this we have a constant ``size'' regular
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   460
expression for our running example no matter how large $n$ 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   461
is. This means we have to also add cases for $nullable$ and
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   462
$der$. Does the change have any effect?
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   463
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   464
\pgfplotsset{compat=1.11}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   465
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   466
\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   467
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   468
    xtick={0,100,...,1000},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   469
    xmax=1000,
    ytick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   470
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   471
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   472
    width=10cm,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   473
    height=5cm, 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   474
    legend entries={Python,Ruby,Scala V1,Scala V2},  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   475
    legend pos=outer north east,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   476
    legend cell align=left  
]
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   477
\addplot[blue,mark=*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   478
  table {re-python.data};
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   479
\addplot[brown,mark=pentagon*, mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   480
  table {re-ruby.data};  
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   481
\addplot[red,mark=triangle*,mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   482
  table {re1.data};  
\addplot[green,mark=square*,mark options={fill=white}] 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   483
  table {re2b.data};
\end{axis}
\end{tikzpicture}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   484
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   485
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   486
\noindent Now we are talking business! The modified matcher 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   487
can within 30 seconds handle regular expressions up to
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   488
$n = 950$ before a StackOverflow is raised.
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   489
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   490
The moral is that our algorithm is rather sensitive to the
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   491
size of regular expressions it needs to handle. This is of
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   492
course obvious because both $nullable$ and $der$ need to
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   493
traverse the whole regular expression. There seems to be one
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   494
more source of making the algorithm run faster. The derivative
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   495
function often produces ``useless'' $\varnothing$s and
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   496
$\epsilon$s. To see this, consider $r = ((a \cdot b) + b)^*$
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   497
and the following two derivatives
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   498
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   499
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   500
\begin{tabular}{l}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   501
$der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   502
$der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   503
$der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   504
\end{tabular}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   505
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   506
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   507
\noindent 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   508
If we simplify them according to the simple rules from the
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   509
beginning, we can replace the right-hand sides by the 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   510
smaller equivalent regular expressions
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   511
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   512
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   513
\begin{tabular}{l}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   514
$der\,a\,r \equiv b \cdot r$\\
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   515
$der\,b\,r \equiv r$\\
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   516
$der\,c\,r \equiv \varnothing$
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   517
\end{tabular}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   518
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   519
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   520
\noindent I leave it to you to contemplate whether such a
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   521
simplification can have any impact on the correctness of our
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   522
algorithm (will it change any answers?). Figure~\ref{scala2}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   523
give a simplification function that recursively traverses a
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   524
regular expression and simplifies it according to the rules
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   525
given at the beginning. There are only rules for $+$, $\cdot$
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   526
and $n$-times (the latter because we added it in the second
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   527
version of our matcher). There is no rule for a star, because
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   528
empirical data and also a little thought showed that
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   529
simplifying under a star is waste of computation time. The
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   530
simplification function will be called after every derivation.
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   531
This additional step removes all the ``junk'' the derivative
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   532
function introduced. Does this improve the speed? You bet!!
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   533
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   534
\begin{figure}[p]
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   535
\lstinputlisting{../progs/app6.scala}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   536
\caption{The simplification function and modified 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   537
\pcode{ders}-function.\label{scala2}}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   538
\end{figure}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   539
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   540
\pgfplotsset{compat=1.11}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   541
\begin{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   542
\begin{tikzpicture}
\begin{axis}[
    xlabel={\pcode{a}s},
    ylabel={time in secs},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   543
    enlargelimits=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   544
    xtick={0,2000,...,12000},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   545
    xmax=12000,
    ytick={0,5,...,30},
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   546
    scaled ticks=false,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   547
    axis lines=left,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   548
    width=9cm,
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   549
    height=4cm, 
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   550
    legend entries={Scala V2,Scala V3},    
]
\addplot[green,mark=square*,mark options={fill=white}] table {re2b.data};
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   551
\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
\end{axis}
\end{tikzpicture}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   552
\end{center}
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   553
ee4304bc6350 updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 261
diff changeset
   554
\end{document}
261
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   555
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   556
24531cfaa36a updated handouts
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 259
diff changeset
   557
123
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   558
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   559
%%% Local Variables: 
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   560
%%% mode: latex
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   561
%%% TeX-master: t
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
diff changeset
   562
%%% End: