496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1
(*<*)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 2
theory Paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 3
imports
569
+ − 4
"Posix.LexerSimp"
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 5
"Posix.FBound"
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 6
"HOL-Library.LaTeXsugar"
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 7
begin
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 8
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 9
declare [[show_question_marks = false]]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 10
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 11
notation (latex output)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 12
If ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 13
Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 14
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 15
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 16
abbreviation
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 17
"der_syn r c \<equiv> der c r"
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 18
abbreviation
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 19
"ders_syn r s \<equiv> ders s r"
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 20
abbreviation
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 21
"bder_syn r c \<equiv> bder c r"
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 22
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 23
notation (latex output)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 24
der_syn ("_\\_" [79, 1000] 76) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 25
ders_syn ("_\\_" [79, 1000] 76) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 26
bder_syn ("_\\_" [79, 1000] 76) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 27
bders ("_\\_" [79, 1000] 76) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 28
bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 29
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 30
ZERO ("\<^bold>0" 81) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 31
ONE ("\<^bold>1" 81) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 32
CH ("_" [1000] 80) and
569
+ − 33
ALT ("_ + _" [76,76] 77) and
+ − 34
SEQ ("_ \<cdot> _" [78,78] 78) and
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 35
STAR ("_\<^sup>*" [79] 78) and
569
+ − 36
NTIMES ("_\<^sup>{\<^bsup>_\<^esup>\<^sup>}" [79] 78) and
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 37
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 38
val.Void ("Empty" 78) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 39
val.Char ("Char _" [1000] 78) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 40
val.Left ("Left _" [79] 78) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 41
val.Right ("Right _" [1000] 78) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 42
val.Seq ("Seq _ _" [79,79] 78) and
569
+ − 43
val.Stars ("Stars _" [1000] 78) and
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 44
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 45
Prf ("\<turnstile> _ : _" [75,75] 75) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 46
Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 47
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 48
flat ("|_|" [75] 74) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 49
flats ("|_|" [72] 74) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 50
injval ("inj _ _ _" [79,77,79] 76) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 51
mkeps ("mkeps _" [79] 76) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 52
length ("len _" [73] 73) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 53
set ("_" [73] 73) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 54
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 55
AZERO ("ZERO" 81) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 56
AONE ("ONE _" [79] 78) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 57
ACHAR ("CHAR _ _" [79, 79] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 58
AALTs ("ALTs _ _" [77,77] 78) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 59
ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 60
ASTAR ("STAR _ _" [79, 79] 78) and
569
+ − 61
ANTIMES ("NT _ _ _" [79, 79, 79] 78) and
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 62
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 63
code ("code _" [79] 74) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 64
intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 65
erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 66
bnullable ("bnullable _" [1000] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 67
bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 68
bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 69
bmkeps ("bmkeps _" [1000] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 70
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 71
eq1 ("_ \<approx> _" [77,77] 76) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 72
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 73
srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 74
rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 75
srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 76
blexer_simp ("blexer\<^sup>+" 1000)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 77
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 78
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 79
lemma better_retrieve:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 80
shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 81
and "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 82
apply (metis list.exhaust retrieve.simps(4))
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 83
by (metis list.exhaust retrieve.simps(5))
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 84
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 85
569
+ − 86
lemma better_retrieve2:
+ − 87
shows "retrieve (ANTIMES bs r (n + 1)) (Stars (v#vs)) =
+ − 88
bs @ [Z] @ retrieve r v @ retrieve (ANTIMES [] r n) (Stars vs)"
+ − 89
by auto
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 90
(*>*)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 91
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 92
section {* Introduction *}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 93
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 94
text {*
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 95
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 96
In the last fifteen or so years, Brzozowski's derivatives of regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 97
expressions have sparked quite a bit of interest in the functional
569
+ − 98
programming and theorem prover communities.
563
+ − 99
Derivatives of a
642
+ − 100
regular expressions, written @{term "der c r"}, give a simple solution
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 101
to the problem of matching a string @{term s} with a regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 102
expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 103
succession) all the characters of the string matches the empty string,
569
+ − 104
then @{term r} matches @{term s} (and {\em vice versa}).
+ − 105
The beauty of
+ − 106
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
+ − 107
expressible in any functional language, and easily definable and
+ − 108
reasoned about in theorem provers---the definitions just consist of
+ − 109
inductive datatypes and simple recursive functions. Another attractive
+ − 110
feature of derivatives is that they can be easily extended to \emph{bounded}
+ − 111
regular expressions, such as @{term "r"}$^{\{@{text n}\}}$ or @{term r}$^{\{..@{text n}\}}$, where numbers or
+ − 112
intervals of numbers specify how many times a regular expression should be used
+ − 113
during matching.
+ − 114
+ − 115
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 116
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 117
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 118
However, there are two difficulties with derivative-based matchers:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 119
First, Brzozowski's original matcher only generates a yes/no answer
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 120
for whether a regular expression matches a string or not. This is too
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 121
little information in the context of lexing where separate tokens must
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 122
be identified and also classified (for example as keywords
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 123
or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 124
difficulty by cleverly extending Brzozowski's matching
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 125
algorithm. Their extended version generates additional information on
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 126
\emph{how} a regular expression matches a string following the POSIX
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 127
rules for regular expression matching. They achieve this by adding a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 128
second ``phase'' to Brzozowski's algorithm involving an injection
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 129
function. In our own earlier work we provided the formal
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 130
specification of what POSIX matching means and proved in Isabelle/HOL
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 131
the correctness
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 132
of Sulzmann and Lu's extended algorithm accordingly
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 133
\cite{AusafDyckhoffUrban2016}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 134
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 135
The second difficulty is that Brzozowski's derivatives can
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 136
grow to arbitrarily big sizes. For example if we start with the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 137
regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 138
successive derivatives according to the character $a$, we end up with
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 139
a sequence of ever-growing derivatives like
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 140
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 141
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 142
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 143
\begin{tabular}{rll}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 144
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 145
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 146
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 147
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 148
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 149
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 150
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 151
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 152
\noindent where after around 35 steps we run out of memory on a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 153
typical computer (we shall define shortly the precise details of our
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 154
regular expressions and the derivative operation). Clearly, the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 155
notation involving $\ZERO$s and $\ONE$s already suggests
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 156
simplification rules that can be applied to regular regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 157
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 158
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 159
r$. While such simple-minded simplifications have been proved in our
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 160
earlier work to preserve the correctness of Sulzmann and Lu's
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 161
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 162
\emph{not} help with limiting the growth of the derivatives shown
569
+ − 163
above: the growth is slowed, but some derivatives can still grow rather
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 164
quickly beyond any finite bound.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 165
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 166
569
+ − 167
Sulzmann and Lu address this ``growth problem'' in a second algorithm
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 168
\cite{Sulzmann2014} where they introduce bitcoded
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 169
regular expressions. In this version, POSIX values are
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 170
represented as bitsequences and such sequences are incrementally generated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 171
when derivatives are calculated. The compact representation
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 172
of bitsequences and regular expressions allows them to define a more
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 173
``aggressive'' simplification method that keeps the size of the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 174
derivatives finitely bounded no matter what the length of the string is.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 175
They make some informal claims about the correctness and linear behaviour
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 176
of this version, but do not provide any supporting proof arguments, not
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 177
even ``pencil-and-paper'' arguments. They write about their bitcoded
615
+ − 178
\emph{incremental parsing method} (that is the algorithm to be fixed and formalised
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 179
in this paper):
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 180
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 181
\begin{quote}\it
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 182
``Correctness Claim: We further claim that the incremental parsing
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 183
method [..] in combination with the simplification steps [..]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 184
yields POSIX parse trees. We have tested this claim
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 185
extensively [..] but yet
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 186
have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 187
\end{quote}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 188
615
+ − 189
\noindent{}\textbf{Contributions:} We fill this gap by implementing in Isabelle/HOL
+ − 190
our version of the derivative-based lexing algorithm of Sulzmann and Lu
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 191
\cite{Sulzmann2014} where regular expressions are annotated with
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 192
bitsequences. We define the crucial simplification function as a
569
+ − 193
recursive function, without the need of a fixpoint operation. One objective of
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 194
the simplification function is to remove duplicates of regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 195
expressions. For this Sulzmann and Lu use in their paper the standard
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 196
@{text nub} function from Haskell's list library, but this function
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 197
does not achieve the intended objective with bitcoded regular expressions. The
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 198
reason is that in the bitcoded setting, each copy generally has a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 199
different bitcode annotation---so @{text nub} would never ``fire''.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 200
Inspired by Scala's library for lists, we shall instead use a @{text
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 201
distinctWith} function that finds duplicates under an ``erasing'' function
616
+ − 202
that deletes bitcodes before comparing regular expressions.
615
+ − 203
We shall also introduce our \emph{own} arguments and definitions for
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 204
establishing the correctness of the bitcoded algorithm when
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 205
simplifications are included. Finally we
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 206
establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 207
%of regular expressions and describe the definition
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 208
%of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 209
%as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 210
%the correctness for the bitcoded algorithm without simplification, and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 211
%after that extend the proof to include simplification.}\smallskip
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 212
%\mbox{}\\[-10mm]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 213
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 214
\noindent In this paper, we shall first briefly introduce the basic notions
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 215
of regular expressions and describe the definition
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 216
of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 217
as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 218
the correctness for the bitcoded algorithm without simplification, and
643
+ − 219
after that extend the proof to include simplification.
+ − 220
Our Isabelle code including the results from Sec.~5 is available
+ − 221
from \textcolor{darkblue}{\url{https://github.com/urbanchr/posix}}.
+ − 222
\mbox{}\\[-6mm]
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 223
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 224
*}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 225
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 226
section {* Background *}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 227
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 228
text {*
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 229
In our Isabelle/HOL formalisation strings are lists of characters with
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 230
the empty string being represented by the empty list, written $[]$,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 231
and list-cons being written as $\_\!::\!\_\,$; string
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 232
concatenation is $\_ \,@\, \_\,$. We often use the usual
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 233
bracket notation for lists also for strings; for example a string
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 234
consisting of just a single character $c$ is written $[c]$.
569
+ − 235
Our regular expressions are defined as the following inductive
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 236
datatype:\\[-4mm]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 237
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 238
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 239
@{text "r ::="} \;
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 240
@{const "ZERO"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 241
@{const "ONE"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 242
@{term "CH c"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 243
@{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 244
@{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
563
+ − 245
@{term "STAR r"} $\mid$
+ − 246
@{term "NTIMES r n"}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 247
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 248
569
+ − 249
\noindent where @{const ZERO} stands for the regular expression that
+ − 250
does not match any string, @{const ONE} for the regular expression
+ − 251
that matches only the empty string and @{term c} for matching a
+ − 252
character literal. The constructors $+$ and $\cdot$ represent
+ − 253
alternatives and sequences, respectively. We sometimes omit the
+ − 254
$\cdot$ in a sequence regular expression for brevity. The
+ − 255
\emph{language} of a regular expression, written $L(r)$, is defined
+ − 256
as usual and we omit giving the definition here (see for example
+ − 257
\cite{AusafDyckhoffUrban2016}).
+ − 258
+ − 259
In our work here we also add to the usual ``basic'' regular
+ − 260
expressions the \emph{bounded} regular expression @{term "NTIMES r
+ − 261
n"} where the @{term n} specifies that @{term r} should match
643
+ − 262
exactly @{term n}-times (it is not included in Sulzmann and Lu's original work). For brevity we omit the other bounded
644
+ − 263
regular expressions @{text "r"}$^{\{..\textit{n}\}}$,
+ − 264
@{text "r"}$^{\{\textit{n}..\}}$
+ − 265
and @{text "r"}$^{\{\textit{n}..\textit{m}\}}$ which specify intervals for how many
569
+ − 266
times @{text r} should match. The results presented in this paper
+ − 267
extend straightforwardly to them too. The importance of the bounded
+ − 268
regular expressions is that they are often used in practical
+ − 269
applications, such as Snort (a system for detecting network
599
+ − 270
intrusions) and also in XML Schema definitions. According to Bj\"{o}rklund et
569
+ − 271
al~\cite{BjorklundMartensTimm2015}, bounded regular expressions
+ − 272
occur frequently in the latter and can have counters of up to
+ − 273
ten million. The problem is that tools based on the classic notion
644
+ − 274
of automata need to expand @{text "r"}$^{\{\textit{n}\}}$ into @{text n}
569
+ − 275
connected copies of the automaton for @{text r}. This leads to very
+ − 276
inefficient matching algorithms or algorithms that consume large
+ − 277
amounts of memory. A classic example is the regular expression
+ − 278
\mbox{@{term "SEQ (SEQ (STAR (ALT a b)) a) (NTIMES (ALT a b) n)"}}
+ − 279
where the minimal DFA requires at least $2^{n + 1}$ states (see
+ − 280
\cite{CountingSet2020}). Therefore regular expression matching
+ − 281
libraries that rely on the classic notion of DFAs often impose
+ − 282
adhoc limits for bounded regular expressions: For example in the
571
+ − 283
regular expression matching library in the Go language and also in Google's RE2 library the regular expression
569
+ − 284
@{term "NTIMES a 1001"} is not permitted, because no counter can be
643
+ − 285
above 1000; and in the regular expression library in Rust
569
+ − 286
expressions such as @{text "a\<^bsup>{1000}{100}{5}\<^esup>"} give an error
642
+ − 287
message for being too big. Up until recently,\footnote{up until version 1.5.4 of the regex
+ − 288
library in Rust; see also CVE-2022-24713.} Rust
643
+ − 289
however happily generated automata for regular expressions such as
642
+ − 290
@{text "a\<^bsup>{0}{4294967295}\<^esup>"}. This was due to a bug
+ − 291
in the algorithm that decides when a regular expression is acceptable
643
+ − 292
or too big according to Rust's classification (it did not account for the fact that @{text "a\<^bsup>{0}\<^esup>"} and similar examples can match the empty string). We shall come back to
642
+ − 293
this example later in the paper.
+ − 294
These problems can of course be solved in matching
569
+ − 295
algorithms where automata go beyond the classic notion and for
643
+ − 296
instance include explicit counters (e.g.~\cite{CountingSet2020}).
569
+ − 297
The point here is that Brzozowski derivatives and the algorithms by
+ − 298
Sulzmann and Lu can be straightforwardly extended to deal with
+ − 299
bounded regular expressions and moreover the resulting code
+ − 300
still consists of only simple recursive functions and inductive
+ − 301
datatypes. Finally, bounded regular expressions
+ − 302
do not destroy our finite boundedness property, which we shall
642
+ − 303
prove later on.
+ − 304
+ − 305
%, because during the lexing process counters will only be
569
+ − 306
%decremented.
+ − 307
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 308
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 309
Central to Brzozowski's regular expression matcher are two functions
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 310
called @{text nullable} and \emph{derivative}. The latter is written
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 311
$r\backslash c$ for the derivative of the regular expression $r$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 312
w.r.t.~the character $c$. Both functions are defined by recursion over
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 313
regular expressions.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 314
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 315
\begin{center}
615
+ − 316
\begin{tabular}{c @ {\hspace{-9mm}}c}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 317
\begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 318
@{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 319
@{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 320
@{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 321
@{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 322
@{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 323
& & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 324
& & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 325
% & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
569
+ − 326
@{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\\
+ − 327
@{thm (lhs) der.simps(7)} & $\dn$ & @{thm (rhs) der.simps(7)}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 328
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 329
&
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 330
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 331
@{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 332
@{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 333
@{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 334
@{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 335
@{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
569
+ − 336
@{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\
+ − 337
@{thm (lhs) nullable.simps(7)} & $\dn$ & @{text "if"} @{text "n = 0"}\\
+ − 338
& & @{text "then"} @{const "True"}\\
+ − 339
& & @{text "else"} @{term "nullable r"}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 340
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 341
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 342
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 343
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 344
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 345
We can extend this definition to give derivatives w.r.t.~strings,
615
+ − 346
namely as @{thm (lhs) ders.simps(1)}$\dn$@{thm (rhs) ders.simps(1)}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 347
and @{thm (lhs) ders.simps(2)}$\dn$@{thm (rhs) ders.simps(2)}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 348
Using @{text nullable} and the derivative operation, we can
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 349
define a simple regular expression matcher, namely
615
+ − 350
$@{text "match s r"} \dn @{term nullable}(r\backslash s)$.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 351
This is essentially Brzozowski's algorithm from 1964. Its
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 352
main virtue is that the algorithm can be easily implemented as a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 353
functional program (either in a functional programming language or in
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 354
a theorem prover). The correctness of @{text match} amounts to
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 355
establishing the property:%\footnote{It is a fun exercise to formally prove this property in a theorem prover.}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 356
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 357
\begin{proposition}\label{matchcorr}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 358
@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 359
\end{proposition}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 360
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 361
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 362
It is a fun exercise to formally prove this property in a theorem prover.
569
+ − 363
We are aware
+ − 364
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
+ − 365
Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
+ − 366
of the work by Krauss and Nipkow~\cite{Krauss2011}. And another one
+ − 367
in Coq is given by Coquand and Siles \cite{Coquand2012}.
+ − 368
Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 369
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 370
The novel idea of Sulzmann and Lu is to extend this algorithm for
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 371
lexing, where it is important to find out which part of the string
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 372
is matched by which part of the regular expression.
615
+ − 373
For this Sulzmann and Lu presented two lexing algorithms in their
+ − 374
paper~\cite{Sulzmann2014}. The first algorithm consists of two phases: first a
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 375
matching phase (which is Brzozowski's algorithm) and then a value
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 376
construction phase. The values encode \emph{how} a regular expression
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 377
matches a string. \emph{Values} are defined as the inductive datatype
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 378
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 379
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 380
@{text "v ::="}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 381
@{const "Void"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 382
@{term "val.Char c"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 383
@{term "Left v"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 384
@{term "Right v"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 385
@{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 386
@{term "Stars vs"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 387
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 388
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 389
\noindent where we use @{term vs} to stand for a list of values. The
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 390
string underlying a value can be calculated by a @{const flat}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 391
function, written @{term "flat DUMMY"}. It traverses a value and
616
+ − 392
collects the characters contained in it (see \cite{AusafDyckhoffUrban2016}).
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 393
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 394
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 395
Sulzmann and Lu also define inductively an
569
+ − 396
inhabitation relation that associates values to regular expressions. Our
616
+ − 397
version of this relation is defined by the following six rules:
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 398
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 399
\begin{center}
615
+ − 400
\begin{tabular}{@ {}l@ {}}
644
+ − 401
@{thm[mode=Axiom] Prf.intros(4)}\qquad
+ − 402
@{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\qquad
+ − 403
@{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\qquad
615
+ − 404
@{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\medskip\\
569
+ − 405
@{thm[mode=Axiom] Prf.intros(5)[of "c"]}\qquad
+ − 406
@{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\qquad
+ − 407
$\mprset{flushleft}\inferrule{
+ − 408
@{thm (prem 1) Prf.intros(7)[of "vs\<^sub>1" "r" "vs\<^sub>2" "n"]}\\\\
642
+ − 409
@{thm (prem 2) Prf.intros(7)[of "vs\<^sub>1" "r" "vs\<^sub>2" "n"]}\quad
569
+ − 410
@{thm (prem 3) Prf.intros(7)[of "vs\<^sub>1" "r" "vs\<^sub>2" "n"]}
+ − 411
}
+ − 412
{@{thm (concl) Prf.intros(7)[of "vs\<^sub>1" "r" "vs\<^sub>2" "n"]}}
+ − 413
$
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 414
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 415
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 416
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 417
\noindent Note that no values are associated with the regular expression
569
+ − 418
@{term ZERO}, since it cannot match any string. Interesting is our version
+ − 419
of the rule for @{term "STAR r"} where we require that each value
+ − 420
in @{term vs} flattens to a non-empty string. This means if @{term "STAR r"} ``fires''
+ − 421
one or more times, then each copy needs to match a non-empty string.
+ − 422
Similarly, in the rule
+ − 423
for @{term "NTIMES r n"} we require that the length of the list
+ − 424
@{term "vs\<^sub>1 @ vs\<^sub>2"} equals @{term n} (meaning the regular expression
+ − 425
@{text r} matches @{text n}-times) and that the first segment of this list
+ − 426
contains values that flatten to non-empty strings followed by a segment that
+ − 427
only contains values that flatten to the empty string.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 428
It is routine to establish how values ``inhabiting'' a regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 429
expression correspond to the language of a regular expression, namely
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 430
@{thm L_flat_Prf}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 431
616
+ − 432
In general there is more than one value inhabiting a regular
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 433
expression (meaning regular expressions can typically match more
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 434
than one string). But even when fixing a string from the language of the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 435
regular expression, there are generally more than one way of how the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 436
regular expression can match this string. POSIX lexing is about
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 437
identifying the unique value for a given regular expression and a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 438
string that satisfies the informal POSIX rules (see
499
+ − 439
\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).
+ − 440
%\footnote{POSIX
+ − 441
% lexing acquired its name from the fact that the corresponding
+ − 442
% rules were described as part of the POSIX specification for
+ − 443
% Unix-like operating systems \cite{POSIX}.}
+ − 444
Sometimes these
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 445
informal rules are called \emph{maximal munch rule} and \emph{rule priority}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 446
One contribution of our earlier paper is to give a convenient
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 447
specification for what POSIX values are (the inductive rules are shown in
616
+ − 448
Fig~\ref{POSIXrules}).
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 449
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 450
\begin{figure}[t]
499
+ − 451
\begin{center}\small%
569
+ − 452
\begin{tabular}{@ {\hspace{-2mm}}c@ {}}
642
+ − 453
\\[-8.5mm]
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 454
@{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 455
@{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 456
@{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 457
@{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 458
$\mprset{flushleft}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 459
\inferrule
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 460
{@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
642
+ − 461
@{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 462
@{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
642
+ − 463
{@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 464
@{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 465
$\mprset{flushleft}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 466
\inferrule
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 467
{@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 468
@{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 469
@{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 470
@{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
642
+ − 471
{@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\medskip\\
569
+ − 472
\mprset{sep=4mm}
642
+ − 473
@{thm[mode=Rule] Posix.intros(9)}\<open>Pn[]\<close>\quad\;
569
+ − 474
$\mprset{flushleft}
+ − 475
\inferrule
+ − 476
{@{thm (prem 1) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
+ − 477
@{thm (prem 2) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
+ − 478
@{thm (prem 3) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \\\\
+ − 479
@{thm (prem 4) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}
+ − 480
{@{thm (concl) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}$\<open>Pn+\<close>
+ − 481
\\[-4mm]
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 482
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 483
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 484
\caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 485
of given a string $s$ and a regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 486
expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
569
+ − 487
regular expression matching.\smallskip}\label{POSIXrules}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 488
\end{figure}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 489
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 490
The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 491
an injection function on values that mirrors (but inverts) the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 492
construction of the derivative on regular expressions. Essentially it
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 493
injects back a character into a value.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 494
For this they define two functions called @{text mkeps} and @{text inj}:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 495
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 496
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 497
\begin{tabular}{@ {}l@ {}}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 498
\begin{tabular}{@ {}lcl@ {}}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 499
@{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 500
@{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 501
@{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 502
@{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
642
+ − 503
@{thm (lhs) mkeps.simps(5)} & $\dn$ & @{thm (rhs) mkeps.simps(5)}
615
+ − 504
\end{tabular}
642
+ − 505
%\end{tabular}
+ − 506
%\end{center}
+ − 507
\smallskip\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 508
642
+ − 509
%\begin{center}
+ − 510
%\begin{tabular}{@ {}cc@ {}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 511
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
642
+ − 512
@{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)[of "c"]}\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 513
@{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 514
@{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 515
@{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 516
@{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
615
+ − 517
%!
+ − 518
@{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
+ − 519
& @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
+ − 520
@{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
+ − 521
& @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
+ − 522
@{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
+ − 523
@{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
+ − 524
%!
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 525
@{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$
569
+ − 526
& @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
+ − 527
@{thm (lhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]} & $\dn$
+ − 528
& @{thm (rhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 529
\end{tabular}
615
+ − 530
%!&
+ − 531
%!\begin{tabular}{@ {\hspace{-3mm}}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
+ − 532
+ − 533
%!\end{tabular}
+ − 534
%!\end{tabular}
+ − 535
\end{tabular}%!\smallskip
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 536
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 537
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 538
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 539
The function @{text mkeps} is run when the last derivative is nullable, that is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 540
the string to be matched is in the language of the regular expression. It generates
569
+ − 541
a value for how the last derivative can match the empty string. In case
+ − 542
of @{term "NTIMES r n"} we use the function @{term replicate} in order to generate
+ − 543
a list of exactly @{term n} copies, which is the length of the list we expect in this
642
+ − 544
case. The injection function\footnote{While the character argument @{text c} is not
+ − 545
strictly necessary in the @{text inj}-function for the fragment of regular expressions we
644
+ − 546
use in this paper, it is necessary for extended regular expressions. For example for the range regular expression of the form @{text "[a-z]"}.
642
+ − 547
We therefore keep this argument from the original formulation of @{text inj} by Sulzmann and Lu.}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 548
then calculates the corresponding value for each intermediate derivative until
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 549
a value for the original regular expression is generated.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 550
Graphically the algorithm by
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 551
Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 552
where the path from the left to the right involving @{term derivatives}/@{const
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 553
nullable} is the first phase of the algorithm (calculating successive
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 554
\Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 555
left, the second phase.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 556
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 557
\begin{center}
642
+ − 558
\begin{tikzpicture}[scale=0.85,node distance=8mm,
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 559
every node/.style={minimum size=6mm}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 560
\node (r1) {@{term "r\<^sub>1"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 561
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 562
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 563
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 564
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 565
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 566
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 567
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 568
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 569
\draw[->,line width=1mm](r4) -- (v4);
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 570
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 571
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 572
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 573
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 574
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 575
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
642
+ − 576
\draw (r4) node[anchor=north west] {\;\raisebox{-5mm}{@{term "mkeps"}}};
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 577
\end{tikzpicture}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 578
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 579
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 580
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 581
The picture shows the steps required when a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 582
regular expression, say @{text "r\<^sub>1"}, matches the string @{term
499
+ − 583
"[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:%\\[-8mm]
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 584
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 585
% \begin{figure}[t]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 586
%\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 587
%\begin{tikzpicture}[scale=1,node distance=1cm,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 588
% every node/.style={minimum size=6mm}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 589
%\node (r1) {@{term "r\<^sub>1"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 590
%\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 591
%\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 592
%\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 593
%\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 594
%\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 595
%\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 596
%\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 597
%\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 598
%\draw[->,line width=1mm](r4) -- (v4);
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 599
%\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 600
%\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 601
%\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 602
%\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 603
%\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 604
%\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 605
%\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 606
%\end{tikzpicture}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 607
%\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 608
%\mbox{}\\[-13mm]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 609
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 610
%\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 611
%matching the string @{term "[a,b,c]"}. The first phase (the arrows from
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 612
%left to right) is \Brz's matcher building successive derivatives. If the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 613
%last regular expression is @{term nullable}, then the functions of the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 614
%second phase are called (the top-down and right-to-left arrows): first
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 615
%@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 616
%how the empty string has been recognised by @{term "r\<^sub>4"}. After
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 617
%that the function @{term inj} ``injects back'' the characters of the string into
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 618
%the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 619
%the POSIX value for this string and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 620
%regular expression.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 621
%\label{Sulz}}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 622
%\end{figure}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 623
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 624
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 625
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 626
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 627
\begin{tabular}{lcl}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 628
@{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 629
@{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}
642
+ − 630
@{term "None"} @{text "\<Rightarrow>"} @{term None}
+ − 631
%%& & \hspace{27mm}
+ − 632
$|\;$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 633
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 634
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 635
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 636
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 637
We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 638
this algorithm is correct, that is it generates POSIX values. The
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 639
central property we established relates the derivative operation to the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 640
injection function.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 641
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 642
\begin{proposition}\label{Posix2}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 643
\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 644
\end{proposition}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 645
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 646
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 647
With this in place we were able to prove:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 648
499
+ − 649
\begin{proposition}\mbox{}\label{lexercorrect}
642
+ − 650
\textrm{(1)}\; @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}.\\
+ − 651
\mbox{\hspace{29.5mm}}\textrm{(2)}\; @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}.
499
+ − 652
%
+ − 653
% \smallskip\\
+ − 654
%\begin{tabular}{ll}
642
+ − 655
%(1) & @{thm (lhs) lexer_correct_None} \;if and only if\; @{thm (rhs) lexer_correct_None}\\
+ − 656
%(2) & @{thm (lhs) lexer_correct_Some} \;if and only if\; @{thm (rhs) lexer_correct_Some}\\
499
+ − 657
%\end{tabular}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 658
\end{proposition}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 659
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 660
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 661
In fact we have shown that, in the success case, the generated POSIX value $v$ is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 662
unique and in the failure case that there is no POSIX value $v$ that satisfies
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 663
$(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 664
slow in cases where the derivatives grow arbitrarily (recall the example from the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 665
Introduction). However it can be used as a convenient reference point for the correctness
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 666
proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 667
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 668
*}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 669
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 670
section {* Bitcoded Regular Expressions and Derivatives *}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 671
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 672
text {*
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 673
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 674
In the second part of their paper \cite{Sulzmann2014},
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 675
Sulzmann and Lu describe another algorithm that also generates POSIX
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 676
values but dispenses with the second phase where characters are
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 677
injected ``back'' into values. For this they annotate bitcodes to
615
+ − 678
regular expressions, which we define in Isabelle/HOL as the datatype\medskip
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 679
642
+ − 680
\begin{center}
+ − 681
%\noindent
+ − 682
%\begin{minipage}{0.9\textwidth}
615
+ − 683
\,@{term breg} $\,::=\,$ @{term "AZERO"}
+ − 684
$\,\mid$ @{term "AONE bs"}
+ − 685
$\,\mid$ @{term "ACHAR bs c"}
+ − 686
$\,\mid$ @{term "AALTs bs rs"}
+ − 687
$\,\mid$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}
+ − 688
$\,\mid$ @{term "ASTAR bs r"}
642
+ − 689
$\,\mid$ \mbox{@{term "ANTIMES bs r n"}}%\hfill\mbox{}
+ − 690
%\end{minipage}\medskip
+ − 691
\end{center}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 692
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 693
\noindent where @{text bs} stands for bitsequences; @{text r},
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 694
@{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 695
expressions; and @{text rs} for lists of bitcoded regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 696
expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 697
is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 698
For bitsequences we use lists made up of the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 699
constants @{text Z} and @{text S}. The idea with bitcoded regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 700
expressions is to incrementally generate the value information (for
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 701
example @{text Left} and @{text Right}) as bitsequences. For this
578
+ − 702
Sulzmann and Lu follow Nielsen and Henglein \cite{NielsenHenglein2011}
+ − 703
and define a coding
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 704
function for how values can be coded into bitsequences.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 705
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 706
\begin{center}
615
+ − 707
\begin{tabular}{c@ {\hspace{5mm}}c}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 708
\begin{tabular}{lcl}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 709
@{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 710
@{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 711
@{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 712
@{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 713
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 714
&
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 715
\begin{tabular}{lcl}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 716
@{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 717
@{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 718
@{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 719
\mbox{\phantom{XX}}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 720
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 721
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 722
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 723
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 724
\noindent
642
+ − 725
As can be seen, this coding is ``lossy'' in the sense that it does not
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 726
record explicitly character values and also not sequence values (for
642
+ − 727
them it just appends two bitsequences). However, the
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 728
different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 729
@{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 730
if there is still a value coming in the list of @{text Stars}, whereas @{text S}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 731
indicates the end of the list. The lossiness makes the process of
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 732
decoding a bit more involved, but the point is that if we have a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 733
regular expression \emph{and} a bitsequence of a corresponding value,
615
+ − 734
then we can always decode the value accurately~(see Fig.~\ref{decode}).
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 735
The function \textit{decode} checks whether all of the bitsequence is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 736
consumed and returns the corresponding value as @{term "Some v"}; otherwise
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 737
it fails with @{text "None"}. We can establish that for a value $v$
616
+ − 738
inhabiting a regular expression $r$, the decoding of its
578
+ − 739
bitsequence never fails (see also \cite{NielsenHenglein2011}).
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 740
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 741
%The decoding can be
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 742
%defined by using two functions called $\textit{decode}'$ and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 743
%\textit{decode}:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 744
615
+ − 745
\begin{figure}[t]
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 746
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 747
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
599
+ − 748
$\textit{decode}'\,bs\,(\ONE)$ & $\;\dn\;$ & $(\Empty, bs)$\\
+ − 749
$\textit{decode}'\,bs\,(c)$ & $\;\dn\;$ & $(\Char\,c, bs)$\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 750
$\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 751
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 752
(\Left\,v, bs_1)$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 753
$\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 754
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 755
(\Right\,v, bs_1)$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 756
$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 757
$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 758
& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 759
\hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
569
+ − 760
$\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
+ − 761
$\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ &
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 762
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 763
& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
569
+ − 764
\hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\\
615
+ − 765
$\textit{decode}'\,bs\,(r^{\{n\}})$ & $\dn$ & $\textit{decode}'\,bs\,r^*$\smallskip\smallskip\\
599
+ − 766
+ − 767
$\textit{decode}\,bs\,r$ & $\dn$ &
+ − 768
$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
615
+ − 769
& & $\;\;\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;\textit{else}\;\textit{None}$\\[-5mm]
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 770
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 771
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 772
\caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 773
\end{figure}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 774
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 775
%\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 776
%The function \textit{decode} checks whether all of the bitsequence is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 777
%consumed and returns the corresponding value as @{term "Some v"}; otherwise
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 778
%it fails with @{text "None"}. We can establish that for a value $v$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 779
%inhabited by a regular expression $r$, the decoding of its
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 780
%bitsequence never fails.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 781
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 782
\begin{lemma}\label{codedecode}\it
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 783
If $\;\vdash v : r$ then
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 784
$\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 785
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 786
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 787
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 788
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 789
Sulzmann and Lu define the function \emph{internalise}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 790
in order to transform (standard) regular expressions into annotated
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 791
regular expressions. We write this operation as $r^\uparrow$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 792
This internalisation uses the following
615
+ − 793
\emph{fuse} function.\medskip
+ − 794
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 795
%
615
+ − 796
%!\begin{center}
+ − 797
\noindent\begin{minipage}{\textwidth}
+ − 798
\begin{tabular}{@ {}c@ {\hspace{3mm}}c@ {}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 799
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 800
$\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 801
$\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 802
$\textit{ONE}\,(bs\,@\,bs')$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 803
$\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 804
$\textit{CHAR}\,(bs\,@\,bs')\,c$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 805
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 806
&
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 807
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 808
$\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 809
$\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 810
$\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 811
$\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 812
$\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
569
+ − 813
$\textit{STAR}\,(bs\,@\,bs')\,r$\\
+ − 814
$\textit{fuse}\,bs\,(\textit{NT}\,bs'\,r\,n)$ & $\dn$ &
+ − 815
$\textit{NT}\,(bs\,@\,bs')\,r\,n$
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 816
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 817
\end{tabular}
615
+ − 818
\end{minipage}\medskip
+ − 819
%!\end{center}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 820
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 821
\noindent
616
+ − 822
This function ``fuses'' a bitsequence to the topmost constructor of an bitcoded regular expressions.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 823
A regular expression can then be \emph{internalised} into a bitcoded
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 824
regular expression as follows:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 825
%
615
+ − 826
%!\begin{center}
+ − 827
%!\begin{tabular}{@ {}c@ {\hspace{2mm}}c@ {\hspace{2mm}}c@ {}}
+ − 828
%!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+ − 829
%!$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
+ − 830
%!$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$
+ − 831
%!\end{tabular}
+ − 832
%!&
+ − 833
%!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+ − 834
%!$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
+ − 835
%!$(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
+ − 836
%!$(r^{\{n\}})^\uparrow$ & $\dn$ &
+ − 837
%! $\textit{NT}\;[]\,r^\uparrow\,n$
+ − 838
%!\end{tabular}
+ − 839
%!&
+ − 840
%!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+ − 841
%!$(r_1 + r_2)^\uparrow$ & $\dn$ &
+ − 842
%! $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
+ − 843
%! (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
+ − 844
%!$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
+ − 845
%! $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$
+ − 846
%!\end{tabular}
+ − 847
%!\end{tabular}
+ − 848
%!\end{center}
+ − 849
%!
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 850
\begin{center}
615
+ − 851
\begin{tabular}{@ {}c@ {\hspace{7mm}}c@ {}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 852
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 853
$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
615
+ − 854
$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 855
$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
569
+ − 856
$(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 857
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 858
&
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 859
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 860
$(r_1 + r_2)^\uparrow$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 861
$\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 862
(\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 863
$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
615
+ − 864
$\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
+ − 865
$(r^{\{n\}})^\uparrow$ & $\dn$ &
+ − 866
$\textit{NT}\;[]\,r^\uparrow\,n$
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 867
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 868
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 869
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 870
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 871
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 872
There is also an \emph{erase}-function, written $r^\downarrow$, which
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 873
transforms a bitcoded regular expression into a (standard) regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 874
expression by just erasing the annotated bitsequences. We omit the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 875
straightforward definition. For defining the algorithm, we also need
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 876
the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 877
``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 878
bitcoded regular expressions.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 879
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 880
\begin{center}
643
+ − 881
\begin{tabular}{@ {\hspace{-1mm}}c@ {\hspace{10mm}}c@ {}}
569
+ − 882
\begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 883
$\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 884
$\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 885
$\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
615
+ − 886
$\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &\\
+ − 887
\multicolumn{3}{l}{$\quad\exists\, r \in \rs. \,\textit{bnullable}\,r$}\\
+ − 888
$\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
+ − 889
\multicolumn{3}{l}{$\quad\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$}\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 890
$\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
569
+ − 891
$\textit{True}$\\
+ − 892
$\textit{bnullable}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
615
+ − 893
\multicolumn{3}{l}{$\quad\textit{if}\;n = 0\;\textit{then}\;\textit{True}\;
569
+ − 894
\textit{else}\;\textit{bnullable}\,r$}\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 895
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 896
&
569
+ − 897
\begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l@ {}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 898
$\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 899
$\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 900
$bs\,@\,\textit{bmkepss}\,\rs$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 901
$\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
615
+ − 902
\multicolumn{3}{l}{$\quad{}bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 903
$\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 904
$bs \,@\, [\S]$\\
569
+ − 905
$\textit{bmkeps}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
615
+ − 906
\multicolumn{3}{l@ {}}{$\quad\textit{if}\;n=0\;\textit{then}\;bs \,@\, [\S]$}\\
+ − 907
\multicolumn{3}{l@ {}}{$\quad\textit{else}\,bs \,@\, [\Z] \,@\, \textit{bmkeps}\,r\,@\,
569
+ − 908
\textit{bmkeps}\,(@{term "ANTIMES [] r (n - 1)"})$}\\
+ − 909
$\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &\\
615
+ − 910
\multicolumn{3}{l}{$\quad\textit{if}\;\textit{bnullable}\,r\,\textit{then}\;\textit{bmkeps}\,r\;
569
+ − 911
\textit{else}\;\textit{bmkepss}\,\rs$}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 912
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 913
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 914
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 915
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 916
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 917
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 918
The key function in the bitcoded algorithm is the derivative of a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 919
bitcoded regular expression. This derivative function calculates the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 920
derivative but at the same time also the incremental part of the bitsequences
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 921
that contribute to constructing a POSIX value.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 922
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 923
\begin{center}
615
+ − 924
\begin{tabular}{@ {}l@ {\hspace{-1mm}}cl@ {}}
569
+ − 925
$(\textit{ZERO})\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\
+ − 926
$(\textit{ONE}\;bs)\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 927
$(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 928
$\textit{if}\;c=d\; \;\textit{then}\;
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 929
\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 930
$(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 931
$\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 932
$(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 933
$\textit{if}\;\textit{bnullable}\,r_1$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 934
& &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 935
$\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 936
& &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 937
$(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
569
+ − 938
$\textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
+ − 939
(\textit{STAR}\,[]\,r)$\\
+ − 940
$(\textit{NT}\,bs\,r\,n)\backslash c$ & $\dn$ &
+ − 941
$\textit{if}\;n = 0\; \textit{then}\;\textit{ZERO}\;\textit{else}\;
+ − 942
\textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
+ − 943
(\textit{NT}\,[]\,r\,(n - 1))$
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 944
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 945
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 946
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 947
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 948
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 949
This function can also be extended to strings, written $r\backslash s$,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 950
just like the standard derivative. We omit the details. Finally we
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 951
can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 952
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 953
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 954
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 955
$\textit{blexer}\;r\,s$ & $\dn$ &
642
+ − 956
$\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\
+ − 957
& & $\;\;\;\textit{if}\; \textit{bnullable}(r_{der})$
+ − 958
$\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r$
+ − 959
$\;\textit{else}\;\textit{None}$
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 960
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 961
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 962
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 963
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 964
This bitcoded lexer first internalises the regular expression $r$ and then
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 965
builds the bitcoded derivative according to $s$. If the derivative is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 966
(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 967
$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value. If
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 968
the derivative is \emph{not} nullable, then $\textit{None}$ is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 969
returned. We can show that this way of calculating a value
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 970
generates the same result as \textit{lexer}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 971
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 972
Before we can proceed we need to define a helper function, called
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 973
\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 974
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 975
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 976
\begin{tabular}{lcl}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 977
@{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 978
@{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 979
@{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 980
@{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 981
@{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 982
@{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 983
& $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 984
@{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
569
+ − 985
@{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}\\
+ − 986
@{thm (lhs) retrieve.simps(9)} & $\dn$ & @{thm (rhs) retrieve.simps(9)}\\
+ − 987
@{thm (lhs) better_retrieve2} & $\dn$ & @{thm (rhs) better_retrieve2}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 988
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 989
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 990
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 991
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 992
The idea behind this function is to retrieve a possibly partial
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 993
bitsequence from a bitcoded regular expression, where the retrieval is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 994
guided by a value. For example if the value is $\Left$ then we
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 995
descend into the left-hand side of an alternative in order to
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 996
assemble the bitcode. Similarly for
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 997
$\Right$. The property we can show is that for a given $v$ and $r$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 998
with $\vdash v : r$, the retrieved bitsequence from the internalised
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 999
regular expression is equal to the bitcoded version of $v$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1000
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1001
\begin{lemma}\label{retrievecode}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1002
If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1003
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1004
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1005
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1006
We also need some auxiliary facts about how the bitcoded operations
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1007
relate to the ``standard'' operations on regular expressions. For
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1008
example if we build a bitcoded derivative and erase the result, this
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1009
is the same as if we first erase the bitcoded regular expression and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1010
then perform the ``standard'' derivative operation.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1011
499
+ − 1012
\begin{lemma}\label{bnullable}%\mbox{}\smallskip\\
+ − 1013
\textit{(1)} $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\
642
+ − 1014
\mbox{\hspace{21.5mm}}\textit{(2)} $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
+ − 1015
\mbox{\hspace{21.5mm}}\textit{(3)} $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$
499
+ − 1016
%\begin{tabular}{ll}
+ − 1017
%\textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\
+ − 1018
%\textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
642
+ − 1019
%\textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ \;provided\; $\textit{nullable}(r^\downarrow)$.
499
+ − 1020
%\end{tabular}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1021
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1022
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1023
%\begin{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1024
% All properties are by induction on annotated regular expressions.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1025
% %There are no interesting cases.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1026
%\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1027
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1028
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1029
The only difficulty left for the correctness proof is that the bitcoded algorithm
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1030
has only a ``forward phase'' where POSIX values are generated incrementally.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1031
We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1032
functions during the forward phase. An auxiliary function, called $\textit{flex}$,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1033
allows us to recast the rules of $\lexer$ in terms of a single
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1034
phase and stacked up injection functions.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1035
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1036
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1037
\begin{tabular}{@ {}l@ {}c@ {}l@ {\hspace{10mm}}l@ {}c@ {}l@ {}}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1038
$\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1039
$\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1040
$\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1041
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1042
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1043
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1044
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1045
The point of this function is that when
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1046
reaching the end of the string, we just need to apply the stacked up
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1047
injection functions to the value generated by @{text mkeps}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1048
Using this function we can recast the success case in @{text lexer}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1049
as follows:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1050
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1051
\begin{lemma}\label{flex}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1052
If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
642
+ − 1053
(\textit{mkeps}\,(r\backslash s))$.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1054
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1055
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1056
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1057
Note we did not redefine \textit{lexer}, we just established that the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1058
value generated by \textit{lexer} can also be obtained by a different
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1059
method. While this different method is not efficient (we essentially
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1060
need to traverse the string $s$ twice, once for building the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1061
derivative $r\backslash s$ and another time for stacking up injection
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1062
functions), it helps us with proving
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1063
that incrementally building up values in @{text blexer} generates the same result.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1064
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1065
This brings us to our main lemma in this section: if we calculate a
616
+ − 1066
derivative, say $r\backslash s$, and have a value, say $v$, inhabiting
+ − 1067
this derivative, then we can produce the result @{text lexer} generates
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1068
by applying this value to the stacked-up injection functions
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1069
that $\textit{flex}$ assembles. The lemma establishes that this is the same
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1070
value as if we build the annotated derivative $r^\uparrow\backslash s$
616
+ − 1071
and then retrieve the bitcoded version of @{text v}, followed by a
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1072
decoding step.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1073
615
+ − 1074
\begin{lemma}[Main Lemma]\label{mainlemma}\mbox{}\\\it
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1075
If $\vdash v : r\backslash s$ then
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1076
$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1077
\textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1078
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1079
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1080
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1081
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1082
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1083
%With this lemma in place,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1084
We can then prove the correctness of \textit{blexer}---it indeed
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1085
produces the same result as \textit{lexer}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1086
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1087
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1088
\begin{theorem}\label{thmone}
599
+ − 1089
$\textit{blexer}\,r\,s = \textit{lexer}\,r\,s$
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1090
\end{theorem}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1091
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1092
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1093
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1094
\noindent This establishes that the bitcoded algorithm \emph{without}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1095
simplification produces correct results. This was
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1096
only conjectured by Sulzmann and Lu in their paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1097
\cite{Sulzmann2014}. The next step is to add simplifications.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1098
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1099
*}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1100
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1101
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1102
section {* Simplification *}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1103
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1104
text {*
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1105
569
+ − 1106
Derivatives as calculated by Brzozowski's method are usually more
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1107
complex regular expressions than the initial one; the result is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1108
that derivative-based matching and lexing algorithms are
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1109
often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1110
optimisations are possible, such as the simplifications
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1111
$\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1112
$\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1113
simplifications can considerably speed up the two algorithms in many
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1114
cases, they do not solve fundamentally the growth problem with
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1115
derivatives. To see this let us return to the example from the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1116
Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1117
If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1118
the simplification rules shown above we obtain
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1119
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1120
%%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1121
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1122
\begin{equation}\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}\label{derivex}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1123
(a + aa)^* \quad\xll\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1124
\underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\;
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1125
((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r})
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1126
\end{equation}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1127
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1128
\noindent This is a simpler derivative, but unfortunately we
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1129
cannot make any further simplifications. This is a problem because
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1130
the outermost alternatives contains two copies of the same
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1131
regular expression (underlined with $r$). These copies will
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1132
spawn new copies in later derivative steps and they in turn even more copies. This
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1133
destroys any hope of taming the size of the derivatives. But the
642
+ − 1134
second copy of $r$ in~\eqref{derivex} will never contribute to a
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1135
value, because POSIX lexing will always prefer matching a string
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1136
with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
615
+ − 1137
The issue with the simple-minded
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1138
simplification rules above is that the rule $r + r \Rightarrow r$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1139
will never be applicable because as can be seen in this example the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1140
regular expressions are not next to each other but separated by another regular expression.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1141
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1142
But here is where Sulzmann and Lu's representation of generalised
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1143
alternatives in the bitcoded algorithm shines: in @{term
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1144
"ALTs bs rs"} we can define a more aggressive simplification by
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1145
recursively simplifying all regular expressions in @{text rs} and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1146
then analyse the resulting list and remove any duplicates.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1147
Another advantage with the bitsequences in bitcoded regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1148
expressions is that they can be easily modified such that simplification does not
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1149
interfere with the value constructions. For example we can ``flatten'', or
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1150
de-nest, or spill out, @{text ALTs} as follows
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1151
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1152
\[
642
+ − 1153
@{text "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) :: rs\<^sub>1)"}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1154
\quad\xrightarrow{bsimp}\quad
616
+ − 1155
@{text "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) @ rs\<^sub>1)"}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1156
\]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1157
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1158
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1159
where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1160
to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1161
ensure that the correct value corresponding to the original (unsimplified)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1162
regular expression can still be extracted. %In this way the value construction
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1163
%is not affected by simplification.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1164
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1165
However there is one problem with the definition for the more
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1166
aggressive simplification rules described by Sulzmann and Lu. Recasting
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1167
their definition with our syntax they define the step of removing
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1168
duplicates as
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1169
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1170
\[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1171
bs (nub (map bsimp rs))"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1172
\]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1173
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1174
\noindent where they first recursively simplify the regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1175
expressions in @{text rs} (using @{text map}) and then use
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1176
Haskell's @{text nub}-function to remove potential
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1177
duplicates. While this makes sense when considering the example
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1178
shown in \eqref{derivex}, @{text nub} is the inappropriate
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1179
function in the case of bitcoded regular expressions. The reason
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1180
is that in general the elements in @{text rs} will have a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1181
different annotated bitsequence and in this way @{text nub}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1182
will never find a duplicate to be removed. One correct way to
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1183
handle this situation is to first \emph{erase} the regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1184
expressions when comparing potential duplicates. This is inspired
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1185
by Scala's list functions of the form \mbox{@{text "distinctWith rs eq
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1186
acc"}} where @{text eq} is an user-defined equivalence relation that
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1187
compares two elements in @{text rs}. We define this function in
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1188
Isabelle/HOL as
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1189
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1190
\begin{center}
615
+ − 1191
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1192
@{thm (lhs) distinctWith.simps(1)} & $\dn$ & @{thm (rhs) distinctWith.simps(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1193
@{thm (lhs) distinctWith.simps(2)} & $\dn$ &
642
+ − 1194
@{text "if (\<exists> y \<in> acc. eq x y)"}
+ − 1195
@{text "then distinctWith xs eq acc"}\\
615
+ − 1196
& & @{text "else x :: distinctWith xs eq ({x} \<union> acc)"}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1197
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1198
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1199
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1200
\noindent where we scan the list from left to right (because we
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1201
have to remove later copies). In @{text distinctWith}, @{text eq} is intended to be an
616
+ − 1202
equivalence relation for bitcoded regular expressions and @{text acc} is an accumulator for bitcoded regular
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1203
expressions---essentially a set of regular expressions that we have already seen
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1204
while scanning the list. Therefore we delete an element, say @{text x},
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1205
from the list provided a @{text "y"} with @{text "y"} being equivalent to @{text x} is already in the accumulator;
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1206
otherwise we keep @{text x} and scan the rest of the list but
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1207
add @{text "x"} as another ``seen'' element to @{text acc}. We will use
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1208
@{term distinctWith} where @{text eq} is an equivalence that deletes bitsequences from bitcoded regular expressions
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1209
before comparing the components. One way to define this in Isabelle/HOL is by the following recursive function from
616
+ − 1210
bitcoded regular expressions to @{text bool}:
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1211
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1212
\begin{center}
642
+ − 1213
\begin{tabular}{@ {}cc@ {}}
+ − 1214
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {\hspace{1mm}}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1215
@{thm (lhs) eq1.simps(1)} & $\dn$ & @{thm (rhs) eq1.simps(1)}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1216
@{thm (lhs) eq1.simps(2)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(2)[of DUMMY DUMMY]}\\
642
+ − 1217
@{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\
+ − 1218
@{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\
+ − 1219
\mbox{}
+ − 1220
\end{tabular}
+ − 1221
&
+ − 1222
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1223
@{thm (lhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]} & $\dn$ & @{thm (rhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1224
@{thm (lhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]} & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1225
@{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\
569
+ − 1226
@{thm (lhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]}\\
642
+ − 1227
@{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ & \\
+ − 1228
\multicolumn{3}{r}{@{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}}
+ − 1229
\end{tabular}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1230
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1231
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1232
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1233
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1234
where all other cases are set to @{text False}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1235
This equivalence is clearly a computationally more expensive operation than @{text nub},
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1236
but is needed in order to make the removal of unnecessary copies
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1237
to work properly.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1238
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1239
Our simplification function depends on three more helper functions, one is called
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1240
@{text flts} and analyses lists of regular expressions coming from alternatives.
642
+ − 1241
It is defined by four clauses as follows:
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1242
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1243
\begin{center}
642
+ − 1244
\begin{tabular}{c}
+ − 1245
@{thm (lhs) flts.simps(1)} $\dn$ @{thm (rhs) flts.simps(1)} \qquad
+ − 1246
@{thm (lhs) flts.simps(2)} $\dn$ @{thm (rhs) flts.simps(2)} \qquad
+ − 1247
@{text "flts ((ALTs bs' rs') :: rs"}
+ − 1248
%@{ thm (lhs) flts.simps(3)[of "bs'" "rs'"]}
+ − 1249
$\dn$ @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\smallskip\\
+ − 1250
@{text "flts (r :: rs)"} $\dn$ @{text "r :: flts rs"}\hspace{5mm}(otherwise)
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1251
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1252
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1253
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1254
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1255
The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
642
+ − 1256
the third ``de-nests'' alternatives (but retains the
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1257
bitsequence @{text "bs'"} accumulated in the inner alternative). There are
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1258
some corner cases to be considered when the resulting list inside an alternative is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1259
empty or a singleton list. We take care of those cases in the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1260
@{text "bsimpALTs"} function; similarly we define a helper function that simplifies
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1261
sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1262
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1263
\begin{center}
615
+ − 1264
\begin{tabular}{@ {}c@ {\hspace{4mm}}c@ {}}
+ − 1265
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1266
@{text "bsimpALTs bs []"} & $\dn$ & @{text "ZERO"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1267
@{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1268
@{text "bsimpALTs bs rs"} & $\dn$ & @{text "ALTs bs rs"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1269
\mbox{}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1270
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1271
&
615
+ − 1272
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1273
@{text "bsimpSEQ bs _ ZERO"} & $\dn$ & @{text "ZERO"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1274
@{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1275
@{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1276
& $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1277
@{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ & @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1278
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1279
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1280
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1281
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1282
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1283
With this in place we can define our simplification function as
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1284
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1285
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1286
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1287
@{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1288
@{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1289
@{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{text "bsimpALTs bs (distinctWith (flts (map bsimp rs)) \<approx> \<emptyset>)"}\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1290
@{text "bsimp r"} & $\dn$ & @{text r}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1291
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1292
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1293
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1294
\noindent
569
+ − 1295
We believe our recursive function @{term bsimp} simplifies bitcoded regular
642
+ − 1296
expressions as intended by Sulzmann and Lu with the small addition of removing ``useless'' @{text ONE}s
+ − 1297
in sequence regular
+ − 1298
expressions. There is no point in applying the
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1299
@{text bsimp} function repeatedly (like the simplification in their paper which needs to be
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1300
applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1301
that is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1302
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1303
\begin{proposition}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1304
@{term "bsimp (bsimp r) = bsimp r"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1305
\end{proposition}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1306
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1307
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1308
This can be proved by induction on @{text r} but requires a detailed analysis
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1309
that the de-nesting of alternatives always results in a flat list of regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1310
expressions. We omit the details since it does not concern the correctness proof.
569
+ − 1311
%It might be interesting to not that we do not simplify inside @{term STAR} and
+ − 1312
%@{text NT}: the reason is that we want to keep the
+ − 1313
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1314
Next we can include simplification after each derivative step leading to the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1315
following notion of bitcoded derivatives:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1316
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1317
\begin{center}
616
+ − 1318
\begin{tabular}{c@ {\hspace{10mm}}c}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1319
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1320
@{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1321
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1322
&
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1323
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1324
@{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1325
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1326
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1327
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1328
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1329
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1330
and use it in the improved lexing algorithm defined as
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1331
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1332
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1333
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1334
$\textit{blexer}^+\;r\,s$ & $\dn$ &
615
+ − 1335
$\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$\\
642
+ − 1336
& & $\;\;\;\textit{if}\; \textit{bnullable}(r_{der})$
+ − 1337
$\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r$ $\;\textit{else}\;\textit{None}$
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1338
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1339
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1340
616
+ − 1341
\noindent
+ − 1342
Note that in $\textit{blexer}^+$ the derivative $r_{der}$ is calculated
+ − 1343
using the simplifying derivative $\_\,\backslash_{bsimp}\,\_$.
+ − 1344
The remaining task is to show that @{term blexer} and
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1345
@{term "blexer_simp"} generate the same answers.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1346
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1347
When we first
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1348
attempted this proof we encountered a problem with the idea
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1349
in Sulzmann and Lu's paper where the argument seems to be to appeal
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1350
again to the @{text retrieve}-function defined for the unsimplified version
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1351
of the algorithm. But
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1352
this does not work, because desirable properties such as
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1353
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1354
\[
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1355
@{text "retrieve r v = retrieve (bsimp r) v"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1356
\]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1357
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1358
\noindent do not hold under simplification---this property
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1359
essentially purports that we can retrieve the same value from a
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1360
simplified version of the regular expression. To start with @{text retrieve}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1361
depends on the fact that the value @{text v} corresponds to the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1362
structure of the regular expression @{text r}---but the whole point of simplification
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1363
is to ``destroy'' this structure by making the regular expression simpler.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1364
To see this consider the regular expression @{term "r = ALT r' ZERO"} and a corresponding
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1365
value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1366
we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1367
bitsequence. The reason that this works is that @{text r} is an alternative
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1368
regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1369
@{text r}, then @{text v} does not correspond to the shape of the regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1370
expression anymore. So unless one can somehow
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1371
synchronise the change in the simplified regular expressions with
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1372
the original POSIX value, there is no hope of appealing to @{text retrieve} in the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1373
correctness argument for @{term blexer_simp}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1374
642
+ − 1375
For our proof we found it more helpful to introduce the rewriting systems shown in
616
+ − 1376
Fig~\ref{SimpRewrites}. The idea is to generate
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1377
simplified regular expressions in small steps (unlike the @{text bsimp}-function which
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1378
does the same in a big step), and show that each of
616
+ − 1379
the small steps preserves the bitcodes that lead to the POSIX value.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1380
The rewrite system is organised such that $\leadsto$ is for bitcoded regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1381
expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1382
expressions. The former essentially implements the simplifications of
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1383
@{text "bsimpSEQ"} and @{text flts}; while the latter implements the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1384
simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1385
regular expression reduces in zero or more steps to the simplified
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1386
regular expression generated by @{text bsimp}:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1387
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1388
\begin{lemma}\label{lemone}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1389
@{thm[mode=IfThen] rewrites_to_bsimp}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1390
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1391
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1392
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1393
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1394
\noindent
569
+ − 1395
We can also show that this rewrite system preserves @{term bnullable}, that
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1396
is simplification does not affect nullability:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1397
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1398
\begin{lemma}\label{lembnull}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1399
@{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1400
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1401
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1402
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1403
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1404
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1405
From this, we can show that @{text bmkeps} will produce the same bitsequence
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1406
as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1407
establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1408
in the paper by Sulzmannn and Lu).
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1409
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1410
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1411
\begin{lemma}\label{lemthree}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1412
@{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1413
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1414
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1415
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1416
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1417
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1418
Crucial is also the fact that derivative steps and simplification steps can be interleaved,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1419
which is shown by the fact that $\leadsto$ is preserved under derivatives.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1420
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1421
\begin{lemma}\label{bderlem}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1422
@{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1423
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1424
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1425
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1426
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1427
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1428
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1429
Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1430
that the unsimplified
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1431
derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1432
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1433
\begin{lemma}\label{lemtwo}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1434
@{thm[mode=IfThen] central}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1435
\end{lemma}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1436
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1437
%\begin{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1438
%By reverse induction on @{term s} generalising over @{text r}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1439
%\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1440
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1441
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1442
With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1443
generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
599
+ − 1444
is indeed the POSIX value as generated by \textit{lexer}.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1445
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1446
\begin{theorem}
599
+ − 1447
@{thm[mode=IfThen] main_blexer_simp[symmetric]} \; (@{text "= lexer r s"}\; by Thm.~\ref{thmone})
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1448
\end{theorem}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1449
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1450
%\begin{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1451
%By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1452
%\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1453
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1454
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1455
This means that if the algorithm is called with a regular expression @{term r} and a string
498
+ − 1456
@{term s} with $@{term s} \in L(@{term r})$, it will return @{term "Some v"} for the unique
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1457
@{term v} we defined by the POSIX relation $(@{term s}, @{term r}) \rightarrow @{term v}$; otherwise
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1458
the algorithm returns @{term "None"} when $s \not\in L(r)$ and no such @{text v} exists.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1459
This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1460
The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily big but
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1461
can be finitely bounded, which
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1462
we shall show next.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1463
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1464
\begin{figure}[t]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1465
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1466
\begin{tabular}{@ {\hspace{-8mm}}c@ {}}
643
+ − 1467
\\[-8mm]
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1468
@{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1469
@{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1470
@{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1471
@{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1472
@{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1473
@{thm[mode=Axiom] bs6}$A0$\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1474
@{thm[mode=Axiom] bs7}$A1$\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1475
@{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
569
+ − 1476
@{thm[mode=Rule] rrewrite_srewrite.ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1477
@{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1478
@{thm[mode=Axiom] ss4}$L\ZERO$\quad
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1479
@{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1480
@{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1481
\end{tabular}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1482
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1483
\caption{The rewrite rules that generate simplified regular expressions
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1484
in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1485
expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1486
expressions. Interesting is the $LD$ rule that allows copies of regular
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1487
expressions to be removed provided a regular expression earlier in the list can
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1488
match the same strings.}\label{SimpRewrites}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1489
\end{figure}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1490
*}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1491
615
+ − 1492
section {* Finite Bound for the Size of Derivatives *}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1493
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1494
text {*
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1495
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1496
In this section let us sketch our argument for why the size of the simplified
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1497
derivatives with the aggressive simplification function can be finitely bounded. Suppose
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1498
we have a size function for bitcoded regular expressions, written
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1499
$\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1500
(we omit the precise definition; ditto for lists $\llbracket r\!s\rrbracket$). For this we show that for every $r$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1501
there exists a bound $N$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1502
such that
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1503
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1504
\begin{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1505
$\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1506
\end{center}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1507
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1508
\noindent
569
+ − 1509
Note that the bound $N$ is a bound for \emph{all} strings, no matter how long they are.
+ − 1510
We establish this bound by induction on $r$. The base cases for @{term AZERO},
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1511
@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1512
for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1513
hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and
615
+ − 1514
$\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows\medskip
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1515
%
615
+ − 1516
%!\begin{center}
+ − 1517
+ − 1518
\noindent\begin{minipage}{\textwidth}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1519
\begin{tabular}{lcll}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1520
& & $ \llbracket@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\rrbracket$\\
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1521
& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
615
+ − 1522
[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]))\rrbracket $ & (1) \\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1523
& $\leq$ &
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1524
$\llbracket\textit{distinctWith}\,((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
615
+ − 1525
[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)])\,\approx\;@{term "{}"}\rrbracket + 1 $ & (2) \\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1526
& $\leq$ & $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket +
615
+ − 1527
\llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket + 1 $ & (3) \\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1528
& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 +
615
+ − 1529
\llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket$ & (4)\\
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1530
& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1531
\end{tabular}
615
+ − 1532
\end{minipage}\medskip
+ − 1533
%!\end{center}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1534
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1535
% tell Chengsong about Indian paper of closed forms of derivatives
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1536
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1537
\noindent
616
+ − 1538
where in (1) the set $\textit{Suf}(@{text "r"}_1, s)$ are all the suffixes of $s$ where @{term "bders_simp r\<^sub>1 s'"} is nullable ($s'$ being a suffix of $s$).
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1539
In (3) we know that $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket$ is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1540
bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1541
than $N_2$. The list length after @{text distinctWith} is bounded by a number, which we call $l_{N_2}$. It stands
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1542
for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them).
642
+ − 1543
We reason similarly for @{text STAR} and @{text NT}.\smallskip
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1544
499
+ − 1545
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1546
Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1547
far from the actual bound we can expect. We can do better than this, but this does not improve
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1548
the finiteness property we are proving. If we are interested in a polynomial bound,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1549
one would hope to obtain a similar tight bound as for partial
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1550
derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1551
@{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1552
partial derivatives). Unfortunately to obtain the exact same bound would mean
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1553
we need to introduce simplifications, such as
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1554
$(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1555
which exist for partial derivatives. However, if we introduce them in our
569
+ − 1556
setting we would lose the POSIX property of our calculated values. For example
+ − 1557
given the regular expressions @{term "SEQ (ALT a ab) (ALT b ONE)"} and the string $[a, b]$, then our
+ − 1558
algorithm generates the following correct POSIX value
+ − 1559
%
+ − 1560
\[
+ − 1561
@{term "Seq (Right (Seq (Char a) (Char b))) (Right Empty)"}
+ − 1562
\]
+ − 1563
+ − 1564
\noindent
+ − 1565
Essentially it matches the string with the longer @{text "Right"}-alternative in the
+ − 1566
first sequence (and then the `rest' with the empty regular expression @{const ONE} from the second sequence).
+ − 1567
If we add the simplification above, then we obtain the following value
+ − 1568
@{term "Seq (Left (Char a)) (Left (Char b))"}
+ − 1569
where the @{text "Left"}-alternatives get priority. However, this violates
+ − 1570
the POSIX rules and we have not been able to
615
+ − 1571
reconcile this problem. Therefore we leave better bounds for future work.%!\\[-6.5mm]
+ − 1572
644
+ − 1573
Note also that while Antimirov was able to give a bound on the \emph{size}
615
+ − 1574
of his partial derivatives~\cite{Antimirov95}, Brzozowski gave a bound
+ − 1575
on the \emph{number} of derivatives, provided they are quotient via
643
+ − 1576
ACI rules \cite{Brzozowski1964}. Brzozowski's result is crucial when one
642
+ − 1577
uses his derivatives for obtaining a DFA (it essentially bounds
615
+ − 1578
the number of states). However, this result does \emph{not}
+ − 1579
transfer to our setting where we are interested in the \emph{size} of the
642
+ − 1580
derivatives. For example it is \emph{not} true for our derivatives that the
+ − 1581
set of derivatives $r \backslash s$ for a given $r$ and all strings
617
+ − 1582
$s$ is finite (even when simplified). This is because for our annotated regular expressions
615
+ − 1583
the bitcode annotation is dependent on the number of iterations that
642
+ − 1584
are needed for @{text STAR}-regular expressions. This is not a problem for us: Since we intend to do lexing
615
+ − 1585
by calculating (as fast as possible) derivatives, the bound on the size
642
+ − 1586
of the derivatives is important, not their number. % of derivatives.
615
+ − 1587
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1588
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1589
*}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1590
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1591
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1592
section {* Conclusion *}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1593
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1594
text {*
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1595
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1596
We set out in this work to prove in Isabelle/HOL the correctness of
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1597
the second POSIX lexing algorithm by Sulzmann and Lu
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1598
\cite{Sulzmann2014}. This follows earlier work where we established
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1599
the correctness of the first algorithm
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1600
\cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
643
+ − 1601
introduce our own specification for POSIX values,
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1602
because the informal definition given by Sulzmann and Lu did not
616
+ − 1603
stand up to formal proof. Also for the second algorithm we needed
642
+ − 1604
to introduce our own definitions and proof ideas in order to
+ − 1605
establish the correctness. Our interest in the second algorithm
+ − 1606
lies in the fact that by using bitcoded regular expressions and an
+ − 1607
aggressive simplification method there is a chance that the
+ − 1608
derivatives can be kept universally small (we established in this
+ − 1609
paper that for a given $r$ they can be kept finitely bounded for
643
+ − 1610
\emph{all} strings). Our formalisation is approximately 7500 lines of
642
+ − 1611
Isabelle code. A little more than half of this code concerns the finiteness bound
+ − 1612
obtained in Section 5. This slight ``bloat'' in the latter part is because
+ − 1613
we had to introduce an intermediate datatype for annotated regular expressions and repeat many
643
+ − 1614
definitions for this intermediate datatype. But overall we think this made our
+ − 1615
formalisation work smoother. The code of our formalisation can be found at
+ − 1616
\textcolor{darkblue}{\url{https://github.com/urbanchr/posix}}.
499
+ − 1617
%This is important if one is after
+ − 1618
%an efficient POSIX lexing algorithm based on derivatives.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1619
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1620
Having proved the correctness of the POSIX lexing algorithm, which
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1621
lessons have we learned? Well, we feel this is a very good example
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1622
where formal proofs give further insight into the matter at
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1623
hand. For example it is very hard to see a problem with @{text nub}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1624
vs @{text distinctWith} with only experimental data---one would still
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1625
see the correct result but find that simplification does not simplify in well-chosen, but not
499
+ − 1626
obscure, examples.
+ − 1627
%We found that from an implementation
+ − 1628
%point-of-view it is really important to have the formal proofs of
+ − 1629
%the corresponding properties at hand.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1630
642
+ − 1631
With the results reported here, we can of course only make a claim
+ − 1632
about the correctness of the algorithm and the sizes of the
569
+ − 1633
derivatives, not about the efficiency or runtime of our version of
642
+ − 1634
Sulzmann and Lu's algorithm. But we found the size is an important
569
+ − 1635
first indicator about efficiency: clearly if the derivatives can
+ − 1636
grow to arbitrarily big sizes and the algorithm needs to traverse
+ − 1637
the derivatives possibly several times, then the algorithm will be
642
+ − 1638
slow---excruciatingly slow that is. Other works seem to make
+ − 1639
stronger claims, but during our formalisation work we have
+ − 1640
developed a healthy suspicion when for example experimental data is
+ − 1641
used to back up efficiency claims. For instance Sulzmann and Lu
+ − 1642
write about their equivalent of @{term blexer_simp} \textit{``...we
+ − 1643
can incrementally compute bitcoded parse trees in linear time in
+ − 1644
the size of the input''} \cite[Page 14]{Sulzmann2014}. Given the
+ − 1645
growth of the derivatives in some cases even after aggressive
+ − 1646
simplification, this is a hard to believe claim. A similar claim
+ − 1647
about a theoretical runtime of @{text "O(n\<^sup>2)"} for one
+ − 1648
specific list of regular expressions is made for the Verbatim
+ − 1649
lexer, which calculates tokens according to POSIX
569
+ − 1650
rules~\cite{verbatim}. For this, Verbatim uses Brzozowski's
642
+ − 1651
derivatives like in our work. About their empirical data, the authors write:
+ − 1652
\textit{``The results of our empirical tests [..] confirm that
+ − 1653
Verbatim has @{text "O(n\<^sup>2)"} time complexity.''}
569
+ − 1654
\cite[Section~VII]{verbatim}. While their correctness proof for
+ − 1655
Verbatim is formalised in Coq, the claim about the runtime
642
+ − 1656
complexity is only supported by some empirical evidence obtained by
569
+ − 1657
using the code extraction facilities of Coq. Given our observation
642
+ − 1658
with the ``growth problem'' of derivatives, this runtime bound
+ − 1659
is unlikely to hold universally: indeed we tried out their extracted OCaml
+ − 1660
code with the example \mbox{@{text "(a + aa)\<^sup>*"}} as a single
+ − 1661
lexing rule, and it took for us around 5 minutes to tokenise a
+ − 1662
string of 40 $a$'s and that increased to approximately 19 minutes
+ − 1663
when the string is 50 $a$'s long. Taking into account that
+ − 1664
derivatives are not simplified in the Verbatim lexer, such numbers
+ − 1665
are not surprising. Clearly our result of having finite
+ − 1666
derivatives might sound rather weak in this context but we think
+ − 1667
such efficiency claims really require further scrutiny. The
+ − 1668
contribution of this paper is to make sure derivatives do not grow
+ − 1669
arbitrarily big (universally). In the example \mbox{@{text "(a +
+ − 1670
aa)\<^sup>*"}}, \emph{all} derivatives have a size of 17 or
569
+ − 1671
less. The result is that lexing a string of, say, 50\,000 a's with
+ − 1672
the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes
+ − 1673
approximately 10 seconds with our Scala implementation of the
+ − 1674
presented algorithm.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1675
569
+ − 1676
Finally, let us come back to the point about bounded regular
+ − 1677
expressions. We have in this paper only shown that @{term "NTIMES r
+ − 1678
n"} can be included, but all our results extend straightforwardly
+ − 1679
also to the other bounded regular expressions. We find bounded
+ − 1680
regular expressions fit naturally into the setting of Brzozowski
+ − 1681
derivatives and the bitcoded regular expressions by Sulzmann and Lu.
+ − 1682
In contrast bounded regular expressions are often the Achilles'
+ − 1683
heel in regular expression matchers that use the traditional
+ − 1684
automata-based approach to lexing, primarily because they need to expand the
+ − 1685
counters of bounded regular expressions into $n$-connected copies
+ − 1686
of an automaton. This is not needed in Sulzmann and Lu's algorithm.
+ − 1687
To see the difference consider for example the regular expression @{term "SEQ (NTIMES a
+ − 1688
1001) (STAR a)"}, which is not permitted in the Go language because
+ − 1689
the counter is too big. In contrast we have no problem with
+ − 1690
matching this regular expression with, say 50\,000 a's, because the
+ − 1691
counters can be kept compact. In fact, the overall size of the
+ − 1692
derivatives is never greater than 5 in this example. Even in the
+ − 1693
example from Section 2, where Rust raises an error message, namely
+ − 1694
\mbox{@{text "a\<^bsup>{1000}{100}{5}\<^esup>"}}, the maximum size for
+ − 1695
our derivatives is a moderate 14.
+ − 1696
642
+ − 1697
Let us also return to the example @{text
+ − 1698
"a\<^bsup>{0}{4294967295}\<^esup>"} which until recently Rust
+ − 1699
deemed acceptable. But this was due to a bug. It turns out that it took Rust
+ − 1700
more than 11 minutes to generate an automaton for this regular
+ − 1701
expression and then to determine that a string of just one(!)
+ − 1702
@{text a} does \emph{not} match this regular expression. Therefore
+ − 1703
it is probably a wise choice that in newer versions of Rust's
+ − 1704
regular expression library such regular expressions are declared as
643
+ − 1705
``too big'' and raise an error message. While this is clearly
642
+ − 1706
a contrived example, the safety guaranties Rust wants to provide necessitate
+ − 1707
this conservative approach.
+ − 1708
However, with the derivatives and simplifications we have shown
+ − 1709
in this paper, the example can be solved with ease:
+ − 1710
it essentially only involves operations on
+ − 1711
integers and our Scala implementation takes only a few seconds to
+ − 1712
find out that this string, or even much larger strings, do not match.
+ − 1713
569
+ − 1714
Let us also compare our work to the verified Verbatim++ lexer where
+ − 1715
the authors of the Verbatim lexer introduced a number of
+ − 1716
improvements and optimisations, for example memoisation
+ − 1717
\cite{verbatimpp}. However, unlike Verbatim, which works with
+ − 1718
derivatives like in our work, Verbatim++ compiles first a regular
+ − 1719
expression into a DFA. While this makes lexing fast in many cases,
+ − 1720
with examples of bounded regular expressions like
+ − 1721
\mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}}
+ − 1722
one needs to represent them as
+ − 1723
sequences of $a \cdot a \cdot \ldots \cdot a$ (500 a's in sequence). We have run
+ − 1724
their extracted code with such a regular expression as a
+ − 1725
single lexing rule and a string of 50\,000 a's---lexing in this case
+ − 1726
takes approximately 5~minutes. We are not aware of any better
616
+ − 1727
translation using the traditional notion of DFAs so that we can improve on this. Therefore we
569
+ − 1728
prefer to stick with calculating derivatives, but attempt to make
+ − 1729
this calculation (in the future) as fast as possible. What we can guaranty
+ − 1730
with the presented work is that the maximum size of the derivatives
616
+ − 1731
for \mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}$\,\cdot\,$@{term "STAR a"}} is never bigger than 9. This means our Scala
642
+ − 1732
implementation again only needs a few seconds for this example and matching 50\,000 a's, say.
569
+ − 1733
%
+ − 1734
%
+ − 1735
%Possible ideas are
+ − 1736
%zippers which have been used in the context of parsing of
+ − 1737
%context-free grammars \cite{zipperparser}.
643
+ − 1738
%\\[-5mm] %\smallskip
642
+ − 1739
643
+ − 1740
%\noindent
+ − 1741
%Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.
+ − 1742
%%\\[-10mm]
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1743
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1744
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1745
%%\bibliographystyle{plain}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1746
\bibliography{root}
615
+ − 1747
*}
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1748
615
+ − 1749
(*<*)
+ − 1750
text {*
569
+ − 1751
\newpage
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1752
\appendix
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1753
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1754
\section{Some Proofs}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1755
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1756
While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some
569
+ − 1757
rough details of our reasoning in ``English'' if this is helpful for reviewing.
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1758
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1759
\begin{proof}[Proof of Lemma~\ref{codedecode}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1760
This follows from the property that
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1761
$\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1762
for any bit-sequence $bs$ and $\vdash v : r$. This property can be
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1763
easily proved by induction on $\vdash v : r$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1764
\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1765
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1766
\begin{proof}[Proof of Lemma~\ref{mainlemma}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1767
This can be proved by induction on $s$ and generalising over
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1768
$v$. The interesting point is that we need to prove this in the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1769
reverse direction for $s$. This means instead of cases $[]$ and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1770
$c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1771
string from the back.\footnote{Isabelle/HOL provides an induction principle
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1772
for this way of performing the induction.}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1773
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1774
The case for $[]$ is routine using Lemmas~\ref{codedecode}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1775
and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1776
the assumption that $\vdash v : (r\backslash s)\backslash c$
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1777
holds. Hence by Prop.~\ref{Posix2} we know that
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1778
(*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1779
By definition of $\textit{flex}$ we can unfold the left-hand side
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1780
to be
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1781
\[
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1782
\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1783
\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1784
\]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1785
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1786
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1787
By IH and (*) we can rewrite the right-hand side to
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1788
$\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1789
(\inj\,(r\backslash s)\,c\,\,v))\,r$ which is equal to
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1790
$\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1791
(s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1792
because we generalised over $v$ in our induction.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1793
\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1794
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1795
\begin{proof}[Proof of Theorem~\ref{thmone}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1796
We can first expand both sides using Lem.~\ref{flex} and the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1797
definition of \textit{blexer}. This gives us two
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1798
\textit{if}-statements, which we need to show to be equal. By
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1799
Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1800
$\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1801
\nullable(r\backslash s)$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1802
For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1803
$d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1804
by Lemma~\ref{bnullable}\textit{(3)} that
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1805
%
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1806
\[
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1807
\textit{decode}(\textit{bmkeps}\:r_d)\,r =
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1808
\textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1809
\]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1810
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1811
\noindent
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1812
where the right-hand side is equal to
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1813
$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1814
d))$ by Lemma~\ref{mainlemma} (we know
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1815
$\vdash \textit{mkeps}\,d : d$ by (*)). This shows the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1816
\textit{if}-branches return the same value. In the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1817
\textit{else}-branches both \textit{lexer} and \textit{blexer} return
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1818
\textit{None}. Therefore we can conclude the proof.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1819
\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1820
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1821
\begin{proof}[Proof of Lemma~\ref{lemone}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1822
By induction on @{text r}. For this we can use the properties
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1823
@{thm fltsfrewrites} and @{text "rs"}$\;\stackrel{s}{\leadsto}^*$@{text "distinctWith rs \<approx>"} @{term "{}"}. The latter uses
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1824
repeated applications of the $LD$ rule which allows the removal
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1825
of duplicates that can recognise the same strings.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1826
\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1827
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1828
\begin{proof}[Proof of Lemma~\ref{lembnull}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1829
Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1830
The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1831
be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1832
@{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1833
\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1834
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1835
\begin{proof}[Proof of Lemma \ref{lemthree}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1836
By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1837
Again the only interesting case is the rule $LD$ where we need to ensure that
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1838
@{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1839
bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"} holds.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1840
This is indeed the case because according to the POSIX rules the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1841
generated bitsequence is determined by the first alternative that can match the
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1842
string (in this case being nullable).
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1843
\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1844
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1845
\begin{proof}[Proof of Lemma~\ref{bderlem}]
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1846
By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1847
The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1848
if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1849
\end{proof}
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1850
*}
615
+ − 1851
(*>*)
496
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1852
(*<*)
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1853
end
Christian Urban <christian.urban@kcl.ac.uk>
parents:
diff
changeset
+ − 1854
(*>*)