author | Christian Urban <christian.urban@kcl.ac.uk> |
Tue, 08 Feb 2022 14:29:41 +0000 | |
changeset 423 | b7199d6c672d |
parent 420 | b66a4305749c |
child 424 | 2416fdec6396 |
permissions | -rw-r--r-- |
396 | 1 |
(*<*) |
2 |
theory Paper |
|
3 |
imports |
|
4 |
"../Lexer" |
|
5 |
"../Simplifying" |
|
6 |
"../Positions" |
|
7 |
"../SizeBound4" |
|
8 |
"HOL-Library.LaTeXsugar" |
|
9 |
begin |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
10 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
11 |
declare [[show_question_marks = false]] |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
12 |
|
398 | 13 |
notation (latex output) |
14 |
If ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and |
|
15 |
Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) |
|
16 |
||
17 |
||
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
18 |
abbreviation |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
19 |
"der_syn r c \<equiv> der c r" |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
20 |
abbreviation |
423 | 21 |
"ders_syn r s \<equiv> ders s r" |
22 |
abbreviation |
|
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
23 |
"bder_syn r c \<equiv> bder c r" |
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
24 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
25 |
notation (latex output) |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
26 |
der_syn ("_\\_" [79, 1000] 76) and |
423 | 27 |
ders_syn ("_\\_" [79, 1000] 76) and |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
28 |
bder_syn ("_\\_" [79, 1000] 76) and |
420 | 29 |
bders ("_\\_" [79, 1000] 76) and |
30 |
bders_simp ("_\\\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
31 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
32 |
ZERO ("\<^bold>0" 81) and |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
33 |
ONE ("\<^bold>1" 81) and |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
34 |
CH ("_" [1000] 80) and |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
35 |
ALT ("_ + _" [77,77] 78) and |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
36 |
SEQ ("_ \<cdot> _" [77,77] 78) and |
423 | 37 |
STAR ("_\<^sup>*" [79] 78) and |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
38 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
39 |
val.Void ("Empty" 78) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
40 |
val.Char ("Char _" [1000] 78) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
41 |
val.Left ("Left _" [79] 78) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
42 |
val.Right ("Right _" [1000] 78) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
43 |
val.Seq ("Seq _ _" [79,79] 78) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
44 |
val.Stars ("Stars _" [79] 78) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
45 |
|
420 | 46 |
Prf ("\<turnstile> _ : _" [75,75] 75) and |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
47 |
Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
48 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
49 |
flat ("|_|" [75] 74) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
50 |
flats ("|_|" [72] 74) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
51 |
injval ("inj _ _ _" [79,77,79] 76) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
52 |
mkeps ("mkeps _" [79] 76) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
53 |
length ("len _" [73] 73) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
54 |
set ("_" [73] 73) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
55 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
56 |
AZERO ("ZERO" 81) and |
405
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
57 |
AONE ("ONE _" [79] 78) and |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
58 |
ACHAR ("CHAR _ _" [79, 79] 80) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
59 |
AALTs ("ALTs _ _" [77,77] 78) and |
405
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
60 |
ASEQ ("SEQ _ _ _" [79, 79,79] 78) and |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
61 |
ASTAR ("STAR _ _" [79, 79] 78) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
62 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
63 |
code ("code _" [79] 74) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
64 |
intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
65 |
erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
66 |
bnullable ("bnullable _" [1000] 80) and |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
67 |
bmkeps ("bmkeps _" [1000] 80) and |
405
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
68 |
|
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
69 |
srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
70 |
rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
71 |
blexer_simp ("blexer\<^sup>+" 1000) |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
72 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
73 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
74 |
lemma better_retrieve: |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
75 |
shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v" |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
76 |
and "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v" |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
77 |
apply (metis list.exhaust retrieve.simps(4)) |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
78 |
by (metis list.exhaust retrieve.simps(5)) |
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
79 |
|
396 | 80 |
(*>*) |
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
81 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
82 |
section {* Introduction *} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
83 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
84 |
text {* |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
85 |
|
400 | 86 |
In the last fifteen or so years, Brzozowski's derivatives of regular |
87 |
expressions have sparked quite a bit of interest in the functional |
|
88 |
programming and theorem prover communities. The beauty of |
|
89 |
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly |
|
90 |
expressible in any functional language, and easily definable and |
|
91 |
reasoned about in theorem provers---the definitions just consist of |
|
420 | 92 |
inductive datatypes and simple recursive functions. Derivatives of a |
93 |
regular expression, written @{term "der c r"}, give a simple solution |
|
94 |
to the problem of matching a string @{term s} with a regular |
|
95 |
expression @{term r}: if the derivative of @{term r} w.r.t.\ (in |
|
96 |
succession) all the characters of the string matches the empty string, |
|
97 |
then @{term r} matches @{term s} (and {\em vice versa}). We are aware |
|
423 | 98 |
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by |
420 | 99 |
Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part |
100 |
of the work by Krauss and Nipkow \cite{Krauss2011}. And another one |
|
101 |
in Coq is given by Coquand and Siles \cite{Coquand2012}. |
|
423 | 102 |
Also Ribeiro and Du Bois give one in Agda \cite{RibeiroAgda2017}. |
420 | 103 |
|
104 |
||
423 | 105 |
However, there are two difficulties with derivative-based matchers: |
106 |
First, Brzozowski's original matcher only generates a yes/no answer |
|
107 |
for whether a regular expression matches a string or not. This is too |
|
108 |
little information in the context of lexing where separate tokens must |
|
109 |
be identified and also classified (for example as keywords |
|
110 |
or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this |
|
111 |
difficulty by cleverly extending Brzozowski's matching |
|
112 |
algorithm. Their extended version generates additional information on |
|
113 |
\emph{how} a regular expression matches a string following the POSIX |
|
114 |
rules for regular expression matching. They achieve this by adding a |
|
115 |
second ``phase'' to Brzozowski's algorithm involving an injection |
|
116 |
function. In our own earlier work we provided the formal |
|
117 |
specification of what POSIX matching means and proved in Isabelle/HOL |
|
118 |
the correctness |
|
119 |
of Sulzmann and Lu's extended algorithm accordingly |
|
120 |
\cite{AusafDyckhoffUrban2016}. |
|
400 | 121 |
|
423 | 122 |
The second difficulty is that Brzozowski's derivatives can |
420 | 123 |
grow to arbitrarily big sizes. For example if we start with the |
124 |
regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take |
|
125 |
successive derivatives according to the character $a$, we end up with |
|
126 |
a sequence of ever-growing derivatives like |
|
127 |
||
128 |
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}} |
|
129 |
\begin{center} |
|
130 |
\begin{tabular}{rll} |
|
131 |
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\ |
|
132 |
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\ |
|
133 |
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\ |
|
134 |
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\ |
|
423 | 135 |
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots) |
420 | 136 |
\end{tabular} |
137 |
\end{center} |
|
138 |
||
139 |
\noindent where after around 35 steps we run out of memory on a |
|
423 | 140 |
typical computer (we shall define shortly the precise details of our |
141 |
regular expressions and the derivative operation). Clearly, the |
|
142 |
notation involving $\ZERO$s and $\ONE$s already suggests |
|
143 |
simplification rules that can be applied to regular regular |
|
144 |
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r |
|
145 |
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow |
|
146 |
r$. While such simple-minded simplifications have been proved in our |
|
147 |
earlier work to preserve the correctness of Sulzmann and Lu's |
|
148 |
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do |
|
149 |
\emph{not} help with limiting the growth of the derivatives shown |
|
150 |
above: the growth is slowed, but the derivatives can still grow quickly |
|
151 |
beyond any finite bound. |
|
420 | 152 |
|
153 |
||
154 |
||
423 | 155 |
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm |
156 |
\cite{Sulzmann2014} where they introduce bitcoded |
|
157 |
regular expressions. In this version, POSIX values are |
|
158 |
represented as bitsequences and such sequences are incrementally generated |
|
159 |
when derivatives are calculated. The compact representation |
|
160 |
of bitsequences and regular expressions allows them to define a more |
|
161 |
``aggressive'' simplification method that keeps the size of the |
|
162 |
derivatives finite no matter what the length of the string is. |
|
163 |
They make some informal claims about the correctness and linear behaviour |
|
164 |
of this version, but do not provide any supporting proof arguments, not |
|
165 |
even ``pencil-and-paper'' arguments. They write about their bitcoded |
|
166 |
\emph{incremental parsing method} (that is the algorithm to be formalised |
|
167 |
in this paper): |
|
420 | 168 |
|
169 |
\begin{quote}\it |
|
170 |
``Correctness Claim: We further claim that the incremental parsing |
|
171 |
method [..] in combination with the simplification steps [..] |
|
172 |
yields POSIX parse trees. We have tested this claim |
|
173 |
extensively [..] but yet |
|
174 |
have to work out all proof details.'' |
|
175 |
\end{quote} |
|
176 |
||
423 | 177 |
\noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL |
178 |
the derivative-based lexing algorithm of Sulzmann and Lu |
|
179 |
\cite{Sulzmann2014} where regular expressions are annotated with |
|
180 |
bitsequences. We define the crucial simplification function as a |
|
181 |
recursive function, instead of a fix-point operation. One objective of |
|
182 |
the simplification function is to remove duplicates of regular |
|
183 |
expressions. For this Sulzmann and Lu use in their paper the standard |
|
184 |
@{text nub} function from Haskell's list library, but this function |
|
185 |
does not achieve the intended objective with bitcoded regular expressions. The |
|
186 |
reason is that in the bitcoded setting, each copy generally has a |
|
187 |
different bitcode annotation---so @{text nub} would never ``fire''. |
|
188 |
Inspired by Scala's library for lists, we shall instead use a @{text |
|
189 |
distinctBy} function that finds duplicates under an erasing function |
|
190 |
that deletes bitcodes. |
|
191 |
We shall also introduce our own argument and definitions for |
|
192 |
establishing the correctness of the bitcoded algorithm when |
|
193 |
simplifications are included.\medskip |
|
420 | 194 |
|
423 | 195 |
\noindent In this paper, we shall first briefly introduce the basic notions |
196 |
of regular expressions and describe the basic definitions |
|
197 |
of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves |
|
198 |
as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove |
|
199 |
the correctness for the bitcoded algorithm without simplification, and |
|
200 |
after that extend the proof to include simplification. |
|
400 | 201 |
|
423 | 202 |
*} |
203 |
||
204 |
section {* Background *} |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
205 |
|
423 | 206 |
text {* |
207 |
In our Isabelle/HOL formalisation strings are lists of characters with |
|
208 |
the empty string being represented by the empty list, written $[]$, |
|
209 |
and list-cons being written as $\_\!::\!\_\,$; string |
|
210 |
concatenation is $\_ \,@\, \_\,$. We often use the usual |
|
211 |
bracket notation for lists also for strings; for example a string |
|
212 |
consisting of just a single character $c$ is written $[c]$. |
|
213 |
Our regular expressions are defined as usual as the elements of the following inductive |
|
214 |
datatype: |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
215 |
|
423 | 216 |
\begin{center} |
217 |
@{text "r :="} |
|
218 |
@{const "ZERO"} $\mid$ |
|
219 |
@{const "ONE"} $\mid$ |
|
220 |
@{term "CH c"} $\mid$ |
|
221 |
@{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$ |
|
222 |
@{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$ |
|
223 |
@{term "STAR r"} |
|
224 |
\end{center} |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
225 |
|
423 | 226 |
\noindent where @{const ZERO} stands for the regular expression that does |
227 |
not match any string, @{const ONE} for the regular expression that matches |
|
228 |
only the empty string and @{term c} for matching a character literal. |
|
229 |
The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively. |
|
230 |
The |
|
231 |
\emph{language} of a regular expression, written $L$, is defined as usual |
|
232 |
and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}). |
|
420 | 233 |
|
423 | 234 |
Central to Brzozowski's regular expression matcher are two functions |
235 |
called @{text nullable} and \emph{derivative}. The latter is written |
|
236 |
$r\backslash c$ for the derivative of the regular expression $r$ |
|
237 |
w.r.t.~the character $c$. Both functions are defined by recursion over |
|
238 |
regular expressions. |
|
420 | 239 |
|
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
240 |
\begin{center} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
241 |
\begin{tabular}{cc} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
242 |
\begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
243 |
@{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
244 |
@{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
245 |
@{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
246 |
@{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
247 |
@{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
248 |
& & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
249 |
& & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
250 |
% & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
251 |
@{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
252 |
\end{tabular} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
253 |
& |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
254 |
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
255 |
@{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
256 |
@{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
257 |
@{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
258 |
@{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
259 |
@{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
260 |
@{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
261 |
\end{tabular} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
262 |
\end{tabular} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
263 |
\end{center} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
264 |
|
423 | 265 |
\noindent |
266 |
We can extend this definition to give derivatives w.r.t.~strings: |
|
267 |
||
268 |
\begin{center} |
|
269 |
\begin{tabular}{cc} |
|
270 |
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} |
|
271 |
@{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)} |
|
272 |
\end{tabular} |
|
273 |
& |
|
274 |
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} |
|
275 |
@{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)} |
|
276 |
\end{tabular} |
|
277 |
\end{tabular} |
|
278 |
\end{center} |
|
279 |
||
280 |
\noindent |
|
281 |
Using @{text nullable} and the derivative operation, we can |
|
282 |
define the following simple regular expression matcher: |
|
283 |
% |
|
284 |
\[ |
|
285 |
@{text "match s r"} \;\dn\; @{term nullable}(r\backslash s) |
|
286 |
\] |
|
287 |
||
288 |
\noindent This is essentially Brzozowski's algorithm from 1964. Its |
|
289 |
main virtue is that the algorithm can be easily implemented as a |
|
290 |
functional program (either in a functional programming language or in |
|
291 |
a theorem prover). The correctness proof for @{text match} amounts to |
|
292 |
establishing the property |
|
293 |
% |
|
294 |
\begin{proposition}\label{matchcorr} |
|
295 |
@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$ |
|
296 |
\end{proposition} |
|
297 |
||
298 |
\noindent |
|
299 |
It is a fun exercise to formaly prove this property in a theorem prover. |
|
300 |
||
301 |
The novel idea of Sulzmann and Lu is to extend this algorithm for |
|
302 |
lexing, where it is important to find out which part of the string |
|
303 |
is matched by which part of the regular expression. |
|
304 |
For this Sulzmann and Lu presented two lexing algorithms in their paper |
|
305 |
\cite{Sulzmann2014}. This first algorithm consists of two phases: first a |
|
306 |
matching phase (which is Brzozowski's algorithm) and then a value |
|
307 |
construction phase. The values encode \emph{how} a regular expression |
|
308 |
matches a string. \emph{Values} are defined as the inductive datatype |
|
309 |
||
310 |
\begin{center} |
|
311 |
@{text "v :="} |
|
312 |
@{const "Void"} $\mid$ |
|
313 |
@{term "val.Char c"} $\mid$ |
|
314 |
@{term "Left v"} $\mid$ |
|
315 |
@{term "Right v"} $\mid$ |
|
316 |
@{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ |
|
317 |
@{term "Stars vs"} |
|
318 |
\end{center} |
|
319 |
||
320 |
\noindent where we use @{term vs} to stand for a list of values. The |
|
321 |
string underlying a value can be calculated by a @{const flat} |
|
322 |
function, written @{term "flat DUMMY"}. It traverses a value and |
|
323 |
collects the characters contained in it. Sulzmann and Lu also define inductively an |
|
324 |
inhabitation relation that associates values to regular expressions: |
|
325 |
||
326 |
\begin{center} |
|
327 |
\begin{tabular}{c} |
|
328 |
\\[-8mm] |
|
329 |
@{thm[mode=Axiom] Prf.intros(4)} \qquad |
|
330 |
@{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm] |
|
331 |
@{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} \qquad |
|
332 |
@{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\\[4mm] |
|
333 |
@{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]} \qquad |
|
334 |
@{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]} |
|
335 |
\end{tabular} |
|
336 |
\end{center} |
|
337 |
||
338 |
\noindent Note that no values are associated with the regular expression |
|
339 |
@{term ZERO}, since it cannot match any string. |
|
340 |
It is routine to establish how values ``inhabiting'' a regular |
|
341 |
expression correspond to the language of a regular expression, namely |
|
342 |
||
343 |
\begin{proposition} |
|
344 |
@{thm L_flat_Prf} |
|
345 |
\end{proposition} |
|
346 |
||
347 |
In general there is more than one value inhabited by a regular |
|
348 |
expression (meaning regular expressions can typically match more |
|
349 |
than one string). But even when fixing a string from the language of the |
|
350 |
regular expression, there are generally more than one way of how the |
|
351 |
regular expression can match this string. POSIX lexing is about |
|
352 |
identifying the unique value for a given regular expression and a |
|
353 |
string that satisfies the informal POSIX rules (see |
|
354 |
\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).\footnote{POSIX |
|
355 |
lexing acquired its name from the fact that the corresponding |
|
356 |
rules were described as part of the POSIX specification for |
|
357 |
Unix-like operating systems \cite{POSIX}.} Sometimes these |
|
358 |
informal rules are called \emph{maximal much rule} and \emph{rule priority}. |
|
359 |
One contribution of our earlier paper is to give a convenient |
|
360 |
specification for what a POSIX value is (the inductive rules are shown in |
|
361 |
Figure~\ref{POSIXrules}). |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
362 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
363 |
\begin{figure}[t] |
423 | 364 |
\begin{center} |
365 |
\begin{tabular}{c} |
|
366 |
@{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad |
|
367 |
@{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\ |
|
368 |
@{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad |
|
369 |
@{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\ |
|
370 |
$\mprset{flushleft} |
|
371 |
\inferrule |
|
372 |
{@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad |
|
373 |
@{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\ |
|
374 |
@{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}} |
|
375 |
{@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\smallskip\\ |
|
376 |
@{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad |
|
377 |
$\mprset{flushleft} |
|
378 |
\inferrule |
|
379 |
{@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad |
|
380 |
@{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad |
|
381 |
@{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\ |
|
382 |
@{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}} |
|
383 |
{@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\\[-4mm] |
|
384 |
\end{tabular} |
|
385 |
\end{center} |
|
386 |
\caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion |
|
387 |
of given a string $s$ and a regular |
|
388 |
expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for |
|
389 |
regular expression matching.}\label{POSIXrules} |
|
390 |
\end{figure} |
|
391 |
||
392 |
The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define |
|
393 |
an injection function on values that mirrors (but inverts) the |
|
394 |
construction of the derivative on regular expressions. Essentially it |
|
395 |
injects back a character into a value. |
|
396 |
For this they define two functions called @{text mkeps} and @{text inj}: |
|
397 |
||
398 |
\begin{center} |
|
399 |
\begin{tabular}{l} |
|
400 |
\begin{tabular}{lcl} |
|
401 |
@{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\ |
|
402 |
@{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\ |
|
403 |
@{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\ |
|
404 |
@{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\ |
|
405 |
\end{tabular}\smallskip\\ |
|
406 |
||
407 |
\begin{tabular}{lcl} |
|
408 |
@{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\ |
|
409 |
@{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & |
|
410 |
@{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\ |
|
411 |
@{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & |
|
412 |
@{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\ |
|
413 |
@{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ |
|
414 |
& @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\ |
|
415 |
@{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ |
|
416 |
& @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\ |
|
417 |
@{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ |
|
418 |
& @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\ |
|
419 |
@{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ |
|
420 |
& @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]} |
|
421 |
\end{tabular} |
|
422 |
\end{tabular} |
|
423 |
\end{center} |
|
424 |
||
425 |
\noindent |
|
426 |
The function @{text mkeps} is called when the last derivative is nullable, that is |
|
427 |
the string to be matched is in the language of the regular expression. It generates |
|
428 |
a value for how the last derivative can match the empty string. The injection function |
|
429 |
then calculates the corresponding value for each intermediate derivative until |
|
430 |
a value for the original regular expression is generated. |
|
431 |
Graphically the algorithm by |
|
432 |
Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz} |
|
433 |
where the path from the left to the right involving @{term derivatives}/@{const |
|
434 |
nullable} is the first phase of the algorithm (calculating successive |
|
435 |
\Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to |
|
436 |
left, the second phase. This picture shows the steps required when a |
|
437 |
regular expression, say @{text "r\<^sub>1"}, matches the string @{term |
|
438 |
"[a,b,c]"}. The lexing algorithm by Sulzmann and Lu can be defined as: |
|
439 |
||
440 |
\begin{figure}[t] |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
441 |
\begin{center} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
442 |
\begin{tikzpicture}[scale=2,node distance=1.3cm, |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
443 |
every node/.style={minimum size=6mm}] |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
444 |
\node (r1) {@{term "r\<^sub>1"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
445 |
\node (r2) [right=of r1]{@{term "r\<^sub>2"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
446 |
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
447 |
\node (r3) [right=of r2]{@{term "r\<^sub>3"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
448 |
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
449 |
\node (r4) [right=of r3]{@{term "r\<^sub>4"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
450 |
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
451 |
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
452 |
\node (v4) [below=of r4]{@{term "v\<^sub>4"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
453 |
\draw[->,line width=1mm](r4) -- (v4); |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
454 |
\node (v3) [left=of v4] {@{term "v\<^sub>3"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
455 |
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
456 |
\node (v2) [left=of v3]{@{term "v\<^sub>2"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
457 |
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
458 |
\node (v1) [left=of v2] {@{term "v\<^sub>1"}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
459 |
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
460 |
\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}}; |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
461 |
\end{tikzpicture} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
462 |
\end{center} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
463 |
\mbox{}\\[-13mm] |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
464 |
|
423 | 465 |
\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014}, |
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
466 |
matching the string @{term "[a,b,c]"}. The first phase (the arrows from |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
467 |
left to right) is \Brz's matcher building successive derivatives. If the |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
468 |
last regular expression is @{term nullable}, then the functions of the |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
469 |
second phase are called (the top-down and right-to-left arrows): first |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
470 |
@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
471 |
how the empty string has been recognised by @{term "r\<^sub>4"}. After |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
472 |
that the function @{term inj} ``injects back'' the characters of the string into |
423 | 473 |
the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing |
474 |
the POSIX value for this string and |
|
475 |
regular expression. |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
476 |
\label{Sulz}} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
477 |
\end{figure} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
478 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
479 |
|
420 | 480 |
|
481 |
\begin{center} |
|
423 | 482 |
\begin{tabular}{lcl} |
483 |
@{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\ |
|
484 |
@{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}\\ |
|
485 |
& & \phantom{$|$} @{term "None"} @{text "\<Rightarrow>"} @{term None}\\ |
|
486 |
& & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"} |
|
420 | 487 |
\end{tabular} |
488 |
\end{center} |
|
489 |
||
423 | 490 |
|
491 |
We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that the algorithm by Sulzmann and Lu |
|
492 |
is correct. The cenral property we established relates the derivative operation to the injection function. |
|
420 | 493 |
|
423 | 494 |
\begin{proposition}\label{Posix2} |
495 |
\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$. |
|
496 |
\end{proposition} |
|
420 | 497 |
|
423 | 498 |
\noindent |
499 |
With this in place we were able to prove: |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
500 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
501 |
|
423 | 502 |
\begin{proposition}\mbox{}\smallskip\\\label{lexercorrect} |
503 |
\begin{tabular}{ll} |
|
504 |
(1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\ |
|
505 |
(2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\ |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
506 |
\end{tabular} |
423 | 507 |
\end{proposition} |
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
508 |
|
423 | 509 |
\noindent |
510 |
In fact we have shown that in the success case the generated POSIX value $v$ is |
|
511 |
unique and in the failure case that there is no POSIX value $v$ that satisfies |
|
512 |
$(s, r) \rightarrow v$. While the algorithm is correct, it is excrutiatingly |
|
513 |
slow in examples where the derivatives grow arbitrarily (see example from the |
|
514 |
Introduction). However it can be used as a conveninet reference point for the correctness |
|
515 |
proof of the second algorithm by Sulzmann and Lu, which we shall describe next. |
|
516 |
||
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
517 |
*} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
518 |
|
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
519 |
section {* Bitcoded Regular Expressions and Derivatives *} |
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
520 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
521 |
text {* |
405
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
522 |
|
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
523 |
In the second part of their paper \cite{Sulzmann2014}, |
423 | 524 |
Sulzmann and Lu describe another algorithm that also generates POSIX |
405
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
525 |
values but dispences with the second phase where characters are |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
526 |
injected ``back'' into values. For this they annotate bitcodes to |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
527 |
regular expressions, which we define in Isabelle/HOL as the datatype |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
528 |
|
405
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
529 |
\begin{center} |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
530 |
\begin{tabular}{lcl} |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
531 |
@{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\ |
405
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
532 |
& $\mid$ & @{term "ACHAR bs c"}\\ |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
533 |
& $\mid$ & @{term "AALTs bs rs"}\\ |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
534 |
& $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\ |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
535 |
& $\mid$ & @{term "ASTAR bs r"} |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
536 |
\end{tabular} |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
537 |
\end{center} |
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
538 |
|
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
539 |
\noindent where @{text bs} stands for bitsequences; @{text r}, |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
540 |
@{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
541 |
expressions; and @{text rs} for lists of bitcoded regular |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
542 |
expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
543 |
is just an abbreviation for @{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}. |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
544 |
For bitsequences we just use lists made up of the |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
545 |
constants @{text Z} and @{text S}. The idea with bitcoded regular |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
546 |
expressions is to incrementally generate the value information (for |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
547 |
example @{text Left} and @{text Right}) as bitsequences. For this |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
548 |
Sulzmann and Lu define a coding |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
549 |
function for how values can be coded into bitsequences. |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
550 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
551 |
\begin{center} |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
552 |
\begin{tabular}{cc} |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
553 |
\begin{tabular}{lcl} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
554 |
@{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
555 |
@{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
556 |
@{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\ |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
557 |
@{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
558 |
\end{tabular} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
559 |
& |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
560 |
\begin{tabular}{lcl} |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
561 |
@{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
562 |
@{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\ |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
563 |
@{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
564 |
\mbox{\phantom{XX}}\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
565 |
\end{tabular} |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
566 |
\end{tabular} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
567 |
\end{center} |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
568 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
569 |
\noindent |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
570 |
As can be seen, this coding is ``lossy'' in the sense that we do not |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
571 |
record explicitly character values and also not sequence values (for |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
572 |
them we just append two bitsequences). However, the |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
573 |
different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
574 |
@{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
575 |
if there is still a value coming in the list of @{text Stars}, whereas @{text S} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
576 |
indicates the end of the list. The lossiness makes the process of |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
577 |
decoding a bit more involved, but the point is that if we have a |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
578 |
regular expression \emph{and} a bitsequence of a corresponding value, |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
579 |
then we can always decode the value accurately. The decoding can be |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
580 |
defined by using two functions called $\textit{decode}'$ and |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
581 |
\textit{decode}: |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
582 |
|
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
583 |
\begin{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
584 |
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
585 |
$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
586 |
$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
587 |
$\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
588 |
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\; |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
589 |
(\Left\,v, bs_1)$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
590 |
$\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
591 |
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\; |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
592 |
(\Right\,v, bs_1)$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
593 |
$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
594 |
$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
595 |
& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
596 |
\hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
597 |
$\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
598 |
$\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
599 |
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
600 |
& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
601 |
\hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
602 |
$\textit{decode}\,bs\,r$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
603 |
$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
604 |
& & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\; |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
605 |
\textit{else}\;\textit{None}$ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
606 |
\end{tabular} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
607 |
\end{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
608 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
609 |
\noindent |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
610 |
The function \textit{decode} checks whether all of the bitsequence is |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
611 |
consumed and returns the corresponding value as @{term "Some v"}; otherwise |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
612 |
it fails with @{text "None"}. We can establish that for a value $v$ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
613 |
inhabited by a regular expression $r$, the decoding of its |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
614 |
bitsequence never fails. |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
615 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
616 |
\begin{lemma}\label{codedecode}\it |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
617 |
If $\;\vdash v : r$ then |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
618 |
$\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$. |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
619 |
\end{lemma} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
620 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
621 |
\begin{proof} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
622 |
This follows from the property that |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
623 |
$\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
624 |
for any bit-sequence $bs$ and $\vdash v : r$. This property can be |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
625 |
easily proved by induction on $\vdash v : r$. |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
626 |
\end{proof} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
627 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
628 |
Sulzmann and Lu define the function \emph{internalise} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
629 |
in order to transform standard regular expressions into annotated |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
630 |
regular expressions. We write this operation as $r^\uparrow$. |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
631 |
This internalisation uses the following |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
632 |
\emph{fuse} function. |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
633 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
634 |
\begin{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
635 |
\begin{tabular}{lcl} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
636 |
$\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
637 |
$\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
638 |
$\textit{ONE}\,(bs\,@\,bs')$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
639 |
$\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
640 |
$\textit{CHAR}\,(bs\,@\,bs')\,c$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
641 |
$\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
642 |
$\textit{ALTs}\,(bs\,@\,bs')\,rs$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
643 |
$\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
644 |
$\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
645 |
$\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
646 |
$\textit{STAR}\,(bs\,@\,bs')\,r$ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
647 |
\end{tabular} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
648 |
\end{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
649 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
650 |
\noindent |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
651 |
A regular expression can then be \emph{internalised} into a bitcoded |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
652 |
regular expression as follows. |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
653 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
654 |
\begin{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
655 |
\begin{tabular}{lcl} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
656 |
$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
657 |
$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
658 |
$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
659 |
$(r_1 + r_2)^\uparrow$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
660 |
$\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\, |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
661 |
(\textit{fuse}\,[\S]\,r_2^\uparrow)$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
662 |
$(r_1\cdot r_2)^\uparrow$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
663 |
$\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
664 |
$(r^*)^\uparrow$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
665 |
$\textit{STAR}\;[]\,r^\uparrow$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
666 |
\end{tabular} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
667 |
\end{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
668 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
669 |
\noindent |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
670 |
There is also an \emph{erase}-function, written $a^\downarrow$, which |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
671 |
transforms a bitcoded regular expression into a (standard) regular |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
672 |
expression by just erasing the annotated bitsequences. We omit the |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
673 |
straightforward definition. For defining the algorithm, we also need |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
674 |
the functions \textit{bnullable} and \textit{bmkeps}, which are the |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
675 |
``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
676 |
bitcoded regular expressions, instead of regular expressions. |
423 | 677 |
% |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
678 |
\begin{center} |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
679 |
\begin{tabular}{@ {}c@ {}c@ {}} |
423 | 680 |
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
681 |
$\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$\\ |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
682 |
$\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{true}$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
683 |
$\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{false}$\\ |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
684 |
$\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ & |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
685 |
$\exists\, r \in \rs. \,\textit{bnullable}\,r$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
686 |
$\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ & |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
687 |
$\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
688 |
$\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ & |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
689 |
$\textit{true}$ |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
690 |
\end{tabular} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
691 |
& |
423 | 692 |
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}} |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
693 |
$\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
694 |
$\textit{bmkeps}\,(\textit{ALTs}\,bs\,r\!::\!\rs)$ & $\dn$ & |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
695 |
$\textit{if}\;\textit{bnullable}\,r$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
696 |
& &$\textit{then}\;bs\,@\,\textit{bmkeps}\,r$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
697 |
& &$\textit{else}\;bs\,@\,\textit{bmkeps}\,\rs$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
698 |
$\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
699 |
\multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
700 |
$\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ & |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
701 |
$bs \,@\, [\S]$ |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
702 |
\end{tabular} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
703 |
\end{tabular} |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
704 |
\end{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
705 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
706 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
707 |
\noindent |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
708 |
The key function in the bitcoded algorithm is the derivative of an |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
709 |
bitcoded regular expression. This derivative calculates the |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
710 |
derivative but at the same time also the incremental part of bitsequences |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
711 |
that contribute to constructing a POSIX value. |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
712 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
713 |
\begin{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
714 |
\begin{tabular}{@ {}lcl@ {}} |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
715 |
$(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \\ |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
716 |
$(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
717 |
$(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ & |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
718 |
$\textit{if}\;c=d\; \;\textit{then}\; |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
719 |
\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\ |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
720 |
$(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ & |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
721 |
$\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
722 |
$(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ & |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
723 |
$\textit{if}\;\textit{bnullable}\,r_1$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
724 |
& &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
725 |
& &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
726 |
& &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
727 |
$(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ & |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
728 |
$\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\, |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
729 |
(\textit{STAR}\,[]\,r)$ |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
730 |
\end{tabular} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
731 |
\end{center} |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
732 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
733 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
734 |
\noindent |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
735 |
This function can also be extended to strings, written $r\backslash s$, |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
736 |
just like the standard derivative. We omit the details. Finally we |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
737 |
can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}: |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
738 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
739 |
\begin{center} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
740 |
\begin{tabular}{lcl} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
741 |
$\textit{blexer}\;r\,s$ & $\dn$ & |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
742 |
$\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
743 |
& & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
744 |
\;\;\textit{else}\;\textit{None}$ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
745 |
\end{tabular} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
746 |
\end{center} |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
747 |
|
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
748 |
\noindent |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
749 |
This bitcoded lexer first internalises the regular expression $r$ and then |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
750 |
builds the bitcoded derivative according to $s$. If the derivative is |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
751 |
(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
752 |
$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value. If |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
753 |
the derivative is \emph{not} nullable, then $\textit{None}$ is |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
754 |
returned. We can show that this way of calculating a value |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
755 |
generates the same result as with \textit{lexer}. |
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
756 |
|
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
757 |
Before we can proceed we need to define a helper function, called |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
758 |
\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof. |
416
57182b36ec01
more with the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
410
diff
changeset
|
759 |
|
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
760 |
\begin{center} |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
761 |
\begin{tabular}{lcl} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
762 |
@{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
763 |
@{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
764 |
@{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
765 |
@{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
766 |
@{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
767 |
@{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
768 |
& $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
769 |
@{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
770 |
@{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
771 |
\end{tabular} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
772 |
\end{center} |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
773 |
|
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
774 |
\noindent |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
775 |
The idea behind this function is to retrieve a possibly partial |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
776 |
bitcode from a bitcoded regular expression, where the retrieval is |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
777 |
guided by a value. For example if the value is $\Left$ then we |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
778 |
descend into the left-hand side of an alternative in order to |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
779 |
assemble the bitcode. Similarly for |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
780 |
$\Right$. The property we can show is that for a given $v$ and $r$ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
781 |
with $\vdash v : r$, the retrieved bitsequence from the internalised |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
782 |
regular expression is equal to the bitcoded version of $v$. |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
783 |
|
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
784 |
\begin{lemma}\label{retrievecode} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
785 |
If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
786 |
\end{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
787 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
788 |
\noindent |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
789 |
We also need some auxiliary facts about how the bitcoded operations |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
790 |
relate to the ``standard'' operations on regular expressions. For |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
791 |
example if we build a bitcoded derivative and erase the result, this |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
792 |
is the same as if we first erase the bitcoded regular expression and |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
793 |
then perform the ``standard'' derivative operation. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
794 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
795 |
\begin{lemma}\label{bnullable}\mbox{}\smallskip\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
796 |
\begin{tabular}{ll} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
797 |
\textit{(1)} & $(a\backslash s)^\downarrow = (a^\downarrow)\backslash s$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
798 |
\textit{(2)} & $\textit{bnullable}(a)$ iff $\textit{nullable}(a^\downarrow)$\\ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
799 |
\textit{(3)} & $\textit{bmkeps}(a) = \textit{retrieve}\,a\,(\textit{mkeps}\,(a^\downarrow))$ provided $\textit{nullable}(a^\downarrow)$. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
800 |
\end{tabular} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
801 |
\end{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
802 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
803 |
\begin{proof} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
804 |
All properties are by induction on annotated regular expressions. There are no |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
805 |
interesting cases. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
806 |
\end{proof} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
807 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
808 |
\noindent |
423 | 809 |
The only problem left for the correctness proof is that the bitcoded algorithm |
810 |
has only a ``forward phase'' where POSIX values are generated incrementally. |
|
811 |
We can achive the same effect with @{text lexer} by stacking up injection |
|
812 |
functions in the during forward phase. An auxiliary function, called $\textit{flex}$, |
|
813 |
allows us to recast the rules of $\lexer$ (with its two phases) in terms of a single |
|
814 |
phase. |
|
815 |
||
816 |
\begin{center} |
|
817 |
\begin{tabular}{lcl} |
|
818 |
$\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$\\ |
|
819 |
$\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ & |
|
820 |
$\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$\\ |
|
821 |
\end{tabular} |
|
822 |
\end{center} |
|
823 |
||
824 |
\noindent |
|
825 |
The point of this function is that when |
|
826 |
reaching the end of the string, we just need to apply the stacked |
|
827 |
injection functions to the value generated by $\mkeps$. |
|
828 |
Using this function we can recast the success case in @{text lexer} |
|
829 |
as follows: |
|
830 |
||
831 |
\begin{proposition}\label{flex} |
|
832 |
If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\, |
|
833 |
(\mkeps (r\backslash s))$. |
|
834 |
\end{proposition} |
|
835 |
||
836 |
\noindent |
|
837 |
Note we did not redefine \textit{lexer}, we just established that the |
|
838 |
value generated by \textit{lexer} can also be obtained by a different |
|
839 |
method. While this different method is not efficient (we essentially |
|
840 |
need to traverse the string $s$ twice, once for building the |
|
841 |
derivative $r\backslash s$ and another time for stacking up injection |
|
842 |
functions using \textit{flex}), it helped us with proving |
|
843 |
that incrementally building up values. |
|
844 |
||
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
845 |
This brings us to our main lemma in this section: if we build a |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
846 |
derivative, say $r\backslash s$ and have a value, say $v$, inhabited |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
847 |
by this derivative, then we can produce the result $\lexer$ generates |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
848 |
by applying this value to the stacked-up injection functions |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
849 |
$\textit{flex}$ assembles. The lemma establishes that this is the same |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
850 |
value as if we build the annotated derivative $r^\uparrow\backslash s$ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
851 |
and then retrieve the corresponding bitcoded version, followed by a |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
852 |
decoding step. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
853 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
854 |
\begin{lemma}[Main Lemma]\label{mainlemma}\it |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
855 |
If $\vdash v : r\backslash s$ then |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
856 |
\[\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) = |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
857 |
\textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r\] |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
858 |
\end{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
859 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
860 |
\begin{proof} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
861 |
This can be proved by induction on $s$ and generalising over |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
862 |
$v$. The interesting point is that we need to prove this in the |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
863 |
reverse direction for $s$. This means instead of cases $[]$ and |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
864 |
$c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
865 |
string from the back.\footnote{Isabelle/HOL provides an induction principle |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
866 |
for this way of performing the induction.} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
867 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
868 |
The case for $[]$ is routine using Lemmas~\ref{codedecode} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
869 |
and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
870 |
the assumption that $\vdash v : (r\backslash s)\backslash c$ |
423 | 871 |
holds. Hence by Prop.~\ref{Posix2} we know that |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
872 |
(*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
873 |
By definition of $\textit{flex}$ we can unfold the left-hand side |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
874 |
to be |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
875 |
\[ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
876 |
\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) = |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
877 |
\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v)) |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
878 |
\] |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
879 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
880 |
\noindent |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
881 |
By induction hypothesis and (*) we can rewrite the right-hand side to |
423 | 882 |
% |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
883 |
\[ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
884 |
\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\; |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
885 |
(\inj\,(r\backslash s)\,c\,\,v))\,r |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
886 |
\] |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
887 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
888 |
\noindent |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
889 |
which is equal to |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
890 |
$\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
891 |
(s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
892 |
because we generalised over $v$ in our induction. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
893 |
\end{proof} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
894 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
895 |
\noindent |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
896 |
With this lemma in place, we can prove the correctness of \textit{blexer} such |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
897 |
that it produces the same result as \textit{lexer}. |
405
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
898 |
|
3cfea5bb5e23
updated some of the text and cardinality proof
Christian Urban <christian.urban@kcl.ac.uk>
parents:
402
diff
changeset
|
899 |
|
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
900 |
\begin{theorem} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
901 |
$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
902 |
\end{theorem} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
903 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
904 |
\begin{proof} |
423 | 905 |
We can first expand both sides using Prop.~\ref{flex} and the |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
906 |
definition of \textit{blexer}. This gives us two |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
907 |
\textit{if}-statements, which we need to show to be equal. By |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
908 |
Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide: |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
909 |
\[ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
910 |
\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\; |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
911 |
\nullable(r\backslash s) |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
912 |
\] |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
913 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
914 |
\noindent |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
915 |
For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
916 |
$d \dn r\backslash s$. We have (*) $\nullable\,d$. We can then show |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
917 |
by Lemma~\ref{bnullable}\textit{(3)} that |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
918 |
% |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
919 |
\[ |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
920 |
\textit{decode}(\textit{bmkeps}\,r_d)\,r = |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
921 |
\textit{decode}(\textit{retrieve}\,a\,(\textit{mkeps}\,d))\,r |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
922 |
\] |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
923 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
924 |
\noindent |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
925 |
where the right-hand side is equal to |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
926 |
$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\, |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
927 |
d))$ by Lemma~\ref{mainlemma} (we know |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
928 |
$\vdash \textit{mkeps}\,d : d$ by (*)). This shows the |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
929 |
\textit{if}-branches return the same value. In the |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
930 |
\textit{else}-branches both \textit{lexer} and \textit{blexer} return |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
931 |
\textit{None}. Therefore we can conclude the proof. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
932 |
\end{proof} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
933 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
934 |
\noindent |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
935 |
This establishes that the bitcoded algorithm by Sulzmann |
423 | 936 |
and Lu \emph{without} simplification produces correct results. This was |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
937 |
only conjectured in their paper \cite{Sulzmann2014}. The next step |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
938 |
is to add simplifications. |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
939 |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
940 |
*} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
941 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
942 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
943 |
section {* Simplification *} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
944 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
945 |
text {* |
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
946 |
|
420 | 947 |
Derivatives as calculated by Brzozowski’s method are usually more |
948 |
complex regular expressions than the initial one; the result is |
|
949 |
that the derivative-based matching and lexing algorithms are |
|
423 | 950 |
often abysmally slow. However, as Sulzmann and Lu wrote, various |
951 |
optimisations are possible, such as the simplifications |
|
952 |
$\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$, |
|
953 |
$\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these |
|
954 |
simplifications can speed up the algorithms considerably in many |
|
955 |
cases, they do not solve fundamentally the ``growth problem'' with |
|
956 |
derivatives. To see this let us return to the example |
|
420 | 957 |
|
958 |
||
418
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
959 |
\begin{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
960 |
@{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
961 |
\end{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
962 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
963 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
964 |
\begin{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
965 |
@{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
966 |
\end{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
967 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
968 |
\begin{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
969 |
@{thm[mode=IfThen] rewrites_to_bsimp} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
970 |
\end{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
971 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
972 |
\begin{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
973 |
@{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
974 |
\end{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
975 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
976 |
\begin{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
977 |
@{thm[mode=IfThen] central} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
978 |
\end{lemma} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
979 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
980 |
\begin{theorem} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
981 |
@{thm[mode=IfThen] main_blexer_simp} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
982 |
\end{theorem} |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
983 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
984 |
Sulzmann \& Lu apply simplification via a fixpoint operation |
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
985 |
|
41a2a3b63853
more of the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
416
diff
changeset
|
986 |
; also does not use erase to filter out duplicates. |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
987 |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
988 |
not direct correspondence with PDERs, because of example |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
989 |
problem with retrieve |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
990 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
991 |
correctness |
398 | 992 |
|
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
993 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
994 |
|
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
995 |
|
398 | 996 |
|
997 |
||
998 |
\begin{figure}[t] |
|
999 |
\begin{center} |
|
1000 |
\begin{tabular}{c} |
|
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1001 |
@{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}\qquad |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1002 |
@{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}\qquad |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1003 |
@{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1004 |
@{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}\qquad |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1005 |
@{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1006 |
@{thm[mode=Axiom] bs6}\qquad |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1007 |
@{thm[mode=Axiom] bs7}\\ |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1008 |
@{thm[mode=Rule] bs8[of "rs\<^sub>1" "rs\<^sub>2"]}\\ |
410 | 1009 |
%@ { t hm[mode=Axiom] ss1}\qquad |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1010 |
@{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}\qquad |
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1011 |
@{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}\\ |
398 | 1012 |
@{thm[mode=Axiom] ss4}\qquad |
402
1612f2a77bf6
more definitions in the paper
Christian Urban <christian.urban@kcl.ac.uk>
parents:
400
diff
changeset
|
1013 |
@{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}\\ |
423 | 1014 |
@{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}\\ |
398 | 1015 |
\end{tabular} |
1016 |
\end{center} |
|
1017 |
\caption{???}\label{SimpRewrites} |
|
1018 |
\end{figure} |
|
397
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
1019 |
*} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
1020 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
1021 |
section {* Bound - NO *} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
1022 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
1023 |
section {* Bounded Regex / Not *} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
1024 |
|
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
1025 |
section {* Conclusion *} |
e1b74d618f1b
updated Sizebound4
Christian Urban <christian.urban@kcl.ac.uk>
parents:
396
diff
changeset
|
1026 |
|
396 | 1027 |
text {* |
1028 |
||
1029 |
\cite{AusafDyckhoffUrban2016} |
|
1030 |
||
1031 |
%%\bibliographystyle{plain} |
|
1032 |
\bibliography{root} |
|
1033 |
*} |
|
1034 |
||
1035 |
(*<*) |
|
1036 |
end |
|
1037 |
(*>*) |