(*<*)
theory Paper
imports
"../Lexer"
"../Simplifying"
"../Positions"
"../SizeBound4"
"HOL-Library.LaTeXsugar"
begin
declare [[show_question_marks = false]]
notation (latex output)
If ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73)
abbreviation
"der_syn r c \<equiv> der c r"
abbreviation
"bder_syn r c \<equiv> bder c r"
notation (latex output)
der_syn ("_\\_" [79, 1000] 76) and
bder_syn ("_\\_" [79, 1000] 76) and
ZERO ("\<^bold>0" 81) and
ONE ("\<^bold>1" 81) and
CH ("_" [1000] 80) and
ALT ("_ + _" [77,77] 78) and
SEQ ("_ \<cdot> _" [77,77] 78) and
STAR ("_\<^sup>\<star>" [79] 78) and
val.Void ("Empty" 78) and
val.Char ("Char _" [1000] 78) and
val.Left ("Left _" [79] 78) and
val.Right ("Right _" [1000] 78) and
val.Seq ("Seq _ _" [79,79] 78) and
val.Stars ("Stars _" [79] 78) and
Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
flat ("|_|" [75] 74) and
flats ("|_|" [72] 74) and
injval ("inj _ _ _" [79,77,79] 76) and
mkeps ("mkeps _" [79] 76) and
length ("len _" [73] 73) and
set ("_" [73] 73) and
AZERO ("ZERO" 81) and
AONE ("ONE _" [79] 78) and
ACHAR ("CHAR _ _" [79, 79] 80) and
AALTs ("ALTs _ _" [77,77] 78) and
ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
ASTAR ("STAR _ _" [79, 79] 78) and
code ("code _" [79] 74) and
intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
bnullable ("bnullable _" [1000] 80) and
bmkeps ("bmkeps _" [1000] 80) and
srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
blexer_simp ("blexer\<^sup>+" 1000)
lemma better_retrieve:
shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
and "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
apply (metis list.exhaust retrieve.simps(4))
by (metis list.exhaust retrieve.simps(5))
(*>*)
section {* Introduction *}
text {*
In the last fifteen or so years, Brzozowski's derivatives of regular
expressions have sparked quite a bit of interest in the functional
programming and theorem prover communities. The beauty of
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
expressible in any functional language, and easily definable and
reasoned about in theorem provers---the definitions just consist of
inductive datatypes and simple recursive functions. A mechanised
correctness proof of Brzozowski's matcher in for example HOL4 has been
mentioned by Owens and Slind~\cite{Owens2008}. Another one in
Isabelle/HOL is part of the work by Krauss and Nipkow
\cite{Krauss2011}. And another one in Coq is given by Coquand and
Siles \cite{Coquand2012}.
The notion of derivatives
\cite{Brzozowski1964}, written @{term "der c r"}, of a regular
expression give a simple solution to the problem of matching a string
@{term s} with a regular expression @{term r}: if the derivative of
@{term r} w.r.t.\ (in succession) all the characters of the string
matches the empty string, then @{term r} matches @{term s} (and {\em
vice versa}). The derivative has the property (which may almost be
regarded as its specification) that, for every string @{term s} and
regular expression @{term r} and character @{term c}, one has @{term
"cs \<in> L(r)"} if and only if \mbox{@{term "s \<in> L(der c r)"}}.
If a regular expression matches a string, then in general there is more
than one way of how the string is matched. There are two commonly used
disambiguation strategies to generate a unique answer: one is called
GREEDY matching \cite{Frisch2004} and the other is POSIX
matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}.
For example consider the string @{term xy} and the regular expression
\mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be
matched in two `iterations' by the single letter-regular expressions
@{term x} and @{term y}, or directly in one iteration by @{term xy}. The
first case corresponds to GREEDY matching, which first matches with the
left-most symbol and only matches the next symbol in case of a mismatch
(this is greedy in the sense of preferring instant gratification to
delayed repletion). The second case is POSIX matching, which prefers the
longest match.
\begin{center}
\begin{tabular}{cc}
\begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
@{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
@{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
@{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
@{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
& & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
& & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
% & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
\end{tabular}
&
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
@{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
@{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
@{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
@{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
\end{tabular}
\end{tabular}
\end{center}
\begin{figure}[t]
\begin{center}
\begin{tikzpicture}[scale=2,node distance=1.3cm,
every node/.style={minimum size=6mm}]
\node (r1) {@{term "r\<^sub>1"}};
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
\draw[->,line width=1mm](r4) -- (v4);
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
\end{tikzpicture}
\end{center}
\mbox{}\\[-13mm]
\caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
matching the string @{term "[a,b,c]"}. The first phase (the arrows from
left to right) is \Brz's matcher building successive derivatives. If the
last regular expression is @{term nullable}, then the functions of the
second phase are called (the top-down and right-to-left arrows): first
@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
how the empty string has been recognised by @{term "r\<^sub>4"}. After
that the function @{term inj} ``injects back'' the characters of the string into
the values.
\label{Sulz}}
\end{figure}
*}
section {* Background *}
text {*
Sulzmann-Lu algorithm with inj. State that POSIX rules.
metion slg is correct.
\begin{figure}[t]
\begin{center}
\begin{tabular}{c}
@{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
@{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
@{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
@{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
$\mprset{flushleft}
\inferrule
{@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
@{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
@{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
{@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\\
@{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\medskip\\
$\mprset{flushleft}
\inferrule
{@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
@{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
@{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
@{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
{@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>
\end{tabular}
\end{center}
\caption{Our inductive definition of POSIX values.}\label{POSIXrules}
\end{figure}
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
@{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
\end{tabular}
\end{center}
\begin{center}
\begin{tabular}{l@ {\hspace{5mm}}lcl}
\textit{(1)} & @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
\textit{(2)} & @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ &
@{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
\textit{(3)} & @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
@{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
\textit{(4)} & @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
& @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
\textit{(5)} & @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
& @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
\textit{(6)} & @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$
& @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
\textit{(7)} & @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$
& @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
\end{tabular}
\end{center}
*}
section {* Bitcoded Regular Expressions and Derivatives *}
text {*
In the second part of their paper \cite{Sulzmann2014},
Sulzmann and Lu describe another algorithm that generates POSIX
values but dispences with the second phase where characters are
injected ``back'' into values. For this they annotate bitcodes to
regular expressions, which we define in Isabelle/HOL as the datatype
\begin{center}
\begin{tabular}{lcl}
@{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\
& $\mid$ & @{term "ACHAR bs c"}\\
& $\mid$ & @{term "AALTs bs rs"}\\
& $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
& $\mid$ & @{term "ASTAR bs r"}
\end{tabular}
\end{center}
\noindent where @{text bs} stands for bitsequences; @{text r},
@{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
expressions; and @{text rs} for lists of bitcoded regular
expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
is just an abbreviation for @{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}.
For bitsequences we just use lists made up of the
constants @{text Z} and @{text S}. The idea with bitcoded regular
expressions is to incrementally generate the value information (for
example @{text Left} and @{text Right}) as bitsequences. For this
Sulzmann and Lu define a coding
function for how values can be coded into bitsequences.
\begin{center}
\begin{tabular}{cc}
\begin{tabular}{lcl}
@{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
@{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
@{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
@{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
\end{tabular}
&
\begin{tabular}{lcl}
@{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
@{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
@{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
\mbox{\phantom{XX}}\\
\end{tabular}
\end{tabular}
\end{center}
\noindent
As can be seen, this coding is ``lossy'' in the sense that we do not
record explicitly character values and also not sequence values (for
them we just append two bitsequences). However, the
different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
@{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
if there is still a value coming in the list of @{text Stars}, whereas @{text S}
indicates the end of the list. The lossiness makes the process of
decoding a bit more involved, but the point is that if we have a
regular expression \emph{and} a bitsequence of a corresponding value,
then we can always decode the value accurately. The decoding can be
defined by using two functions called $\textit{decode}'$ and
\textit{decode}:
\begin{center}
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
$\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
(\Left\,v, bs_1)$\\
$\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
(\Right\,v, bs_1)$\\
$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
\hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
$\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
$\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
\hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
$\textit{decode}\,bs\,r$ & $\dn$ &
$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
& & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
\textit{else}\;\textit{None}$
\end{tabular}
\end{center}
\noindent
The function \textit{decode} checks whether all of the bitsequence is
consumed and returns the corresponding value as @{term "Some v"}; otherwise
it fails with @{text "None"}. We can establish that for a value $v$
inhabited by a regular expression $r$, the decoding of its
bitsequence never fails.
\begin{lemma}\label{codedecode}\it
If $\;\vdash v : r$ then
$\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
\end{lemma}
\begin{proof}
This follows from the property that
$\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
for any bit-sequence $bs$ and $\vdash v : r$. This property can be
easily proved by induction on $\vdash v : r$.
\end{proof}
Sulzmann and Lu define the function \emph{internalise}
in order to transform standard regular expressions into annotated
regular expressions. We write this operation as $r^\uparrow$.
This internalisation uses the following
\emph{fuse} function.
\begin{center}
\begin{tabular}{lcl}
$\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
$\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
$\textit{ONE}\,(bs\,@\,bs')$\\
$\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
$\textit{CHAR}\,(bs\,@\,bs')\,c$\\
$\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
$\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
$\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
$\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
$\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
$\textit{STAR}\,(bs\,@\,bs')\,r$
\end{tabular}
\end{center}
\noindent
A regular expression can then be \emph{internalised} into a bitcoded
regular expression as follows.
\begin{center}
\begin{tabular}{lcl}
$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
$(r_1 + r_2)^\uparrow$ & $\dn$ &
$\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
(\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
$\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
$(r^*)^\uparrow$ & $\dn$ &
$\textit{STAR}\;[]\,r^\uparrow$\\
\end{tabular}
\end{center}
\noindent
There is also an \emph{erase}-function, written $a^\downarrow$, which
transforms a bitcoded regular expression into a (standard) regular
expression by just erasing the annotated bitsequences. We omit the
straightforward definition. For defining the algorithm, we also need
the functions \textit{bnullable} and \textit{bmkeps}, which are the
``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
bitcoded regular expressions, instead of regular expressions.
\begin{center}
\begin{tabular}{@ {}c@ {}c@ {}}
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
$\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$\\
$\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{true}$\\
$\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{false}$\\
$\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
$\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
$\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
$\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
$\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
$\textit{true}$
\end{tabular}
&
\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
$\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
$\textit{bmkeps}\,(\textit{ALTs}\,bs\,r\!::\!\rs)$ & $\dn$ &
$\textit{if}\;\textit{bnullable}\,r$\\
& &$\textit{then}\;bs\,@\,\textit{bmkeps}\,r$\\
& &$\textit{else}\;bs\,@\,\textit{bmkeps}\,\rs$\\
$\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
\multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
$\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
$bs \,@\, [\S]$
\end{tabular}
\end{tabular}
\end{center}
\noindent
The key function in the bitcoded algorithm is the derivative of an
bitcoded regular expression. This derivative calculates the
derivative but at the same time also the incremental part of bitsequences
that contribute to constructing a POSIX value.
\begin{center}
\begin{tabular}{@ {}lcl@ {}}
$(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \\
$(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\
$(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
$\textit{if}\;c=d\; \;\textit{then}\;
\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\
$(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
$\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
$(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
$\textit{if}\;\textit{bnullable}\,r_1$\\
& &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$\\
& &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
& &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
$(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
$\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
(\textit{STAR}\,[]\,r)$
\end{tabular}
\end{center}
\noindent
This function can also be extended to strings, written $r\backslash s$,
just like the standard derivative. We omit the details. Finally we
can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
\begin{center}
\begin{tabular}{lcl}
$\textit{blexer}\;r\,s$ & $\dn$ &
$\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\
& & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
\;\;\textit{else}\;\textit{None}$
\end{tabular}
\end{center}
\noindent
This bitcoded lexer first internalises the regular expression $r$ and then
builds the bitcoded derivative according to $s$. If the derivative is
(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value. If
the derivative is \emph{not} nullable, then $\textit{None}$ is
returned. We can show that this way of calculating a value
generates the same result as with \textit{lexer}.
Before we can proceed we need to define a helper function, called
\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
@{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
@{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
@{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
@{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
@{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
& $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
@{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
@{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
\end{tabular}
\end{center}
\noindent
The idea behind this function is to retrieve a possibly partial
bitcode from a bitcoded regular expression, where the retrieval is
guided by a value. For example if the value is $\Left$ then we
descend into the left-hand side of an alternative in order to
assemble the bitcode. Similarly for
$\Right$. The property we can show is that for a given $v$ and $r$
with $\vdash v : r$, the retrieved bitsequence from the internalised
regular expression is equal to the bitcoded version of $v$.
\begin{lemma}\label{retrievecode}
If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
\end{lemma}
\noindent
We also need some auxiliary facts about how the bitcoded operations
relate to the ``standard'' operations on regular expressions. For
example if we build a bitcoded derivative and erase the result, this
is the same as if we first erase the bitcoded regular expression and
then perform the ``standard'' derivative operation.
\begin{lemma}\label{bnullable}\mbox{}\smallskip\\
\begin{tabular}{ll}
\textit{(1)} & $(a\backslash s)^\downarrow = (a^\downarrow)\backslash s$\\
\textit{(2)} & $\textit{bnullable}(a)$ iff $\textit{nullable}(a^\downarrow)$\\
\textit{(3)} & $\textit{bmkeps}(a) = \textit{retrieve}\,a\,(\textit{mkeps}\,(a^\downarrow))$ provided $\textit{nullable}(a^\downarrow)$.
\end{tabular}
\end{lemma}
\begin{proof}
All properties are by induction on annotated regular expressions. There are no
interesting cases.
\end{proof}
\noindent
This brings us to our main lemma in this section: if we build a
derivative, say $r\backslash s$ and have a value, say $v$, inhabited
by this derivative, then we can produce the result $\lexer$ generates
by applying this value to the stacked-up injection functions
$\textit{flex}$ assembles. The lemma establishes that this is the same
value as if we build the annotated derivative $r^\uparrow\backslash s$
and then retrieve the corresponding bitcoded version, followed by a
decoding step.
\begin{lemma}[Main Lemma]\label{mainlemma}\it
If $\vdash v : r\backslash s$ then
\[\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
\textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r\]
\end{lemma}
\begin{proof}
This can be proved by induction on $s$ and generalising over
$v$. The interesting point is that we need to prove this in the
reverse direction for $s$. This means instead of cases $[]$ and
$c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
string from the back.\footnote{Isabelle/HOL provides an induction principle
for this way of performing the induction.}
The case for $[]$ is routine using Lemmas~\ref{codedecode}
and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
the assumption that $\vdash v : (r\backslash s)\backslash c$
holds. Hence by Lemma~\ref{Posix2} we know that
(*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
By definition of $\textit{flex}$ we can unfold the left-hand side
to be
\[
\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))
\]
\noindent
By induction hypothesis and (*) we can rewrite the right-hand side to
\[
\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
(\inj\,(r\backslash s)\,c\,\,v))\,r
\]
\noindent
which is equal to
$\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
(s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
because we generalised over $v$ in our induction.
\end{proof}
\noindent
With this lemma in place, we can prove the correctness of \textit{blexer} such
that it produces the same result as \textit{lexer}.
\begin{theorem}
$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$
\end{theorem}
\begin{proof}
We can first expand both sides using Lemma~\ref{flex} and the
definition of \textit{blexer}. This gives us two
\textit{if}-statements, which we need to show to be equal. By
Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
\[
\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
\nullable(r\backslash s)
\]
\noindent
For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
$d \dn r\backslash s$. We have (*) $\nullable\,d$. We can then show
by Lemma~\ref{bnullable}\textit{(3)} that
%
\[
\textit{decode}(\textit{bmkeps}\,r_d)\,r =
\textit{decode}(\textit{retrieve}\,a\,(\textit{mkeps}\,d))\,r
\]
\noindent
where the right-hand side is equal to
$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
d))$ by Lemma~\ref{mainlemma} (we know
$\vdash \textit{mkeps}\,d : d$ by (*)). This shows the
\textit{if}-branches return the same value. In the
\textit{else}-branches both \textit{lexer} and \textit{blexer} return
\textit{None}. Therefore we can conclude the proof.
\end{proof}
\noindent
This establishes that the bitcoded algorithm by Sulzmann
and Lu without simplification produces correct results. This was
only conjectured in their paper \cite{Sulzmann2014}. The next step
is to add simplifications.
*}
section {* Simplification *}
text {*
\begin{lemma}
@{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
\end{lemma}
\begin{lemma}
@{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
\end{lemma}
\begin{lemma}
@{thm[mode=IfThen] rewrites_to_bsimp}
\end{lemma}
\begin{lemma}
@{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
\end{lemma}
\begin{lemma}
@{thm[mode=IfThen] central}
\end{lemma}
\begin{theorem}
@{thm[mode=IfThen] main_blexer_simp}
\end{theorem}
Sulzmann \& Lu apply simplification via a fixpoint operation
; also does not use erase to filter out duplicates.
not direct correspondence with PDERs, because of example
problem with retrieve
correctness
\begin{figure}[t]
\begin{center}
\begin{tabular}{c}
@{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}\qquad
@{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}\qquad
@{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}\\
@{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}\qquad
@{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}\\
@{thm[mode=Axiom] bs6}\qquad
@{thm[mode=Axiom] bs7}\\
@{thm[mode=Rule] bs8[of "rs\<^sub>1" "rs\<^sub>2"]}\\
%@ { t hm[mode=Axiom] ss1}\qquad
@{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}\qquad
@{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm[mode=Axiom] ss4}\qquad
@{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}\\
@{thm[mode=Rule] ss6[of "r\<^sub>1" "r\<^sub>2" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}\\
\end{tabular}
\end{center}
\caption{???}\label{SimpRewrites}
\end{figure}
*}
section {* Bound - NO *}
section {* Bounded Regex / Not *}
section {* Conclusion *}
text {*
\cite{AusafDyckhoffUrban2016}
%%\bibliographystyle{plain}
\bibliography{root}
*}
(*<*)
end
(*>*)