lexing: thys2/Paper/Paper.thy@2416fdec6396


(*<*)
theory Paper
imports 
   "../Lexer"
   "../Simplifying" 
   "../Positions"
   "../SizeBound4" 
   "HOL-Library.LaTeXsugar"
begin

declare [[show_question_marks = false]]

notation (latex output)
  If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
  Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) 


abbreviation 
  "der_syn r c \<equiv> der c r"
abbreviation 
 "ders_syn r s \<equiv> ders s r"  
abbreviation 
  "bder_syn r c \<equiv> bder c r"  

notation (latex output)
  der_syn ("_\\_" [79, 1000] 76) and
  ders_syn ("_\\_" [79, 1000] 76) and
  bder_syn ("_\\_" [79, 1000] 76) and
  bders ("_\\_" [79, 1000] 76) and
  bders_simp ("_\\\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and

  ZERO ("\<^bold>0" 81) and 
  ONE ("\<^bold>1" 81) and 
  CH ("_" [1000] 80) and
  ALT ("_ + _" [77,77] 78) and
  SEQ ("_ \<cdot> _" [77,77] 78) and
  STAR ("_\<^sup>*" [79] 78) and

  val.Void ("Empty" 78) and
  val.Char ("Char _" [1000] 78) and
  val.Left ("Left _" [79] 78) and
  val.Right ("Right _" [1000] 78) and
  val.Seq ("Seq _ _" [79,79] 78) and
  val.Stars ("Stars _" [79] 78) and

  Prf ("\<turnstile> _ : _" [75,75] 75) and  
  Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and

  flat ("|_|" [75] 74) and
  flats ("|_|" [72] 74) and
  injval ("inj _ _ _" [79,77,79] 76) and 
  mkeps ("mkeps _" [79] 76) and 
  length ("len _" [73] 73) and
  set ("_" [73] 73) and

  AZERO ("ZERO" 81) and 
  AONE ("ONE _" [79] 78) and 
  ACHAR ("CHAR _ _" [79, 79] 80) and
  AALTs ("ALTs _ _" [77,77] 78) and
  ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
  ASTAR ("STAR _ _" [79, 79] 78) and

  code ("code _" [79] 74) and
  intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
  erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
  bnullable ("bnullable _" [1000] 80) and
  bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
  bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
  bmkeps ("bmkeps _" [1000] 80) and

  srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
  rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
  blexer_simp ("blexer\<^sup>+" 1000) 


lemma better_retrieve:
   shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
   and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
  apply (metis list.exhaust retrieve.simps(4))
  by (metis list.exhaust retrieve.simps(5))

(*>*)

section {* Introduction *}

text {*

In the last fifteen or so years, Brzozowski's derivatives of regular
expressions have sparked quite a bit of interest in the functional
programming and theorem prover communities.  The beauty of
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
expressible in any functional language, and easily definable and
reasoned about in theorem provers---the definitions just consist of
inductive datatypes and simple recursive functions.  Derivatives of a
regular expression, written @{term "der c r"}, give a simple solution
to the problem of matching a string @{term s} with a regular
expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
succession) all the characters of the string matches the empty string,
then @{term r} matches @{term s} (and {\em vice versa}).  We are aware
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
of the work by Krauss and Nipkow \cite{Krauss2011}.  And another one
in Coq is given by Coquand and Siles \cite{Coquand2012}.
Also Ribeiro and Du Bois give one in Agda \cite{RibeiroAgda2017}.


However, there are two difficulties with derivative-based matchers:
First, Brzozowski's original matcher only generates a yes/no answer
for whether a regular expression matches a string or not.  This is too
little information in the context of lexing where separate tokens must
be identified and also classified (for example as keywords
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
difficulty by cleverly extending Brzozowski's matching
algorithm. Their extended version generates additional information on
\emph{how} a regular expression matches a string following the POSIX
rules for regular expression matching. They achieve this by adding a
second ``phase'' to Brzozowski's algorithm involving an injection
function.  In our own earlier work we provided the formal
specification of what POSIX matching means and proved in Isabelle/HOL
the correctness
of Sulzmann and Lu's extended algorithm accordingly
\cite{AusafDyckhoffUrban2016}.

The second difficulty is that Brzozowski's derivatives can 
grow to arbitrarily big sizes. For example if we start with the
regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
successive derivatives according to the character $a$, we end up with
a sequence of ever-growing derivatives like 

\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
\begin{center}
\begin{tabular}{rll}
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
\end{tabular}
\end{center}
 
\noindent where after around 35 steps we run out of memory on a
typical computer (we shall define shortly the precise details of our
regular expressions and the derivative operation).  Clearly, the
notation involving $\ZERO$s and $\ONE$s already suggests
simplification rules that can be applied to regular regular
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
r$. While such simple-minded simplifications have been proved in our
earlier work to preserve the correctness of Sulzmann and Lu's
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
\emph{not} help with limiting the growth of the derivatives shown
above: the growth is slowed, but the derivatives can still grow rather
quickly beyond any finite bound.


Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
\cite{Sulzmann2014} where they introduce bitcoded
regular expressions. In this version, POSIX values are
represented as bitsequences and such sequences are incrementally generated
when derivatives are calculated. The compact representation
of bitsequences and regular expressions allows them to define a more
``aggressive'' simplification method that keeps the size of the
derivatives finite no matter what the length of the string is.
They make some informal claims about the correctness and linear behaviour
of this version, but do not provide any supporting proof arguments, not
even ``pencil-and-paper'' arguments. They write about their bitcoded
\emph{incremental parsing method} (that is the algorithm to be formalised
in this paper):

\begin{quote}\it
  ``Correctness Claim: We further claim that the incremental parsing
  method [..] in combination with the simplification steps [..]
  yields POSIX parse trees. We have tested this claim
  extensively [..] but yet
  have to work out all proof details.''
\end{quote}  

\noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL
the derivative-based lexing algorithm of Sulzmann and Lu
\cite{Sulzmann2014} where regular expressions are annotated with
bitsequences. We define the crucial simplification function as a
recursive function, instead of a fix-point operation. One objective of
the simplification function is to remove duplicates of regular
expressions.  For this Sulzmann and Lu use in their paper the standard
@{text nub} function from Haskell's list library, but this function
does not achieve the intended objective with bitcoded regular expressions.  The
reason is that in the bitcoded setting, each copy generally has a
different bitcode annotation---so @{text nub} would never ``fire''.
Inspired by Scala's library for lists, we shall instead use a @{text
distinctBy} function that finds duplicates under an erasing function
that deletes bitcodes.
We shall also introduce our own argument and definitions for
establishing the correctness of the bitcoded algorithm when 
simplifications are included.\medskip

\noindent In this paper, we shall first briefly introduce the basic notions
of regular expressions and describe the basic definitions
of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
the correctness for the bitcoded algorithm without simplification, and
after that extend the proof to include simplification. 

*}

section {* Background *}

text {*
  In our Isabelle/HOL formalisation strings are lists of characters with
  the empty string being represented by the empty list, written $[]$,
  and list-cons being written as $\_\!::\!\_\,$; string
  concatenation is $\_ \,@\, \_\,$. We often use the usual
  bracket notation for lists also for strings; for example a string
  consisting of just a single character $c$ is written $[c]$.   
  Our regular expressions are defined as usual as the elements of the following inductive
  datatype:

  \begin{center}
  @{text "r ::="} \;
  @{const "ZERO"} $\mid$
  @{const "ONE"} $\mid$
  @{term "CH c"} $\mid$
  @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
  @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
  @{term "STAR r"} 
  \end{center}

  \noindent where @{const ZERO} stands for the regular expression that does
  not match any string, @{const ONE} for the regular expression that matches
  only the empty string and @{term c} for matching a character literal.
  The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively.
  The
  \emph{language} of a regular expression, written $L$, is defined as usual
  and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}).

  Central to Brzozowski's regular expression matcher are two functions
  called @{text nullable} and \emph{derivative}. The latter is written
  $r\backslash c$ for the derivative of the regular expression $r$
  w.r.t.~the character $c$. Both functions are defined by recursion over
  regular expressions.  

\begin{center}
\begin{tabular}{cc}
  \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
  @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
  @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
  @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
  @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
  & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
  & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
  % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
  \end{tabular}
  &
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
  @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
  @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
  @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
  \end{tabular}  
\end{tabular}  
\end{center}

  \noindent
  We can extend this definition to give derivatives w.r.t.~strings:

  \begin{center}
  \begin{tabular}{cc}
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}
  \end{tabular}
  &
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}
  \end{tabular}
  \end{tabular}
  \end{center}

\noindent
Using @{text nullable} and the derivative operation, we can
define the following simple regular expression matcher:
%
\[
@{text "match s r"} \;\dn\; @{term nullable}(r\backslash s)
\]

\noindent This is essentially Brzozowski's algorithm from 1964. Its
main virtue is that the algorithm can be easily implemented as a
functional program (either in a functional programming language or in
a theorem prover). The correctness proof for @{text match} amounts to
establishing the property
%
\begin{proposition}\label{matchcorr} 
@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
\end{proposition}

\noindent
It is a fun exercise to formaly prove this property in a theorem prover.

The novel idea of Sulzmann and Lu is to extend this algorithm for 
lexing, where it is important to find out which part of the string
is matched by which part of the regular expression.
For this Sulzmann and Lu presented two lexing algorithms in their paper
  \cite{Sulzmann2014}. The first algorithm consists of two phases: first a
  matching phase (which is Brzozowski's algorithm) and then a value
  construction phase. The values encode \emph{how} a regular expression
  matches a string. \emph{Values} are defined as the inductive datatype

  \begin{center}
  @{text "v :="}
  @{const "Void"} $\mid$
  @{term "val.Char c"} $\mid$
  @{term "Left v"} $\mid$
  @{term "Right v"} $\mid$
  @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
  @{term "Stars vs"} 
  \end{center}  

  \noindent where we use @{term vs} to stand for a list of values. The
  string underlying a value can be calculated by a @{const flat}
  function, written @{term "flat DUMMY"}. It traverses a value and
  collects the characters contained in it. Sulzmann and Lu also define inductively an
  inhabitation relation that associates values to regular expressions:

  \begin{center}
  \begin{tabular}{c}
  \\[-8mm]
  @{thm[mode=Axiom] Prf.intros(4)} \qquad
  @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm]
  @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} \qquad 
  @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\\[4mm]
  @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]} \qquad
  @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}
  \end{tabular}
  \end{center}

  \noindent Note that no values are associated with the regular expression
  @{term ZERO}, since it cannot match any string.
  It is routine to establish how values ``inhabiting'' a regular
  expression correspond to the language of a regular expression, namely

  \begin{proposition}
  @{thm L_flat_Prf}
  \end{proposition}

  In general there is more than one value inhabited by a regular
  expression (meaning regular expressions can typically match more
  than one string). But even when fixing a string from the language of the
  regular expression, there are generally more than one way of how the
  regular expression can match this string. POSIX lexing is about
  identifying the unique value for a given regular expression and a
  string that satisfies the informal POSIX rules (see
  \cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).\footnote{POSIX
	lexing acquired its name from the fact that the corresponding
	rules were described as part of the POSIX specification for
	Unix-like operating systems \cite{POSIX}.} Sometimes these
  informal rules are called \emph{maximal much rule} and \emph{rule priority}.
  One contribution of our earlier paper is to give a convenient
 specification for what a POSIX value is (the inductive rules are shown in
  Figure~\ref{POSIXrules}).

\begin{figure}[t]
  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
  @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
  @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
  @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
    @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
    @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
   {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\smallskip\\
  @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
    @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
    @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
    @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\\[-4mm]
  \end{tabular}
  \end{center}
  \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
  of given a string $s$ and a regular
  expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
  regular expression matching.}\label{POSIXrules}
  \end{figure}

  The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
  an injection function on values that mirrors (but inverts) the
  construction of the derivative on regular expressions. Essentially it
  injects back a character into a value.
  For this they define two functions called @{text mkeps} and @{text inj}:
 
  \begin{center}
  \begin{tabular}{l}
  \begin{tabular}{lcl}
  @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
  @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
  \end{tabular}\smallskip\\

  \begin{tabular}{lcl}
  @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
  @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
      @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
  @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
      @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
      & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}
  \end{tabular}
  \end{tabular}
  \end{center}

  \noindent
  The function @{text mkeps} is run when the last derivative is nullable, that is
  the string to be matched is in the language of the regular expression. It generates
  a value for how the last derivative can match the empty string. The injection function
  then calculates the corresponding value for each intermediate derivative until
  a value for the original regular expression is generated.
  Graphically the algorithm by
  Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
  where the path from the left to the right involving @{term derivatives}/@{const
  nullable} is the first phase of the algorithm (calculating successive
  \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
  left, the second phase. The picture above shows the steps required when a
  regular expression, say @{text "r\<^sub>1"}, matches the string @{term
  "[a,b,c]"}. The lexing algorithm by Sulzmann and Lu can be defined as:

  \begin{figure}[t]
\begin{center}
\begin{tikzpicture}[scale=2,node distance=1.3cm,
                    every node/.style={minimum size=6mm}]
\node (r1)  {@{term "r\<^sub>1"}};
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
\draw[->,line width=1mm](r4) -- (v4);
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
\end{tikzpicture}
\end{center}
\mbox{}\\[-13mm]

\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
left to right) is \Brz's matcher building successive derivatives. If the 
last regular expression is @{term nullable}, then the functions of the 
second phase are called (the top-down and right-to-left arrows): first 
@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
how the empty string has been recognised by @{term "r\<^sub>4"}. After
that the function @{term inj} ``injects back'' the characters of the string into
the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
the POSIX value for this string and
regular expression.
\label{Sulz}}
\end{figure} 



  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
  @{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}\\
                     & & \phantom{$|$} @{term "None"}  @{text "\<Rightarrow>"} @{term None}\\
                     & & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
  \end{tabular}
  \end{center}


We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
this algorithm is correct, that is it generates POSIX values. The
cenral property we established relates the derivative operation to the
injection function.

  \begin{proposition}\label{Posix2}
	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$. 
\end{proposition}

  \noindent
  With this in place we were able to prove:


  \begin{proposition}\mbox{}\smallskip\\\label{lexercorrect}
  \begin{tabular}{ll}
  (1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
  (2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
  \end{tabular}
  \end{proposition}

  \noindent
  In fact we have shown that in the success case the generated POSIX value $v$ is
  unique and in the failure case that there is no POSIX value $v$ that satisfies
  $(s, r) \rightarrow v$. While the algorithm is correct, it is excrutiatingly
  slow in cases where the derivatives grow arbitrarily (see example from the
  Introduction). However it can be used as a convenient reference point for the correctness
  proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
  
*}

section {* Bitcoded Regular Expressions and Derivatives *}

text {*

  In the second part of their paper \cite{Sulzmann2014},
  Sulzmann and Lu describe another algorithm that also generates POSIX
  values but dispences with the second phase where characters are
  injected ``back'' into values. For this they annotate bitcodes to
  regular expressions, which we define in Isabelle/HOL as the datatype

  \begin{center}
  \begin{tabular}{lcl}
  @{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\
               & $\mid$ & @{term "ACHAR bs c"}\\
               & $\mid$ & @{term "AALTs bs rs"}\\
               & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
               & $\mid$ & @{term "ASTAR bs r"}
  \end{tabular}
  \end{center}

  \noindent where @{text bs} stands for bitsequences; @{text r},
  @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
  expressions; and @{text rs} for lists of bitcoded regular
  expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
  is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}. 
  For bitsequences we just use lists made up of the
  constants @{text Z} and @{text S}.  The idea with bitcoded regular
  expressions is to incrementally generate the value information (for
  example @{text Left} and @{text Right}) as bitsequences. For this 
  Sulzmann and Lu define a coding
  function for how values can be coded into bitsequences.

  \begin{center}
  \begin{tabular}{cc}
  \begin{tabular}{lcl}
  @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
  @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
  @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
  @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
  \end{tabular}
  &
  \begin{tabular}{lcl}
  @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
  @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
  \mbox{\phantom{XX}}\\
  \end{tabular}
  \end{tabular}
  \end{center}
   
  \noindent
  As can be seen, this coding is ``lossy'' in the sense that we do not
  record explicitly character values and also not sequence values (for
  them we just append two bitsequences). However, the
  different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
  @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
  if there is still a value coming in the list of @{text Stars}, whereas @{text S}
  indicates the end of the list. The lossiness makes the process of
  decoding a bit more involved, but the point is that if we have a
  regular expression \emph{and} a bitsequence of a corresponding value,
  then we can always decode the value accurately. The decoding can be
  defined by using two functions called $\textit{decode}'$ and
  \textit{decode}:

  \begin{center}
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
  $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
  $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
       (\Left\,v, bs_1)$\\
  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
       (\Right\,v, bs_1)$\\                           
  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
        \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
        \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
  $\textit{decode}\,bs\,r$ & $\dn$ &
     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
  & & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
       \textit{else}\;\textit{None}$   
  \end{tabular}    
  \end{center}

  \noindent
  The function \textit{decode} checks whether all of the bitsequence is
  consumed and returns the corresponding value as @{term "Some v"}; otherwise
  it fails with @{text "None"}. We can establish that for a value $v$
  inhabited by a regular expression $r$, the decoding of its
  bitsequence never fails.

\begin{lemma}\label{codedecode}\it
  If $\;\vdash v : r$ then
  $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
\end{lemma}

\begin{proof}
  This follows from the property that
  $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
  for any bit-sequence $bs$ and $\vdash v : r$. This property can be
  easily proved by induction on $\vdash v : r$.
\end{proof}  

  Sulzmann and Lu define the function \emph{internalise}
  in order to transform standard regular expressions into annotated
  regular expressions. We write this operation as $r^\uparrow$.
  This internalisation uses the following
  \emph{fuse} function.	     

  \begin{center}
  \begin{tabular}{lcl}
  $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
  $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
     $\textit{ONE}\,(bs\,@\,bs')$\\
  $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
     $\textit{CHAR}\,(bs\,@\,bs')\,c$\\
  $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
     $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
  $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
     $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
  $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
     $\textit{STAR}\,(bs\,@\,bs')\,r$
  \end{tabular}    
  \end{center}    

  \noindent
  A regular expression can then be \emph{internalised} into a bitcoded
  regular expression as follows.

  \begin{center}
  \begin{tabular}{lcl}
  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
  $(r_1 + r_2)^\uparrow$ & $\dn$ &
         $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
                            (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
  $(r^*)^\uparrow$ & $\dn$ &
         $\textit{STAR}\;[]\,r^\uparrow$\\
  \end{tabular}    
  \end{center}    

  \noindent
  There is also an \emph{erase}-function, written $r^\downarrow$, which
  transforms a bitcoded regular expression into a (standard) regular
  expression by just erasing the annotated bitsequences. We omit the
  straightforward definition. For defining the algorithm, we also need
  the functions \textit{bnullable} and \textit{bmkeps}, which are the
  ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
  bitcoded regular expressions, instead of regular expressions.
  %
  \begin{center}
  \begin{tabular}{@ {}c@ {}c@ {}}
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$\\
  $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{true}$\\
  $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{false}$\\
  $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
     $\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
  $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
     $\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
  $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
     $\textit{true}$
  \end{tabular}
  &
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
  $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
  $\textit{bmkeps}\,(\textit{ALTs}\,bs\,r\!::\!\rs)$ & $\dn$ &
     $\textit{if}\;\textit{bnullable}\,r$\\
  & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,r$\\
  & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,\rs$\\
  $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
  \multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
  $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
     $bs \,@\, [\S]$
  \end{tabular}
  \end{tabular}
  \end{center}    
 

  \noindent
  The key function in the bitcoded algorithm is the derivative of a
  bitcoded regular expression. This derivative calculates the
  derivative but at the same time also the incremental part of bitsequences
  that contribute to constructing a POSIX value.	

  \begin{center}
  \begin{tabular}{@ {}lcl@ {}}
  $(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \\  
  $(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
  $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
        $\textit{if}\;c=d\; \;\textit{then}\;
         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
  $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
        $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
  $(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
     $\textit{if}\;\textit{bnullable}\,r_1$\\
  & &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$\\
  & &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
  & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
  $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
      $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
       (\textit{STAR}\,[]\,r)$
  \end{tabular}    
  \end{center}


  \noindent
  This function can also be extended to strings, written $r\backslash s$,
  just like the standard derivative.  We omit the details. Finally we
  can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:

  \begin{center}
\begin{tabular}{lcl}
  $\textit{blexer}\;r\,s$ & $\dn$ &
      $\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\                
  & & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
       \;\;\textit{else}\;\textit{None}$
\end{tabular}
\end{center}

  \noindent
This bitcoded lexer first internalises the regular expression $r$ and then
builds the bitcoded derivative according to $s$. If the derivative is
(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value.  If
the derivative is \emph{not} nullable, then $\textit{None}$ is
returned. We can show that this way of calculating a value
generates the same result as \textit{lexer}.

Before we can proceed we need to define a helper function, called
\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.

\begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
  @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
  @{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
  @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
  @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
  @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
      & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
  @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
  \end{tabular}
  \end{center}

\noindent
The idea behind this function is to retrieve a possibly partial
bitsequence from a bitcoded regular expression, where the retrieval is
guided by a value.  For example if the value is $\Left$ then we
descend into the left-hand side of an alternative in order to
assemble the bitcode. Similarly for
$\Right$. The property we can show is that for a given $v$ and $r$
with $\vdash v : r$, the retrieved bitsequence from the internalised
regular expression is equal to the bitcoded version of $v$.

\begin{lemma}\label{retrievecode}
If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
\end{lemma}

\noindent
We also need some auxiliary facts about how the bitcoded operations
relate to the ``standard'' operations on regular expressions. For
example if we build a bitcoded derivative and erase the result, this
is the same as if we first erase the bitcoded regular expression and
then perform the ``standard'' derivative operation.

\begin{lemma}\label{bnullable}\mbox{}\smallskip\\
  \begin{tabular}{ll}
\textit{(1)} & $(a\backslash s)^\downarrow = (a^\downarrow)\backslash s$\\    
\textit{(2)} & $\textit{bnullable}(a)$ iff $\textit{nullable}(a^\downarrow)$\\
\textit{(3)} & $\textit{bmkeps}(a) = \textit{retrieve}\,a\,(\textit{mkeps}\,(a^\downarrow))$ provided $\textit{nullable}(a^\downarrow)$.
\end{tabular}  
\end{lemma}

\begin{proof}
  All properties are by induction on annotated regular expressions. There are no
  interesting cases.
\end{proof}

\noindent
The only difficulty left for the correctness proof is that the bitcoded algorithm
has only a ``forward phase'' where POSIX values are generated incrementally.
We can achieve the same effect with @{text lexer} by stacking up injection
functions during the forward phase. An auxiliary function, called $\textit{flex}$,
allows us to recast the rules of $\lexer$ (with its two phases) in terms of a single
phase and stacked up injection functions.

\begin{center}
\begin{tabular}{lcl}
  $\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$\\
  $\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
  $\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$\\
\end{tabular}    
\end{center}    

\noindent
The point of this function is that when
reaching the end of the string, we just need to apply the stacked
injection functions to the value generated by @{text mkeps}.
Using this function we can recast the success case in @{text lexer} 
as follows:

\begin{proposition}\label{flex}
If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
      (\mkeps (r\backslash s))$.
\end{proposition}

\noindent
Note we did not redefine \textit{lexer}, we just established that the
value generated by \textit{lexer} can also be obtained by a different
method. While this different method is not efficient (we essentially
need to traverse the string $s$ twice, once for building the
derivative $r\backslash s$ and another time for stacking up injection
functions using \textit{flex}), it helps us with proving
that incrementally building up values generates the same result.

This brings us to our main lemma in this section: if we calculate a
derivative, say $r\backslash s$ and have a value, say $v$, inhabited
by this derivative, then we can produce the result $\lexer$ generates
by applying this value to the stacked-up injection functions
that $\textit{flex}$ assembles. The lemma establishes that this is the same
value as if we build the annotated derivative $r^\uparrow\backslash s$
and then retrieve the corresponding bitcoded version, followed by a
decoding step.

\begin{lemma}[Main Lemma]\label{mainlemma}\it
If $\vdash v : r\backslash s$ then 
\[\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
  \textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r\]
\end{lemma}  

\begin{proof}
  This can be proved by induction on $s$ and generalising over
  $v$. The interesting point is that we need to prove this in the
  reverse direction for $s$. This means instead of cases $[]$ and
  $c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
  string from the back.\footnote{Isabelle/HOL provides an induction principle
    for this way of performing the induction.}

  The case for $[]$ is routine using Lemmas~\ref{codedecode}
  and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
  the assumption that $\vdash v : (r\backslash s)\backslash c$
  holds. Hence by Prop.~\ref{Posix2} we know that 
  (*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
  By definition of $\textit{flex}$ we can unfold the left-hand side
  to be
  \[
    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))  
  \]  

  \noindent
  By induction hypothesis and (*) we can rewrite the right-hand side to
  %
  \[
    \textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
    (\inj\,(r\backslash s)\,c\,\,v))\,r
  \]

  \noindent
  which is equal to
  $\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
  (s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
  because we generalised over $v$ in our induction.
\end{proof}  

\noindent
With this lemma in place, we can prove the correctness of \textit{blexer}---it indeed
produces the same result as \textit{lexer}.


\begin{theorem}
$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$
\end{theorem}  

\begin{proof}
  We can first expand both sides using Prop.~\ref{flex} and the
  definition of \textit{blexer}. This gives us two
  \textit{if}-statements, which we need to show to be equal. By 
  Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
  \[
    \textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
    \nullable(r\backslash s)
  \]

  \noindent
  For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
  $d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
  by Lemma~\ref{bnullable}\textit{(3)} that
  %
  \[
    \textit{decode}(\textit{bmkeps}\,r_d)\,r =
    \textit{decode}(\textit{retrieve}\,a\,(\textit{mkeps}\,d))\,r
  \]

  \noindent
  where the right-hand side is equal to
  $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
  d))$ by Lemma~\ref{mainlemma} (we know
  $\vdash \textit{mkeps}\,d : d$ by (*)).  This shows the
  \textit{if}-branches return the same value. In the
  \textit{else}-branches both \textit{lexer} and \textit{blexer} return
  \textit{None}. Therefore we can conclude the proof.
\end{proof}  

\noindent This establishes that the bitcoded algorithm by Sulzmann and
Lu \emph{without} simplification produces correct results. This was
only conjectured by Sulzmann and Lu in their paper
\cite{Sulzmann2014}. The next step is to add simplifications.

*}


section {* Simplification *}

text {*

     Derivatives as calculated by Brzozowski’s method are usually more
     complex regular expressions than the initial one; the result is
     that derivative-based matching and lexing algorithms are
     often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
     optimisations are possible, such as the simplifications
     $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
     $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
     simplifications can considerably speed up the two algorithms  in many
     cases, they do not solve fundamentally the growth problem with
     derivatives. To see this let us return to the example from the
     Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
     If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
     the simplification rules shown above we obtain
     %
     \def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}%%
     %
     \begin{equation}\label{derivex}
     (a + aa)^* \quad\xll\quad
      \underbrace{\mbox{$(\ONE + \ONE{}a) \cdot (a + aa)^*$}}_{r} \;+\;
     ((a + aa)^* + \underbrace{\mbox{$(\ONE + \ONE{}a) \cdot (a + aa)^*$}}_{r})
     \end{equation}

     \noindent This is a simpler derivative, but unfortunately we
     cannot make further simplifications. This is a problem because
     the outermost alternatives contains two copies of the same
     regular expression (underlined with $r$). The copies will
     spawn new copies in later steps and they in turn further copies. This
     destroys an hope of taming the size of the derivatives.  But the
     second copy of $r$ in \eqref{derivex} will never contribute to a
     value, because POSIX lexing will always prefer matching a string
     with the first copy. So in principle it could be removed.
     The dilemma with the simple-minded
     simplification rules above is that the rule $r + r \Rightarrow r$
     will never be applicable because as can be seen in this example the
     regular expressions are separated by another sub-regular expression.

     But here is where Sulzmann and Lu's representation of generalised
     alternatives in the bitcoded algorithm shines: in @{term
     "ALTs bs rs"} we can define a more aggressive simplification by
     recursively simplifying all regular expressions in @{text rs} and
     then analyse the resulting list and remove any duplicates.
     Another advantage is that the bitsequences in  bitcoded regular
     expressions can be easily modified such that simplification does not
     interfere with the value constructions. For example we can ``flatten'', or
     de-nest, @{text ALTs} as follows
     %
     \[
     @{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
     \quad\xrightarrow{bsimp}\quad
     @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) # rs\<^sub>1)"}
     \]

     \noindent
     where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
     to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
     ensure that the correct value corresponding to the original (unsimplified)
     regular expression can still be extracted. %In this way the value construction
     %is not affected by simplification. 

     However there is one problem with the definition for the more
     aggressive simlification rules by Sulzmann and Lu. Recasting
     their definition with our syntax they define the step of removing
     duplicates as
     %
     \[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
     bs (nup (map bsimp rs))"}
     \]

     \noindent where they first recursively simplify the regular
     expressions in @{text rs} (using @{text map}) and then use
     Haskell's @{text nub}-function to remove potential
     duplicates. While this makes sense when considering the example
     shown in \eqref{derivex}, @{text nub} is the inappropriate
     function in the case of bitcoded regular expressions. The reason
     is that in general the n elements in @{text rs} will have a
     different bitsequence annotated to it and in this way @{text nub}
     will never find a duplicate to be removed. The correct way to
     handle this situation is to first \emph{erase} the regular
     expressions when comparing potential duplicates. This is inspired
     by Scala's list functions of the form \mbox{@{text "distinctBy rs f
     acc"}} where a function is applied first before two elements
     are compared. We define this function in Isabelle/HOL as

     \begin{center}
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
     @{thm (lhs) distinctBy.simps(1)} & $\dn$ & @{thm (rhs) distinctBy.simps(1)}\\
     @{thm (lhs) distinctBy.simps(2)} & $\dn$ & @{thm (rhs) distinctBy.simps(2)}
     \end{tabular}
     \end{center}

     \noindent where we scan the list from left to right (because we
     have to remove later copies). In this function, @{text f} is a
     function and @{text acc} is an accumulator for regular
     expressions---essentially a set of elements we have already seen
     while scanning the list. Therefore we delete an element, say @{text x},
     from the list provided @{text "f x"} is already in the accumulator;
     otherwise we keep @{text x} and scan the rest of the list but now
     add @{text "f x"} as another element to @{text acc}. We will use
     @{term distinctBy} where @{text f} is our erase functions, @{term "erase (DUMMY)"},
     that deletes bitsequences from bitcoded regular expressions.
     This is clearly a computationally more expensive operation, than @{text nub},
     but is needed in order to make the removal of unnecessary copies
     to work.

     Our simplification function depends on three helper functions, one is called
     @{text flts} and defined as follows:

     \begin{center}
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
     @{thm (lhs) flts.simps(1)} & $\dn$ & @{thm (rhs) flts.simps(1)}\\
     @{thm (lhs) flts.simps(2)} & $\dn$ & @{thm (rhs) flts.simps(2)}\\
     @{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\
     \end{tabular}
     \end{center}

     \noindent
     The second clause removes all instances of @{text ZERO} in alternatives and
     the second ``spills'' out nested alternatives (but retaining the
     bitsequence @{text "bs'"} accumulated in the inner alternative). There are
     some corner cases to be considered when the resulting list inside an alternative is
     empty or a singleton list. We take care of those cases in the
     @{text "bsimpALTs"} function; similarly we define a helper function that simplifies
     sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:

     \begin{center}
     \begin{tabular}{c@ {\hspace{5mm}}c}
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
     @{text "bsimpALTs bs []"}  & $\dn$ & @{text "ZERO"}\\
     @{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
     @{text "bsimpALTs bs rs"}  & $\dn$ & @{text "ALTs bs rs"}\\
     \mbox{}\\
     \end{tabular}
     &
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
     @{text "bsimpSEQ bs _ ZERO"}  & $\dn$ & @{text "ZERO"}\\
     @{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
     @{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
         & $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
     @{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ &  @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
     \end{tabular}
     \end{tabular}
     \end{center}

     \noindent
     With this in place we can define our simlification function as

     \begin{center}
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
     @{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
         @{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
     @{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{thm (rhs) bsimp.simps(2)[of "bs" _]}\\
     @{text "bsimp r"} & $\dn$ & @{text r}
     \end{tabular}
     \end{center}

     \noindent
     As far as we can see, our recursive function @{term bsimp} simplifies regular
     expressions as intended by Sulzmann and Lu. There is no point to apply the
     @{text bsimp}
     function repeatedly (like the simplification in their paper which is applied
     until a fixpoint is reached), because we can show that it is idempotent, that is

     \begin{proposition}
     ???
     \end{proposition}


     \begin{lemma}
     @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
     \end{lemma}


     \begin{lemma}
     @{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
     \end{lemma}

     \begin{lemma}
     @{thm[mode=IfThen] rewrites_to_bsimp}
     \end{lemma}

     \begin{lemma}
     @{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
     \end{lemma}

     \begin{lemma}
     @{thm[mode=IfThen] central}
     \end{lemma}

     \begin{theorem}
     @{thm[mode=IfThen] main_blexer_simp}
     \end{theorem}

     Sulzmann \& Lu apply simplification via a fixpoint operation

     ; also does not use erase to filter out duplicates.
  
   not direct correspondence with PDERs, because of example
   problem with retrieve 

   correctness

   
    



   \begin{figure}[t]
  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}\qquad
  @{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}\qquad
  @{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}\\
  @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}\qquad
  @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}\\
  @{thm[mode=Axiom] bs6}\qquad
  @{thm[mode=Axiom] bs7}\\
  @{thm[mode=Rule] bs8[of "rs\<^sub>1" "rs\<^sub>2"]}\\
  %@ { t hm[mode=Axiom] ss1}\qquad
  @{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}\qquad
  @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm[mode=Axiom] ss4}\qquad
  @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}\\
  @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}\\
  \end{tabular}
  \end{center}
  \caption{???}\label{SimpRewrites}
  \end{figure}
*}

section {* Bound - NO *}


section {* Conclusion *}

text {*



%%\bibliographystyle{plain}
\bibliography{root}
*}

(*<*)
end
(*>*)
author	Christian Urban <christian.urban@kcl.ac.uk>
	Wed, 09 Feb 2022 00:29:04 +0000
changeset 424	2416fdec6396
parent 423	b7199d6c672d
child 425	14c558ae0b09
permissions	-rw-r--r--