thys3/Paper.thy
changeset 496 f493a20feeb3
child 498 ab626b60ee64
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/thys3/Paper.thy	Sat Apr 30 00:50:08 2022 +0100
@@ -0,0 +1,1545 @@
+(*<*)
+theory Paper
+  imports
+   "Posix.FBound" 
+   "HOL-Library.LaTeXsugar"
+begin
+
+declare [[show_question_marks = false]]
+
+notation (latex output)
+  If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
+  Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) 
+
+
+abbreviation 
+  "der_syn r c \<equiv> der c r"
+abbreviation 
+ "ders_syn r s \<equiv> ders s r"  
+abbreviation 
+  "bder_syn r c \<equiv> bder c r"  
+
+notation (latex output)
+  der_syn ("_\\_" [79, 1000] 76) and
+  ders_syn ("_\\_" [79, 1000] 76) and
+  bder_syn ("_\\_" [79, 1000] 76) and
+  bders ("_\\_" [79, 1000] 76) and
+  bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
+
+  ZERO ("\<^bold>0" 81) and 
+  ONE ("\<^bold>1" 81) and 
+  CH ("_" [1000] 80) and
+  ALT ("_ + _" [77,77] 78) and
+  SEQ ("_ \<cdot> _" [77,77] 78) and
+  STAR ("_\<^sup>*" [79] 78) and
+
+  val.Void ("Empty" 78) and
+  val.Char ("Char _" [1000] 78) and
+  val.Left ("Left _" [79] 78) and
+  val.Right ("Right _" [1000] 78) and
+  val.Seq ("Seq _ _" [79,79] 78) and
+  val.Stars ("Stars _" [79] 78) and
+
+  Prf ("\<turnstile> _ : _" [75,75] 75) and  
+  Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
+
+  flat ("|_|" [75] 74) and
+  flats ("|_|" [72] 74) and
+  injval ("inj _ _ _" [79,77,79] 76) and 
+  mkeps ("mkeps _" [79] 76) and 
+  length ("len _" [73] 73) and
+  set ("_" [73] 73) and
+
+  AZERO ("ZERO" 81) and 
+  AONE ("ONE _" [79] 78) and 
+  ACHAR ("CHAR _ _" [79, 79] 80) and
+  AALTs ("ALTs _ _" [77,77] 78) and
+  ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
+  ASTAR ("STAR _ _" [79, 79] 78) and
+
+  code ("code _" [79] 74) and
+  intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
+  erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
+  bnullable ("bnullable _" [1000] 80) and
+  bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
+  bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
+  bmkeps ("bmkeps _" [1000] 80) and
+
+  eq1 ("_ \<approx> _" [77,77] 76) and 
+
+  srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
+  rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
+  srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
+  blexer_simp ("blexer\<^sup>+" 1000) 
+
+
+lemma better_retrieve:
+   shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
+   and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
+  apply (metis list.exhaust retrieve.simps(4))
+  by (metis list.exhaust retrieve.simps(5))
+
+
+
+
+(*>*)
+
+section {* Introduction *}
+
+text {*
+
+In the last fifteen or so years, Brzozowski's derivatives of regular
+expressions have sparked quite a bit of interest in the functional
+programming and theorem prover communities.  The beauty of
+Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
+expressible in any functional language, and easily definable and
+reasoned about in theorem provers---the definitions just consist of
+inductive datatypes and simple recursive functions.  Derivatives of a
+regular expression, written @{term "der c r"}, give a simple solution
+to the problem of matching a string @{term s} with a regular
+expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
+succession) all the characters of the string matches the empty string,
+then @{term r} matches @{term s} (and {\em vice versa}).  We are aware
+of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
+Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
+of the work by Krauss and Nipkow~\cite{Krauss2011}.  And another one
+in Coq is given by Coquand and Siles \cite{Coquand2012}.
+Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
+
+
+However, there are two difficulties with derivative-based matchers:
+First, Brzozowski's original matcher only generates a yes/no answer
+for whether a regular expression matches a string or not.  This is too
+little information in the context of lexing where separate tokens must
+be identified and also classified (for example as keywords
+or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
+difficulty by cleverly extending Brzozowski's matching
+algorithm. Their extended version generates additional information on
+\emph{how} a regular expression matches a string following the POSIX
+rules for regular expression matching. They achieve this by adding a
+second ``phase'' to Brzozowski's algorithm involving an injection
+function.  In our own earlier work we provided the formal
+specification of what POSIX matching means and proved in Isabelle/HOL
+the correctness
+of Sulzmann and Lu's extended algorithm accordingly
+\cite{AusafDyckhoffUrban2016}.
+
+The second difficulty is that Brzozowski's derivatives can 
+grow to arbitrarily big sizes. For example if we start with the
+regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
+successive derivatives according to the character $a$, we end up with
+a sequence of ever-growing derivatives like 
+
+\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
+\begin{center}
+\begin{tabular}{rll}
+$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
+& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
+& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
+& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
+& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
+\end{tabular}
+\end{center}
+ 
+\noindent where after around 35 steps we run out of memory on a
+typical computer (we shall define shortly the precise details of our
+regular expressions and the derivative operation).  Clearly, the
+notation involving $\ZERO$s and $\ONE$s already suggests
+simplification rules that can be applied to regular regular
+expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
+\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
+r$. While such simple-minded simplifications have been proved in our
+earlier work to preserve the correctness of Sulzmann and Lu's
+algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
+\emph{not} help with limiting the growth of the derivatives shown
+above: the growth is slowed, but the derivatives can still grow rather
+quickly beyond any finite bound.
+
+
+Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
+\cite{Sulzmann2014} where they introduce bitcoded
+regular expressions. In this version, POSIX values are
+represented as bitsequences and such sequences are incrementally generated
+when derivatives are calculated. The compact representation
+of bitsequences and regular expressions allows them to define a more
+``aggressive'' simplification method that keeps the size of the
+derivatives finitely bounded no matter what the length of the string is.
+They make some informal claims about the correctness and linear behaviour
+of this version, but do not provide any supporting proof arguments, not
+even ``pencil-and-paper'' arguments. They write about their bitcoded
+\emph{incremental parsing method} (that is the algorithm to be formalised
+in this paper):
+%
+\begin{quote}\it
+  ``Correctness Claim: We further claim that the incremental parsing
+  method [..] in combination with the simplification steps [..]
+  yields POSIX parse trees. We have tested this claim
+  extensively [..] but yet
+  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
+\end{quote}  
+
+\noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL
+the derivative-based lexing algorithm of Sulzmann and Lu
+\cite{Sulzmann2014} where regular expressions are annotated with
+bitsequences. We define the crucial simplification function as a
+recursive function, without the need of a fix-point operation. One objective of
+the simplification function is to remove duplicates of regular
+expressions.  For this Sulzmann and Lu use in their paper the standard
+@{text nub} function from Haskell's list library, but this function
+does not achieve the intended objective with bitcoded regular expressions.  The
+reason is that in the bitcoded setting, each copy generally has a
+different bitcode annotation---so @{text nub} would never ``fire''.
+Inspired by Scala's library for lists, we shall instead use a @{text
+distinctWith} function that finds duplicates under an ``erasing'' function
+which deletes bitcodes before comparing regular expressions.
+We shall also introduce our own argument and definitions for
+establishing the correctness of the bitcoded algorithm when 
+simplifications are included. Finally we
+establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions
+%of regular expressions and describe the definition
+%of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
+%as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
+%the correctness for the bitcoded algorithm without simplification, and
+%after that extend the proof to include simplification.}\smallskip
+%\mbox{}\\[-10mm]
+
+\noindent In this paper, we shall first briefly introduce the basic notions
+of regular expressions and describe the definition
+of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
+as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
+the correctness for the bitcoded algorithm without simplification, and
+after that extend the proof to include simplification.\mbox{}\\[-6mm]
+
+*}
+
+section {* Background *}
+
+text {*
+  In our Isabelle/HOL formalisation strings are lists of characters with
+  the empty string being represented by the empty list, written $[]$,
+  and list-cons being written as $\_\!::\!\_\,$; string
+  concatenation is $\_ \,@\, \_\,$. We often use the usual
+  bracket notation for lists also for strings; for example a string
+  consisting of just a single character $c$ is written $[c]$.   
+  Our regular expressions are defined as usual as the following inductive
+  datatype:\\[-4mm]
+  %
+  \begin{center}
+  @{text "r ::="} \;
+  @{const "ZERO"} $\mid$
+  @{const "ONE"} $\mid$
+  @{term "CH c"} $\mid$
+  @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
+  @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
+  @{term "STAR r"} 
+  \end{center}
+
+  \noindent where @{const ZERO} stands for the regular expression that does
+  not match any string, @{const ONE} for the regular expression that matches
+  only the empty string and @{term c} for matching a character literal.
+  The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively.
+  We sometimes omit the $\cdot$ in a sequence regular expression for brevity. 
+  The
+  \emph{language} of a regular expression, written $L(r)$, is defined as usual
+  and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}).
+
+  Central to Brzozowski's regular expression matcher are two functions
+  called @{text nullable} and \emph{derivative}. The latter is written
+  $r\backslash c$ for the derivative of the regular expression $r$
+  w.r.t.~the character $c$. Both functions are defined by recursion over
+  regular expressions.  
+%
+\begin{center}
+\begin{tabular}{cc}
+  \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
+  @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
+  @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
+  @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
+  @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
+  @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
+  & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
+  & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
+  % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
+  @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
+  \end{tabular}
+  &
+  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
+  @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
+  @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
+  @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
+  @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
+  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}%\medskip\\
+  \end{tabular}  
+\end{tabular}  
+\end{center}
+
+  \noindent
+  We can extend this definition to give derivatives w.r.t.~strings,
+  namely @{thm (lhs) ders.simps(1)}$\dn$@{thm (rhs) ders.simps(1)}
+  and @{thm (lhs) ders.simps(2)}$\dn$@{thm (rhs) ders.simps(2)}.
+  Using @{text nullable} and the derivative operation, we can
+define a simple regular expression matcher, namely
+$@{text "match s r"} \;\dn\; @{term nullable}(r\backslash s)$.
+This is essentially Brzozowski's algorithm from 1964. Its
+main virtue is that the algorithm can be easily implemented as a
+functional program (either in a functional programming language or in
+a theorem prover). The correctness of @{text match} amounts to
+establishing the property:%\footnote{It is a fun exercise to formally prove this property in a theorem prover.}
+%
+\begin{proposition}\label{matchcorr} 
+@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
+\end{proposition}
+
+\noindent
+It is a fun exercise to formally prove this property in a theorem prover.
+
+The novel idea of Sulzmann and Lu is to extend this algorithm for 
+lexing, where it is important to find out which part of the string
+is matched by which part of the regular expression.
+For this Sulzmann and Lu presented two lexing algorithms in their paper
+  \cite{Sulzmann2014}. The first algorithm consists of two phases: first a
+  matching phase (which is Brzozowski's algorithm) and then a value
+  construction phase. The values encode \emph{how} a regular expression
+  matches a string. \emph{Values} are defined as the inductive datatype
+  %
+  \begin{center}
+  @{text "v ::="}
+  @{const "Void"} $\mid$
+  @{term "val.Char c"} $\mid$
+  @{term "Left v"} $\mid$
+  @{term "Right v"} $\mid$
+  @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
+  @{term "Stars vs"} 
+  \end{center}  
+
+  \noindent where we use @{term vs} to stand for a list of values. The
+  string underlying a value can be calculated by a @{const flat}
+  function, written @{term "flat DUMMY"}. It traverses a value and
+  collects the characters contained in it \cite{AusafDyckhoffUrban2016}.
+
+
+  Sulzmann and Lu also define inductively an
+  inhabitation relation that associates values to regular expressions. This
+  is defined by the following six rules for the values:
+  %
+  \begin{center}
+  \begin{tabular}{@ {\hspace{-12mm}}c@ {}}
+  \begin{tabular}{@ {}c@ {}}
+  \\[-8mm]
+  @{thm[mode=Axiom] Prf.intros(4)}\\
+  @{thm[mode=Axiom] Prf.intros(5)[of "c"]}
+  \end{tabular}
+  \quad
+  @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\quad
+  @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\quad
+  @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\quad
+  @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\mbox{}\hspace{-10mm}\mbox{}
+  \end{tabular}
+  \end{center}
+
+  \noindent Note that no values are associated with the regular expression
+  @{term ZERO}, since it cannot match any string.
+  It is routine to establish how values ``inhabiting'' a regular
+  expression correspond to the language of a regular expression, namely
+  @{thm L_flat_Prf}.
+
+  In general there is more than one value inhabited by a regular
+  expression (meaning regular expressions can typically match more
+  than one string). But even when fixing a string from the language of the
+  regular expression, there are generally more than one way of how the
+  regular expression can match this string. POSIX lexing is about
+  identifying the unique value for a given regular expression and a
+  string that satisfies the informal POSIX rules (see
+  \cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).\footnote{POSIX
+	lexing acquired its name from the fact that the corresponding
+	rules were described as part of the POSIX specification for
+	Unix-like operating systems \cite{POSIX}.} Sometimes these
+  informal rules are called \emph{maximal munch rule} and \emph{rule priority}.
+  One contribution of our earlier paper is to give a convenient
+ specification for what POSIX values are (the inductive rules are shown in
+  Figure~\ref{POSIXrules}).
+
+\begin{figure}[t]
+  \begin{center}
+  \begin{tabular}{@ {}c@ {}}
+  @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \quad
+  @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\quad
+  @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\quad
+  @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
+  $\mprset{flushleft}
+   \inferrule
+   {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
+    @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
+    @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
+   {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\smallskip\\
+  @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
+  $\mprset{flushleft}
+   \inferrule
+   {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
+    @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
+    @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
+    @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
+   {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\\[-4mm]
+  \end{tabular}
+  \end{center}
+  \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
+  of given a string $s$ and a regular
+  expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
+  regular expression matching.}\label{POSIXrules}
+  \end{figure}
+
+  The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
+  an injection function on values that mirrors (but inverts) the
+  construction of the derivative on regular expressions. Essentially it
+  injects back a character into a value.
+  For this they define two functions called @{text mkeps} and @{text inj}:
+ %
+  \begin{center}
+  \begin{tabular}{@ {}l@ {}}
+  \begin{tabular}{@ {}lcl@ {}}
+  @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
+  @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
+  @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
+  @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
+  \end{tabular}\smallskip\\
+
+  \begin{tabular}{@ {}cc@ {}}
+  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
+  @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
+  @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
+      @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
+  @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
+      @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
+  @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
+      & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}
+  \end{tabular}
+  &
+  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
+  @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
+      & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
+  @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
+      & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
+  @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$\\ 
+  \multicolumn{3}{@ {}r@ {}}{@{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}}
+  \end{tabular}
+  \end{tabular}
+  \end{tabular}\smallskip
+  \end{center}
+
+  \noindent
+  The function @{text mkeps} is run when the last derivative is nullable, that is
+  the string to be matched is in the language of the regular expression. It generates
+  a value for how the last derivative can match the empty string. The injection function
+  then calculates the corresponding value for each intermediate derivative until
+  a value for the original regular expression is generated.
+  Graphically the algorithm by
+  Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz}
+  where the path from the left to the right involving @{term derivatives}/@{const
+  nullable} is the first phase of the algorithm (calculating successive
+  \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
+  left, the second phase.
+%
+\begin{center}
+\begin{tikzpicture}[scale=0.99,node distance=9mm,
+                    every node/.style={minimum size=6mm}]
+\node (r1)  {@{term "r\<^sub>1"}};
+\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
+\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
+\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
+\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
+\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
+\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
+\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
+\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
+\draw[->,line width=1mm](r4) -- (v4);
+\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
+\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
+\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
+\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
+\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
+\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
+\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
+\end{tikzpicture}
+\end{center}
+%
+\noindent
+  The picture shows the steps required when a
+  regular expression, say @{text "r\<^sub>1"}, matches the string @{term
+  "[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:\\[-8mm]
+
+%  \begin{figure}[t]
+%\begin{center}
+%\begin{tikzpicture}[scale=1,node distance=1cm,
+%                    every node/.style={minimum size=6mm}]
+%\node (r1)  {@{term "r\<^sub>1"}};
+%\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
+%\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
+%\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
+%\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
+%\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
+%\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
+%\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
+%\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
+%\draw[->,line width=1mm](r4) -- (v4);
+%\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
+%\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
+%\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
+%\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
+%\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
+%\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
+%\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
+%\end{tikzpicture}
+%\end{center}
+%\mbox{}\\[-13mm]
+%
+%\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
+%matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
+%left to right) is \Brz's matcher building successive derivatives. If the 
+%last regular expression is @{term nullable}, then the functions of the 
+%second phase are called (the top-down and right-to-left arrows): first 
+%@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
+%how the empty string has been recognised by @{term "r\<^sub>4"}. After
+%that the function @{term inj} ``injects back'' the characters of the string into
+%the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
+%the POSIX value for this string and
+%regular expression.
+%\label{Sulz}}
+%\end{figure} 
+
+
+
+  \begin{center}
+  \begin{tabular}{lcl}
+  @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
+  @{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}
+                     @{term "None"}  @{text "\<Rightarrow>"} @{term None}\\
+                     & & \hspace{24mm}$|\;$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
+  \end{tabular}
+  \end{center}
+
+
+We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
+this algorithm is correct, that is it generates POSIX values. The
+central property we established relates the derivative operation to the
+injection function.
+
+  \begin{proposition}\label{Posix2}
+	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$. 
+\end{proposition}
+
+  \noindent
+  With this in place we were able to prove:
+
+
+  \begin{proposition}\mbox{}\smallskip\\\label{lexercorrect}
+  \begin{tabular}{ll}
+  (1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
+  (2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
+  \end{tabular}
+  \end{proposition}
+
+  \noindent
+  In fact we have shown that, in the success case, the generated POSIX value $v$ is
+  unique and in the failure case that there is no POSIX value $v$ that satisfies
+  $(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
+  slow in cases where the derivatives grow arbitrarily (recall the example from the
+  Introduction). However it can be used as a convenient reference point for the correctness
+  proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
+  
+*}
+
+section {* Bitcoded Regular Expressions and Derivatives *}
+
+text {*
+
+  In the second part of their paper \cite{Sulzmann2014},
+  Sulzmann and Lu describe another algorithm that also generates POSIX
+  values but dispenses with the second phase where characters are
+  injected ``back'' into values. For this they annotate bitcodes to
+  regular expressions, which we define in Isabelle/HOL as the datatype
+
+  \begin{center}
+  @{term breg} $\;::=\;$ @{term "AZERO"}
+               $\;\mid\;$ @{term "AONE bs"}
+               $\;\mid\;$ @{term "ACHAR bs c"}
+               $\;\mid\;$ @{term "AALTs bs rs"}
+               $\;\mid\;$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}
+               $\;\mid\;$ @{term "ASTAR bs r"}
+  \end{center}
+
+  \noindent where @{text bs} stands for bitsequences; @{text r},
+  @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
+  expressions; and @{text rs} for lists of bitcoded regular
+  expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
+  is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}. 
+  For bitsequences we use lists made up of the
+  constants @{text Z} and @{text S}.  The idea with bitcoded regular
+  expressions is to incrementally generate the value information (for
+  example @{text Left} and @{text Right}) as bitsequences. For this 
+  Sulzmann and Lu define a coding
+  function for how values can be coded into bitsequences.
+
+  \begin{center}
+  \begin{tabular}{cc}
+  \begin{tabular}{lcl}
+  @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
+  @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
+  @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
+  @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
+  \end{tabular}
+  &
+  \begin{tabular}{lcl}
+  @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
+  @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
+  @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
+  \mbox{\phantom{XX}}\\
+  \end{tabular}
+  \end{tabular}
+  \end{center}
+   
+  \noindent
+  As can be seen, this coding is ``lossy'' in the sense that we do not
+  record explicitly character values and also not sequence values (for
+  them we just append two bitsequences). However, the
+  different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
+  @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
+  if there is still a value coming in the list of @{text Stars}, whereas @{text S}
+  indicates the end of the list. The lossiness makes the process of
+  decoding a bit more involved, but the point is that if we have a
+  regular expression \emph{and} a bitsequence of a corresponding value,
+  then we can always decode the value accurately (see Fig.~\ref{decode}).
+  The function \textit{decode} checks whether all of the bitsequence is
+  consumed and returns the corresponding value as @{term "Some v"}; otherwise
+  it fails with @{text "None"}. We can establish that for a value $v$
+  inhabited by a regular expression $r$, the decoding of its
+  bitsequence never fails.
+
+  %The decoding can be
+  %defined by using two functions called $\textit{decode}'$ and
+  %\textit{decode}:
+
+  \begin{figure}
+  \begin{center}
+  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
+  \multicolumn{3}{@ {}l}{$\textit{decode}'\,bs\,(\ONE)$ $\;\dn\;$ $(\Empty, bs)$ \quad\qquad
+  $\textit{decode}'\,bs\,(c)$ $\;\dn\;$ $(\Char\,c, bs)$}\\
+  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
+       (\Left\,v, bs_1)$\\
+  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
+       (\Right\,v, bs_1)$\\                           
+  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
+        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
+  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
+        \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
+  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
+  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
+         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
+  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
+        \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\medskip\\
+  \multicolumn{3}{l}{$\textit{decode}\,bs\,r$ $\dn$ 
+     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$
+     $\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
+       \textit{else}\;\textit{None}$}\\[-4mm]   
+  \end{tabular}    
+  \end{center}
+  \caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode}
+  \end{figure}
+
+  %\noindent
+  %The function \textit{decode} checks whether all of the bitsequence is
+  %consumed and returns the corresponding value as @{term "Some v"}; otherwise
+  %it fails with @{text "None"}. We can establish that for a value $v$
+  %inhabited by a regular expression $r$, the decoding of its
+  %bitsequence never fails.
+
+\begin{lemma}\label{codedecode}\it
+  If $\;\vdash v : r$ then
+  $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
+\end{lemma}
+
+
+
+  Sulzmann and Lu define the function \emph{internalise}
+  in order to transform (standard) regular expressions into annotated
+  regular expressions. We write this operation as $r^\uparrow$.
+  This internalisation uses the following
+  \emph{fuse} function.	     
+  %
+  \begin{center}
+  \begin{tabular}{@ {}cc@ {}}
+  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
+  $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
+     $\textit{ONE}\,(bs\,@\,bs')$\\
+  $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
+     $\textit{CHAR}\,(bs\,@\,bs')\,c$
+  \end{tabular}
+  &
+  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
+     $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
+  $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
+     $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
+  $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
+     $\textit{STAR}\,(bs\,@\,bs')\,r$
+  \end{tabular}
+  \end{tabular}
+  \end{center}    
+
+  \noindent
+  A regular expression can then be \emph{internalised} into a bitcoded
+  regular expression as follows:
+  %
+  \begin{center}
+  \begin{tabular}{@ {}ccc@ {}}
+  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
+  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$
+  \end{tabular}
+  &
+  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
+  $(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$
+  \end{tabular}
+  &
+  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  $(r_1 + r_2)^\uparrow$ & $\dn$ &
+         $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
+                            (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
+  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
+         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$
+  \end{tabular}
+  \end{tabular}
+  \end{center}    
+
+  \noindent
+  There is also an \emph{erase}-function, written $r^\downarrow$, which
+  transforms a bitcoded regular expression into a (standard) regular
+  expression by just erasing the annotated bitsequences. We omit the
+  straightforward definition. For defining the algorithm, we also need
+  the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
+  ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
+  bitcoded regular expressions.
+  %
+  \begin{center}
+  \begin{tabular}{@ {}c@ {\hspace{2mm}}c@ {}}
+  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
+  $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
+  $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
+  $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
+     $\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
+  $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
+     $\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
+  $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
+     $\textit{True}$
+  \end{tabular}
+  &
+  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
+  $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
+  $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
+  $bs\,@\,\textit{bmkepss}\,\rs$\\
+  $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
+  \multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
+  $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
+     $bs \,@\, [\S]$\\
+  $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &
+     $\textit{if}\;\textit{bnullable}\,r$\\
+  & &$\textit{then}\;\textit{bmkeps}\,r$\\
+  & &$\textit{else}\;\textit{bmkepss}\,\rs$
+  \end{tabular}
+  \end{tabular}
+  \end{center}    
+ 
+
+  \noindent
+  The key function in the bitcoded algorithm is the derivative of a
+  bitcoded regular expression. This derivative function calculates the
+  derivative but at the same time also the incremental part of the bitsequences
+  that contribute to constructing a POSIX value.	
+  % 
+  \begin{center}
+  \begin{tabular}{@ {}lcl@ {}}
+  \multicolumn{3}{@ {}l}{$(\textit{ZERO})\backslash c$ $\;\dn\;$ $\textit{ZERO}$ \quad\qquad  
+  $(\textit{ONE}\;bs)\backslash c$ $\;\dn\;$ $\textit{ZERO}$}\\  
+  $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
+        $\textit{if}\;c=d\; \;\textit{then}\;
+         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
+  $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
+        $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
+  $(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
+     $\textit{if}\;\textit{bnullable}\,r_1$\\
+  & &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$
+  $\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
+  & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
+  $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
+      $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
+       (\textit{STAR}\,[]\,r)$
+  \end{tabular}    
+  \end{center}
+
+
+  \noindent
+  This function can also be extended to strings, written $r\backslash s$,
+  just like the standard derivative.  We omit the details. Finally we
+  can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
+  %
+  \begin{center}
+\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
+  $\textit{blexer}\;r\,s$ & $\dn$ &
+      $\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$                
+  $\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
+       \;\;\textit{else}\;\textit{None}$
+\end{tabular}
+\end{center}
+
+  \noindent
+This bitcoded lexer first internalises the regular expression $r$ and then
+builds the bitcoded derivative according to $s$. If the derivative is
+(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
+$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value.  If
+the derivative is \emph{not} nullable, then $\textit{None}$ is
+returned. We can show that this way of calculating a value
+generates the same result as \textit{lexer}.
+
+Before we can proceed we need to define a helper function, called
+\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
+%
+\begin{center}
+  \begin{tabular}{lcl}
+  @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
+  @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
+  @{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
+  @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
+  @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
+  @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
+      & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
+  @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
+  @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
+  \end{tabular}
+  \end{center}
+
+\noindent
+The idea behind this function is to retrieve a possibly partial
+bitsequence from a bitcoded regular expression, where the retrieval is
+guided by a value.  For example if the value is $\Left$ then we
+descend into the left-hand side of an alternative in order to
+assemble the bitcode. Similarly for
+$\Right$. The property we can show is that for a given $v$ and $r$
+with $\vdash v : r$, the retrieved bitsequence from the internalised
+regular expression is equal to the bitcoded version of $v$.
+
+\begin{lemma}\label{retrievecode}
+If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
+\end{lemma}
+
+\noindent
+We also need some auxiliary facts about how the bitcoded operations
+relate to the ``standard'' operations on regular expressions. For
+example if we build a bitcoded derivative and erase the result, this
+is the same as if we first erase the bitcoded regular expression and
+then perform the ``standard'' derivative operation.
+
+\begin{lemma}\label{bnullable}\mbox{}\smallskip\\
+  \begin{tabular}{ll}
+\textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\    
+\textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
+\textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$.
+\end{tabular}  
+\end{lemma}
+
+%\begin{proof}
+%  All properties are by induction on annotated regular expressions.
+%  %There are no interesting cases.
+%\end{proof}
+
+\noindent
+The only difficulty left for the correctness proof is that the bitcoded algorithm
+has only a ``forward phase'' where POSIX values are generated incrementally.
+We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
+functions during the forward phase. An auxiliary function, called $\textit{flex}$,
+allows us to recast the rules of $\lexer$ in terms of a single
+phase and stacked up injection functions.
+%
+\begin{center}
+\begin{tabular}{@ {}l@ {}c@ {}l@ {\hspace{10mm}}l@ {}c@ {}l@ {}}
+  $\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$ & 
+  $\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
+  $\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$
+\end{tabular}    
+\end{center}    
+
+\noindent
+The point of this function is that when
+reaching the end of the string, we just need to apply the stacked up
+injection functions to the value generated by @{text mkeps}.
+Using this function we can recast the success case in @{text lexer} 
+as follows:
+
+\begin{lemma}\label{flex}
+If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
+      (\mkeps (r\backslash s))$.
+\end{lemma}
+
+\noindent
+Note we did not redefine \textit{lexer}, we just established that the
+value generated by \textit{lexer} can also be obtained by a different
+method. While this different method is not efficient (we essentially
+need to traverse the string $s$ twice, once for building the
+derivative $r\backslash s$ and another time for stacking up injection
+functions), it helps us with proving
+that incrementally building up values in @{text blexer} generates the same result.
+
+This brings us to our main lemma in this section: if we calculate a
+derivative, say $r\backslash s$, and have a value, say $v$, inhabited
+by this derivative, then we can produce the result @{text lexer} generates
+by applying this value to the stacked-up injection functions
+that $\textit{flex}$ assembles. The lemma establishes that this is the same
+value as if we build the annotated derivative $r^\uparrow\backslash s$
+and then retrieve the corresponding bitcoded version, followed by a
+decoding step.
+
+\begin{lemma}[Main Lemma]\label{mainlemma}\it
+If $\vdash v : r\backslash s$ then 
+$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
+  \textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r$
+\end{lemma}  
+
+
+
+\noindent
+%With this lemma in place,
+We can then prove the correctness of \textit{blexer}---it indeed
+produces the same result as \textit{lexer}.
+
+
+\begin{theorem}\label{thmone}
+$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$
+\end{theorem}  
+
+
+
+\noindent This establishes that the bitcoded algorithm \emph{without}
+simplification produces correct results. This was
+only conjectured by Sulzmann and Lu in their paper
+\cite{Sulzmann2014}. The next step is to add simplifications.
+
+*}
+
+
+section {* Simplification *}
+
+text {*
+
+     Derivatives as calculated by Brzozowski’s method are usually more
+     complex regular expressions than the initial one; the result is
+     that derivative-based matching and lexing algorithms are
+     often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
+     optimisations are possible, such as the simplifications
+     $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
+     $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
+     simplifications can considerably speed up the two algorithms  in many
+     cases, they do not solve fundamentally the growth problem with
+     derivatives. To see this let us return to the example from the
+     Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
+     If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
+     the simplification rules shown above we obtain
+     %
+     %%
+     %
+     \begin{equation}\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}\label{derivex}
+     (a + aa)^* \quad\xll\quad
+      \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\;
+     ((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r})
+     \end{equation}
+
+     \noindent This is a simpler derivative, but unfortunately we
+     cannot make any further simplifications. This is a problem because
+     the outermost alternatives contains two copies of the same
+     regular expression (underlined with $r$). These copies will
+     spawn new copies in later derivative steps and they in turn even more copies. This
+     destroys any hope of taming the size of the derivatives.  But the
+     second copy of $r$ in \eqref{derivex} will never contribute to a
+     value, because POSIX lexing will always prefer matching a string
+     with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
+     The dilemma with the simple-minded
+     simplification rules above is that the rule $r + r \Rightarrow r$
+     will never be applicable because as can be seen in this example the
+     regular expressions are not next to each other but separated by another regular expression.
+
+     But here is where Sulzmann and Lu's representation of generalised
+     alternatives in the bitcoded algorithm shines: in @{term
+     "ALTs bs rs"} we can define a more aggressive simplification by
+     recursively simplifying all regular expressions in @{text rs} and
+     then analyse the resulting list and remove any duplicates.
+     Another advantage with the bitsequences in  bitcoded regular
+     expressions is that they can be easily modified such that simplification does not
+     interfere with the value constructions. For example we can ``flatten'', or
+     de-nest, or spill out, @{text ALTs} as follows
+     %
+     \[
+     @{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
+     \quad\xrightarrow{bsimp}\quad
+     @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) # rs\<^sub>1)"}
+     \]
+
+     \noindent
+     where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
+     to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
+     ensure that the correct value corresponding to the original (unsimplified)
+     regular expression can still be extracted. %In this way the value construction
+     %is not affected by simplification. 
+
+     However there is one problem with the definition for the more
+     aggressive simplification rules described by Sulzmann and Lu. Recasting
+     their definition with our syntax they define the step of removing
+     duplicates as
+     %
+     \[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
+     bs (nub (map bsimp rs))"}
+     \]
+   
+     \noindent where they first recursively simplify the regular
+     expressions in @{text rs} (using @{text map}) and then use
+     Haskell's @{text nub}-function to remove potential
+     duplicates. While this makes sense when considering the example
+     shown in \eqref{derivex}, @{text nub} is the inappropriate
+     function in the case of bitcoded regular expressions. The reason
+     is that in general the elements in @{text rs} will have a
+     different annotated bitsequence and in this way @{text nub}
+     will never find a duplicate to be removed. One correct way to
+     handle this situation is to first \emph{erase} the regular
+     expressions when comparing potential duplicates. This is inspired
+     by Scala's list functions of the form \mbox{@{text "distinctWith rs eq
+     acc"}} where @{text eq} is an user-defined equivalence relation that
+     compares two elements in @{text rs}. We define this function in
+     Isabelle/HOL as
+
+     \begin{center}
+     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}}
+     @{thm (lhs) distinctWith.simps(1)} & $\dn$ & @{thm (rhs) distinctWith.simps(1)}\\
+     @{thm (lhs) distinctWith.simps(2)} & $\dn$ &
+     @{text "if (\<exists> y \<in> acc. eq x y)"} & @{text "then distinctWith xs eq acc"}\\
+     & & & @{text "else x :: distinctWith xs eq ({x} \<union> acc)"}
+     \end{tabular}
+     \end{center}
+
+     \noindent where we scan the list from left to right (because we
+     have to remove later copies). In @{text distinctWith}, @{text eq} is intended to be an
+     equivalence relation for annotated regular expressions and @{text acc} is an accumulator for annotated regular
+     expressions---essentially a set of regular expressions that we have already seen
+     while scanning the list. Therefore we delete an element, say @{text x},
+     from the list provided a @{text "y"} with @{text "y"} being equivalent to @{text x} is already in the accumulator;
+     otherwise we keep @{text x} and scan the rest of the list but 
+     add @{text "x"} as another ``seen'' element to @{text acc}. We will use
+     @{term distinctWith} where @{text eq} is an equivalence that deletes bitsequences from bitcoded regular expressions
+     before comparing the components. One way to define this in Isabelle/HOL is by the following recursive function from
+     annotated regular expressions to @{text bool}:
+     %
+     \begin{center}
+     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}}
+     @{thm (lhs) eq1.simps(1)} & $\dn$ & @{thm (rhs) eq1.simps(1)}\\
+     @{thm (lhs) eq1.simps(2)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(2)[of DUMMY DUMMY]}\\
+     @{thm (lhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]} & $\dn$ & @{thm (rhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]}\\
+     @{thm (lhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]} & $\dn$ &
+     @{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\
+     @{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\
+     @{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ &
+     @{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}\\
+     @{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\
+     \end{tabular}
+     \end{center}
+
+     \noindent
+     where all other cases are set to @{text False}.
+     This equivalence is clearly a computationally more expensive operation than @{text nub},
+     but is needed in order to make the removal of unnecessary copies
+     to work properly.
+
+     Our simplification function depends on three more helper functions, one is called
+     @{text flts} and analyses lists of regular expressions coming from alternatives.
+     It is defined as follows:
+
+     \begin{center}
+     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+     @{thm (lhs) flts.simps(1)} & $\dn$ & @{thm (rhs) flts.simps(1)}\\
+     @{thm (lhs) flts.simps(2)} & $\dn$ & @{thm (rhs) flts.simps(2)}\\
+     @{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\
+     \end{tabular}
+     \end{center}
+
+     \noindent
+     The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
+     the third ``de-nests'' alternatives (but retaining the
+     bitsequence @{text "bs'"} accumulated in the inner alternative). There are
+     some corner cases to be considered when the resulting list inside an alternative is
+     empty or a singleton list. We take care of those cases in the
+     @{text "bsimpALTs"} function; similarly we define a helper function that simplifies
+     sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:
+
+     \begin{center}
+     \begin{tabular}{c@ {\hspace{5mm}}c}
+     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+     @{text "bsimpALTs bs []"}  & $\dn$ & @{text "ZERO"}\\
+     @{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
+     @{text "bsimpALTs bs rs"}  & $\dn$ & @{text "ALTs bs rs"}\\
+     \mbox{}\\
+     \end{tabular}
+     &
+     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+     @{text "bsimpSEQ bs _ ZERO"}  & $\dn$ & @{text "ZERO"}\\
+     @{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
+     @{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
+         & $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
+     @{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ &  @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
+     \end{tabular}
+     \end{tabular}
+     \end{center}
+
+     \noindent
+     With this in place we can define our simplification function as
+
+     \begin{center}
+     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+     @{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
+         @{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
+     @{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{text "bsimpALTs bs (distinctWith (flts (map bsimp rs)) \<approx> \<emptyset>)"}\\
+     @{text "bsimp r"} & $\dn$ & @{text r}
+     \end{tabular}
+     \end{center}
+
+     \noindent
+     We believe our recursive function @{term bsimp} simplifies regular
+     expressions as intended by Sulzmann and Lu. There is no point in applying the
+     @{text bsimp} function repeatedly (like the simplification in their paper which needs to be
+     applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
+     that is
+
+     \begin{proposition}
+     @{term "bsimp (bsimp r) = bsimp r"}
+     \end{proposition}
+
+     \noindent
+     This can be proved by induction on @{text r} but requires a detailed analysis
+     that the de-nesting of alternatives always results in a flat list of regular
+     expressions. We omit the details since it does not concern the correctness proof.
+     
+     Next we can include simplification after each derivative step leading to the
+     following notion of bitcoded derivatives:
+     
+     \begin{center}
+      \begin{tabular}{cc}
+      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+      @{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
+      \end{tabular}
+      &
+      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+      @{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
+      \end{tabular}
+      \end{tabular}
+      \end{center}
+
+      \noindent
+      and use it in the improved lexing algorithm defined as
+
+     \begin{center}
+\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
+  $\textit{blexer}^+\;r\,s$ & $\dn$ &
+      $\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$                
+      $\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
+       \;\;\textit{else}\;\textit{None}$
+\end{tabular}
+\end{center}
+
+       \noindent The remaining task is to show that @{term blexer} and
+       @{term "blexer_simp"} generate the same answers.
+
+       When we first
+       attempted this proof we encountered a problem with the idea
+       in Sulzmann and Lu's paper where the argument seems to be to appeal
+       again to the @{text retrieve}-function defined for the unsimplified version
+       of the algorithm. But
+       this does not work, because desirable properties such as
+     %
+     \[
+     @{text "retrieve r v = retrieve (bsimp r) v"}
+     \]
+
+     \noindent do not hold under simplification---this property
+     essentially purports that we can retrieve the same value from a
+     simplified version of the regular expression. To start with @{text retrieve}
+     depends on the fact that the value @{text v} corresponds to the
+     structure of the regular expression @{text r}---but the whole point of simplification
+     is to ``destroy'' this structure by making the regular expression simpler.
+     To see this consider the regular expression @{term "r = ALT r' ZERO"} and a corresponding
+     value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then 
+     we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding
+     bitsequence. The reason that this works is that @{text r} is an alternative
+     regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify
+     @{text r}, then @{text v} does not correspond to the shape of the regular 
+     expression anymore. So unless one can somehow
+     synchronise the change in the simplified regular expressions with
+     the original POSIX value, there is no hope of appealing to @{text retrieve} in the
+     correctness argument for @{term blexer_simp}.
+
+     We found it more helpful to introduce the rewriting systems shown in
+     Figure~\ref{SimpRewrites}. The idea is to generate 
+     simplified regular expressions in small steps (unlike the @{text bsimp}-function which
+     does the same in a big step), and show that each of
+     the small steps preserves the bitcodes that lead to the final POSIX value.
+     The rewrite system is organised such that $\leadsto$ is for bitcoded regular
+     expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
+     expressions. The former essentially implements the simplifications of
+     @{text "bsimpSEQ"} and @{text flts}; while the latter implements the
+     simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
+     regular expression reduces in zero or more steps to the simplified
+     regular expression generated by @{text bsimp}:
+
+     \begin{lemma}\label{lemone}
+     @{thm[mode=IfThen] rewrites_to_bsimp}
+     \end{lemma}
+
+     
+
+     \noindent
+     We can show that this rewrite system preserves @{term bnullable}, that 
+     is simplification does not affect nullability:
+
+     \begin{lemma}\label{lembnull}
+     @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
+     \end{lemma}
+
+     
+
+     \noindent
+     From this, we can show that @{text bmkeps} will produce the same bitsequence
+     as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
+     establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
+     in the paper by Sulzmannn and Lu). 
+
+
+     \begin{lemma}\label{lemthree}
+     @{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
+     \end{lemma}
+
+     
+
+     \noindent
+     Crucial is also the fact that derivative steps and simplification steps can be interleaved,
+     which is shown by the fact that $\leadsto$ is preserved under derivatives.
+
+     \begin{lemma}\label{bderlem}
+     @{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
+     \end{lemma}
+
+     
+
+
+     \noindent
+     Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
+     that the unsimplified
+     derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
+     
+     \begin{lemma}\label{lemtwo}
+     @{thm[mode=IfThen] central}
+     \end{lemma}
+
+     %\begin{proof}
+     %By reverse induction on @{term s} generalising over @{text r}.
+     %\end{proof}
+
+     \noindent
+     With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
+     generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
+     is indeed the POSIX value.
+     
+     \begin{theorem}
+     @{thm[mode=IfThen] main_blexer_simp}
+     \end{theorem}
+
+     %\begin{proof}
+     %By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}. 	
+     %\end{proof}
+     
+     \noindent
+     This means that if the algorithm is called with a regular expression @{term r} and a string
+     @{term s} with $@{term s} \in L(@{term r})$, it will return @{term "Some v"} for the
+     @{term v} we defined by the POSIX relation $(@{term s}, @{term r}) \rightarrow @{term v}$; otherwise
+     the algorithm returns @{term "None"} when $s \not\in L(r)$ and no such @{text v} exists.
+     This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
+     The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily big but
+     can be finitely bounded, which
+     we shall show next.
+
+   \begin{figure}[t]
+  \begin{center}
+  \begin{tabular}{@ {\hspace{-8mm}}c@ {}}
+  @{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\quad
+  @{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\quad
+  @{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
+  @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
+  @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
+  @{thm[mode=Axiom] bs6}$A0$\quad
+  @{thm[mode=Axiom] bs7}$A1$\quad
+  @{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
+  @{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad
+  @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad
+  @{thm[mode=Axiom] ss4}$L\ZERO$\quad
+  @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
+  @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm]
+  \end{tabular}
+  \end{center}
+  \caption{The rewrite rules that generate simplified regular expressions
+  in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
+  expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
+  expressions. Interesting is the $LD$ rule that allows copies of regular
+  expressions to be removed provided a regular expression earlier in the list can
+  match the same strings.}\label{SimpRewrites}
+  \end{figure}
+*}
+
+section {* Finiteness of Derivatives *}
+
+text {*
+
+In this section let us sketch our argument for why the size of the simplified
+derivatives with the aggressive simplification function can be finitely bounded. Suppose
+we have a size function for bitcoded regular expressions, written
+$\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree
+(we omit the precise definition; ditto for lists $\llbracket r\!s\rrbracket$). For this we show that for every $r$
+there exists a bound $N$
+such that 
+
+\begin{center}
+$\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$
+\end{center}
+
+\noindent
+We prove this by induction on $r$. The base cases for @{term AZERO},
+@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
+for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
+hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and
+$\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows
+%
+\begin{center}
+\begin{tabular}{lcll}
+& & $ \llbracket@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\rrbracket$\\
+& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
+    [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]))\rrbracket $ & (1) \\
+& $\leq$ &
+    $\llbracket\textit{distinctWith}\,((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
+    [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)])\,\approx\;@{term "{}"}\rrbracket + 1 $ & (2) \\
+& $\leq$ & $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket +
+             \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket + 1 $ & (3) \\
+& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 +
+      \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket$ & (4)\\
+& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5)
+\end{tabular}
+\end{center}
+
+% tell Chengsong about Indian paper of closed forms of derivatives
+
+\noindent
+where in (1) the $\textit{Suffix}(@{text "r"}_1, s)$ are the all the suffixes of $s$ where @{term "bders_simp r\<^sub>1 s'"} is nullable ($s'$ being a suffix of $s$).
+In (3) we know that  $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket$ is 
+bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller
+than $N_2$. The list length after @{text distinctWith} is bounded by a number, which we call $l_{N_2}$. It stands
+for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them).
+We reason similarly for @{text STAR}.\medskip
+
+\noindent
+Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
+far from the actual bound we can expect. We can do better than this, but this does not improve
+the finiteness property we are proving. If we are interested in a polynomial bound,
+one would hope to obtain a similar tight bound as for partial
+derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
+@{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in
+partial derivatives). Unfortunately to obtain the exact same bound would mean
+we need to introduce simplifications, such as
+ $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$,
+which exist for partial derivatives. However, if we introduce them in our
+setting we would lose the POSIX property of our calculated values. We leave better
+bounds for future work.\\[-6.5mm]
+
+*}
+
+
+section {* Conclusion *}
+
+text {*
+
+   We set out in this work to prove in Isabelle/HOL the correctness of
+   the second POSIX lexing algorithm by Sulzmann and Lu
+   \cite{Sulzmann2014}. This follows earlier work where we established
+   the correctness of the first algorithm
+   \cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
+   introduce our own specification about what POSIX values are,
+   because the informal definition given by Sulzmann and Lu did not
+   stand up to a formal proof. Also for the second algorithm we needed
+   to introduce our own definitions and proof ideas in order to establish the
+   correctness.  Our interest in the second algorithm 
+   lies in the fact that by using bitcoded regular expressions and an aggressive
+   simplification method there is a chance that the derivatives
+   can be kept universally small  (we established in this paper that
+   they can be kept finitely bounded for any string). This is important if one is after
+   an efficient POSIX lexing algorithm based on derivatives.
+
+   Having proved the correctness of the POSIX lexing algorithm, which
+   lessons have we learned? Well, we feel this is a very good example
+   where formal proofs give further insight into the matter at
+   hand. For example it is very hard to see a problem with @{text nub}
+   vs @{text distinctWith} with only experimental data---one would still
+   see the correct result but find that simplification does not simplify in well-chosen, but not
+   obscure, examples. We found that from an implementation
+   point-of-view it is really important to have the formal proofs of
+   the corresponding properties at hand.
+
+   We have also developed a
+   healthy suspicion when experimental data is used to back up
+   efficiency claims. For example Sulzmann and Lu write about their
+   equivalent of @{term blexer_simp} \textit{``...we can incrementally compute
+   bitcoded parse trees in linear time in the size of the input''}
+   \cite[Page 14]{Sulzmann2014}. 
+   Given the growth of the
+   derivatives in some cases even after aggressive simplification, this
+   is a hard to believe claim. A similar claim about a theoretical runtime
+   of @{text "O(n\<^sup>2)"} is made for the Verbatim lexer, which calculates
+   tokens according to POSIX rules~\cite{verbatim}. For this Verbatim uses Brzozowski's
+   derivatives like in our work. 
+   The authors write: \textit{``The results of our empirical tests [..] confirm that Verbatim has
+   @{text "O(n\<^sup>2)"} time complexity.''} \cite[Section~VII]{verbatim}.
+   While their correctness proof for Verbatim is formalised in Coq, the claim about
+   the runtime complexity is only supported by some emperical evidence obtained
+   by using the code extraction facilities of Coq.
+   Given our observation with the ``growth problem'' of derivatives,
+   we
+   tried out their extracted OCaml code with the example
+   \mbox{@{text "(a + aa)\<^sup>*"}} as a single lexing rule, and it took for us around 5 minutes to tokenise a
+   string of 40 $a$'s and that increased to approximately 19 minutes when the
+   string is 50 $a$'s long. Taking into account that derivatives are not simplified in the Verbatim
+   lexer, such numbers are not surprising. 
+   Clearly our result of having finite
+   derivatives might sound rather weak in this context but we think such effeciency claims
+   really require further scrutiny.
+
+   The contribution of this paper is to make sure
+   derivatives do not grow arbitrarily big (universially) In the example \mbox{@{text "(a + aa)\<^sup>*"}},
+   \emph{all} derivatives have a size of 17 or less. The result is that
+   lexing a string of, say, 50\,000 a's with the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes approximately
+   10 seconds with our Scala implementation
+   of the presented algorithm. 
+   \smallskip
+
+   \noindent
+   Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.\\[-10mm]
+
+
+%%\bibliographystyle{plain}
+\bibliography{root}
+
+\appendix
+
+\section{Some Proofs}
+
+While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some
+rough details of our reasoning in ``English''.
+
+\begin{proof}[Proof of Lemma~\ref{codedecode}]
+  This follows from the property that
+  $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
+  for any bit-sequence $bs$ and $\vdash v : r$. This property can be
+  easily proved by induction on $\vdash v : r$.
+\end{proof}  
+
+\begin{proof}[Proof of Lemma~\ref{mainlemma}]
+  This can be proved by induction on $s$ and generalising over
+  $v$. The interesting point is that we need to prove this in the
+  reverse direction for $s$. This means instead of cases $[]$ and
+  $c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
+  string from the back.\footnote{Isabelle/HOL provides an induction principle
+    for this way of performing the induction.}
+
+  The case for $[]$ is routine using Lemmas~\ref{codedecode}
+  and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
+  the assumption that $\vdash v : (r\backslash s)\backslash c$
+  holds. Hence by Prop.~\ref{Posix2} we know that 
+  (*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
+  By definition of $\textit{flex}$ we can unfold the left-hand side
+  to be
+  \[
+    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
+    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))  
+  \]  
+
+  \noindent
+  By IH and (*) we can rewrite the right-hand side to
+  $\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
+    (\inj\,(r\backslash s)\,c\,\,v))\,r$ which is equal to
+  $\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
+  (s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
+  because we generalised over $v$ in our induction.
+\end{proof}
+
+\begin{proof}[Proof of Theorem~\ref{thmone}]
+  We can first expand both sides using Lem.~\ref{flex} and the
+  definition of \textit{blexer}. This gives us two
+  \textit{if}-statements, which we need to show to be equal. By 
+  Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
+  $\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
+   \nullable(r\backslash s)$.
+  For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
+  $d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
+  by Lemma~\ref{bnullable}\textit{(3)} that
+  %
+  \[
+    \textit{decode}(\textit{bmkeps}\:r_d)\,r =
+    \textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r
+  \]
+
+  \noindent
+  where the right-hand side is equal to
+  $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
+  d))$ by Lemma~\ref{mainlemma} (we know
+  $\vdash \textit{mkeps}\,d : d$ by (*)).  This shows the
+  \textit{if}-branches return the same value. In the
+  \textit{else}-branches both \textit{lexer} and \textit{blexer} return
+  \textit{None}. Therefore we can conclude the proof.
+\end{proof}  
+
+\begin{proof}[Proof of Lemma~\ref{lemone}]
+     By induction on @{text r}. For this we can use the properties
+     @{thm fltsfrewrites} and @{text "rs"}$\;\stackrel{s}{\leadsto}^*$@{text "distinctWith rs \<approx>"} @{term "{}"}. The latter uses
+     repeated applications of the $LD$ rule which allows the removal
+     of duplicates that can recognise the same strings. 
+     \end{proof}
+
+    \begin{proof}[Proof of Lemma~\ref{lembnull}]
+     Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
+     The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
+     be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
+     @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
+     \end{proof}
+
+ \begin{proof}[Proof of Lemma \ref{lemthree}]
+     By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
+     Again the only interesting case is the rule $LD$ where we need to ensure that
+     @{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
+        bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"} holds.
+     This is indeed the case because according to the POSIX rules the
+     generated bitsequence is determined by the first alternative that can match the
+     string (in this case being nullable).
+     \end{proof}
+
+     \begin{proof}[Proof of Lemma~\ref{bderlem}]
+     By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
+     The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
+     if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
+     \end{proof}
+*}
+
+(*<*)
+end
+(*>*)
\ No newline at end of file