lexing: thys3/Paper.thy@9f984ff20020


(*<*)
theory Paper
  imports
   "Posix.LexerSimp" 
   "Posix.FBound" 
   "HOL-Library.LaTeXsugar"
begin

declare [[show_question_marks = false]]

notation (latex output)
  If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
  Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) 


abbreviation 
  "der_syn r c \<equiv> der c r"
abbreviation 
 "ders_syn r s \<equiv> ders s r"  
abbreviation 
  "bder_syn r c \<equiv> bder c r"  

notation (latex output)
  der_syn ("_\\_" [79, 1000] 76) and
  ders_syn ("_\\_" [79, 1000] 76) and
  bder_syn ("_\\_" [79, 1000] 76) and
  bders ("_\\_" [79, 1000] 76) and
  bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and

  ZERO ("\<^bold>0" 81) and 
  ONE ("\<^bold>1" 81) and 
  CH ("_" [1000] 80) and
  ALT ("_ + _" [76,76] 77) and
  SEQ ("_ \<cdot> _" [78,78] 78) and
  STAR ("_\<^sup>*" [79] 78) and
  NTIMES ("_\<^sup>{\<^bsup>_\<^esup>\<^sup>}" [79] 78) and

  val.Void ("Empty" 78) and
  val.Char ("Char _" [1000] 78) and
  val.Left ("Left _" [79] 78) and
  val.Right ("Right _" [1000] 78) and
  val.Seq ("Seq _ _" [79,79] 78) and
  val.Stars ("Stars _" [1000] 78) and

  Prf ("\<turnstile> _ : _" [75,75] 75) and  
  Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and

  flat ("|_|" [75] 74) and
  flats ("|_|" [72] 74) and
  injval ("inj _ _ _" [79,77,79] 76) and 
  mkeps ("mkeps _" [79] 76) and 
  length ("len _" [73] 73) and
  set ("_" [73] 73) and

  AZERO ("ZERO" 81) and 
  AONE ("ONE _" [79] 78) and 
  ACHAR ("CHAR _ _" [79, 79] 80) and
  AALTs ("ALTs _ _" [77,77] 78) and
  ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
  ASTAR ("STAR _ _" [79, 79] 78) and
  ANTIMES ("NT _ _ _" [79, 79, 79] 78) and

  code ("code _" [79] 74) and
  intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
  erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
  bnullable ("bnullable _" [1000] 80) and
  bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
  bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
  bmkeps ("bmkeps _" [1000] 80) and

  eq1 ("_ \<approx> _" [77,77] 76) and 

  srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
  rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
  srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
  blexer_simp ("blexer\<^sup>+" 1000) 


lemma better_retrieve:
   shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
   and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
  apply (metis list.exhaust retrieve.simps(4))
  by (metis list.exhaust retrieve.simps(5))


lemma better_retrieve2:
  shows "retrieve (ANTIMES bs r (n + 1)) (Stars (v#vs)) = 
     bs @ [Z] @ retrieve r v @ retrieve (ANTIMES [] r n) (Stars vs)"
by auto
(*>*)

section {* Introduction *}

text {*

In the last fifteen or so years, Brzozowski's derivatives of regular
expressions have sparked quite a bit of interest in the functional
programming and theorem prover communities.
Derivatives of a
regular expressions, written @{term "der c r"}, give a simple solution
to the problem of matching a string @{term s} with a regular
expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
succession) all the characters of the string matches the empty string,
then @{term r} matches @{term s} (and {\em vice versa}).  
The beauty of
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
expressible in any functional language, and easily definable and
reasoned about in theorem provers---the definitions just consist of
inductive datatypes and simple recursive functions. Another attractive
feature of derivatives is that they can be easily extended to \emph{bounded}
regular expressions, such as @{term "r"}$^{\{@{text n}\}}$ or @{term r}$^{\{..@{text n}\}}$, where numbers or
intervals of numbers specify how many times a regular expression should be used
during matching.




However, there are two difficulties with derivative-based matchers:
First, Brzozowski's original matcher only generates a yes/no answer
for whether a regular expression matches a string or not.  This is too
little information in the context of lexing where separate tokens must
be identified and also classified (for example as keywords
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
difficulty by cleverly extending Brzozowski's matching
algorithm. Their extended version generates additional information on
\emph{how} a regular expression matches a string following the POSIX
rules for regular expression matching. They achieve this by adding a
second ``phase'' to Brzozowski's algorithm involving an injection
function.  In our own earlier work we provided the formal
specification of what POSIX matching means and proved in Isabelle/HOL
the correctness
of Sulzmann and Lu's extended algorithm accordingly
\cite{AusafDyckhoffUrban2016}.

The second difficulty is that Brzozowski's derivatives can 
grow to arbitrarily big sizes. For example if we start with the
regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
successive derivatives according to the character $a$, we end up with
a sequence of ever-growing derivatives like 

\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
\begin{center}
\begin{tabular}{rll}
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
\end{tabular}
\end{center}
 
\noindent where after around 35 steps we run out of memory on a
typical computer (we shall define shortly the precise details of our
regular expressions and the derivative operation).  Clearly, the
notation involving $\ZERO$s and $\ONE$s already suggests
simplification rules that can be applied to regular regular
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
r$. While such simple-minded simplifications have been proved in our
earlier work to preserve the correctness of Sulzmann and Lu's
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
\emph{not} help with limiting the growth of the derivatives shown
above: the growth is slowed, but some derivatives can still grow rather
quickly beyond any finite bound.


Sulzmann and Lu address this ``growth problem'' in a second algorithm
\cite{Sulzmann2014} where they introduce bitcoded
regular expressions. In this version, POSIX values are
represented as bitsequences and such sequences are incrementally generated
when derivatives are calculated. The compact representation
of bitsequences and regular expressions allows them to define a more
``aggressive'' simplification method that keeps the size of the
derivatives finitely bounded no matter what the length of the string is.
They make some informal claims about the correctness and linear behaviour
of this version, but do not provide any supporting proof arguments, not
even ``pencil-and-paper'' arguments. They write about their bitcoded
\emph{incremental parsing method} (that is the algorithm to be fixed and formalised
in this paper):
%
\begin{quote}\it
  ``Correctness Claim: We further claim that the incremental parsing
  method [..] in combination with the simplification steps [..]
  yields POSIX parse trees. We have tested this claim
  extensively [..] but yet
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
\end{quote}  

\noindent{}\textbf{Contributions:} We fill this gap by implementing in Isabelle/HOL
our version of the derivative-based lexing algorithm of Sulzmann and Lu
\cite{Sulzmann2014} where regular expressions are annotated with
bitsequences. We define the crucial simplification function as a
recursive function, without the need of a fixpoint operation. One objective of
the simplification function is to remove duplicates of regular
expressions.  For this Sulzmann and Lu use in their paper the standard
@{text nub} function from Haskell's list library, but this function
does not achieve the intended objective with bitcoded regular expressions.  The
reason is that in the bitcoded setting, each copy generally has a
different bitcode annotation---so @{text nub} would never ``fire''.
Inspired by Scala's library for lists, we shall instead use a @{text
distinctWith} function that finds duplicates under an ``erasing'' function
that deletes bitcodes before comparing regular expressions.
We shall also introduce our \emph{own} arguments and definitions for
establishing the correctness of the bitcoded algorithm when 
simplifications are included. Finally we
establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions
%of regular expressions and describe the definition
%of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
%as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
%the correctness for the bitcoded algorithm without simplification, and
%after that extend the proof to include simplification.}\smallskip
%\mbox{}\\[-10mm]

\noindent In this paper, we shall first briefly introduce the basic notions
of regular expressions and describe the definition
of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
the correctness for the bitcoded algorithm without simplification, and
after that extend the proof to include simplification.
Our Isabelle code including the results from Sec.~5 is available
from \textcolor{darkblue}{\url{https://github.com/urbanchr/posix}}.
\mbox{}\\[-6mm]

*}

section {* Background *}

text {*
  In our Isabelle/HOL formalisation strings are lists of characters with
  the empty string being represented by the empty list, written $[]$,
  and list-cons being written as $\_\!::\!\_\,$; string
  concatenation is $\_ \,@\, \_\,$. We often use the usual
  bracket notation for lists also for strings; for example a string
  consisting of just a single character $c$ is written $[c]$.   
  Our regular expressions are defined as the following inductive
  datatype:\\[-4mm]
  %
  \begin{center}
  @{text "r ::="} \;
  @{const "ZERO"} $\mid$
  @{const "ONE"} $\mid$
  @{term "CH c"} $\mid$
  @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
  @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
  @{term "STAR r"} $\mid$
  @{term "NTIMES r n"}
  \end{center}

  \noindent where @{const ZERO} stands for the regular expression that
  does not match any string, @{const ONE} for the regular expression
  that matches only the empty string and @{term c} for matching a
  character literal.  The constructors $+$ and $\cdot$ represent
  alternatives and sequences, respectively.  We sometimes omit the
  $\cdot$ in a sequence regular expression for brevity.  The
  \emph{language} of a regular expression, written $L(r)$, is defined
  as usual and we omit giving the definition here (see for example
  \cite{AusafDyckhoffUrban2016}).
  
  In our work here we also add to the usual ``basic'' regular
  expressions the \emph{bounded} regular expression @{term "NTIMES r
  n"} where the @{term n} specifies that @{term r} should match
  exactly @{term n}-times (it is not included in Sulzmann and Lu's original work). For brevity we omit the other bounded
  regular expressions @{text "r"}$^{\{..\textit{n}\}}$,
  @{text "r"}$^{\{\textit{n}..\}}$
  and @{text "r"}$^{\{\textit{n}..\textit{m}\}}$ which specify intervals for how many
  times @{text r} should match. The results presented in this paper
  extend straightforwardly to them too. The importance of the bounded
  regular expressions is that they are often used in practical
  applications, such as Snort (a system for detecting network
  intrusions) and also in XML Schema definitions. According to Bj\"{o}rklund et
  al~\cite{BjorklundMartensTimm2015}, bounded regular expressions 
  occur frequently in the latter and can have counters of up to
  ten million.  The problem is that tools based on the classic notion
  of automata need to expand @{text "r"}$^{\{\textit{n}\}}$ into @{text n}
  connected copies of the automaton for @{text r}. This leads to very
  inefficient matching algorithms or algorithms that consume large
  amounts of memory.  A classic example is the regular expression
  \mbox{@{term "SEQ (SEQ (STAR (ALT a b)) a) (NTIMES (ALT a b) n)"}}
  where the minimal DFA requires at least $2^{n + 1}$ states (see
  \cite{CountingSet2020}). Therefore regular expression matching
  libraries that rely on the classic notion of DFAs often impose 
  adhoc limits for bounded regular expressions: For example in the
  regular expression matching library in the Go language and also in Google's RE2 library the regular expression
  @{term "NTIMES a 1001"} is not permitted, because no counter can be
  above 1000; and in the regular expression library in Rust
  expressions such as @{text "a\<^bsup>{1000}{100}{5}\<^esup>"} give an error
  message for being too big. Up until recently,\footnote{up until version 1.5.4 of the regex
  library in Rust; see also CVE-2022-24713.} Rust
  however happily generated automata for regular expressions such as 
  @{text "a\<^bsup>{0}{4294967295}\<^esup>"}. This was due to a bug
  in the algorithm that decides when a regular expression is acceptable
  or too big according to Rust's classification (it did not account for the fact that @{text "a\<^bsup>{0}\<^esup>"} and similar examples can match the empty string). We shall come back to
  this example later in the paper. 
  These problems can of course be solved in matching
  algorithms where automata go beyond the classic notion and for
  instance include explicit counters (e.g.~\cite{CountingSet2020}).
  The point here is that Brzozowski derivatives and the algorithms by
  Sulzmann and Lu can be straightforwardly extended to deal with
  bounded regular expressions and moreover the resulting code
  still consists of only simple recursive functions and inductive
  datatypes. Finally, bounded regular expressions 
  do not destroy our finite boundedness property, which we shall
  prove later on.

  %, because during the lexing process counters will only be
  %decremented.
  

  Central to Brzozowski's regular expression matcher are two functions
  called @{text nullable} and \emph{derivative}. The latter is written
  $r\backslash c$ for the derivative of the regular expression $r$
  w.r.t.~the character $c$. Both functions are defined by recursion over
  regular expressions.  
%
\begin{center}
\begin{tabular}{c @ {\hspace{-9mm}}c}
  \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
  @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
  @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
  @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
  @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
  & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
  & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
  % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\\
  @{thm (lhs) der.simps(7)} & $\dn$ & @{thm (rhs) der.simps(7)}
  \end{tabular}
  &
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
  @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
  @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
  @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\
  @{thm (lhs) nullable.simps(7)} & $\dn$ & @{text "if"} @{text "n = 0"}\\
  & & @{text "then"} @{const "True"}\\
  & & @{text "else"} @{term "nullable r"}
  \end{tabular}  
\end{tabular}  
\end{center}

  \noindent
  We can extend this definition to give derivatives w.r.t.~strings,
  namely as @{thm (lhs) ders.simps(1)}$\dn$@{thm (rhs) ders.simps(1)}
  and @{thm (lhs) ders.simps(2)}$\dn$@{thm (rhs) ders.simps(2)}.
  Using @{text nullable} and the derivative operation, we can
define a simple regular expression matcher, namely
$@{text "match s r"} \dn @{term nullable}(r\backslash s)$.
This is essentially Brzozowski's algorithm from 1964. Its
main virtue is that the algorithm can be easily implemented as a
functional program (either in a functional programming language or in
a theorem prover). The correctness of @{text match} amounts to
establishing the property:%\footnote{It is a fun exercise to formally prove this property in a theorem prover.}
%
\begin{proposition}\label{matchcorr} 
@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
\end{proposition}

\noindent
It is a fun exercise to formally prove this property in a theorem prover.
We are aware
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
of the work by Krauss and Nipkow~\cite{Krauss2011}.  And another one
in Coq is given by Coquand and Siles \cite{Coquand2012}.
Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.

The novel idea of Sulzmann and Lu is to extend this algorithm for 
lexing, where it is important to find out which part of the string
is matched by which part of the regular expression.
For this Sulzmann and Lu presented two lexing algorithms in their
paper~\cite{Sulzmann2014}. The first algorithm consists of two phases: first a
  matching phase (which is Brzozowski's algorithm) and then a value
  construction phase. The values encode \emph{how} a regular expression
  matches a string. \emph{Values} are defined as the inductive datatype
  %
  \begin{center}
  @{text "v ::="}
  @{const "Void"} $\mid$
  @{term "val.Char c"} $\mid$
  @{term "Left v"} $\mid$
  @{term "Right v"} $\mid$
  @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
  @{term "Stars vs"} 
  \end{center}  

  \noindent where we use @{term vs} to stand for a list of values. The
  string underlying a value can be calculated by a @{const flat}
  function, written @{term "flat DUMMY"}. It traverses a value and
  collects the characters contained in it (see \cite{AusafDyckhoffUrban2016}).


  Sulzmann and Lu also define inductively an
  inhabitation relation that associates values to regular expressions. Our
  version of this relation is defined by the following six rules:
  %
  \begin{center}
  \begin{tabular}{@ {}l@ {}}
  @{thm[mode=Axiom] Prf.intros(4)}\qquad
  @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\qquad
  @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\qquad
  @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\medskip\\
  @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\qquad
  @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\qquad
  $\mprset{flushleft}\inferrule{
  @{thm (prem 1) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}\\\\
  @{thm (prem 2) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}\quad
  @{thm (prem 3) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}
  }
  {@{thm (concl) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}}
  $
  \end{tabular}
  \end{center}

  \noindent Note that no values are associated with the regular expression
  @{term ZERO}, since it cannot match any string. Interesting is our version
  of the rule for @{term "STAR r"} where we require that each value
  in  @{term vs} flattens to a non-empty string. This means if @{term "STAR r"} ``fires''
  one or more times, then each copy needs to match a non-empty string.
  Similarly, in the rule
  for @{term "NTIMES r n"} we require that the length of the list
  @{term "vs\<^sub>1 @ vs\<^sub>2"} equals @{term n} (meaning the regular expression
  @{text r} matches @{text n}-times) and that the first segment of this list
  contains values that flatten to non-empty strings followed by a segment that
  only contains values that flatten to the empty string. 
  It is routine to establish how values ``inhabiting'' a regular
  expression correspond to the language of a regular expression, namely
  @{thm L_flat_Prf}.

  In general there is more than one value inhabiting a regular
  expression (meaning regular expressions can typically match more
  than one string). But even when fixing a string from the language of the
  regular expression, there are generally more than one way of how the
  regular expression can match this string. POSIX lexing is about
  identifying the unique value for a given regular expression and a
  string that satisfies the informal POSIX rules (see
  \cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).
  %\footnote{POSIX
  %	lexing acquired its name from the fact that the corresponding
  %	rules were described as part of the POSIX specification for
  %	Unix-like operating systems \cite{POSIX}.}
  Sometimes these
  informal rules are called \emph{maximal munch rule} and \emph{rule priority}.
  One contribution of our earlier paper is to give a convenient
 specification for what POSIX values are (the inductive rules are shown in
  Fig~\ref{POSIXrules}).

\begin{figure}[t]
  \begin{center}\small%
  \begin{tabular}{@ {\hspace{-2mm}}c@ {}}
  \\[-8.5mm]
  @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \quad
  @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\quad
  @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\quad
  @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
    @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
    @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
   {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\\
  @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
    @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
    @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
    @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\medskip\\
  \mprset{sep=4mm} 
  @{thm[mode=Rule] Posix.intros(9)}\<open>Pn[]\<close>\quad\; 
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
    @{thm (prem 2) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
    @{thm (prem 3) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \\\\
    @{thm (prem 4) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}
   {@{thm (concl) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}$\<open>Pn+\<close>
  \\[-4mm]   
  \end{tabular}
  \end{center}
  \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
  of given a string $s$ and a regular
  expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
  regular expression matching.\smallskip}\label{POSIXrules}
  \end{figure}

  The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
  an injection function on values that mirrors (but inverts) the
  construction of the derivative on regular expressions. Essentially it
  injects back a character into a value.
  For this they define two functions called @{text mkeps} and @{text inj}:
 %
  \begin{center}
  \begin{tabular}{@ {}l@ {}}
  \begin{tabular}{@ {}lcl@ {}}
  @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
  @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
  @{thm (lhs) mkeps.simps(5)} & $\dn$ & @{thm (rhs) mkeps.simps(5)}
  \end{tabular}
  %\end{tabular}
  %\end{center}
  \smallskip\\

  %\begin{center}
  %\begin{tabular}{@ {}cc@ {}}
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
  @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)[of "c"]}\\
  @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
      @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
  @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
      @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
  %!
  @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
  @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
  %!
  @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
      & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
  @{thm (lhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]} & $\dn$ 
      & @{thm (rhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]}
  \end{tabular}
  %!&
  %!\begin{tabular}{@ {\hspace{-3mm}}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
  
  %!\end{tabular}
  %!\end{tabular}
  \end{tabular}%!\smallskip
  \end{center}

  \noindent
  The function @{text mkeps} is run when the last derivative is nullable, that is
  the string to be matched is in the language of the regular expression. It generates
  a value for how the last derivative can match the empty string. In case
  of @{term "NTIMES r n"} we use the function @{term replicate} in order to generate
  a list of exactly @{term n} copies, which is the length of the list we expect in this
  case.  The injection function\footnote{While the character argument @{text c} is not
  strictly necessary in the @{text inj}-function for the fragment of regular expressions we
  use in this paper, it is necessary for extended regular expressions. For example for the range regular expression of the form @{text "[a-z]"}.
  We therefore keep this argument from the original formulation of @{text inj} by Sulzmann and Lu.}
  then calculates the corresponding value for each intermediate derivative until
  a value for the original regular expression is generated.
  Graphically the algorithm by
  Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz}
  where the path from the left to the right involving @{term derivatives}/@{const
  nullable} is the first phase of the algorithm (calculating successive
  \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
  left, the second phase.
%
\begin{center}
\begin{tikzpicture}[scale=0.85,node distance=8mm,
                    every node/.style={minimum size=6mm}]
\node (r1)  {@{term "r\<^sub>1"}};
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
\draw[->,line width=1mm](r4) -- (v4);
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
\draw (r4) node[anchor=north west] {\;\raisebox{-5mm}{@{term "mkeps"}}};
\end{tikzpicture}
\end{center}
%
\noindent
  The picture shows the steps required when a
  regular expression, say @{text "r\<^sub>1"}, matches the string @{term
  "[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:%\\[-8mm]

%  \begin{figure}[t]
%\begin{center}
%\begin{tikzpicture}[scale=1,node distance=1cm,
%                    every node/.style={minimum size=6mm}]
%\node (r1)  {@{term "r\<^sub>1"}};
%\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
%\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
%\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
%\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
%\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
%\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
%\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
%\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
%\draw[->,line width=1mm](r4) -- (v4);
%\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
%\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
%\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
%\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
%\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
%\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
%\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
%\end{tikzpicture}
%\end{center}
%\mbox{}\\[-13mm]
%
%\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
%matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
%left to right) is \Brz's matcher building successive derivatives. If the 
%last regular expression is @{term nullable}, then the functions of the 
%second phase are called (the top-down and right-to-left arrows): first 
%@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
%how the empty string has been recognised by @{term "r\<^sub>4"}. After
%that the function @{term inj} ``injects back'' the characters of the string into
%the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
%the POSIX value for this string and
%regular expression.
%\label{Sulz}}
%\end{figure} 



  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
  @{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}
                     @{term "None"}  @{text "\<Rightarrow>"} @{term None}
                     %%& & \hspace{27mm}
		     $|\;$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
  \end{tabular}
  \end{center}


We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
this algorithm is correct, that is it generates POSIX values. The
central property we established relates the derivative operation to the
injection function.

  \begin{proposition}\label{Posix2}
	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$. 
\end{proposition}

  \noindent
  With this in place we were able to prove:

  \begin{proposition}\mbox{}\label{lexercorrect}
  \textrm{(1)}\; @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}.\\
  \mbox{\hspace{29.5mm}}\textrm{(2)}\; @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}.
  %
  % \smallskip\\
  %\begin{tabular}{ll}
  %(1) & @{thm (lhs) lexer_correct_None} \;if and only if\; @{thm (rhs) lexer_correct_None}\\
  %(2) & @{thm (lhs) lexer_correct_Some} \;if and only if\; @{thm (rhs) lexer_correct_Some}\\
  %\end{tabular}
  \end{proposition}

  \noindent
  In fact we have shown that, in the success case, the generated POSIX value $v$ is
  unique and in the failure case that there is no POSIX value $v$ that satisfies
  $(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
  slow in cases where the derivatives grow arbitrarily (recall the example from the
  Introduction). However it can be used as a convenient reference point for the correctness
  proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
  
*}

section {* Bitcoded Regular Expressions and Derivatives *}

text {*

  In the second part of their paper \cite{Sulzmann2014},
  Sulzmann and Lu describe another algorithm that also generates POSIX
  values but dispenses with the second phase where characters are
  injected ``back'' into values. For this they annotate bitcodes to
  regular expressions, which we define in Isabelle/HOL as the datatype\medskip

  \begin{center}
  %\noindent
  %\begin{minipage}{0.9\textwidth}
  \,@{term breg} $\,::=\,$ @{term "AZERO"}
               $\,\mid$ @{term "AONE bs"}
               $\,\mid$ @{term "ACHAR bs c"}
               $\,\mid$ @{term "AALTs bs rs"}
               $\,\mid$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}
               $\,\mid$ @{term "ASTAR bs r"}
	       $\,\mid$ \mbox{@{term "ANTIMES bs r n"}}%\hfill\mbox{}
  %\end{minipage}\medskip	       
  \end{center}

  \noindent where @{text bs} stands for bitsequences; @{text r},
  @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
  expressions; and @{text rs} for lists of bitcoded regular
  expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
  is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}. 
  For bitsequences we use lists made up of the
  constants @{text Z} and @{text S}.  The idea with bitcoded regular
  expressions is to incrementally generate the value information (for
  example @{text Left} and @{text Right}) as bitsequences. For this 
  Sulzmann and Lu follow Nielsen and Henglein \cite{NielsenHenglein2011}
  and define a coding
  function for how values can be coded into bitsequences.

  \begin{center}
  \begin{tabular}{c@ {\hspace{5mm}}c}
  \begin{tabular}{lcl}
  @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
  @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
  @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
  @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
  \end{tabular}
  &
  \begin{tabular}{lcl}
  @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
  @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
  \mbox{\phantom{XX}}\\
  \end{tabular}
  \end{tabular}
  \end{center}
   
  \noindent
  As can be seen, this coding is ``lossy'' in the sense that it does not
  record explicitly character values and also not sequence values (for
  them it just appends two bitsequences). However, the
  different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
  @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
  if there is still a value coming in the list of @{text Stars}, whereas @{text S}
  indicates the end of the list. The lossiness makes the process of
  decoding a bit more involved, but the point is that if we have a
  regular expression \emph{and} a bitsequence of a corresponding value,
  then we can always decode the value accurately~(see Fig.~\ref{decode}).
  The function \textit{decode} checks whether all of the bitsequence is
  consumed and returns the corresponding value as @{term "Some v"}; otherwise
  it fails with @{text "None"}. We can establish that for a value $v$
  inhabiting a regular expression $r$, the decoding of its
  bitsequence never fails (see also \cite{NielsenHenglein2011}).

  %The decoding can be
  %defined by using two functions called $\textit{decode}'$ and
  %\textit{decode}:

  \begin{figure}[t]
  \begin{center}
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
  $\textit{decode}'\,bs\,(\ONE)$ & $\;\dn\;$ & $(\Empty, bs)$\\
  $\textit{decode}'\,bs\,(c)$ & $\;\dn\;$ & $(\Char\,c, bs)$\\
  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
       (\Left\,v, bs_1)$\\
  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
       (\Right\,v, bs_1)$\\                           
  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
        \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & 
         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
        \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\\
  $\textit{decode}'\,bs\,(r^{\{n\}})$ & $\dn$ & $\textit{decode}'\,bs\,r^*$\smallskip\smallskip\\

  $\textit{decode}\,bs\,r$ & $\dn$ & 
     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
  & & $\;\;\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;\textit{else}\;\textit{None}$\\[-5mm]   
  \end{tabular}    
  \end{center}
  \caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode}
  \end{figure}

  %\noindent
  %The function \textit{decode} checks whether all of the bitsequence is
  %consumed and returns the corresponding value as @{term "Some v"}; otherwise
  %it fails with @{text "None"}. We can establish that for a value $v$
  %inhabited by a regular expression $r$, the decoding of its
  %bitsequence never fails.

\begin{lemma}\label{codedecode}\it
  If $\;\vdash v : r$ then
  $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
\end{lemma}



  Sulzmann and Lu define the function \emph{internalise}
  in order to transform (standard) regular expressions into annotated
  regular expressions. We write this operation as $r^\uparrow$.
  This internalisation uses the following
  \emph{fuse} function.\medskip
  
  %
  %!\begin{center}
  \noindent\begin{minipage}{\textwidth}
  \begin{tabular}{@ {}c@ {\hspace{3mm}}c@ {}}
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
  $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
     $\textit{ONE}\,(bs\,@\,bs')$\\
  $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
     $\textit{CHAR}\,(bs\,@\,bs')\,c$
  \end{tabular}
  &
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
     $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
  $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
     $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
  $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
     $\textit{STAR}\,(bs\,@\,bs')\,r$\\
  $\textit{fuse}\,bs\,(\textit{NT}\,bs'\,r\,n)$ & $\dn$ &
     $\textit{NT}\,(bs\,@\,bs')\,r\,n$   
  \end{tabular}
  \end{tabular}
  \end{minipage}\medskip
  %!\end{center}    

  \noindent
  This function ``fuses'' a bitsequence to the topmost constructor of an bitcoded regular expressions. 
  A regular expression can then be \emph{internalised} into a bitcoded
  regular expression as follows:
  %
  %!\begin{center}
  %!\begin{tabular}{@ {}c@ {\hspace{2mm}}c@ {\hspace{2mm}}c@ {}}
  %!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  %!$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
  %!$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$
  %!\end{tabular}
  %!&
  %!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  %!$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
  %!$(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
  %!$(r^{\{n\}})^\uparrow$ & $\dn$ &
  %!      $\textit{NT}\;[]\,r^\uparrow\,n$
  %!\end{tabular}
  %!&
  %!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  %!$(r_1 + r_2)^\uparrow$ & $\dn$ &
  %!       $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
  %!                          (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
  %!$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
  %!       $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$
  %!\end{tabular}
  %!\end{tabular}
  %!\end{center}
  %!
  \begin{center}
  \begin{tabular}{@ {}c@ {\hspace{7mm}}c@ {}}
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
  $(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
  \end{tabular}
  &
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  $(r_1 + r_2)^\uparrow$ & $\dn$ &
         $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
                            (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
  $(r^{\{n\}})^\uparrow$ & $\dn$ &
        $\textit{NT}\;[]\,r^\uparrow\,n$	 
  \end{tabular}
  \end{tabular}
  \end{center}    

  \noindent
  There is also an \emph{erase}-function, written $r^\downarrow$, which
  transforms a bitcoded regular expression into a (standard) regular
  expression by just erasing the annotated bitsequences. We omit the
  straightforward definition. For defining the algorithm, we also need
  the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
  ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
  bitcoded regular expressions.
  %
  \begin{center}
  \begin{tabular}{@ {\hspace{-1mm}}c@ {\hspace{10mm}}c@ {}}
  \begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l}
  $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
  $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
  $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
  $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &\\
  \multicolumn{3}{l}{$\quad\exists\, r \in \rs. \,\textit{bnullable}\,r$}\\
  $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
  \multicolumn{3}{l}{$\quad\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$}\\
  $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
     $\textit{True}$\\
  $\textit{bnullable}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
  \multicolumn{3}{l}{$\quad\textit{if}\;n = 0\;\textit{then}\;\textit{True}\;
  \textit{else}\;\textit{bnullable}\,r$}\\
  \end{tabular}
  &
  \begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l@ {}}
  $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
  $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
  $bs\,@\,\textit{bmkepss}\,\rs$\\
  $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
  \multicolumn{3}{l}{$\quad{}bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
  $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
     $bs \,@\, [\S]$\\
   $\textit{bmkeps}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
   \multicolumn{3}{l@ {}}{$\quad\textit{if}\;n=0\;\textit{then}\;bs \,@\, [\S]$}\\
   \multicolumn{3}{l@ {}}{$\quad\textit{else}\,bs \,@\, [\Z] \,@\, \textit{bmkeps}\,r\,@\,
        \textit{bmkeps}\,(@{term "ANTIMES [] r (n - 1)"})$}\\   
  $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &\\
  \multicolumn{3}{l}{$\quad\textit{if}\;\textit{bnullable}\,r\,\textit{then}\;\textit{bmkeps}\,r\;
  \textit{else}\;\textit{bmkepss}\,\rs$}
  \end{tabular}
  \end{tabular}
  \end{center}    
 

  \noindent
  The key function in the bitcoded algorithm is the derivative of a
  bitcoded regular expression. This derivative function calculates the
  derivative but at the same time also the incremental part of the bitsequences
  that contribute to constructing a POSIX value.	
  % 
  \begin{center}
  \begin{tabular}{@ {}l@ {\hspace{-1mm}}cl@ {}}
  $(\textit{ZERO})\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\ 
  $(\textit{ONE}\;bs)\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\  
  $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
        $\textit{if}\;c=d\; \;\textit{then}\;
         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
  $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
        $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
  $(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
     $\textit{if}\;\textit{bnullable}\,r_1$\\
  & &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$
  $\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
  & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
  $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
      $\textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
       (\textit{STAR}\,[]\,r)$\\
  $(\textit{NT}\,bs\,r\,n)\backslash c$ & $\dn$ &
      $\textit{if}\;n = 0\; \textit{then}\;\textit{ZERO}\;\textit{else}\;
      \textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
       (\textit{NT}\,[]\,r\,(n - 1))$     
  \end{tabular}    
  \end{center}


  \noindent
  This function can also be extended to strings, written $r\backslash s$,
  just like the standard derivative.  We omit the details. Finally we
  can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
  %
  \begin{center}
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
  $\textit{blexer}\;r\,s$ & $\dn$ &
      $\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\               
  & & $\;\;\;\textit{if}\; \textit{bnullable}(r_{der})$
      $\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r$
      $\;\textit{else}\;\textit{None}$
\end{tabular}
\end{center}

  \noindent
This bitcoded lexer first internalises the regular expression $r$ and then
builds the bitcoded derivative according to $s$. If the derivative is
(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value.  If
the derivative is \emph{not} nullable, then $\textit{None}$ is
returned. We can show that this way of calculating a value
generates the same result as \textit{lexer}.

Before we can proceed we need to define a helper function, called
\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
%
\begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
  @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
  @{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
  @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
  @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
  @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
      & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
  @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}\\
  @{thm (lhs) retrieve.simps(9)} & $\dn$ & @{thm (rhs) retrieve.simps(9)}\\
  @{thm (lhs) better_retrieve2} & $\dn$ & @{thm (rhs) better_retrieve2}
  \end{tabular}
  \end{center}

\noindent
The idea behind this function is to retrieve a possibly partial
bitsequence from a bitcoded regular expression, where the retrieval is
guided by a value.  For example if the value is $\Left$ then we
descend into the left-hand side of an alternative in order to
assemble the bitcode. Similarly for
$\Right$. The property we can show is that for a given $v$ and $r$
with $\vdash v : r$, the retrieved bitsequence from the internalised
regular expression is equal to the bitcoded version of $v$.

\begin{lemma}\label{retrievecode}
If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
\end{lemma}

\noindent
We also need some auxiliary facts about how the bitcoded operations
relate to the ``standard'' operations on regular expressions. For
example if we build a bitcoded derivative and erase the result, this
is the same as if we first erase the bitcoded regular expression and
then perform the ``standard'' derivative operation.

\begin{lemma}\label{bnullable}%\mbox{}\smallskip\\
\textit{(1)} $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\    
\mbox{\hspace{21.5mm}}\textit{(2)} $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
\mbox{\hspace{21.5mm}}\textit{(3)} $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$
%\begin{tabular}{ll}
%\textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\    
%\textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
%\textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ \;provided\; $\textit{nullable}(r^\downarrow)$.
%\end{tabular}  
\end{lemma}

%\begin{proof}
%  All properties are by induction on annotated regular expressions.
%  %There are no interesting cases.
%\end{proof}

\noindent
The only difficulty left for the correctness proof is that the bitcoded algorithm
has only a ``forward phase'' where POSIX values are generated incrementally.
We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
functions during the forward phase. An auxiliary function, called $\textit{flex}$,
allows us to recast the rules of $\lexer$ in terms of a single
phase and stacked up injection functions.
%
\begin{center}
\begin{tabular}{@ {}l@ {}c@ {}l@ {\hspace{10mm}}l@ {}c@ {}l@ {}}
  $\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$ & 
  $\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
  $\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$
\end{tabular}    
\end{center}    

\noindent
The point of this function is that when
reaching the end of the string, we just need to apply the stacked up
injection functions to the value generated by @{text mkeps}.
Using this function we can recast the success case in @{text lexer} 
as follows:

\begin{lemma}\label{flex}
If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
      (\textit{mkeps}\,(r\backslash s))$.
\end{lemma}

\noindent
Note we did not redefine \textit{lexer}, we just established that the
value generated by \textit{lexer} can also be obtained by a different
method. While this different method is not efficient (we essentially
need to traverse the string $s$ twice, once for building the
derivative $r\backslash s$ and another time for stacking up injection
functions), it helps us with proving
that incrementally building up values in @{text blexer} generates the same result.

This brings us to our main lemma in this section: if we calculate a
derivative, say $r\backslash s$, and have a value, say $v$, inhabiting
this derivative, then we can produce the result @{text lexer} generates
by applying this value to the stacked-up injection functions
that $\textit{flex}$ assembles. The lemma establishes that this is the same
value as if we build the annotated derivative $r^\uparrow\backslash s$
and then retrieve the bitcoded version of @{text v}, followed by a
decoding step.

\begin{lemma}[Main Lemma]\label{mainlemma}\mbox{}\\\it
If $\vdash v : r\backslash s$ then 
$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
  \textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r$
\end{lemma}  



\noindent
%With this lemma in place,
We can then prove the correctness of \textit{blexer}---it indeed
produces the same result as \textit{lexer}.


\begin{theorem}\label{thmone}
$\textit{blexer}\,r\,s = \textit{lexer}\,r\,s$
\end{theorem}  



\noindent This establishes that the bitcoded algorithm \emph{without}
simplification produces correct results. This was
only conjectured by Sulzmann and Lu in their paper
\cite{Sulzmann2014}. The next step is to add simplifications.

*}


section {* Simplification *}

text {*

     Derivatives as calculated by Brzozowski's method are usually more
     complex regular expressions than the initial one; the result is
     that derivative-based matching and lexing algorithms are
     often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
     optimisations are possible, such as the simplifications
     $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
     $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
     simplifications can considerably speed up the two algorithms  in many
     cases, they do not solve fundamentally the growth problem with
     derivatives. To see this let us return to the example from the
     Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
     If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
     the simplification rules shown above we obtain
     %
     %%
     %
     \begin{equation}\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}\label{derivex}
     (a + aa)^* \quad\xll\quad
      \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\;
     ((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r})
     \end{equation}

     \noindent This is a simpler derivative, but unfortunately we
     cannot make any further simplifications. This is a problem because
     the outermost alternatives contains two copies of the same
     regular expression (underlined with $r$). These copies will
     spawn new copies in later derivative steps and they in turn even more copies. This
     destroys any hope of taming the size of the derivatives.  But the
     second copy of $r$ in~\eqref{derivex} will never contribute to a
     value, because POSIX lexing will always prefer matching a string
     with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
     The issue with the simple-minded
     simplification rules above is that the rule $r + r \Rightarrow r$
     will never be applicable because as can be seen in this example the
     regular expressions are not next to each other but separated by another regular expression.

     But here is where Sulzmann and Lu's representation of generalised
     alternatives in the bitcoded algorithm shines: in @{term
     "ALTs bs rs"} we can define a more aggressive simplification by
     recursively simplifying all regular expressions in @{text rs} and
     then analyse the resulting list and remove any duplicates.
     Another advantage with the bitsequences in  bitcoded regular
     expressions is that they can be easily modified such that simplification does not
     interfere with the value constructions. For example we can ``flatten'', or
     de-nest, or spill out, @{text ALTs} as follows
     %
     \[
     @{text "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) :: rs\<^sub>1)"}
     \quad\xrightarrow{bsimp}\quad
     @{text "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) @ rs\<^sub>1)"}
     \]

     \noindent
     where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
     to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
     ensure that the correct value corresponding to the original (unsimplified)
     regular expression can still be extracted. %In this way the value construction
     %is not affected by simplification. 

     However there is one problem with the definition for the more
     aggressive simplification rules described by Sulzmann and Lu. Recasting
     their definition with our syntax they define the step of removing
     duplicates as
     %
     \[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
     bs (nub (map bsimp rs))"}
     \]
   
     \noindent where they first recursively simplify the regular
     expressions in @{text rs} (using @{text map}) and then use
     Haskell's @{text nub}-function to remove potential
     duplicates. While this makes sense when considering the example
     shown in \eqref{derivex}, @{text nub} is the inappropriate
     function in the case of bitcoded regular expressions. The reason
     is that in general the elements in @{text rs} will have a
     different annotated bitsequence and in this way @{text nub}
     will never find a duplicate to be removed. One correct way to
     handle this situation is to first \emph{erase} the regular
     expressions when comparing potential duplicates. This is inspired
     by Scala's list functions of the form \mbox{@{text "distinctWith rs eq
     acc"}} where @{text eq} is an user-defined equivalence relation that
     compares two elements in @{text rs}. We define this function in
     Isabelle/HOL as

     \begin{center}
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
     @{thm (lhs) distinctWith.simps(1)} & $\dn$ & @{thm (rhs) distinctWith.simps(1)}\\
     @{thm (lhs) distinctWith.simps(2)} & $\dn$ &
     @{text "if (\<exists> y \<in> acc. eq x y)"}
     @{text "then distinctWith xs eq acc"}\\
     & & @{text "else x :: distinctWith xs eq ({x} \<union> acc)"}
     \end{tabular}
     \end{center}

     \noindent where we scan the list from left to right (because we
     have to remove later copies). In @{text distinctWith}, @{text eq} is intended to be an
     equivalence relation for bitcoded regular expressions and @{text acc} is an accumulator for bitcoded regular
     expressions---essentially a set of regular expressions that we have already seen
     while scanning the list. Therefore we delete an element, say @{text x},
     from the list provided a @{text "y"} with @{text "y"} being equivalent to @{text x} is already in the accumulator;
     otherwise we keep @{text x} and scan the rest of the list but 
     add @{text "x"} as another ``seen'' element to @{text acc}. We will use
     @{term distinctWith} where @{text eq} is an equivalence that deletes bitsequences from bitcoded regular expressions
     before comparing the components. One way to define this in Isabelle/HOL is by the following recursive function from
     bitcoded regular expressions to @{text bool}:
     %
     \begin{center}
     \begin{tabular}{@ {}cc@ {}}
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {\hspace{1mm}}}
     @{thm (lhs) eq1.simps(1)} & $\dn$ & @{thm (rhs) eq1.simps(1)}\\
     @{thm (lhs) eq1.simps(2)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(2)[of DUMMY DUMMY]}\\
     @{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\
      @{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\
     \mbox{}
     \end{tabular}
     &
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}}
     @{thm (lhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]} & $\dn$ & @{thm (rhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]}\\
     @{thm (lhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]} & $\dn$ &
     @{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\
     @{thm (lhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]}\\
     @{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ & \\
     \multicolumn{3}{r}{@{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}}
     \end{tabular}
     \end{tabular}
     \end{center}

     \noindent
     where all other cases are set to @{text False}.
     This equivalence is clearly a computationally more expensive operation than @{text nub},
     but is needed in order to make the removal of unnecessary copies
     to work properly.

     Our simplification function depends on three more helper functions, one is called
     @{text flts} and analyses lists of regular expressions coming from alternatives.
     It is defined by four clauses as follows:

     \begin{center}
     \begin{tabular}{c}
     @{thm (lhs) flts.simps(1)} $\dn$ @{thm (rhs) flts.simps(1)} \qquad
     @{thm (lhs) flts.simps(2)} $\dn$ @{thm (rhs) flts.simps(2)} \qquad
     @{text "flts ((ALTs bs' rs') :: rs"}
     %@{ thm (lhs) flts.simps(3)[of "bs'" "rs'"]}
     $\dn$ @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\smallskip\\
     @{text "flts (r :: rs)"} $\dn$ @{text "r :: flts rs"}\hspace{5mm}(otherwise)
     \end{tabular}
     \end{center}

     \noindent
     The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
     the third ``de-nests'' alternatives (but retains the
     bitsequence @{text "bs'"} accumulated in the inner alternative). There are
     some corner cases to be considered when the resulting list inside an alternative is
     empty or a singleton list. We take care of those cases in the
     @{text "bsimpALTs"} function; similarly we define a helper function that simplifies
     sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:

     \begin{center}
     \begin{tabular}{@ {}c@ {\hspace{4mm}}c@ {}}
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
     @{text "bsimpALTs bs []"}  & $\dn$ & @{text "ZERO"}\\
     @{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
     @{text "bsimpALTs bs rs"}  & $\dn$ & @{text "ALTs bs rs"}\\
     \mbox{}\\
     \end{tabular}
     &
     \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
     @{text "bsimpSEQ bs _ ZERO"}  & $\dn$ & @{text "ZERO"}\\
     @{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
     @{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
         & $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
     @{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ &  @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
     \end{tabular}
     \end{tabular}
     \end{center}

     \noindent
     With this in place we can define our simplification function as

     \begin{center}
     \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
     @{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
         @{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
     @{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{text "bsimpALTs bs (distinctWith (flts (map bsimp rs)) \<approx> \<emptyset>)"}\\
     @{text "bsimp r"} & $\dn$ & @{text r}
     \end{tabular}
     \end{center}

     \noindent
     We believe our recursive function @{term bsimp} simplifies bitcoded regular
     expressions as intended by Sulzmann and Lu with the small addition of removing ``useless'' @{text ONE}s
     in sequence regular
     expressions. There is no point in applying the
     @{text bsimp} function repeatedly (like the simplification in their paper which needs to be
     applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
     that is

     \begin{proposition}
     @{term "bsimp (bsimp r) = bsimp r"}
     \end{proposition}

     \noindent
     This can be proved by induction on @{text r} but requires a detailed analysis
     that the de-nesting of alternatives always results in a flat list of regular
     expressions. We omit the details since it does not concern the correctness proof.
     %It might be interesting to not that we do not simplify inside @{term STAR} and
     %@{text NT}: the reason is that we want to keep the

     Next we can include simplification after each derivative step leading to the
     following notion of bitcoded derivatives:
     
     \begin{center}
      \begin{tabular}{c@ {\hspace{10mm}}c}
      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
      @{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
      \end{tabular}
      &
      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
      @{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
      \end{tabular}
      \end{tabular}
      \end{center}

      \noindent
      and use it in the improved lexing algorithm defined as

     \begin{center}
\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
  $\textit{blexer}^+\;r\,s$ & $\dn$ &
      $\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$\\                
      & & $\;\;\;\textit{if}\; \textit{bnullable}(r_{der})$
      $\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r$ $\;\textit{else}\;\textit{None}$
\end{tabular}
\end{center}

       \noindent
       Note that in $\textit{blexer}^+$ the derivative $r_{der}$ is calculated
       using the simplifying derivative $\_\,\backslash_{bsimp}\,\_$.
       The remaining task is to show that @{term blexer} and
       @{term "blexer_simp"} generate the same answers.

       When we first
       attempted this proof we encountered a problem with the idea
       in Sulzmann and Lu's paper where the argument seems to be to appeal
       again to the @{text retrieve}-function defined for the unsimplified version
       of the algorithm. But
       this does not work, because desirable properties such as
     %
     \[
     @{text "retrieve r v = retrieve (bsimp r) v"}
     \]

     \noindent do not hold under simplification---this property
     essentially purports that we can retrieve the same value from a
     simplified version of the regular expression. To start with @{text retrieve}
     depends on the fact that the value @{text v} corresponds to the
     structure of the regular expression @{text r}---but the whole point of simplification
     is to ``destroy'' this structure by making the regular expression simpler.
     To see this consider the regular expression @{term "r = ALT r' ZERO"} and a corresponding
     value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then 
     we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding
     bitsequence. The reason that this works is that @{text r} is an alternative
     regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify
     @{text r}, then @{text v} does not correspond to the shape of the regular 
     expression anymore. So unless one can somehow
     synchronise the change in the simplified regular expressions with
     the original POSIX value, there is no hope of appealing to @{text retrieve} in the
     correctness argument for @{term blexer_simp}.

     For our proof we found it more helpful to introduce the rewriting systems shown in
     Fig~\ref{SimpRewrites}. The idea is to generate 
     simplified regular expressions in small steps (unlike the @{text bsimp}-function which
     does the same in a big step), and show that each of
     the small steps preserves the bitcodes that lead to the POSIX value.
     The rewrite system is organised such that $\leadsto$ is for bitcoded regular
     expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
     expressions. The former essentially implements the simplifications of
     @{text "bsimpSEQ"} and @{text flts}; while the latter implements the
     simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
     regular expression reduces in zero or more steps to the simplified
     regular expression generated by @{text bsimp}:

     \begin{lemma}\label{lemone}
     @{thm[mode=IfThen] rewrites_to_bsimp}
     \end{lemma}

     

     \noindent
     We can also show that this rewrite system preserves @{term bnullable}, that 
     is simplification does not affect nullability:

     \begin{lemma}\label{lembnull}
     @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
     \end{lemma}

     

     \noindent
     From this, we can show that @{text bmkeps} will produce the same bitsequence
     as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
     establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
     in the paper by Sulzmannn and Lu). 


     \begin{lemma}\label{lemthree}
     @{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
     \end{lemma}

     

     \noindent
     Crucial is also the fact that derivative steps and simplification steps can be interleaved,
     which is shown by the fact that $\leadsto$ is preserved under derivatives.

     \begin{lemma}\label{bderlem}
     @{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
     \end{lemma}

     


     \noindent
     Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
     that the unsimplified
     derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
     
     \begin{lemma}\label{lemtwo}
     @{thm[mode=IfThen] central}
     \end{lemma}

     %\begin{proof}
     %By reverse induction on @{term s} generalising over @{text r}.
     %\end{proof}

     \noindent
     With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
     generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
     is indeed the POSIX value as generated by \textit{lexer}.
     
     \begin{theorem}
     @{thm[mode=IfThen] main_blexer_simp[symmetric]} \; (@{text "= lexer r s"}\; by Thm.~\ref{thmone})
     \end{theorem}

     %\begin{proof}
     %By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}. 	
     %\end{proof}
     
     \noindent
     This means that if the algorithm is called with a regular expression @{term r} and a string
     @{term s} with $@{term s} \in L(@{term r})$, it will return @{term "Some v"} for the unique
     @{term v} we defined by the POSIX relation $(@{term s}, @{term r}) \rightarrow @{term v}$; otherwise
     the algorithm returns @{term "None"} when $s \not\in L(r)$ and no such @{text v} exists.
     This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
     The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily big but
     can be finitely bounded, which
     we shall show next.

   \begin{figure}[t]
  \begin{center}
  \begin{tabular}{@ {\hspace{-8mm}}c@ {}}
  \\[-8mm]
  @{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\quad
  @{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\quad
  @{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
  @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
  @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
  @{thm[mode=Axiom] bs6}$A0$\quad
  @{thm[mode=Axiom] bs7}$A1$\quad
  @{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
  @{thm[mode=Rule] rrewrite_srewrite.ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad
  @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad
  @{thm[mode=Axiom] ss4}$L\ZERO$\quad
  @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
  @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm]
  \end{tabular}
  \end{center}
  \caption{The rewrite rules that generate simplified regular expressions
  in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
  expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
  expressions. Interesting is the $LD$ rule that allows copies of regular
  expressions to be removed provided a regular expression earlier in the list can
  match the same strings.}\label{SimpRewrites}
  \end{figure}
*}

section {* Finite Bound for the Size of Derivatives *}

text {*

In this section let us sketch our argument for why the size of the simplified
derivatives with the aggressive simplification function can be finitely bounded. Suppose
we have a size function for bitcoded regular expressions, written
$\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree
(we omit the precise definition; ditto for lists $\llbracket r\!s\rrbracket$). For this we show that for every $r$
there exists a bound $N$
such that 

\begin{center}
$\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$
\end{center}

\noindent
Note that the bound $N$ is a bound for \emph{all} strings, no matter how long they are.
We establish this bound by induction on $r$. The base cases for @{term AZERO},
@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and
$\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows\medskip
%
%!\begin{center}

\noindent\begin{minipage}{\textwidth}
\begin{tabular}{lcll}
& & $ \llbracket@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\rrbracket$\\
& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
    [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]))\rrbracket $ & (1) \\
& $\leq$ &
    $\llbracket\textit{distinctWith}\,((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
    [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)])\,\approx\;@{term "{}"}\rrbracket + 1 $ & (2) \\
& $\leq$ & $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket +
             \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket + 1 $ & (3) \\
& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 +
      \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suf}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket$ & (4)\\
& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5)
\end{tabular}
\end{minipage}\medskip
%!\end{center}

% tell Chengsong about Indian paper of closed forms of derivatives

\noindent
where in (1) the set $\textit{Suf}(@{text "r"}_1, s)$ are all the suffixes of $s$ where @{term "bders_simp r\<^sub>1 s'"} is nullable ($s'$ being a suffix of $s$).
In (3) we know that  $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket$ is 
bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller
than $N_2$. The list length after @{text distinctWith} is bounded by a number, which we call $l_{N_2}$. It stands
for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them).
We reason similarly for @{text STAR} and @{text NT}.\smallskip


Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
far from the actual bound we can expect. We can do better than this, but this does not improve
the finiteness property we are proving. If we are interested in a polynomial bound,
one would hope to obtain a similar tight bound as for partial
derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
@{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in
partial derivatives). Unfortunately to obtain the exact same bound would mean
we need to introduce simplifications, such as
 $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$,
which exist for partial derivatives. However, if we introduce them in our
setting we would lose the POSIX property of our calculated values. For example
given the regular expressions @{term "SEQ (ALT a ab) (ALT b ONE)"} and the string $[a, b]$, then our
algorithm generates the following correct POSIX value
%
\[
@{term "Seq (Right (Seq (Char a) (Char b))) (Right Empty)"}
\]

\noindent
Essentially it matches the string with the longer @{text "Right"}-alternative in the
first sequence (and then the `rest' with the empty regular expression @{const ONE} from the second sequence). 
If we add the simplification above, then we obtain the following value
@{term "Seq (Left (Char a)) (Left (Char b))"}
where the @{text "Left"}-alternatives get priority. However, this violates
the POSIX rules and we have not been able to
reconcile this problem. Therefore we leave better bounds for future work.%!\\[-6.5mm]

Note also that while Antimirov was able to give a bound on the \emph{size}
of his partial derivatives~\cite{Antimirov95}, Brzozowski gave a bound
on the \emph{number} of derivatives, provided they are quotient via
ACI rules \cite{Brzozowski1964}. Brzozowski's result is crucial when one
uses his derivatives for obtaining a DFA (it essentially bounds
the number of states). However, this result does \emph{not}
transfer to our setting where we are interested in the \emph{size} of the
derivatives. For example it is \emph{not} true for our derivatives that the
set of derivatives $r \backslash s$ for a given $r$ and all strings
$s$ is finite (even when simplified). This is because for our annotated regular expressions
the bitcode annotation is dependent on the number of iterations that
are needed for @{text STAR}-regular expressions. This is not a problem for us: Since we intend to do lexing
by calculating (as fast as possible) derivatives, the bound on the size
of the derivatives is important, not their number. % of derivatives.


*}


section {* Conclusion *}

text {*

   We set out in this work to prove in Isabelle/HOL the correctness of
   the second POSIX lexing algorithm by Sulzmann and Lu
   \cite{Sulzmann2014}. This follows earlier work where we established
   the correctness of the first algorithm
   \cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
   introduce our own specification for POSIX values,
   because the informal definition given by Sulzmann and Lu did not
   stand up to formal proof. Also for the second algorithm we needed
   to introduce our own definitions and proof ideas in order to
   establish the correctness.  Our interest in the second algorithm
   lies in the fact that by using bitcoded regular expressions and an
   aggressive simplification method there is a chance that the
   derivatives can be kept universally small (we established in this
   paper that for a given $r$ they can be kept finitely bounded for
   \emph{all} strings).  Our formalisation is approximately 7500 lines of
   Isabelle code. A little more than half of this code concerns the finiteness bound
   obtained in Section 5. This slight ``bloat'' in the latter part is because
   we had to introduce an intermediate datatype for annotated regular expressions and repeat many
   definitions for this intermediate datatype. But overall we think this made our
   formalisation work smoother. The code of our formalisation can be found at
   \textcolor{darkblue}{\url{https://github.com/urbanchr/posix}}.
   %This is important if one is after
   %an efficient POSIX lexing algorithm based on derivatives.

   Having proved the correctness of the POSIX lexing algorithm, which
   lessons have we learned? Well, we feel this is a very good example
   where formal proofs give further insight into the matter at
   hand. For example it is very hard to see a problem with @{text nub}
   vs @{text distinctWith} with only experimental data---one would still
   see the correct result but find that simplification does not simplify in well-chosen, but not
   obscure, examples.
   %We found that from an implementation
   %point-of-view it is really important to have the formal proofs of
   %the corresponding properties at hand.

   With the results reported here, we can of course only make a claim
   about the correctness of the algorithm and the sizes of the
   derivatives, not about the efficiency or runtime of our version of
   Sulzmann and Lu's algorithm. But we found the size is an important
   first indicator about efficiency: clearly if the derivatives can
   grow to arbitrarily big sizes and the algorithm needs to traverse
   the derivatives possibly several times, then the algorithm will be
   slow---excruciatingly slow that is. Other works seem to make
   stronger claims, but during our formalisation work we have
   developed a healthy suspicion when for example experimental data is
   used to back up efficiency claims. For instance Sulzmann and Lu
   write about their equivalent of @{term blexer_simp} \textit{``...we
   can incrementally compute bitcoded parse trees in linear time in
   the size of the input''} \cite[Page 14]{Sulzmann2014}.  Given the
   growth of the derivatives in some cases even after aggressive
   simplification, this is a hard to believe claim. A similar claim
   about a theoretical runtime of @{text "O(n\<^sup>2)"} for one
   specific list of regular expressions is made for the Verbatim
   lexer, which calculates tokens according to POSIX
   rules~\cite{verbatim}. For this, Verbatim uses Brzozowski's
   derivatives like in our work.  About their empirical data, the authors write:
   \textit{``The results of our empirical tests [..] confirm that
   Verbatim has @{text "O(n\<^sup>2)"} time complexity.''}
   \cite[Section~VII]{verbatim}.  While their correctness proof for
   Verbatim is formalised in Coq, the claim about the runtime
   complexity is only supported by some empirical evidence obtained by
   using the code extraction facilities of Coq.  Given our observation
   with the ``growth problem'' of derivatives, this runtime bound
   is unlikely to hold universally: indeed we tried out their extracted OCaml
   code with the example \mbox{@{text "(a + aa)\<^sup>*"}} as a single
   lexing rule, and it took for us around 5 minutes to tokenise a
   string of 40 $a$'s and that increased to approximately 19 minutes
   when the string is 50 $a$'s long. Taking into account that
   derivatives are not simplified in the Verbatim lexer, such numbers
   are not surprising.  Clearly our result of having finite
   derivatives might sound rather weak in this context but we think
   such efficiency claims really require further scrutiny.  The
   contribution of this paper is to make sure derivatives do not grow
   arbitrarily big (universally). In the example \mbox{@{text "(a +
   aa)\<^sup>*"}}, \emph{all} derivatives have a size of 17 or
   less. The result is that lexing a string of, say, 50\,000 a's with
   the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes
   approximately 10 seconds with our Scala implementation of the
   presented algorithm.

   Finally, let us come back to the point about bounded regular
   expressions. We have in this paper only shown that @{term "NTIMES r
   n"} can be included, but all our results extend straightforwardly
   also to the other bounded regular expressions. We find bounded
   regular expressions fit naturally into the setting of Brzozowski
   derivatives and the bitcoded regular expressions by Sulzmann and Lu.
   In contrast bounded regular expressions are often the Achilles'
   heel in regular expression matchers that use the traditional
   automata-based approach to lexing, primarily because they need to expand the
   counters of bounded regular expressions into $n$-connected copies
   of an automaton. This is not needed in Sulzmann and Lu's algorithm.
   To see the difference consider for example the regular expression @{term "SEQ (NTIMES a
   1001) (STAR a)"}, which is not permitted in the Go language because
   the counter is too big. In contrast we have no problem with
   matching this regular expression with, say 50\,000 a's, because the
   counters can be kept compact. In fact, the overall size of the
   derivatives is never greater than 5 in this example. Even in the
   example from Section 2, where Rust raises an error message, namely
   \mbox{@{text "a\<^bsup>{1000}{100}{5}\<^esup>"}}, the maximum size for
   our derivatives is a moderate 14.

   Let us also return to the example @{text
   "a\<^bsup>{0}{4294967295}\<^esup>"} which until recently Rust
   deemed acceptable. But this was due to a bug. It turns out that it took Rust
   more than 11 minutes to generate an automaton for this regular
   expression and then to determine that a string of just one(!)
   @{text a} does \emph{not} match this regular expression.  Therefore
   it is probably a wise choice that in newer versions of Rust's
   regular expression library such regular expressions are declared as
   ``too big'' and raise an error message. While this is clearly
   a contrived example, the safety guaranties Rust wants to provide necessitate
   this conservative approach.
   However, with the derivatives and simplifications we have shown
   in this paper, the example can be solved with ease:
   it essentially only involves operations on
   integers and our Scala implementation takes only a few seconds to
   find out that this string, or even much larger strings, do not match.

   Let us also compare our work to the verified Verbatim++ lexer where
   the authors of the Verbatim lexer introduced a number of
   improvements and optimisations, for example memoisation
   \cite{verbatimpp}. However, unlike Verbatim, which works with
   derivatives like in our work, Verbatim++ compiles first a regular
   expression into a DFA. While this makes lexing fast in many cases,
   with examples of bounded regular expressions like
   \mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}}
   one needs to represent them as
   sequences of $a \cdot a \cdot \ldots \cdot a$ (500 a's in sequence). We have run
   their extracted code with such a regular expression as a
   single lexing rule and a string of 50\,000 a's---lexing in this case
   takes approximately 5~minutes. We are not aware of any better
   translation using the traditional notion of DFAs so that we can improve on this. Therefore we
   prefer to stick with calculating derivatives, but attempt to make
   this calculation (in the future) as fast as possible. What we can guaranty
   with the presented work is that the maximum size of the derivatives
   for \mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}$\,\cdot\,$@{term "STAR a"}} is never bigger than 9. This means our Scala
   implementation again only needs a few seconds for this example and matching 50\,000 a's, say.
   %
   %
   %Possible ideas are
   %zippers which have been used in the context of parsing of
   %context-free grammars \cite{zipperparser}.
   %\\[-5mm]  %\smallskip
   
   %\noindent
   %Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.
   %%\\[-10mm]


%%\bibliographystyle{plain}
\bibliography{root}
*}

(*<*)
text {*
\newpage
\appendix

\section{Some Proofs}

While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some
rough details of our reasoning in ``English'' if this is helpful for reviewing.

\begin{proof}[Proof of Lemma~\ref{codedecode}]
  This follows from the property that
  $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
  for any bit-sequence $bs$ and $\vdash v : r$. This property can be
  easily proved by induction on $\vdash v : r$.
\end{proof}  

\begin{proof}[Proof of Lemma~\ref{mainlemma}]
  This can be proved by induction on $s$ and generalising over
  $v$. The interesting point is that we need to prove this in the
  reverse direction for $s$. This means instead of cases $[]$ and
  $c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
  string from the back.\footnote{Isabelle/HOL provides an induction principle
    for this way of performing the induction.}

  The case for $[]$ is routine using Lemmas~\ref{codedecode}
  and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
  the assumption that $\vdash v : (r\backslash s)\backslash c$
  holds. Hence by Prop.~\ref{Posix2} we know that 
  (*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
  By definition of $\textit{flex}$ we can unfold the left-hand side
  to be
  \[
    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
    \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))  
  \]  

  \noindent
  By IH and (*) we can rewrite the right-hand side to
  $\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
    (\inj\,(r\backslash s)\,c\,\,v))\,r$ which is equal to
  $\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
  (s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
  because we generalised over $v$ in our induction.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{thmone}]
  We can first expand both sides using Lem.~\ref{flex} and the
  definition of \textit{blexer}. This gives us two
  \textit{if}-statements, which we need to show to be equal. By 
  Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
  $\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
   \nullable(r\backslash s)$.
  For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
  $d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
  by Lemma~\ref{bnullable}\textit{(3)} that
  %
  \[
    \textit{decode}(\textit{bmkeps}\:r_d)\,r =
    \textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r
  \]

  \noindent
  where the right-hand side is equal to
  $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
  d))$ by Lemma~\ref{mainlemma} (we know
  $\vdash \textit{mkeps}\,d : d$ by (*)).  This shows the
  \textit{if}-branches return the same value. In the
  \textit{else}-branches both \textit{lexer} and \textit{blexer} return
  \textit{None}. Therefore we can conclude the proof.
\end{proof}  

\begin{proof}[Proof of Lemma~\ref{lemone}]
     By induction on @{text r}. For this we can use the properties
     @{thm fltsfrewrites} and @{text "rs"}$\;\stackrel{s}{\leadsto}^*$@{text "distinctWith rs \<approx>"} @{term "{}"}. The latter uses
     repeated applications of the $LD$ rule which allows the removal
     of duplicates that can recognise the same strings. 
     \end{proof}

    \begin{proof}[Proof of Lemma~\ref{lembnull}]
     Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
     The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
     be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
     @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
     \end{proof}

 \begin{proof}[Proof of Lemma \ref{lemthree}]
     By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
     Again the only interesting case is the rule $LD$ where we need to ensure that
     @{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
        bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"} holds.
     This is indeed the case because according to the POSIX rules the
     generated bitsequence is determined by the first alternative that can match the
     string (in this case being nullable).
     \end{proof}

     \begin{proof}[Proof of Lemma~\ref{bderlem}]
     By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
     The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
     if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
     \end{proof}
*}
(*>*)
(*<*)
end
(*>*)
author	Christian Urban <christian.urban@kcl.ac.uk>
	Thu, 23 Feb 2023 12:37:34 +0000
changeset 644	9f984ff20020
parent 643	9580bae0500d
permissions	-rw-r--r--