thys3/Paper.thy
changeset 496 f493a20feeb3
child 498 ab626b60ee64
equal deleted inserted replaced
495:f9cdc295ccf7 496:f493a20feeb3
       
     1 (*<*)
       
     2 theory Paper
       
     3   imports
       
     4    "Posix.FBound" 
       
     5    "HOL-Library.LaTeXsugar"
       
     6 begin
       
     7 
       
     8 declare [[show_question_marks = false]]
       
     9 
       
    10 notation (latex output)
       
    11   If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
       
    12   Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) 
       
    13 
       
    14 
       
    15 abbreviation 
       
    16   "der_syn r c \<equiv> der c r"
       
    17 abbreviation 
       
    18  "ders_syn r s \<equiv> ders s r"  
       
    19 abbreviation 
       
    20   "bder_syn r c \<equiv> bder c r"  
       
    21 
       
    22 notation (latex output)
       
    23   der_syn ("_\\_" [79, 1000] 76) and
       
    24   ders_syn ("_\\_" [79, 1000] 76) and
       
    25   bder_syn ("_\\_" [79, 1000] 76) and
       
    26   bders ("_\\_" [79, 1000] 76) and
       
    27   bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
       
    28 
       
    29   ZERO ("\<^bold>0" 81) and 
       
    30   ONE ("\<^bold>1" 81) and 
       
    31   CH ("_" [1000] 80) and
       
    32   ALT ("_ + _" [77,77] 78) and
       
    33   SEQ ("_ \<cdot> _" [77,77] 78) and
       
    34   STAR ("_\<^sup>*" [79] 78) and
       
    35 
       
    36   val.Void ("Empty" 78) and
       
    37   val.Char ("Char _" [1000] 78) and
       
    38   val.Left ("Left _" [79] 78) and
       
    39   val.Right ("Right _" [1000] 78) and
       
    40   val.Seq ("Seq _ _" [79,79] 78) and
       
    41   val.Stars ("Stars _" [79] 78) and
       
    42 
       
    43   Prf ("\<turnstile> _ : _" [75,75] 75) and  
       
    44   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
       
    45 
       
    46   flat ("|_|" [75] 74) and
       
    47   flats ("|_|" [72] 74) and
       
    48   injval ("inj _ _ _" [79,77,79] 76) and 
       
    49   mkeps ("mkeps _" [79] 76) and 
       
    50   length ("len _" [73] 73) and
       
    51   set ("_" [73] 73) and
       
    52 
       
    53   AZERO ("ZERO" 81) and 
       
    54   AONE ("ONE _" [79] 78) and 
       
    55   ACHAR ("CHAR _ _" [79, 79] 80) and
       
    56   AALTs ("ALTs _ _" [77,77] 78) and
       
    57   ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
       
    58   ASTAR ("STAR _ _" [79, 79] 78) and
       
    59 
       
    60   code ("code _" [79] 74) and
       
    61   intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
       
    62   erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
       
    63   bnullable ("bnullable _" [1000] 80) and
       
    64   bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
       
    65   bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
       
    66   bmkeps ("bmkeps _" [1000] 80) and
       
    67 
       
    68   eq1 ("_ \<approx> _" [77,77] 76) and 
       
    69 
       
    70   srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
       
    71   rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
       
    72   srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
       
    73   blexer_simp ("blexer\<^sup>+" 1000) 
       
    74 
       
    75 
       
    76 lemma better_retrieve:
       
    77    shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
       
    78    and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
       
    79   apply (metis list.exhaust retrieve.simps(4))
       
    80   by (metis list.exhaust retrieve.simps(5))
       
    81 
       
    82 
       
    83 
       
    84 
       
    85 (*>*)
       
    86 
       
    87 section {* Introduction *}
       
    88 
       
    89 text {*
       
    90 
       
    91 In the last fifteen or so years, Brzozowski's derivatives of regular
       
    92 expressions have sparked quite a bit of interest in the functional
       
    93 programming and theorem prover communities.  The beauty of
       
    94 Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
       
    95 expressible in any functional language, and easily definable and
       
    96 reasoned about in theorem provers---the definitions just consist of
       
    97 inductive datatypes and simple recursive functions.  Derivatives of a
       
    98 regular expression, written @{term "der c r"}, give a simple solution
       
    99 to the problem of matching a string @{term s} with a regular
       
   100 expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
       
   101 succession) all the characters of the string matches the empty string,
       
   102 then @{term r} matches @{term s} (and {\em vice versa}).  We are aware
       
   103 of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
       
   104 Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
       
   105 of the work by Krauss and Nipkow~\cite{Krauss2011}.  And another one
       
   106 in Coq is given by Coquand and Siles \cite{Coquand2012}.
       
   107 Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
       
   108 
       
   109 
       
   110 However, there are two difficulties with derivative-based matchers:
       
   111 First, Brzozowski's original matcher only generates a yes/no answer
       
   112 for whether a regular expression matches a string or not.  This is too
       
   113 little information in the context of lexing where separate tokens must
       
   114 be identified and also classified (for example as keywords
       
   115 or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
       
   116 difficulty by cleverly extending Brzozowski's matching
       
   117 algorithm. Their extended version generates additional information on
       
   118 \emph{how} a regular expression matches a string following the POSIX
       
   119 rules for regular expression matching. They achieve this by adding a
       
   120 second ``phase'' to Brzozowski's algorithm involving an injection
       
   121 function.  In our own earlier work we provided the formal
       
   122 specification of what POSIX matching means and proved in Isabelle/HOL
       
   123 the correctness
       
   124 of Sulzmann and Lu's extended algorithm accordingly
       
   125 \cite{AusafDyckhoffUrban2016}.
       
   126 
       
   127 The second difficulty is that Brzozowski's derivatives can 
       
   128 grow to arbitrarily big sizes. For example if we start with the
       
   129 regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
       
   130 successive derivatives according to the character $a$, we end up with
       
   131 a sequence of ever-growing derivatives like 
       
   132 
       
   133 \def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
       
   134 \begin{center}
       
   135 \begin{tabular}{rll}
       
   136 $(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   137 & $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   138 & $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
       
   139 & & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   140 & $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
       
   141 \end{tabular}
       
   142 \end{center}
       
   143  
       
   144 \noindent where after around 35 steps we run out of memory on a
       
   145 typical computer (we shall define shortly the precise details of our
       
   146 regular expressions and the derivative operation).  Clearly, the
       
   147 notation involving $\ZERO$s and $\ONE$s already suggests
       
   148 simplification rules that can be applied to regular regular
       
   149 expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
       
   150 \Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
       
   151 r$. While such simple-minded simplifications have been proved in our
       
   152 earlier work to preserve the correctness of Sulzmann and Lu's
       
   153 algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
       
   154 \emph{not} help with limiting the growth of the derivatives shown
       
   155 above: the growth is slowed, but the derivatives can still grow rather
       
   156 quickly beyond any finite bound.
       
   157 
       
   158 
       
   159 Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
       
   160 \cite{Sulzmann2014} where they introduce bitcoded
       
   161 regular expressions. In this version, POSIX values are
       
   162 represented as bitsequences and such sequences are incrementally generated
       
   163 when derivatives are calculated. The compact representation
       
   164 of bitsequences and regular expressions allows them to define a more
       
   165 ``aggressive'' simplification method that keeps the size of the
       
   166 derivatives finitely bounded no matter what the length of the string is.
       
   167 They make some informal claims about the correctness and linear behaviour
       
   168 of this version, but do not provide any supporting proof arguments, not
       
   169 even ``pencil-and-paper'' arguments. They write about their bitcoded
       
   170 \emph{incremental parsing method} (that is the algorithm to be formalised
       
   171 in this paper):
       
   172 %
       
   173 \begin{quote}\it
       
   174   ``Correctness Claim: We further claim that the incremental parsing
       
   175   method [..] in combination with the simplification steps [..]
       
   176   yields POSIX parse trees. We have tested this claim
       
   177   extensively [..] but yet
       
   178   have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
       
   179 \end{quote}  
       
   180 
       
   181 \noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL
       
   182 the derivative-based lexing algorithm of Sulzmann and Lu
       
   183 \cite{Sulzmann2014} where regular expressions are annotated with
       
   184 bitsequences. We define the crucial simplification function as a
       
   185 recursive function, without the need of a fix-point operation. One objective of
       
   186 the simplification function is to remove duplicates of regular
       
   187 expressions.  For this Sulzmann and Lu use in their paper the standard
       
   188 @{text nub} function from Haskell's list library, but this function
       
   189 does not achieve the intended objective with bitcoded regular expressions.  The
       
   190 reason is that in the bitcoded setting, each copy generally has a
       
   191 different bitcode annotation---so @{text nub} would never ``fire''.
       
   192 Inspired by Scala's library for lists, we shall instead use a @{text
       
   193 distinctWith} function that finds duplicates under an ``erasing'' function
       
   194 which deletes bitcodes before comparing regular expressions.
       
   195 We shall also introduce our own argument and definitions for
       
   196 establishing the correctness of the bitcoded algorithm when 
       
   197 simplifications are included. Finally we
       
   198 establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions
       
   199 %of regular expressions and describe the definition
       
   200 %of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
       
   201 %as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
       
   202 %the correctness for the bitcoded algorithm without simplification, and
       
   203 %after that extend the proof to include simplification.}\smallskip
       
   204 %\mbox{}\\[-10mm]
       
   205 
       
   206 \noindent In this paper, we shall first briefly introduce the basic notions
       
   207 of regular expressions and describe the definition
       
   208 of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
       
   209 as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
       
   210 the correctness for the bitcoded algorithm without simplification, and
       
   211 after that extend the proof to include simplification.\mbox{}\\[-6mm]
       
   212 
       
   213 *}
       
   214 
       
   215 section {* Background *}
       
   216 
       
   217 text {*
       
   218   In our Isabelle/HOL formalisation strings are lists of characters with
       
   219   the empty string being represented by the empty list, written $[]$,
       
   220   and list-cons being written as $\_\!::\!\_\,$; string
       
   221   concatenation is $\_ \,@\, \_\,$. We often use the usual
       
   222   bracket notation for lists also for strings; for example a string
       
   223   consisting of just a single character $c$ is written $[c]$.   
       
   224   Our regular expressions are defined as usual as the following inductive
       
   225   datatype:\\[-4mm]
       
   226   %
       
   227   \begin{center}
       
   228   @{text "r ::="} \;
       
   229   @{const "ZERO"} $\mid$
       
   230   @{const "ONE"} $\mid$
       
   231   @{term "CH c"} $\mid$
       
   232   @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
       
   233   @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
       
   234   @{term "STAR r"} 
       
   235   \end{center}
       
   236 
       
   237   \noindent where @{const ZERO} stands for the regular expression that does
       
   238   not match any string, @{const ONE} for the regular expression that matches
       
   239   only the empty string and @{term c} for matching a character literal.
       
   240   The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively.
       
   241   We sometimes omit the $\cdot$ in a sequence regular expression for brevity. 
       
   242   The
       
   243   \emph{language} of a regular expression, written $L(r)$, is defined as usual
       
   244   and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}).
       
   245 
       
   246   Central to Brzozowski's regular expression matcher are two functions
       
   247   called @{text nullable} and \emph{derivative}. The latter is written
       
   248   $r\backslash c$ for the derivative of the regular expression $r$
       
   249   w.r.t.~the character $c$. Both functions are defined by recursion over
       
   250   regular expressions.  
       
   251 %
       
   252 \begin{center}
       
   253 \begin{tabular}{cc}
       
   254   \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
       
   255   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
       
   256   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
       
   257   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
       
   258   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   259   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
       
   260   & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
       
   261   & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
       
   262   % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   263   @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
       
   264   \end{tabular}
       
   265   &
       
   266   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
   267   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
       
   268   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
       
   269   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
       
   270   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   271   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   272   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}%\medskip\\
       
   273   \end{tabular}  
       
   274 \end{tabular}  
       
   275 \end{center}
       
   276 
       
   277   \noindent
       
   278   We can extend this definition to give derivatives w.r.t.~strings,
       
   279   namely @{thm (lhs) ders.simps(1)}$\dn$@{thm (rhs) ders.simps(1)}
       
   280   and @{thm (lhs) ders.simps(2)}$\dn$@{thm (rhs) ders.simps(2)}.
       
   281   Using @{text nullable} and the derivative operation, we can
       
   282 define a simple regular expression matcher, namely
       
   283 $@{text "match s r"} \;\dn\; @{term nullable}(r\backslash s)$.
       
   284 This is essentially Brzozowski's algorithm from 1964. Its
       
   285 main virtue is that the algorithm can be easily implemented as a
       
   286 functional program (either in a functional programming language or in
       
   287 a theorem prover). The correctness of @{text match} amounts to
       
   288 establishing the property:%\footnote{It is a fun exercise to formally prove this property in a theorem prover.}
       
   289 %
       
   290 \begin{proposition}\label{matchcorr} 
       
   291 @{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
       
   292 \end{proposition}
       
   293 
       
   294 \noindent
       
   295 It is a fun exercise to formally prove this property in a theorem prover.
       
   296 
       
   297 The novel idea of Sulzmann and Lu is to extend this algorithm for 
       
   298 lexing, where it is important to find out which part of the string
       
   299 is matched by which part of the regular expression.
       
   300 For this Sulzmann and Lu presented two lexing algorithms in their paper
       
   301   \cite{Sulzmann2014}. The first algorithm consists of two phases: first a
       
   302   matching phase (which is Brzozowski's algorithm) and then a value
       
   303   construction phase. The values encode \emph{how} a regular expression
       
   304   matches a string. \emph{Values} are defined as the inductive datatype
       
   305   %
       
   306   \begin{center}
       
   307   @{text "v ::="}
       
   308   @{const "Void"} $\mid$
       
   309   @{term "val.Char c"} $\mid$
       
   310   @{term "Left v"} $\mid$
       
   311   @{term "Right v"} $\mid$
       
   312   @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
       
   313   @{term "Stars vs"} 
       
   314   \end{center}  
       
   315 
       
   316   \noindent where we use @{term vs} to stand for a list of values. The
       
   317   string underlying a value can be calculated by a @{const flat}
       
   318   function, written @{term "flat DUMMY"}. It traverses a value and
       
   319   collects the characters contained in it \cite{AusafDyckhoffUrban2016}.
       
   320 
       
   321 
       
   322   Sulzmann and Lu also define inductively an
       
   323   inhabitation relation that associates values to regular expressions. This
       
   324   is defined by the following six rules for the values:
       
   325   %
       
   326   \begin{center}
       
   327   \begin{tabular}{@ {\hspace{-12mm}}c@ {}}
       
   328   \begin{tabular}{@ {}c@ {}}
       
   329   \\[-8mm]
       
   330   @{thm[mode=Axiom] Prf.intros(4)}\\
       
   331   @{thm[mode=Axiom] Prf.intros(5)[of "c"]}
       
   332   \end{tabular}
       
   333   \quad
       
   334   @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\quad
       
   335   @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\quad
       
   336   @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\quad
       
   337   @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\mbox{}\hspace{-10mm}\mbox{}
       
   338   \end{tabular}
       
   339   \end{center}
       
   340 
       
   341   \noindent Note that no values are associated with the regular expression
       
   342   @{term ZERO}, since it cannot match any string.
       
   343   It is routine to establish how values ``inhabiting'' a regular
       
   344   expression correspond to the language of a regular expression, namely
       
   345   @{thm L_flat_Prf}.
       
   346 
       
   347   In general there is more than one value inhabited by a regular
       
   348   expression (meaning regular expressions can typically match more
       
   349   than one string). But even when fixing a string from the language of the
       
   350   regular expression, there are generally more than one way of how the
       
   351   regular expression can match this string. POSIX lexing is about
       
   352   identifying the unique value for a given regular expression and a
       
   353   string that satisfies the informal POSIX rules (see
       
   354   \cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).\footnote{POSIX
       
   355 	lexing acquired its name from the fact that the corresponding
       
   356 	rules were described as part of the POSIX specification for
       
   357 	Unix-like operating systems \cite{POSIX}.} Sometimes these
       
   358   informal rules are called \emph{maximal munch rule} and \emph{rule priority}.
       
   359   One contribution of our earlier paper is to give a convenient
       
   360  specification for what POSIX values are (the inductive rules are shown in
       
   361   Figure~\ref{POSIXrules}).
       
   362 
       
   363 \begin{figure}[t]
       
   364   \begin{center}
       
   365   \begin{tabular}{@ {}c@ {}}
       
   366   @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \quad
       
   367   @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\quad
       
   368   @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\quad
       
   369   @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
       
   370   $\mprset{flushleft}
       
   371    \inferrule
       
   372    {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
       
   373     @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
       
   374     @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
       
   375    {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\smallskip\\
       
   376   @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
       
   377   $\mprset{flushleft}
       
   378    \inferrule
       
   379    {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
       
   380     @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
       
   381     @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
       
   382     @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
       
   383    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\\[-4mm]
       
   384   \end{tabular}
       
   385   \end{center}
       
   386   \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
       
   387   of given a string $s$ and a regular
       
   388   expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
       
   389   regular expression matching.}\label{POSIXrules}
       
   390   \end{figure}
       
   391 
       
   392   The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
       
   393   an injection function on values that mirrors (but inverts) the
       
   394   construction of the derivative on regular expressions. Essentially it
       
   395   injects back a character into a value.
       
   396   For this they define two functions called @{text mkeps} and @{text inj}:
       
   397  %
       
   398   \begin{center}
       
   399   \begin{tabular}{@ {}l@ {}}
       
   400   \begin{tabular}{@ {}lcl@ {}}
       
   401   @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
       
   402   @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   403   @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   404   @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
       
   405   \end{tabular}\smallskip\\
       
   406 
       
   407   \begin{tabular}{@ {}cc@ {}}
       
   408   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
       
   409   @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
       
   410   @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
       
   411       @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
       
   412   @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
       
   413       @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
       
   414   @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
       
   415       & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}
       
   416   \end{tabular}
       
   417   &
       
   418   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
       
   419   @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
       
   420       & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
       
   421   @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
       
   422       & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
       
   423   @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$\\ 
       
   424   \multicolumn{3}{@ {}r@ {}}{@{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}}
       
   425   \end{tabular}
       
   426   \end{tabular}
       
   427   \end{tabular}\smallskip
       
   428   \end{center}
       
   429 
       
   430   \noindent
       
   431   The function @{text mkeps} is run when the last derivative is nullable, that is
       
   432   the string to be matched is in the language of the regular expression. It generates
       
   433   a value for how the last derivative can match the empty string. The injection function
       
   434   then calculates the corresponding value for each intermediate derivative until
       
   435   a value for the original regular expression is generated.
       
   436   Graphically the algorithm by
       
   437   Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz}
       
   438   where the path from the left to the right involving @{term derivatives}/@{const
       
   439   nullable} is the first phase of the algorithm (calculating successive
       
   440   \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
       
   441   left, the second phase.
       
   442 %
       
   443 \begin{center}
       
   444 \begin{tikzpicture}[scale=0.99,node distance=9mm,
       
   445                     every node/.style={minimum size=6mm}]
       
   446 \node (r1)  {@{term "r\<^sub>1"}};
       
   447 \node (r2) [right=of r1]{@{term "r\<^sub>2"}};
       
   448 \draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
       
   449 \node (r3) [right=of r2]{@{term "r\<^sub>3"}};
       
   450 \draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
       
   451 \node (r4) [right=of r3]{@{term "r\<^sub>4"}};
       
   452 \draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
       
   453 \draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
       
   454 \node (v4) [below=of r4]{@{term "v\<^sub>4"}};
       
   455 \draw[->,line width=1mm](r4) -- (v4);
       
   456 \node (v3) [left=of v4] {@{term "v\<^sub>3"}};
       
   457 \draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
       
   458 \node (v2) [left=of v3]{@{term "v\<^sub>2"}};
       
   459 \draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
       
   460 \node (v1) [left=of v2] {@{term "v\<^sub>1"}};
       
   461 \draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
       
   462 \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
       
   463 \end{tikzpicture}
       
   464 \end{center}
       
   465 %
       
   466 \noindent
       
   467   The picture shows the steps required when a
       
   468   regular expression, say @{text "r\<^sub>1"}, matches the string @{term
       
   469   "[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:\\[-8mm]
       
   470 
       
   471 %  \begin{figure}[t]
       
   472 %\begin{center}
       
   473 %\begin{tikzpicture}[scale=1,node distance=1cm,
       
   474 %                    every node/.style={minimum size=6mm}]
       
   475 %\node (r1)  {@{term "r\<^sub>1"}};
       
   476 %\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
       
   477 %\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
       
   478 %\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
       
   479 %\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
       
   480 %\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
       
   481 %\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
       
   482 %\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
       
   483 %\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
       
   484 %\draw[->,line width=1mm](r4) -- (v4);
       
   485 %\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
       
   486 %\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
       
   487 %\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
       
   488 %\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
       
   489 %\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
       
   490 %\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
       
   491 %\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
       
   492 %\end{tikzpicture}
       
   493 %\end{center}
       
   494 %\mbox{}\\[-13mm]
       
   495 %
       
   496 %\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
       
   497 %matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
       
   498 %left to right) is \Brz's matcher building successive derivatives. If the 
       
   499 %last regular expression is @{term nullable}, then the functions of the 
       
   500 %second phase are called (the top-down and right-to-left arrows): first 
       
   501 %@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
       
   502 %how the empty string has been recognised by @{term "r\<^sub>4"}. After
       
   503 %that the function @{term inj} ``injects back'' the characters of the string into
       
   504 %the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
       
   505 %the POSIX value for this string and
       
   506 %regular expression.
       
   507 %\label{Sulz}}
       
   508 %\end{figure} 
       
   509 
       
   510 
       
   511 
       
   512   \begin{center}
       
   513   \begin{tabular}{lcl}
       
   514   @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
       
   515   @{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}
       
   516                      @{term "None"}  @{text "\<Rightarrow>"} @{term None}\\
       
   517                      & & \hspace{24mm}$|\;$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
       
   518   \end{tabular}
       
   519   \end{center}
       
   520 
       
   521 
       
   522 We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
       
   523 this algorithm is correct, that is it generates POSIX values. The
       
   524 central property we established relates the derivative operation to the
       
   525 injection function.
       
   526 
       
   527   \begin{proposition}\label{Posix2}
       
   528 	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$. 
       
   529 \end{proposition}
       
   530 
       
   531   \noindent
       
   532   With this in place we were able to prove:
       
   533 
       
   534 
       
   535   \begin{proposition}\mbox{}\smallskip\\\label{lexercorrect}
       
   536   \begin{tabular}{ll}
       
   537   (1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
       
   538   (2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
       
   539   \end{tabular}
       
   540   \end{proposition}
       
   541 
       
   542   \noindent
       
   543   In fact we have shown that, in the success case, the generated POSIX value $v$ is
       
   544   unique and in the failure case that there is no POSIX value $v$ that satisfies
       
   545   $(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
       
   546   slow in cases where the derivatives grow arbitrarily (recall the example from the
       
   547   Introduction). However it can be used as a convenient reference point for the correctness
       
   548   proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
       
   549   
       
   550 *}
       
   551 
       
   552 section {* Bitcoded Regular Expressions and Derivatives *}
       
   553 
       
   554 text {*
       
   555 
       
   556   In the second part of their paper \cite{Sulzmann2014},
       
   557   Sulzmann and Lu describe another algorithm that also generates POSIX
       
   558   values but dispenses with the second phase where characters are
       
   559   injected ``back'' into values. For this they annotate bitcodes to
       
   560   regular expressions, which we define in Isabelle/HOL as the datatype
       
   561 
       
   562   \begin{center}
       
   563   @{term breg} $\;::=\;$ @{term "AZERO"}
       
   564                $\;\mid\;$ @{term "AONE bs"}
       
   565                $\;\mid\;$ @{term "ACHAR bs c"}
       
   566                $\;\mid\;$ @{term "AALTs bs rs"}
       
   567                $\;\mid\;$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}
       
   568                $\;\mid\;$ @{term "ASTAR bs r"}
       
   569   \end{center}
       
   570 
       
   571   \noindent where @{text bs} stands for bitsequences; @{text r},
       
   572   @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
       
   573   expressions; and @{text rs} for lists of bitcoded regular
       
   574   expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
       
   575   is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}. 
       
   576   For bitsequences we use lists made up of the
       
   577   constants @{text Z} and @{text S}.  The idea with bitcoded regular
       
   578   expressions is to incrementally generate the value information (for
       
   579   example @{text Left} and @{text Right}) as bitsequences. For this 
       
   580   Sulzmann and Lu define a coding
       
   581   function for how values can be coded into bitsequences.
       
   582 
       
   583   \begin{center}
       
   584   \begin{tabular}{cc}
       
   585   \begin{tabular}{lcl}
       
   586   @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
       
   587   @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
       
   588   @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
       
   589   @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
       
   590   \end{tabular}
       
   591   &
       
   592   \begin{tabular}{lcl}
       
   593   @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
   594   @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
       
   595   @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
       
   596   \mbox{\phantom{XX}}\\
       
   597   \end{tabular}
       
   598   \end{tabular}
       
   599   \end{center}
       
   600    
       
   601   \noindent
       
   602   As can be seen, this coding is ``lossy'' in the sense that we do not
       
   603   record explicitly character values and also not sequence values (for
       
   604   them we just append two bitsequences). However, the
       
   605   different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
       
   606   @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
       
   607   if there is still a value coming in the list of @{text Stars}, whereas @{text S}
       
   608   indicates the end of the list. The lossiness makes the process of
       
   609   decoding a bit more involved, but the point is that if we have a
       
   610   regular expression \emph{and} a bitsequence of a corresponding value,
       
   611   then we can always decode the value accurately (see Fig.~\ref{decode}).
       
   612   The function \textit{decode} checks whether all of the bitsequence is
       
   613   consumed and returns the corresponding value as @{term "Some v"}; otherwise
       
   614   it fails with @{text "None"}. We can establish that for a value $v$
       
   615   inhabited by a regular expression $r$, the decoding of its
       
   616   bitsequence never fails.
       
   617 
       
   618   %The decoding can be
       
   619   %defined by using two functions called $\textit{decode}'$ and
       
   620   %\textit{decode}:
       
   621 
       
   622   \begin{figure}
       
   623   \begin{center}
       
   624   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
       
   625   \multicolumn{3}{@ {}l}{$\textit{decode}'\,bs\,(\ONE)$ $\;\dn\;$ $(\Empty, bs)$ \quad\qquad
       
   626   $\textit{decode}'\,bs\,(c)$ $\;\dn\;$ $(\Char\,c, bs)$}\\
       
   627   $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
       
   628      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
       
   629        (\Left\,v, bs_1)$\\
       
   630   $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
       
   631      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
       
   632        (\Right\,v, bs_1)$\\                           
       
   633   $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
       
   634         $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
       
   635   & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
       
   636         \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
       
   637   $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
       
   638   $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
       
   639          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
       
   640   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
       
   641         \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\medskip\\
       
   642   \multicolumn{3}{l}{$\textit{decode}\,bs\,r$ $\dn$ 
       
   643      $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$
       
   644      $\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
       
   645        \textit{else}\;\textit{None}$}\\[-4mm]   
       
   646   \end{tabular}    
       
   647   \end{center}
       
   648   \caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode}
       
   649   \end{figure}
       
   650 
       
   651   %\noindent
       
   652   %The function \textit{decode} checks whether all of the bitsequence is
       
   653   %consumed and returns the corresponding value as @{term "Some v"}; otherwise
       
   654   %it fails with @{text "None"}. We can establish that for a value $v$
       
   655   %inhabited by a regular expression $r$, the decoding of its
       
   656   %bitsequence never fails.
       
   657 
       
   658 \begin{lemma}\label{codedecode}\it
       
   659   If $\;\vdash v : r$ then
       
   660   $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
       
   661 \end{lemma}
       
   662 
       
   663 
       
   664 
       
   665   Sulzmann and Lu define the function \emph{internalise}
       
   666   in order to transform (standard) regular expressions into annotated
       
   667   regular expressions. We write this operation as $r^\uparrow$.
       
   668   This internalisation uses the following
       
   669   \emph{fuse} function.	     
       
   670   %
       
   671   \begin{center}
       
   672   \begin{tabular}{@ {}cc@ {}}
       
   673   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
   674   $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
       
   675   $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
       
   676      $\textit{ONE}\,(bs\,@\,bs')$\\
       
   677   $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
       
   678      $\textit{CHAR}\,(bs\,@\,bs')\,c$
       
   679   \end{tabular}
       
   680   &
       
   681   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
   682   $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
       
   683      $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
       
   684   $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
       
   685      $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
       
   686   $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
       
   687      $\textit{STAR}\,(bs\,@\,bs')\,r$
       
   688   \end{tabular}
       
   689   \end{tabular}
       
   690   \end{center}    
       
   691 
       
   692   \noindent
       
   693   A regular expression can then be \emph{internalised} into a bitcoded
       
   694   regular expression as follows:
       
   695   %
       
   696   \begin{center}
       
   697   \begin{tabular}{@ {}ccc@ {}}
       
   698   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
   699   $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
       
   700   $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$
       
   701   \end{tabular}
       
   702   &
       
   703   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
   704   $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
       
   705   $(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$
       
   706   \end{tabular}
       
   707   &
       
   708   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
   709   $(r_1 + r_2)^\uparrow$ & $\dn$ &
       
   710          $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
       
   711                             (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
       
   712   $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
       
   713          $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$
       
   714   \end{tabular}
       
   715   \end{tabular}
       
   716   \end{center}    
       
   717 
       
   718   \noindent
       
   719   There is also an \emph{erase}-function, written $r^\downarrow$, which
       
   720   transforms a bitcoded regular expression into a (standard) regular
       
   721   expression by just erasing the annotated bitsequences. We omit the
       
   722   straightforward definition. For defining the algorithm, we also need
       
   723   the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
       
   724   ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
       
   725   bitcoded regular expressions.
       
   726   %
       
   727   \begin{center}
       
   728   \begin{tabular}{@ {}c@ {\hspace{2mm}}c@ {}}
       
   729   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
   730   $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
       
   731   $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
       
   732   $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
       
   733   $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
       
   734      $\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
       
   735   $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
       
   736      $\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
       
   737   $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
       
   738      $\textit{True}$
       
   739   \end{tabular}
       
   740   &
       
   741   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
       
   742   $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
       
   743   $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
       
   744   $bs\,@\,\textit{bmkepss}\,\rs$\\
       
   745   $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
       
   746   \multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
       
   747   $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
       
   748      $bs \,@\, [\S]$\\
       
   749   $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &
       
   750      $\textit{if}\;\textit{bnullable}\,r$\\
       
   751   & &$\textit{then}\;\textit{bmkeps}\,r$\\
       
   752   & &$\textit{else}\;\textit{bmkepss}\,\rs$
       
   753   \end{tabular}
       
   754   \end{tabular}
       
   755   \end{center}    
       
   756  
       
   757 
       
   758   \noindent
       
   759   The key function in the bitcoded algorithm is the derivative of a
       
   760   bitcoded regular expression. This derivative function calculates the
       
   761   derivative but at the same time also the incremental part of the bitsequences
       
   762   that contribute to constructing a POSIX value.	
       
   763   % 
       
   764   \begin{center}
       
   765   \begin{tabular}{@ {}lcl@ {}}
       
   766   \multicolumn{3}{@ {}l}{$(\textit{ZERO})\backslash c$ $\;\dn\;$ $\textit{ZERO}$ \quad\qquad  
       
   767   $(\textit{ONE}\;bs)\backslash c$ $\;\dn\;$ $\textit{ZERO}$}\\  
       
   768   $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
       
   769         $\textit{if}\;c=d\; \;\textit{then}\;
       
   770          \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
       
   771   $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
       
   772         $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
       
   773   $(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
       
   774      $\textit{if}\;\textit{bnullable}\,r_1$\\
       
   775   & &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$
       
   776   $\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
       
   777   & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
       
   778   $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
       
   779       $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
       
   780        (\textit{STAR}\,[]\,r)$
       
   781   \end{tabular}    
       
   782   \end{center}
       
   783 
       
   784 
       
   785   \noindent
       
   786   This function can also be extended to strings, written $r\backslash s$,
       
   787   just like the standard derivative.  We omit the details. Finally we
       
   788   can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
       
   789   %
       
   790   \begin{center}
       
   791 \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
       
   792   $\textit{blexer}\;r\,s$ & $\dn$ &
       
   793       $\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$                
       
   794   $\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
       
   795        \;\;\textit{else}\;\textit{None}$
       
   796 \end{tabular}
       
   797 \end{center}
       
   798 
       
   799   \noindent
       
   800 This bitcoded lexer first internalises the regular expression $r$ and then
       
   801 builds the bitcoded derivative according to $s$. If the derivative is
       
   802 (b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
       
   803 $\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value.  If
       
   804 the derivative is \emph{not} nullable, then $\textit{None}$ is
       
   805 returned. We can show that this way of calculating a value
       
   806 generates the same result as \textit{lexer}.
       
   807 
       
   808 Before we can proceed we need to define a helper function, called
       
   809 \textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
       
   810 %
       
   811 \begin{center}
       
   812   \begin{tabular}{lcl}
       
   813   @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
       
   814   @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
       
   815   @{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
       
   816   @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
       
   817   @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
       
   818   @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
       
   819       & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
       
   820   @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
       
   821   @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
       
   822   \end{tabular}
       
   823   \end{center}
       
   824 
       
   825 \noindent
       
   826 The idea behind this function is to retrieve a possibly partial
       
   827 bitsequence from a bitcoded regular expression, where the retrieval is
       
   828 guided by a value.  For example if the value is $\Left$ then we
       
   829 descend into the left-hand side of an alternative in order to
       
   830 assemble the bitcode. Similarly for
       
   831 $\Right$. The property we can show is that for a given $v$ and $r$
       
   832 with $\vdash v : r$, the retrieved bitsequence from the internalised
       
   833 regular expression is equal to the bitcoded version of $v$.
       
   834 
       
   835 \begin{lemma}\label{retrievecode}
       
   836 If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
       
   837 \end{lemma}
       
   838 
       
   839 \noindent
       
   840 We also need some auxiliary facts about how the bitcoded operations
       
   841 relate to the ``standard'' operations on regular expressions. For
       
   842 example if we build a bitcoded derivative and erase the result, this
       
   843 is the same as if we first erase the bitcoded regular expression and
       
   844 then perform the ``standard'' derivative operation.
       
   845 
       
   846 \begin{lemma}\label{bnullable}\mbox{}\smallskip\\
       
   847   \begin{tabular}{ll}
       
   848 \textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\    
       
   849 \textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
       
   850 \textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$.
       
   851 \end{tabular}  
       
   852 \end{lemma}
       
   853 
       
   854 %\begin{proof}
       
   855 %  All properties are by induction on annotated regular expressions.
       
   856 %  %There are no interesting cases.
       
   857 %\end{proof}
       
   858 
       
   859 \noindent
       
   860 The only difficulty left for the correctness proof is that the bitcoded algorithm
       
   861 has only a ``forward phase'' where POSIX values are generated incrementally.
       
   862 We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
       
   863 functions during the forward phase. An auxiliary function, called $\textit{flex}$,
       
   864 allows us to recast the rules of $\lexer$ in terms of a single
       
   865 phase and stacked up injection functions.
       
   866 %
       
   867 \begin{center}
       
   868 \begin{tabular}{@ {}l@ {}c@ {}l@ {\hspace{10mm}}l@ {}c@ {}l@ {}}
       
   869   $\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$ & 
       
   870   $\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
       
   871   $\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$
       
   872 \end{tabular}    
       
   873 \end{center}    
       
   874 
       
   875 \noindent
       
   876 The point of this function is that when
       
   877 reaching the end of the string, we just need to apply the stacked up
       
   878 injection functions to the value generated by @{text mkeps}.
       
   879 Using this function we can recast the success case in @{text lexer} 
       
   880 as follows:
       
   881 
       
   882 \begin{lemma}\label{flex}
       
   883 If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
       
   884       (\mkeps (r\backslash s))$.
       
   885 \end{lemma}
       
   886 
       
   887 \noindent
       
   888 Note we did not redefine \textit{lexer}, we just established that the
       
   889 value generated by \textit{lexer} can also be obtained by a different
       
   890 method. While this different method is not efficient (we essentially
       
   891 need to traverse the string $s$ twice, once for building the
       
   892 derivative $r\backslash s$ and another time for stacking up injection
       
   893 functions), it helps us with proving
       
   894 that incrementally building up values in @{text blexer} generates the same result.
       
   895 
       
   896 This brings us to our main lemma in this section: if we calculate a
       
   897 derivative, say $r\backslash s$, and have a value, say $v$, inhabited
       
   898 by this derivative, then we can produce the result @{text lexer} generates
       
   899 by applying this value to the stacked-up injection functions
       
   900 that $\textit{flex}$ assembles. The lemma establishes that this is the same
       
   901 value as if we build the annotated derivative $r^\uparrow\backslash s$
       
   902 and then retrieve the corresponding bitcoded version, followed by a
       
   903 decoding step.
       
   904 
       
   905 \begin{lemma}[Main Lemma]\label{mainlemma}\it
       
   906 If $\vdash v : r\backslash s$ then 
       
   907 $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
       
   908   \textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r$
       
   909 \end{lemma}  
       
   910 
       
   911 
       
   912 
       
   913 \noindent
       
   914 %With this lemma in place,
       
   915 We can then prove the correctness of \textit{blexer}---it indeed
       
   916 produces the same result as \textit{lexer}.
       
   917 
       
   918 
       
   919 \begin{theorem}\label{thmone}
       
   920 $\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$
       
   921 \end{theorem}  
       
   922 
       
   923 
       
   924 
       
   925 \noindent This establishes that the bitcoded algorithm \emph{without}
       
   926 simplification produces correct results. This was
       
   927 only conjectured by Sulzmann and Lu in their paper
       
   928 \cite{Sulzmann2014}. The next step is to add simplifications.
       
   929 
       
   930 *}
       
   931 
       
   932 
       
   933 section {* Simplification *}
       
   934 
       
   935 text {*
       
   936 
       
   937      Derivatives as calculated by Brzozowski’s method are usually more
       
   938      complex regular expressions than the initial one; the result is
       
   939      that derivative-based matching and lexing algorithms are
       
   940      often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
       
   941      optimisations are possible, such as the simplifications
       
   942      $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
       
   943      $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
       
   944      simplifications can considerably speed up the two algorithms  in many
       
   945      cases, they do not solve fundamentally the growth problem with
       
   946      derivatives. To see this let us return to the example from the
       
   947      Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
       
   948      If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
       
   949      the simplification rules shown above we obtain
       
   950      %
       
   951      %%
       
   952      %
       
   953      \begin{equation}\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}\label{derivex}
       
   954      (a + aa)^* \quad\xll\quad
       
   955       \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\;
       
   956      ((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r})
       
   957      \end{equation}
       
   958 
       
   959      \noindent This is a simpler derivative, but unfortunately we
       
   960      cannot make any further simplifications. This is a problem because
       
   961      the outermost alternatives contains two copies of the same
       
   962      regular expression (underlined with $r$). These copies will
       
   963      spawn new copies in later derivative steps and they in turn even more copies. This
       
   964      destroys any hope of taming the size of the derivatives.  But the
       
   965      second copy of $r$ in \eqref{derivex} will never contribute to a
       
   966      value, because POSIX lexing will always prefer matching a string
       
   967      with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
       
   968      The dilemma with the simple-minded
       
   969      simplification rules above is that the rule $r + r \Rightarrow r$
       
   970      will never be applicable because as can be seen in this example the
       
   971      regular expressions are not next to each other but separated by another regular expression.
       
   972 
       
   973      But here is where Sulzmann and Lu's representation of generalised
       
   974      alternatives in the bitcoded algorithm shines: in @{term
       
   975      "ALTs bs rs"} we can define a more aggressive simplification by
       
   976      recursively simplifying all regular expressions in @{text rs} and
       
   977      then analyse the resulting list and remove any duplicates.
       
   978      Another advantage with the bitsequences in  bitcoded regular
       
   979      expressions is that they can be easily modified such that simplification does not
       
   980      interfere with the value constructions. For example we can ``flatten'', or
       
   981      de-nest, or spill out, @{text ALTs} as follows
       
   982      %
       
   983      \[
       
   984      @{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
       
   985      \quad\xrightarrow{bsimp}\quad
       
   986      @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) # rs\<^sub>1)"}
       
   987      \]
       
   988 
       
   989      \noindent
       
   990      where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
       
   991      to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
       
   992      ensure that the correct value corresponding to the original (unsimplified)
       
   993      regular expression can still be extracted. %In this way the value construction
       
   994      %is not affected by simplification. 
       
   995 
       
   996      However there is one problem with the definition for the more
       
   997      aggressive simplification rules described by Sulzmann and Lu. Recasting
       
   998      their definition with our syntax they define the step of removing
       
   999      duplicates as
       
  1000      %
       
  1001      \[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
       
  1002      bs (nub (map bsimp rs))"}
       
  1003      \]
       
  1004    
       
  1005      \noindent where they first recursively simplify the regular
       
  1006      expressions in @{text rs} (using @{text map}) and then use
       
  1007      Haskell's @{text nub}-function to remove potential
       
  1008      duplicates. While this makes sense when considering the example
       
  1009      shown in \eqref{derivex}, @{text nub} is the inappropriate
       
  1010      function in the case of bitcoded regular expressions. The reason
       
  1011      is that in general the elements in @{text rs} will have a
       
  1012      different annotated bitsequence and in this way @{text nub}
       
  1013      will never find a duplicate to be removed. One correct way to
       
  1014      handle this situation is to first \emph{erase} the regular
       
  1015      expressions when comparing potential duplicates. This is inspired
       
  1016      by Scala's list functions of the form \mbox{@{text "distinctWith rs eq
       
  1017      acc"}} where @{text eq} is an user-defined equivalence relation that
       
  1018      compares two elements in @{text rs}. We define this function in
       
  1019      Isabelle/HOL as
       
  1020 
       
  1021      \begin{center}
       
  1022      \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}}
       
  1023      @{thm (lhs) distinctWith.simps(1)} & $\dn$ & @{thm (rhs) distinctWith.simps(1)}\\
       
  1024      @{thm (lhs) distinctWith.simps(2)} & $\dn$ &
       
  1025      @{text "if (\<exists> y \<in> acc. eq x y)"} & @{text "then distinctWith xs eq acc"}\\
       
  1026      & & & @{text "else x :: distinctWith xs eq ({x} \<union> acc)"}
       
  1027      \end{tabular}
       
  1028      \end{center}
       
  1029 
       
  1030      \noindent where we scan the list from left to right (because we
       
  1031      have to remove later copies). In @{text distinctWith}, @{text eq} is intended to be an
       
  1032      equivalence relation for annotated regular expressions and @{text acc} is an accumulator for annotated regular
       
  1033      expressions---essentially a set of regular expressions that we have already seen
       
  1034      while scanning the list. Therefore we delete an element, say @{text x},
       
  1035      from the list provided a @{text "y"} with @{text "y"} being equivalent to @{text x} is already in the accumulator;
       
  1036      otherwise we keep @{text x} and scan the rest of the list but 
       
  1037      add @{text "x"} as another ``seen'' element to @{text acc}. We will use
       
  1038      @{term distinctWith} where @{text eq} is an equivalence that deletes bitsequences from bitcoded regular expressions
       
  1039      before comparing the components. One way to define this in Isabelle/HOL is by the following recursive function from
       
  1040      annotated regular expressions to @{text bool}:
       
  1041      %
       
  1042      \begin{center}
       
  1043      \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}}
       
  1044      @{thm (lhs) eq1.simps(1)} & $\dn$ & @{thm (rhs) eq1.simps(1)}\\
       
  1045      @{thm (lhs) eq1.simps(2)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(2)[of DUMMY DUMMY]}\\
       
  1046      @{thm (lhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]} & $\dn$ & @{thm (rhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]}\\
       
  1047      @{thm (lhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]} & $\dn$ &
       
  1048      @{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\
       
  1049      @{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\
       
  1050      @{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ &
       
  1051      @{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}\\
       
  1052      @{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\
       
  1053      \end{tabular}
       
  1054      \end{center}
       
  1055 
       
  1056      \noindent
       
  1057      where all other cases are set to @{text False}.
       
  1058      This equivalence is clearly a computationally more expensive operation than @{text nub},
       
  1059      but is needed in order to make the removal of unnecessary copies
       
  1060      to work properly.
       
  1061 
       
  1062      Our simplification function depends on three more helper functions, one is called
       
  1063      @{text flts} and analyses lists of regular expressions coming from alternatives.
       
  1064      It is defined as follows:
       
  1065 
       
  1066      \begin{center}
       
  1067      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
  1068      @{thm (lhs) flts.simps(1)} & $\dn$ & @{thm (rhs) flts.simps(1)}\\
       
  1069      @{thm (lhs) flts.simps(2)} & $\dn$ & @{thm (rhs) flts.simps(2)}\\
       
  1070      @{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\
       
  1071      \end{tabular}
       
  1072      \end{center}
       
  1073 
       
  1074      \noindent
       
  1075      The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
       
  1076      the third ``de-nests'' alternatives (but retaining the
       
  1077      bitsequence @{text "bs'"} accumulated in the inner alternative). There are
       
  1078      some corner cases to be considered when the resulting list inside an alternative is
       
  1079      empty or a singleton list. We take care of those cases in the
       
  1080      @{text "bsimpALTs"} function; similarly we define a helper function that simplifies
       
  1081      sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:
       
  1082 
       
  1083      \begin{center}
       
  1084      \begin{tabular}{c@ {\hspace{5mm}}c}
       
  1085      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
  1086      @{text "bsimpALTs bs []"}  & $\dn$ & @{text "ZERO"}\\
       
  1087      @{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
       
  1088      @{text "bsimpALTs bs rs"}  & $\dn$ & @{text "ALTs bs rs"}\\
       
  1089      \mbox{}\\
       
  1090      \end{tabular}
       
  1091      &
       
  1092      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
  1093      @{text "bsimpSEQ bs _ ZERO"}  & $\dn$ & @{text "ZERO"}\\
       
  1094      @{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
       
  1095      @{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
       
  1096          & $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
       
  1097      @{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ &  @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
       
  1098      \end{tabular}
       
  1099      \end{tabular}
       
  1100      \end{center}
       
  1101 
       
  1102      \noindent
       
  1103      With this in place we can define our simplification function as
       
  1104 
       
  1105      \begin{center}
       
  1106      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
  1107      @{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
       
  1108          @{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
       
  1109      @{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{text "bsimpALTs bs (distinctWith (flts (map bsimp rs)) \<approx> \<emptyset>)"}\\
       
  1110      @{text "bsimp r"} & $\dn$ & @{text r}
       
  1111      \end{tabular}
       
  1112      \end{center}
       
  1113 
       
  1114      \noindent
       
  1115      We believe our recursive function @{term bsimp} simplifies regular
       
  1116      expressions as intended by Sulzmann and Lu. There is no point in applying the
       
  1117      @{text bsimp} function repeatedly (like the simplification in their paper which needs to be
       
  1118      applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
       
  1119      that is
       
  1120 
       
  1121      \begin{proposition}
       
  1122      @{term "bsimp (bsimp r) = bsimp r"}
       
  1123      \end{proposition}
       
  1124 
       
  1125      \noindent
       
  1126      This can be proved by induction on @{text r} but requires a detailed analysis
       
  1127      that the de-nesting of alternatives always results in a flat list of regular
       
  1128      expressions. We omit the details since it does not concern the correctness proof.
       
  1129      
       
  1130      Next we can include simplification after each derivative step leading to the
       
  1131      following notion of bitcoded derivatives:
       
  1132      
       
  1133      \begin{center}
       
  1134       \begin{tabular}{cc}
       
  1135       \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
  1136       @{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
       
  1137       \end{tabular}
       
  1138       &
       
  1139       \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
       
  1140       @{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
       
  1141       \end{tabular}
       
  1142       \end{tabular}
       
  1143       \end{center}
       
  1144 
       
  1145       \noindent
       
  1146       and use it in the improved lexing algorithm defined as
       
  1147 
       
  1148      \begin{center}
       
  1149 \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
       
  1150   $\textit{blexer}^+\;r\,s$ & $\dn$ &
       
  1151       $\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$                
       
  1152       $\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
       
  1153        \;\;\textit{else}\;\textit{None}$
       
  1154 \end{tabular}
       
  1155 \end{center}
       
  1156 
       
  1157        \noindent The remaining task is to show that @{term blexer} and
       
  1158        @{term "blexer_simp"} generate the same answers.
       
  1159 
       
  1160        When we first
       
  1161        attempted this proof we encountered a problem with the idea
       
  1162        in Sulzmann and Lu's paper where the argument seems to be to appeal
       
  1163        again to the @{text retrieve}-function defined for the unsimplified version
       
  1164        of the algorithm. But
       
  1165        this does not work, because desirable properties such as
       
  1166      %
       
  1167      \[
       
  1168      @{text "retrieve r v = retrieve (bsimp r) v"}
       
  1169      \]
       
  1170 
       
  1171      \noindent do not hold under simplification---this property
       
  1172      essentially purports that we can retrieve the same value from a
       
  1173      simplified version of the regular expression. To start with @{text retrieve}
       
  1174      depends on the fact that the value @{text v} corresponds to the
       
  1175      structure of the regular expression @{text r}---but the whole point of simplification
       
  1176      is to ``destroy'' this structure by making the regular expression simpler.
       
  1177      To see this consider the regular expression @{term "r = ALT r' ZERO"} and a corresponding
       
  1178      value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then 
       
  1179      we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding
       
  1180      bitsequence. The reason that this works is that @{text r} is an alternative
       
  1181      regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify
       
  1182      @{text r}, then @{text v} does not correspond to the shape of the regular 
       
  1183      expression anymore. So unless one can somehow
       
  1184      synchronise the change in the simplified regular expressions with
       
  1185      the original POSIX value, there is no hope of appealing to @{text retrieve} in the
       
  1186      correctness argument for @{term blexer_simp}.
       
  1187 
       
  1188      We found it more helpful to introduce the rewriting systems shown in
       
  1189      Figure~\ref{SimpRewrites}. The idea is to generate 
       
  1190      simplified regular expressions in small steps (unlike the @{text bsimp}-function which
       
  1191      does the same in a big step), and show that each of
       
  1192      the small steps preserves the bitcodes that lead to the final POSIX value.
       
  1193      The rewrite system is organised such that $\leadsto$ is for bitcoded regular
       
  1194      expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
       
  1195      expressions. The former essentially implements the simplifications of
       
  1196      @{text "bsimpSEQ"} and @{text flts}; while the latter implements the
       
  1197      simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
       
  1198      regular expression reduces in zero or more steps to the simplified
       
  1199      regular expression generated by @{text bsimp}:
       
  1200 
       
  1201      \begin{lemma}\label{lemone}
       
  1202      @{thm[mode=IfThen] rewrites_to_bsimp}
       
  1203      \end{lemma}
       
  1204 
       
  1205      
       
  1206 
       
  1207      \noindent
       
  1208      We can show that this rewrite system preserves @{term bnullable}, that 
       
  1209      is simplification does not affect nullability:
       
  1210 
       
  1211      \begin{lemma}\label{lembnull}
       
  1212      @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
       
  1213      \end{lemma}
       
  1214 
       
  1215      
       
  1216 
       
  1217      \noindent
       
  1218      From this, we can show that @{text bmkeps} will produce the same bitsequence
       
  1219      as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
       
  1220      establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
       
  1221      in the paper by Sulzmannn and Lu). 
       
  1222 
       
  1223 
       
  1224      \begin{lemma}\label{lemthree}
       
  1225      @{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
       
  1226      \end{lemma}
       
  1227 
       
  1228      
       
  1229 
       
  1230      \noindent
       
  1231      Crucial is also the fact that derivative steps and simplification steps can be interleaved,
       
  1232      which is shown by the fact that $\leadsto$ is preserved under derivatives.
       
  1233 
       
  1234      \begin{lemma}\label{bderlem}
       
  1235      @{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
       
  1236      \end{lemma}
       
  1237 
       
  1238      
       
  1239 
       
  1240 
       
  1241      \noindent
       
  1242      Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
       
  1243      that the unsimplified
       
  1244      derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
       
  1245      
       
  1246      \begin{lemma}\label{lemtwo}
       
  1247      @{thm[mode=IfThen] central}
       
  1248      \end{lemma}
       
  1249 
       
  1250      %\begin{proof}
       
  1251      %By reverse induction on @{term s} generalising over @{text r}.
       
  1252      %\end{proof}
       
  1253 
       
  1254      \noindent
       
  1255      With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
       
  1256      generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
       
  1257      is indeed the POSIX value.
       
  1258      
       
  1259      \begin{theorem}
       
  1260      @{thm[mode=IfThen] main_blexer_simp}
       
  1261      \end{theorem}
       
  1262 
       
  1263      %\begin{proof}
       
  1264      %By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}. 	
       
  1265      %\end{proof}
       
  1266      
       
  1267      \noindent
       
  1268      This means that if the algorithm is called with a regular expression @{term r} and a string
       
  1269      @{term s} with $@{term s} \in L(@{term r})$, it will return @{term "Some v"} for the
       
  1270      @{term v} we defined by the POSIX relation $(@{term s}, @{term r}) \rightarrow @{term v}$; otherwise
       
  1271      the algorithm returns @{term "None"} when $s \not\in L(r)$ and no such @{text v} exists.
       
  1272      This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
       
  1273      The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily big but
       
  1274      can be finitely bounded, which
       
  1275      we shall show next.
       
  1276 
       
  1277    \begin{figure}[t]
       
  1278   \begin{center}
       
  1279   \begin{tabular}{@ {\hspace{-8mm}}c@ {}}
       
  1280   @{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\quad
       
  1281   @{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\quad
       
  1282   @{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
       
  1283   @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
       
  1284   @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
       
  1285   @{thm[mode=Axiom] bs6}$A0$\quad
       
  1286   @{thm[mode=Axiom] bs7}$A1$\quad
       
  1287   @{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
       
  1288   @{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad
       
  1289   @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad
       
  1290   @{thm[mode=Axiom] ss4}$L\ZERO$\quad
       
  1291   @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
       
  1292   @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm]
       
  1293   \end{tabular}
       
  1294   \end{center}
       
  1295   \caption{The rewrite rules that generate simplified regular expressions
       
  1296   in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
       
  1297   expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
       
  1298   expressions. Interesting is the $LD$ rule that allows copies of regular
       
  1299   expressions to be removed provided a regular expression earlier in the list can
       
  1300   match the same strings.}\label{SimpRewrites}
       
  1301   \end{figure}
       
  1302 *}
       
  1303 
       
  1304 section {* Finiteness of Derivatives *}
       
  1305 
       
  1306 text {*
       
  1307 
       
  1308 In this section let us sketch our argument for why the size of the simplified
       
  1309 derivatives with the aggressive simplification function can be finitely bounded. Suppose
       
  1310 we have a size function for bitcoded regular expressions, written
       
  1311 $\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree
       
  1312 (we omit the precise definition; ditto for lists $\llbracket r\!s\rrbracket$). For this we show that for every $r$
       
  1313 there exists a bound $N$
       
  1314 such that 
       
  1315 
       
  1316 \begin{center}
       
  1317 $\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$
       
  1318 \end{center}
       
  1319 
       
  1320 \noindent
       
  1321 We prove this by induction on $r$. The base cases for @{term AZERO},
       
  1322 @{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
       
  1323 for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
       
  1324 hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and
       
  1325 $\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows
       
  1326 %
       
  1327 \begin{center}
       
  1328 \begin{tabular}{lcll}
       
  1329 & & $ \llbracket@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\rrbracket$\\
       
  1330 & $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
       
  1331     [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]))\rrbracket $ & (1) \\
       
  1332 & $\leq$ &
       
  1333     $\llbracket\textit{distinctWith}\,((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
       
  1334     [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)])\,\approx\;@{term "{}"}\rrbracket + 1 $ & (2) \\
       
  1335 & $\leq$ & $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket +
       
  1336              \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket + 1 $ & (3) \\
       
  1337 & $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 +
       
  1338       \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket$ & (4)\\
       
  1339 & $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5)
       
  1340 \end{tabular}
       
  1341 \end{center}
       
  1342 
       
  1343 % tell Chengsong about Indian paper of closed forms of derivatives
       
  1344 
       
  1345 \noindent
       
  1346 where in (1) the $\textit{Suffix}(@{text "r"}_1, s)$ are the all the suffixes of $s$ where @{term "bders_simp r\<^sub>1 s'"} is nullable ($s'$ being a suffix of $s$).
       
  1347 In (3) we know that  $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket$ is 
       
  1348 bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller
       
  1349 than $N_2$. The list length after @{text distinctWith} is bounded by a number, which we call $l_{N_2}$. It stands
       
  1350 for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them).
       
  1351 We reason similarly for @{text STAR}.\medskip
       
  1352 
       
  1353 \noindent
       
  1354 Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
       
  1355 far from the actual bound we can expect. We can do better than this, but this does not improve
       
  1356 the finiteness property we are proving. If we are interested in a polynomial bound,
       
  1357 one would hope to obtain a similar tight bound as for partial
       
  1358 derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
       
  1359 @{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in
       
  1360 partial derivatives). Unfortunately to obtain the exact same bound would mean
       
  1361 we need to introduce simplifications, such as
       
  1362  $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$,
       
  1363 which exist for partial derivatives. However, if we introduce them in our
       
  1364 setting we would lose the POSIX property of our calculated values. We leave better
       
  1365 bounds for future work.\\[-6.5mm]
       
  1366 
       
  1367 *}
       
  1368 
       
  1369 
       
  1370 section {* Conclusion *}
       
  1371 
       
  1372 text {*
       
  1373 
       
  1374    We set out in this work to prove in Isabelle/HOL the correctness of
       
  1375    the second POSIX lexing algorithm by Sulzmann and Lu
       
  1376    \cite{Sulzmann2014}. This follows earlier work where we established
       
  1377    the correctness of the first algorithm
       
  1378    \cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
       
  1379    introduce our own specification about what POSIX values are,
       
  1380    because the informal definition given by Sulzmann and Lu did not
       
  1381    stand up to a formal proof. Also for the second algorithm we needed
       
  1382    to introduce our own definitions and proof ideas in order to establish the
       
  1383    correctness.  Our interest in the second algorithm 
       
  1384    lies in the fact that by using bitcoded regular expressions and an aggressive
       
  1385    simplification method there is a chance that the derivatives
       
  1386    can be kept universally small  (we established in this paper that
       
  1387    they can be kept finitely bounded for any string). This is important if one is after
       
  1388    an efficient POSIX lexing algorithm based on derivatives.
       
  1389 
       
  1390    Having proved the correctness of the POSIX lexing algorithm, which
       
  1391    lessons have we learned? Well, we feel this is a very good example
       
  1392    where formal proofs give further insight into the matter at
       
  1393    hand. For example it is very hard to see a problem with @{text nub}
       
  1394    vs @{text distinctWith} with only experimental data---one would still
       
  1395    see the correct result but find that simplification does not simplify in well-chosen, but not
       
  1396    obscure, examples. We found that from an implementation
       
  1397    point-of-view it is really important to have the formal proofs of
       
  1398    the corresponding properties at hand.
       
  1399 
       
  1400    We have also developed a
       
  1401    healthy suspicion when experimental data is used to back up
       
  1402    efficiency claims. For example Sulzmann and Lu write about their
       
  1403    equivalent of @{term blexer_simp} \textit{``...we can incrementally compute
       
  1404    bitcoded parse trees in linear time in the size of the input''}
       
  1405    \cite[Page 14]{Sulzmann2014}. 
       
  1406    Given the growth of the
       
  1407    derivatives in some cases even after aggressive simplification, this
       
  1408    is a hard to believe claim. A similar claim about a theoretical runtime
       
  1409    of @{text "O(n\<^sup>2)"} is made for the Verbatim lexer, which calculates
       
  1410    tokens according to POSIX rules~\cite{verbatim}. For this Verbatim uses Brzozowski's
       
  1411    derivatives like in our work. 
       
  1412    The authors write: \textit{``The results of our empirical tests [..] confirm that Verbatim has
       
  1413    @{text "O(n\<^sup>2)"} time complexity.''} \cite[Section~VII]{verbatim}.
       
  1414    While their correctness proof for Verbatim is formalised in Coq, the claim about
       
  1415    the runtime complexity is only supported by some emperical evidence obtained
       
  1416    by using the code extraction facilities of Coq.
       
  1417    Given our observation with the ``growth problem'' of derivatives,
       
  1418    we
       
  1419    tried out their extracted OCaml code with the example
       
  1420    \mbox{@{text "(a + aa)\<^sup>*"}} as a single lexing rule, and it took for us around 5 minutes to tokenise a
       
  1421    string of 40 $a$'s and that increased to approximately 19 minutes when the
       
  1422    string is 50 $a$'s long. Taking into account that derivatives are not simplified in the Verbatim
       
  1423    lexer, such numbers are not surprising. 
       
  1424    Clearly our result of having finite
       
  1425    derivatives might sound rather weak in this context but we think such effeciency claims
       
  1426    really require further scrutiny.
       
  1427 
       
  1428    The contribution of this paper is to make sure
       
  1429    derivatives do not grow arbitrarily big (universially) In the example \mbox{@{text "(a + aa)\<^sup>*"}},
       
  1430    \emph{all} derivatives have a size of 17 or less. The result is that
       
  1431    lexing a string of, say, 50\,000 a's with the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes approximately
       
  1432    10 seconds with our Scala implementation
       
  1433    of the presented algorithm. 
       
  1434    \smallskip
       
  1435 
       
  1436    \noindent
       
  1437    Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.\\[-10mm]
       
  1438 
       
  1439 
       
  1440 %%\bibliographystyle{plain}
       
  1441 \bibliography{root}
       
  1442 
       
  1443 \appendix
       
  1444 
       
  1445 \section{Some Proofs}
       
  1446 
       
  1447 While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some
       
  1448 rough details of our reasoning in ``English''.
       
  1449 
       
  1450 \begin{proof}[Proof of Lemma~\ref{codedecode}]
       
  1451   This follows from the property that
       
  1452   $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
       
  1453   for any bit-sequence $bs$ and $\vdash v : r$. This property can be
       
  1454   easily proved by induction on $\vdash v : r$.
       
  1455 \end{proof}  
       
  1456 
       
  1457 \begin{proof}[Proof of Lemma~\ref{mainlemma}]
       
  1458   This can be proved by induction on $s$ and generalising over
       
  1459   $v$. The interesting point is that we need to prove this in the
       
  1460   reverse direction for $s$. This means instead of cases $[]$ and
       
  1461   $c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
       
  1462   string from the back.\footnote{Isabelle/HOL provides an induction principle
       
  1463     for this way of performing the induction.}
       
  1464 
       
  1465   The case for $[]$ is routine using Lemmas~\ref{codedecode}
       
  1466   and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
       
  1467   the assumption that $\vdash v : (r\backslash s)\backslash c$
       
  1468   holds. Hence by Prop.~\ref{Posix2} we know that 
       
  1469   (*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
       
  1470   By definition of $\textit{flex}$ we can unfold the left-hand side
       
  1471   to be
       
  1472   \[
       
  1473     \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
       
  1474     \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))  
       
  1475   \]  
       
  1476 
       
  1477   \noindent
       
  1478   By IH and (*) we can rewrite the right-hand side to
       
  1479   $\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
       
  1480     (\inj\,(r\backslash s)\,c\,\,v))\,r$ which is equal to
       
  1481   $\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
       
  1482   (s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
       
  1483   because we generalised over $v$ in our induction.
       
  1484 \end{proof}
       
  1485 
       
  1486 \begin{proof}[Proof of Theorem~\ref{thmone}]
       
  1487   We can first expand both sides using Lem.~\ref{flex} and the
       
  1488   definition of \textit{blexer}. This gives us two
       
  1489   \textit{if}-statements, which we need to show to be equal. By 
       
  1490   Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
       
  1491   $\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
       
  1492    \nullable(r\backslash s)$.
       
  1493   For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
       
  1494   $d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
       
  1495   by Lemma~\ref{bnullable}\textit{(3)} that
       
  1496   %
       
  1497   \[
       
  1498     \textit{decode}(\textit{bmkeps}\:r_d)\,r =
       
  1499     \textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r
       
  1500   \]
       
  1501 
       
  1502   \noindent
       
  1503   where the right-hand side is equal to
       
  1504   $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
       
  1505   d))$ by Lemma~\ref{mainlemma} (we know
       
  1506   $\vdash \textit{mkeps}\,d : d$ by (*)).  This shows the
       
  1507   \textit{if}-branches return the same value. In the
       
  1508   \textit{else}-branches both \textit{lexer} and \textit{blexer} return
       
  1509   \textit{None}. Therefore we can conclude the proof.
       
  1510 \end{proof}  
       
  1511 
       
  1512 \begin{proof}[Proof of Lemma~\ref{lemone}]
       
  1513      By induction on @{text r}. For this we can use the properties
       
  1514      @{thm fltsfrewrites} and @{text "rs"}$\;\stackrel{s}{\leadsto}^*$@{text "distinctWith rs \<approx>"} @{term "{}"}. The latter uses
       
  1515      repeated applications of the $LD$ rule which allows the removal
       
  1516      of duplicates that can recognise the same strings. 
       
  1517      \end{proof}
       
  1518 
       
  1519     \begin{proof}[Proof of Lemma~\ref{lembnull}]
       
  1520      Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
       
  1521      The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
       
  1522      be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
       
  1523      @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
       
  1524      \end{proof}
       
  1525 
       
  1526  \begin{proof}[Proof of Lemma \ref{lemthree}]
       
  1527      By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
       
  1528      Again the only interesting case is the rule $LD$ where we need to ensure that
       
  1529      @{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
       
  1530         bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"} holds.
       
  1531      This is indeed the case because according to the POSIX rules the
       
  1532      generated bitsequence is determined by the first alternative that can match the
       
  1533      string (in this case being nullable).
       
  1534      \end{proof}
       
  1535 
       
  1536      \begin{proof}[Proof of Lemma~\ref{bderlem}]
       
  1537      By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
       
  1538      The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
       
  1539      if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
       
  1540      \end{proof}
       
  1541 *}
       
  1542 
       
  1543 (*<*)
       
  1544 end
       
  1545 (*>*)