thys3/Paper.thy
changeset 569 5af61c89f51e
parent 563 c92a41d9c4da
child 571 a76458dd9e4c
equal deleted inserted replaced
568:7a579f5533f8 569:5af61c89f51e
     1 (*<*)
     1 (*<*)
     2 theory Paper
     2 theory Paper
     3   imports
     3   imports
       
     4    "Posix.LexerSimp" 
     4    "Posix.FBound" 
     5    "Posix.FBound" 
     5    "HOL-Library.LaTeXsugar"
     6    "HOL-Library.LaTeXsugar"
     6 begin
     7 begin
     7 
     8 
     8 declare [[show_question_marks = false]]
     9 declare [[show_question_marks = false]]
    27   bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
    28   bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
    28 
    29 
    29   ZERO ("\<^bold>0" 81) and 
    30   ZERO ("\<^bold>0" 81) and 
    30   ONE ("\<^bold>1" 81) and 
    31   ONE ("\<^bold>1" 81) and 
    31   CH ("_" [1000] 80) and
    32   CH ("_" [1000] 80) and
    32   ALT ("_ + _" [77,77] 78) and
    33   ALT ("_ + _" [76,76] 77) and
    33   SEQ ("_ \<cdot> _" [77,77] 78) and
    34   SEQ ("_ \<cdot> _" [78,78] 78) and
    34   STAR ("_\<^sup>*" [79] 78) and
    35   STAR ("_\<^sup>*" [79] 78) and
    35   NTIMES ("_\<^sup>{\<^sup>_\<^sup>}" [79] 78) and
    36   NTIMES ("_\<^sup>{\<^bsup>_\<^esup>\<^sup>}" [79] 78) and
    36 
    37 
    37   val.Void ("Empty" 78) and
    38   val.Void ("Empty" 78) and
    38   val.Char ("Char _" [1000] 78) and
    39   val.Char ("Char _" [1000] 78) and
    39   val.Left ("Left _" [79] 78) and
    40   val.Left ("Left _" [79] 78) and
    40   val.Right ("Right _" [1000] 78) and
    41   val.Right ("Right _" [1000] 78) and
    41   val.Seq ("Seq _ _" [79,79] 78) and
    42   val.Seq ("Seq _ _" [79,79] 78) and
    42   val.Stars ("Stars _" [79] 78) and
    43   val.Stars ("Stars _" [1000] 78) and
    43 
    44 
    44   Prf ("\<turnstile> _ : _" [75,75] 75) and  
    45   Prf ("\<turnstile> _ : _" [75,75] 75) and  
    45   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
    46   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
    46 
    47 
    47   flat ("|_|" [75] 74) and
    48   flat ("|_|" [75] 74) and
    55   AONE ("ONE _" [79] 78) and 
    56   AONE ("ONE _" [79] 78) and 
    56   ACHAR ("CHAR _ _" [79, 79] 80) and
    57   ACHAR ("CHAR _ _" [79, 79] 80) and
    57   AALTs ("ALTs _ _" [77,77] 78) and
    58   AALTs ("ALTs _ _" [77,77] 78) and
    58   ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
    59   ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
    59   ASTAR ("STAR _ _" [79, 79] 78) and
    60   ASTAR ("STAR _ _" [79, 79] 78) and
       
    61   ANTIMES ("NT _ _ _" [79, 79, 79] 78) and
    60 
    62 
    61   code ("code _" [79] 74) and
    63   code ("code _" [79] 74) and
    62   intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
    64   intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
    63   erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
    65   erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
    64   bnullable ("bnullable _" [1000] 80) and
    66   bnullable ("bnullable _" [1000] 80) and
    79    and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
    81    and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
    80   apply (metis list.exhaust retrieve.simps(4))
    82   apply (metis list.exhaust retrieve.simps(4))
    81   by (metis list.exhaust retrieve.simps(5))
    83   by (metis list.exhaust retrieve.simps(5))
    82 
    84 
    83 
    85 
    84 
    86 lemma better_retrieve2:
    85 
    87   shows "retrieve (ANTIMES bs r (n + 1)) (Stars (v#vs)) = 
       
    88      bs @ [Z] @ retrieve r v @ retrieve (ANTIMES [] r n) (Stars vs)"
       
    89 by auto
    86 (*>*)
    90 (*>*)
    87 
    91 
    88 section {* Introduction *}
    92 section {* Introduction *}
    89 
    93 
    90 text {*
    94 text {*
    91 
    95 
    92 In the last fifteen or so years, Brzozowski's derivatives of regular
    96 In the last fifteen or so years, Brzozowski's derivatives of regular
    93 expressions have sparked quite a bit of interest in the functional
    97 expressions have sparked quite a bit of interest in the functional
    94 programming and theorem prover communities.  The beauty of
    98 programming and theorem prover communities.
    95 Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
       
    96 expressible in any functional language, and easily definable and
       
    97 reasoned about in theorem provers---the definitions just consist of
       
    98 inductive datatypes and simple recursive functions. Another neat
       
    99 feature of derivatives is that they can be easily extended to bounded
       
   100 regular expressions, such as @{term "NTIMES r n"}, where numbers or
       
   101 intervals specify how many times a regular expression should be used
       
   102 during matching.
       
   103 
       
   104 Derivatives of a
    99 Derivatives of a
   105 regular expression, written @{term "der c r"}, give a simple solution
   100 regular expression, written @{term "der c r"}, give a simple solution
   106 to the problem of matching a string @{term s} with a regular
   101 to the problem of matching a string @{term s} with a regular
   107 expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
   102 expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
   108 succession) all the characters of the string matches the empty string,
   103 succession) all the characters of the string matches the empty string,
   109 then @{term r} matches @{term s} (and {\em vice versa}).  We are aware
   104 then @{term r} matches @{term s} (and {\em vice versa}).  
   110 of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
   105 The beauty of
   111 Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
   106 Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
   112 of the work by Krauss and Nipkow~\cite{Krauss2011}.  And another one
   107 expressible in any functional language, and easily definable and
   113 in Coq is given by Coquand and Siles \cite{Coquand2012}.
   108 reasoned about in theorem provers---the definitions just consist of
   114 Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
   109 inductive datatypes and simple recursive functions. Another attractive
       
   110 feature of derivatives is that they can be easily extended to \emph{bounded}
       
   111 regular expressions, such as @{term "r"}$^{\{@{text n}\}}$ or @{term r}$^{\{..@{text n}\}}$, where numbers or
       
   112 intervals of numbers specify how many times a regular expression should be used
       
   113 during matching.
       
   114 
       
   115 
   115 
   116 
   116 
   117 
   117 However, there are two difficulties with derivative-based matchers:
   118 However, there are two difficulties with derivative-based matchers:
   118 First, Brzozowski's original matcher only generates a yes/no answer
   119 First, Brzozowski's original matcher only generates a yes/no answer
   119 for whether a regular expression matches a string or not.  This is too
   120 for whether a regular expression matches a string or not.  This is too
   157 \Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
   158 \Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
   158 r$. While such simple-minded simplifications have been proved in our
   159 r$. While such simple-minded simplifications have been proved in our
   159 earlier work to preserve the correctness of Sulzmann and Lu's
   160 earlier work to preserve the correctness of Sulzmann and Lu's
   160 algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
   161 algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
   161 \emph{not} help with limiting the growth of the derivatives shown
   162 \emph{not} help with limiting the growth of the derivatives shown
   162 above: the growth is slowed, but the derivatives can still grow rather
   163 above: the growth is slowed, but some derivatives can still grow rather
   163 quickly beyond any finite bound.
   164 quickly beyond any finite bound.
   164 
   165 
   165 
   166 
   166 Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
   167 Sulzmann and Lu address this ``growth problem'' in a second algorithm
   167 \cite{Sulzmann2014} where they introduce bitcoded
   168 \cite{Sulzmann2014} where they introduce bitcoded
   168 regular expressions. In this version, POSIX values are
   169 regular expressions. In this version, POSIX values are
   169 represented as bitsequences and such sequences are incrementally generated
   170 represented as bitsequences and such sequences are incrementally generated
   170 when derivatives are calculated. The compact representation
   171 when derivatives are calculated. The compact representation
   171 of bitsequences and regular expressions allows them to define a more
   172 of bitsequences and regular expressions allows them to define a more
   187 
   188 
   188 \noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL
   189 \noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL
   189 the derivative-based lexing algorithm of Sulzmann and Lu
   190 the derivative-based lexing algorithm of Sulzmann and Lu
   190 \cite{Sulzmann2014} where regular expressions are annotated with
   191 \cite{Sulzmann2014} where regular expressions are annotated with
   191 bitsequences. We define the crucial simplification function as a
   192 bitsequences. We define the crucial simplification function as a
   192 recursive function, without the need of a fix-point operation. One objective of
   193 recursive function, without the need of a fixpoint operation. One objective of
   193 the simplification function is to remove duplicates of regular
   194 the simplification function is to remove duplicates of regular
   194 expressions.  For this Sulzmann and Lu use in their paper the standard
   195 expressions.  For this Sulzmann and Lu use in their paper the standard
   195 @{text nub} function from Haskell's list library, but this function
   196 @{text nub} function from Haskell's list library, but this function
   196 does not achieve the intended objective with bitcoded regular expressions.  The
   197 does not achieve the intended objective with bitcoded regular expressions.  The
   197 reason is that in the bitcoded setting, each copy generally has a
   198 reason is that in the bitcoded setting, each copy generally has a
   198 different bitcode annotation---so @{text nub} would never ``fire''.
   199 different bitcode annotation---so @{text nub} would never ``fire''.
   199 Inspired by Scala's library for lists, we shall instead use a @{text
   200 Inspired by Scala's library for lists, we shall instead use a @{text
   200 distinctWith} function that finds duplicates under an ``erasing'' function
   201 distinctWith} function that finds duplicates under an ``erasing'' function
   201 which deletes bitcodes before comparing regular expressions.
   202 which deletes bitcodes before comparing regular expressions.
   202 We shall also introduce our own argument and definitions for
   203 We shall also introduce our \emph{own} argument and definitions for
   203 establishing the correctness of the bitcoded algorithm when 
   204 establishing the correctness of the bitcoded algorithm when 
   204 simplifications are included. Finally we
   205 simplifications are included. Finally we
   205 establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions
   206 establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions
   206 %of regular expressions and describe the definition
   207 %of regular expressions and describe the definition
   207 %of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
   208 %of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
   226   the empty string being represented by the empty list, written $[]$,
   227   the empty string being represented by the empty list, written $[]$,
   227   and list-cons being written as $\_\!::\!\_\,$; string
   228   and list-cons being written as $\_\!::\!\_\,$; string
   228   concatenation is $\_ \,@\, \_\,$. We often use the usual
   229   concatenation is $\_ \,@\, \_\,$. We often use the usual
   229   bracket notation for lists also for strings; for example a string
   230   bracket notation for lists also for strings; for example a string
   230   consisting of just a single character $c$ is written $[c]$.   
   231   consisting of just a single character $c$ is written $[c]$.   
   231   Our regular expressions are defined as usual as the following inductive
   232   Our regular expressions are defined as the following inductive
   232   datatype:\\[-4mm]
   233   datatype:\\[-4mm]
   233   %
   234   %
   234   \begin{center}
   235   \begin{center}
   235   @{text "r ::="} \;
   236   @{text "r ::="} \;
   236   @{const "ZERO"} $\mid$
   237   @{const "ZERO"} $\mid$
   240   @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
   241   @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
   241   @{term "STAR r"} $\mid$
   242   @{term "STAR r"} $\mid$
   242   @{term "NTIMES r n"}
   243   @{term "NTIMES r n"}
   243   \end{center}
   244   \end{center}
   244 
   245 
   245   \noindent where @{const ZERO} stands for the regular expression that does
   246   \noindent where @{const ZERO} stands for the regular expression that
   246   not match any string, @{const ONE} for the regular expression that matches
   247   does not match any string, @{const ONE} for the regular expression
   247   only the empty string and @{term c} for matching a character literal.
   248   that matches only the empty string and @{term c} for matching a
   248   The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively.
   249   character literal.  The constructors $+$ and $\cdot$ represent
   249   We sometimes omit the $\cdot$ in a sequence regular expression for brevity. 
   250   alternatives and sequences, respectively.  We sometimes omit the
   250   In our work here we also add to the usual ``basic'' regular expressions 
   251   $\cdot$ in a sequence regular expression for brevity.  The
   251   the bounded regular expression @{term "NTIMES r n"} where the @{term n}
   252   \emph{language} of a regular expression, written $L(r)$, is defined
   252   specifies that @{term r} should match exactly @{term n}-times. For
   253   as usual and we omit giving the definition here (see for example
   253   brevity we omit the other bounded regular expressions
   254   \cite{AusafDyckhoffUrban2016}).
   254   @{text "r"}$^{\{..n\}}$, @{text "r"}$^{\{n..\}}$ and @{text "r"}$^{\{n..m\}}$
   255   
   255   which specify an interval for how many times @{text r} should match. Our
   256   In our work here we also add to the usual ``basic'' regular
   256   results extend straightforwardly also to them. The
   257   expressions the \emph{bounded} regular expression @{term "NTIMES r
   257   \emph{language} of a regular expression, written $L(r)$, is defined as usual
   258   n"} where the @{term n} specifies that @{term r} should match
   258   and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}).
   259   exactly @{term n}-times. Again for brevity we omit the other bounded
       
   260   regular expressions @{text "r"}$^{\{..n\}}$, @{text "r"}$^{\{n..\}}$
       
   261   and @{text "r"}$^{\{n..m\}}$ which specify intervals for how many
       
   262   times @{text r} should match. The results presented in this paper
       
   263   extend straightforwardly to them too. The importance of the bounded
       
   264   regular expressions is that they are often used in practical
       
   265   applications, such as Snort (a system for detecting network
       
   266   intrusion) and also in XML Schema definitions. According to Bj\"{o}rklund et
       
   267   al~\cite{BjorklundMartensTimm2015}, bounded regular expressions 
       
   268   occur frequently in the latter and can have counters of up to
       
   269   ten million.  The problem is that tools based on the classic notion
       
   270   of automata need to expand @{text "r"}$^{\{n\}}$ into @{text n}
       
   271   connected copies of the automaton for @{text r}. This leads to very
       
   272   inefficient matching algorithms or algorithms that consume large
       
   273   amounts of memory.  A classic example is the regular expression
       
   274   \mbox{@{term "SEQ (SEQ (STAR (ALT a b)) a) (NTIMES (ALT a b) n)"}}
       
   275   where the minimal DFA requires at least $2^{n + 1}$ states (see
       
   276   \cite{CountingSet2020}). Therefore regular expression matching
       
   277   libraries that rely on the classic notion of DFAs often impose 
       
   278   adhoc limits for bounded regular expressions: For example in the
       
   279   regular expression matching library in the Go language the regular expression
       
   280   @{term "NTIMES a 1001"} is not permitted, because no counter can be
       
   281   above 1000, and in the built-in Rust regular expression library
       
   282   expressions such as @{text "a\<^bsup>{1000}{100}{5}\<^esup>"} give an error
       
   283   message for being too big.  These problems can of course be solved in matching
       
   284   algorithms where automata go beyond the classic notion and for
       
   285   instance include explicit counters (see~\cite{CountingSet2020}).
       
   286   The point here is that Brzozowski derivatives and the algorithms by
       
   287   Sulzmann and Lu can be straightforwardly extended to deal with
       
   288   bounded regular expressions and moreover the resulting code
       
   289   still consists of only simple recursive functions and inductive
       
   290   datatypes. Finally, bounded regular expressions 
       
   291   do not destroy our finite boundedness property, which we shall
       
   292   prove later on.%, because during the lexing process counters will only be
       
   293   %decremented.
       
   294   
   259 
   295 
   260   Central to Brzozowski's regular expression matcher are two functions
   296   Central to Brzozowski's regular expression matcher are two functions
   261   called @{text nullable} and \emph{derivative}. The latter is written
   297   called @{text nullable} and \emph{derivative}. The latter is written
   262   $r\backslash c$ for the derivative of the regular expression $r$
   298   $r\backslash c$ for the derivative of the regular expression $r$
   263   w.r.t.~the character $c$. Both functions are defined by recursion over
   299   w.r.t.~the character $c$. Both functions are defined by recursion over
   264   regular expressions.  
   300   regular expressions.  
   265 %
   301 %
   266 \begin{center}
   302 \begin{center}
   267 \begin{tabular}{cc}
   303 \begin{tabular}{c @ {\hspace{-1mm}}c}
   268   \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
   304   \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
   269   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
   305   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
   270   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
   306   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
   271   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
   307   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
   272   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
   308   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
   273   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
   309   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
   274   & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
   310   & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
   275   & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
   311   & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
   276   % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
   312   % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
   277   @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
   313   @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\\
       
   314   @{thm (lhs) der.simps(7)} & $\dn$ & @{thm (rhs) der.simps(7)}
   278   \end{tabular}
   315   \end{tabular}
   279   &
   316   &
   280   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
   317   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
   281   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
   318   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
   282   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
   319   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
   283   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
   320   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
   284   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   321   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   285   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   322   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   286   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}%\medskip\\
   323   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\
       
   324   @{thm (lhs) nullable.simps(7)} & $\dn$ & @{text "if"} @{text "n = 0"}\\
       
   325   & & @{text "then"} @{const "True"}\\
       
   326   & & @{text "else"} @{term "nullable r"}
   287   \end{tabular}  
   327   \end{tabular}  
   288 \end{tabular}  
   328 \end{tabular}  
   289 \end{center}
   329 \end{center}
   290 
   330 
   291   \noindent
   331   \noindent
   305 @{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
   345 @{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
   306 \end{proposition}
   346 \end{proposition}
   307 
   347 
   308 \noindent
   348 \noindent
   309 It is a fun exercise to formally prove this property in a theorem prover.
   349 It is a fun exercise to formally prove this property in a theorem prover.
       
   350 We are aware
       
   351 of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
       
   352 Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
       
   353 of the work by Krauss and Nipkow~\cite{Krauss2011}.  And another one
       
   354 in Coq is given by Coquand and Siles \cite{Coquand2012}.
       
   355 Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
   310 
   356 
   311 The novel idea of Sulzmann and Lu is to extend this algorithm for 
   357 The novel idea of Sulzmann and Lu is to extend this algorithm for 
   312 lexing, where it is important to find out which part of the string
   358 lexing, where it is important to find out which part of the string
   313 is matched by which part of the regular expression.
   359 is matched by which part of the regular expression.
   314 For this Sulzmann and Lu presented two lexing algorithms in their paper
   360 For this Sulzmann and Lu presented two lexing algorithms in their paper
   332   function, written @{term "flat DUMMY"}. It traverses a value and
   378   function, written @{term "flat DUMMY"}. It traverses a value and
   333   collects the characters contained in it \cite{AusafDyckhoffUrban2016}.
   379   collects the characters contained in it \cite{AusafDyckhoffUrban2016}.
   334 
   380 
   335 
   381 
   336   Sulzmann and Lu also define inductively an
   382   Sulzmann and Lu also define inductively an
   337   inhabitation relation that associates values to regular expressions. This
   383   inhabitation relation that associates values to regular expressions. Our
   338   is defined by the following six rules for the values:
   384   version of this relation is defined the following six rules for the values:
   339   %
   385   %
   340   \begin{center}
   386   \begin{center}
   341   \begin{tabular}{@ {\hspace{-12mm}}c@ {}}
       
   342   \begin{tabular}{@ {}c@ {}}
   387   \begin{tabular}{@ {}c@ {}}
   343   \\[-8mm]
   388   @{thm[mode=Axiom] Prf.intros(4)}\qquad
   344   @{thm[mode=Axiom] Prf.intros(4)}\\
   389   @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\qquad
   345   @{thm[mode=Axiom] Prf.intros(5)[of "c"]}
   390   @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\qquad
   346   \end{tabular}
   391   @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\qquad
   347   \quad
   392   @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\medskip\\
   348   @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\quad
   393   @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\qquad
   349   @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\quad
   394   $\mprset{flushleft}\inferrule{
   350   @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\quad
   395   @{thm (prem 1) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}\\\\
   351   @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\mbox{}\hspace{-10mm}\mbox{}
   396   @{thm (prem 2) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}\quad
       
   397   @{thm (prem 3) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}
       
   398   }
       
   399   {@{thm (concl) Prf.intros(7)[of "vs\<^sub>1" "r"  "vs\<^sub>2" "n"]}}
       
   400   $
   352   \end{tabular}
   401   \end{tabular}
   353   \end{center}
   402   \end{center}
   354 
   403 
   355   \noindent Note that no values are associated with the regular expression
   404   \noindent Note that no values are associated with the regular expression
   356   @{term ZERO}, since it cannot match any string.
   405   @{term ZERO}, since it cannot match any string. Interesting is our version
       
   406   of the rule for @{term "STAR r"} where we require that each value
       
   407   in  @{term vs} flattens to a non-empty string. This means if @{term "STAR r"} ``fires''
       
   408   one or more times, then each copy needs to match a non-empty string.
       
   409   Similarly, in the rule
       
   410   for @{term "NTIMES r n"} we require that the length of the list
       
   411   @{term "vs\<^sub>1 @ vs\<^sub>2"} equals @{term n} (meaning the regular expression
       
   412   @{text r} matches @{text n}-times) and that the first segment of this list
       
   413   contains values that flatten to non-empty strings followed by a segment that
       
   414   only contains values that flatten to the empty string. 
   357   It is routine to establish how values ``inhabiting'' a regular
   415   It is routine to establish how values ``inhabiting'' a regular
   358   expression correspond to the language of a regular expression, namely
   416   expression correspond to the language of a regular expression, namely
   359   @{thm L_flat_Prf}.
   417   @{thm L_flat_Prf}.
   360 
   418 
   361   In general there is more than one value inhabited by a regular
   419   In general there is more than one value inhabited by a regular
   376  specification for what POSIX values are (the inductive rules are shown in
   434  specification for what POSIX values are (the inductive rules are shown in
   377   Figure~\ref{POSIXrules}).
   435   Figure~\ref{POSIXrules}).
   378 
   436 
   379 \begin{figure}[t]
   437 \begin{figure}[t]
   380   \begin{center}\small%
   438   \begin{center}\small%
   381   \begin{tabular}{@ {}c@ {}}
   439   \begin{tabular}{@ {\hspace{-2mm}}c@ {}}
   382   \\[-9mm]
   440   \\[-9mm]
   383   @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \quad
   441   @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \quad
   384   @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\quad
   442   @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\quad
   385   @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\quad
   443   @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\quad
   386   @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
   444   @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
   395    \inferrule
   453    \inferrule
   396    {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
   454    {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
   397     @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
   455     @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
   398     @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
   456     @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
   399     @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   457     @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   400    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\\[-4mm]
   458    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\medskip\smallskip\\
       
   459   \mprset{sep=4mm} 
       
   460   @{thm[mode=Rule] Posix.intros(9)}\<open>Pn[]\<close>\quad 
       
   461   $\mprset{flushleft}
       
   462    \inferrule
       
   463    {@{thm (prem 1) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
       
   464     @{thm (prem 2) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
       
   465     @{thm (prem 3) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \\\\
       
   466     @{thm (prem 4) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}
       
   467    {@{thm (concl) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}$\<open>Pn+\<close>
       
   468   \\[-4mm]   
   401   \end{tabular}
   469   \end{tabular}
   402   \end{center}
   470   \end{center}
   403   \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
   471   \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
   404   of given a string $s$ and a regular
   472   of given a string $s$ and a regular
   405   expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
   473   expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
   406   regular expression matching.}\label{POSIXrules}
   474   regular expression matching.\smallskip}\label{POSIXrules}
   407   \end{figure}
   475   \end{figure}
   408 
   476 
   409   The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
   477   The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
   410   an injection function on values that mirrors (but inverts) the
   478   an injection function on values that mirrors (but inverts) the
   411   construction of the derivative on regular expressions. Essentially it
   479   construction of the derivative on regular expressions. Essentially it
   417   \begin{tabular}{@ {}lcl@ {}}
   485   \begin{tabular}{@ {}lcl@ {}}
   418   @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
   486   @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
   419   @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
   487   @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
   420   @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
   488   @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
   421   @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
   489   @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
   422   \end{tabular}\smallskip\\
   490   @{thm (lhs) mkeps.simps(5)} & $\dn$ & @{thm (rhs) mkeps.simps(5)}\\
       
   491   \end{tabular}\medskip\\
   423 
   492 
   424   \begin{tabular}{@ {}cc@ {}}
   493   \begin{tabular}{@ {}cc@ {}}
   425   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
   494   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
   426   @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
   495   @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
   427   @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
   496   @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
   428       @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
   497       @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
   429   @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
   498   @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
   430       @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
   499       @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
   431   @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
   500   @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
   432       & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}
   501       & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
       
   502   @{thm (lhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]} & $\dn$ 
       
   503       & @{thm (rhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]}
   433   \end{tabular}
   504   \end{tabular}
   434   &
   505   &
   435   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
   506   \begin{tabular}{@ {\hspace{-3mm}}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
   436   @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
   507   @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
   437       & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
   508       & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
   438   @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
   509   @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
   439       & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
   510       & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
   440   @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$\\ 
   511   @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$\\ 
   441   \multicolumn{3}{@ {}r@ {}}{@{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}}
   512   \multicolumn{3}{@ {}r@ {}}{@{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}}\\ \mbox{}\\ \mbox{}
   442   \end{tabular}
   513   \end{tabular}
   443   \end{tabular}
   514   \end{tabular}
   444   \end{tabular}\smallskip
   515   \end{tabular}\smallskip
   445   \end{center}
   516   \end{center}
   446 
   517 
   447   \noindent
   518   \noindent
   448   The function @{text mkeps} is run when the last derivative is nullable, that is
   519   The function @{text mkeps} is run when the last derivative is nullable, that is
   449   the string to be matched is in the language of the regular expression. It generates
   520   the string to be matched is in the language of the regular expression. It generates
   450   a value for how the last derivative can match the empty string. The injection function
   521   a value for how the last derivative can match the empty string. In case
       
   522   of @{term "NTIMES r n"} we use the function @{term replicate} in order to generate
       
   523   a list of exactly @{term n} copies, which is the length of the list we expect in this
       
   524   case.  The injection function
   451   then calculates the corresponding value for each intermediate derivative until
   525   then calculates the corresponding value for each intermediate derivative until
   452   a value for the original regular expression is generated.
   526   a value for the original regular expression is generated.
   453   Graphically the algorithm by
   527   Graphically the algorithm by
   454   Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz}
   528   Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz}
   455   where the path from the left to the right involving @{term derivatives}/@{const
   529   where the path from the left to the right involving @{term derivatives}/@{const
   584                $\;\mid\;$ @{term "AONE bs"}
   658                $\;\mid\;$ @{term "AONE bs"}
   585                $\;\mid\;$ @{term "ACHAR bs c"}
   659                $\;\mid\;$ @{term "ACHAR bs c"}
   586                $\;\mid\;$ @{term "AALTs bs rs"}
   660                $\;\mid\;$ @{term "AALTs bs rs"}
   587                $\;\mid\;$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}
   661                $\;\mid\;$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}
   588                $\;\mid\;$ @{term "ASTAR bs r"}
   662                $\;\mid\;$ @{term "ASTAR bs r"}
       
   663 	       $\;\mid\;$ @{term "ANTIMES bs r n"}
   589   \end{center}
   664   \end{center}
   590 
   665 
   591   \noindent where @{text bs} stands for bitsequences; @{text r},
   666   \noindent where @{text bs} stands for bitsequences; @{text r},
   592   @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
   667   @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
   593   expressions; and @{text rs} for lists of bitcoded regular
   668   expressions; and @{text rs} for lists of bitcoded regular
   652        (\Right\,v, bs_1)$\\                           
   727        (\Right\,v, bs_1)$\\                           
   653   $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
   728   $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
   654         $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
   729         $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
   655   & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
   730   & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
   656         \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
   731         \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
   657   $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
   732   $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
   658   $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
   733   $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & 
   659          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
   734          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
   660   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
   735   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
   661         \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\medskip\\
   736         \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\\
   662   \multicolumn{3}{l}{$\textit{decode}\,bs\,r$ $\dn$ 
   737   $\textit{decode}'\,(\S\!::\!bs)\,(r^{\{n\}})$ & $\dn$ & $(\Stars\,[], bs)$\\
       
   738   $\textit{decode}'\,(\Z\!::\!bs)\,(r^{\{n\}})$ & $\dn$ & 
       
   739          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
       
   740   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^{\{n - 1\}}$
       
   741         \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\medskip\\	
       
   742   \multicolumn{3}{@ {}l}{$\textit{decode}\,bs\,r$ $\dn$ 
   663      $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$
   743      $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$
   664      $\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
   744      $\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
   665        \textit{else}\;\textit{None}$}\\[-4mm]   
   745        \textit{else}\;\textit{None}$}\\[-4mm]   
   666   \end{tabular}    
   746   \end{tabular}    
   667   \end{center}
   747   \end{center}
   702   $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
   782   $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
   703      $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
   783      $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
   704   $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
   784   $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
   705      $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
   785      $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
   706   $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
   786   $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
   707      $\textit{STAR}\,(bs\,@\,bs')\,r$
   787      $\textit{STAR}\,(bs\,@\,bs')\,r$\\
       
   788   $\textit{fuse}\,bs\,(\textit{NT}\,bs'\,r\,n)$ & $\dn$ &
       
   789      $\textit{NT}\,(bs\,@\,bs')\,r\,n$   
   708   \end{tabular}
   790   \end{tabular}
   709   \end{tabular}
   791   \end{tabular}
   710   \end{center}    
   792   \end{center}    
   711 
   793 
   712   \noindent
   794   \noindent
   720   $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$
   802   $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$
   721   \end{tabular}
   803   \end{tabular}
   722   &
   804   &
   723   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
   805   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
   724   $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
   806   $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
   725   $(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$
   807   $(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
       
   808   $(r^{\{n\}})^\uparrow$ & $\dn$ &
       
   809         $\textit{NT}\;[]\,r^\uparrow\,n$
   726   \end{tabular}
   810   \end{tabular}
   727   &
   811   &
   728   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
   812   \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
   729   $(r_1 + r_2)^\uparrow$ & $\dn$ &
   813   $(r_1 + r_2)^\uparrow$ & $\dn$ &
   730          $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
   814          $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
   743   the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
   827   the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
   744   ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
   828   ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
   745   bitcoded regular expressions.
   829   bitcoded regular expressions.
   746   %
   830   %
   747   \begin{center}
   831   \begin{center}
   748   \begin{tabular}{@ {}c@ {\hspace{2mm}}c@ {}}
   832   \begin{tabular}{@ {\hspace{-1mm}}c@ {\hspace{0mm}}c@ {}}
   749   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
   833   \begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l}
   750   $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
   834   $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
   751   $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
   835   $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
   752   $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
   836   $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
   753   $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
   837   $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
   754      $\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
   838      $\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
   755   $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
   839   $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
   756      $\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
   840      $\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
   757   $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
   841   $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
   758      $\textit{True}$
   842      $\textit{True}$\\
       
   843   $\textit{bnullable}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
       
   844   \multicolumn{3}{l}{$\textit{if}\;n = 0\;\textit{then}\;\textit{True}\;
       
   845   \textit{else}\;\textit{bnullable}\,r$}\\
   759   \end{tabular}
   846   \end{tabular}
   760   &
   847   &
   761   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
   848   \begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l@ {}}
   762   $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
   849   $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
   763   $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
   850   $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
   764   $bs\,@\,\textit{bmkepss}\,\rs$\\
   851   $bs\,@\,\textit{bmkepss}\,\rs$\\
   765   $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
   852   $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
   766   \multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
   853   \multicolumn{3}{l}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
   767   $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
   854   $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
   768      $bs \,@\, [\S]$\\
   855      $bs \,@\, [\S]$\\
   769   $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &
   856    $\textit{bmkeps}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
   770      $\textit{if}\;\textit{bnullable}\,r$\\
   857    \multicolumn{3}{l@ {}}{$\textit{if}\;n=0\;\textit{then}\;bs \,@\, [\S]$}\\
   771   & &$\textit{then}\;\textit{bmkeps}\,r$\\
   858    \multicolumn{3}{l@ {}}{$\textit{else}\,bs \,@\, [\Z] \,@\, \textit{bmkeps}\,r\,@\,
   772   & &$\textit{else}\;\textit{bmkepss}\,\rs$
   859         \textit{bmkeps}\,(@{term "ANTIMES [] r (n - 1)"})$}\\   
       
   860   $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &\\
       
   861   \multicolumn{3}{l}{$\textit{if}\;\textit{bnullable}\,r\,\textit{then}\;\textit{bmkeps}\,r\;
       
   862   \textit{else}\;\textit{bmkepss}\,\rs$}
   773   \end{tabular}
   863   \end{tabular}
   774   \end{tabular}
   864   \end{tabular}
   775   \end{center}    
   865   \end{center}    
   776  
   866  
   777 
   867 
   781   derivative but at the same time also the incremental part of the bitsequences
   871   derivative but at the same time also the incremental part of the bitsequences
   782   that contribute to constructing a POSIX value.	
   872   that contribute to constructing a POSIX value.	
   783   % 
   873   % 
   784   \begin{center}
   874   \begin{center}
   785   \begin{tabular}{@ {}lcl@ {}}
   875   \begin{tabular}{@ {}lcl@ {}}
   786   \multicolumn{3}{@ {}l}{$(\textit{ZERO})\backslash c$ $\;\dn\;$ $\textit{ZERO}$ \quad\qquad  
   876   $(\textit{ZERO})\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\ 
   787   $(\textit{ONE}\;bs)\backslash c$ $\;\dn\;$ $\textit{ZERO}$}\\  
   877   $(\textit{ONE}\;bs)\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\  
   788   $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
   878   $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
   789         $\textit{if}\;c=d\; \;\textit{then}\;
   879         $\textit{if}\;c=d\; \;\textit{then}\;
   790          \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
   880          \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
   791   $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
   881   $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
   792         $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
   882         $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
   794      $\textit{if}\;\textit{bnullable}\,r_1$\\
   884      $\textit{if}\;\textit{bnullable}\,r_1$\\
   795   & &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$
   885   & &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$
   796   $\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
   886   $\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
   797   & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
   887   & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
   798   $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
   888   $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
   799       $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
   889       $\textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
   800        (\textit{STAR}\,[]\,r)$
   890        (\textit{STAR}\,[]\,r)$\\
       
   891   $(\textit{NT}\,bs\,r\,n)\backslash c$ & $\dn$ &
       
   892       $\textit{if}\;n = 0\; \textit{then}\;\textit{ZERO}\;\textit{else}\;
       
   893       \textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
       
   894        (\textit{NT}\,[]\,r\,(n - 1))$     
   801   \end{tabular}    
   895   \end{tabular}    
   802   \end{center}
   896   \end{center}
   803 
   897 
   804 
   898 
   805   \noindent
   899   \noindent
   836   @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
   930   @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
   837   @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
   931   @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
   838   @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
   932   @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
   839       & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
   933       & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
   840   @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
   934   @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
   841   @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
   935   @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}\\
       
   936   @{thm (lhs) retrieve.simps(9)} & $\dn$ & @{thm (rhs) retrieve.simps(9)}\\
       
   937   @{thm (lhs) better_retrieve2} & $\dn$ & @{thm (rhs) better_retrieve2}
   842   \end{tabular}
   938   \end{tabular}
   843   \end{center}
   939   \end{center}
   844 
   940 
   845 \noindent
   941 \noindent
   846 The idea behind this function is to retrieve a possibly partial
   942 The idea behind this function is to retrieve a possibly partial
   955 
  1051 
   956 section {* Simplification *}
  1052 section {* Simplification *}
   957 
  1053 
   958 text {*
  1054 text {*
   959 
  1055 
   960      Derivatives as calculated by Brzozowski’s method are usually more
  1056      Derivatives as calculated by Brzozowski's method are usually more
   961      complex regular expressions than the initial one; the result is
  1057      complex regular expressions than the initial one; the result is
   962      that derivative-based matching and lexing algorithms are
  1058      that derivative-based matching and lexing algorithms are
   963      often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
  1059      often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
   964      optimisations are possible, such as the simplifications
  1060      optimisations are possible, such as the simplifications
   965      $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
  1061      $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
  1004      de-nest, or spill out, @{text ALTs} as follows
  1100      de-nest, or spill out, @{text ALTs} as follows
  1005      %
  1101      %
  1006      \[
  1102      \[
  1007      @{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
  1103      @{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
  1008      \quad\xrightarrow{bsimp}\quad
  1104      \quad\xrightarrow{bsimp}\quad
  1009      @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) # rs\<^sub>1)"}
  1105      @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) @ rs\<^sub>1)"}
  1010      \]
  1106      \]
  1011 
  1107 
  1012      \noindent
  1108      \noindent
  1013      where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
  1109      where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
  1014      to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
  1110      to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
  1071      @{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\
  1167      @{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\
  1072      @{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\
  1168      @{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\
  1073      @{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ &
  1169      @{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ &
  1074      @{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}\\
  1170      @{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}\\
  1075      @{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\
  1171      @{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\
       
  1172      @{thm (lhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]}\\
  1076      \end{tabular}
  1173      \end{tabular}
  1077      \end{center}
  1174      \end{center}
  1078 
  1175 
  1079      \noindent
  1176      \noindent
  1080      where all other cases are set to @{text False}.
  1177      where all other cases are set to @{text False}.
  1087      It is defined as follows:
  1184      It is defined as follows:
  1088 
  1185 
  1089      \begin{center}
  1186      \begin{center}
  1090      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  1187      \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  1091      \multicolumn{3}{@ {}c}{@{thm (lhs) flts.simps(1)} $\dn$ @{thm (rhs) flts.simps(1)} \qquad\qquad\qquad\qquad
  1188      \multicolumn{3}{@ {}c}{@{thm (lhs) flts.simps(1)} $\dn$ @{thm (rhs) flts.simps(1)} \qquad\qquad\qquad\qquad
  1092      @{thm (lhs) flts.simps(2)} $\dn$ @{thm (rhs) flts.simps(2)}}\\
  1189      @{thm (lhs) flts.simps(2)} $\dn$ @{thm (rhs) flts.simps(2)}}\smallskip\\
  1093      @{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\
  1190      @{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\
  1094      \end{tabular}
  1191      \end{tabular}
  1095      \end{center}
  1192      \end{center}
  1096 
  1193 
  1097      \noindent
  1194      \noindent
  1133      @{text "bsimp r"} & $\dn$ & @{text r}
  1230      @{text "bsimp r"} & $\dn$ & @{text r}
  1134      \end{tabular}
  1231      \end{tabular}
  1135      \end{center}
  1232      \end{center}
  1136 
  1233 
  1137      \noindent
  1234      \noindent
  1138      We believe our recursive function @{term bsimp} simplifies regular
  1235      We believe our recursive function @{term bsimp} simplifies bitcoded regular
  1139      expressions as intended by Sulzmann and Lu. There is no point in applying the
  1236      expressions as intended by Sulzmann and Lu. There is no point in applying the
  1140      @{text bsimp} function repeatedly (like the simplification in their paper which needs to be
  1237      @{text bsimp} function repeatedly (like the simplification in their paper which needs to be
  1141      applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
  1238      applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
  1142      that is
  1239      that is
  1143 
  1240 
  1147 
  1244 
  1148      \noindent
  1245      \noindent
  1149      This can be proved by induction on @{text r} but requires a detailed analysis
  1246      This can be proved by induction on @{text r} but requires a detailed analysis
  1150      that the de-nesting of alternatives always results in a flat list of regular
  1247      that the de-nesting of alternatives always results in a flat list of regular
  1151      expressions. We omit the details since it does not concern the correctness proof.
  1248      expressions. We omit the details since it does not concern the correctness proof.
  1152      
  1249      %It might be interesting to not that we do not simplify inside @{term STAR} and
       
  1250      %@{text NT}: the reason is that we want to keep the
       
  1251 
  1153      Next we can include simplification after each derivative step leading to the
  1252      Next we can include simplification after each derivative step leading to the
  1154      following notion of bitcoded derivatives:
  1253      following notion of bitcoded derivatives:
  1155      
  1254      
  1156      \begin{center}
  1255      \begin{center}
  1157       \begin{tabular}{cc}
  1256       \begin{tabular}{cc}
  1226      \end{lemma}
  1325      \end{lemma}
  1227 
  1326 
  1228      
  1327      
  1229 
  1328 
  1230      \noindent
  1329      \noindent
  1231      We can show that this rewrite system preserves @{term bnullable}, that 
  1330      We can also show that this rewrite system preserves @{term bnullable}, that 
  1232      is simplification does not affect nullability:
  1331      is simplification does not affect nullability:
  1233 
  1332 
  1234      \begin{lemma}\label{lembnull}
  1333      \begin{lemma}\label{lembnull}
  1235      @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
  1334      @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
  1236      \end{lemma}
  1335      \end{lemma}
  1307   @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
  1406   @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
  1308   @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
  1407   @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
  1309   @{thm[mode=Axiom] bs6}$A0$\quad
  1408   @{thm[mode=Axiom] bs6}$A0$\quad
  1310   @{thm[mode=Axiom] bs7}$A1$\quad
  1409   @{thm[mode=Axiom] bs7}$A1$\quad
  1311   @{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
  1410   @{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
  1312   @{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad
  1411   @{thm[mode=Rule] rrewrite_srewrite.ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad
  1313   @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad
  1412   @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad
  1314   @{thm[mode=Axiom] ss4}$L\ZERO$\quad
  1413   @{thm[mode=Axiom] ss4}$L\ZERO$\quad
  1315   @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
  1414   @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
  1316   @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm]
  1415   @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm]
  1317   \end{tabular}
  1416   \end{tabular}
  1340 \begin{center}
  1439 \begin{center}
  1341 $\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$
  1440 $\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$
  1342 \end{center}
  1441 \end{center}
  1343 
  1442 
  1344 \noindent
  1443 \noindent
  1345 We prove this by induction on $r$. The base cases for @{term AZERO},
  1444 Note that the bound $N$ is a bound for \emph{all} strings, no matter how long they are.
       
  1445 We establish this bound by induction on $r$. The base cases for @{term AZERO},
  1346 @{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
  1446 @{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
  1347 for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
  1447 for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
  1348 hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and
  1448 hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and
  1349 $\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows
  1449 $\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows
  1350 %
  1450 %
  1383 @{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in
  1483 @{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in
  1384 partial derivatives). Unfortunately to obtain the exact same bound would mean
  1484 partial derivatives). Unfortunately to obtain the exact same bound would mean
  1385 we need to introduce simplifications, such as
  1485 we need to introduce simplifications, such as
  1386  $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$,
  1486  $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$,
  1387 which exist for partial derivatives. However, if we introduce them in our
  1487 which exist for partial derivatives. However, if we introduce them in our
  1388 setting we would lose the POSIX property of our calculated values. We leave better
  1488 setting we would lose the POSIX property of our calculated values. For example
  1389 bounds for future work.\\[-6.5mm]
  1489 given the regular expressions @{term "SEQ (ALT a ab) (ALT b ONE)"} and the string $[a, b]$, then our
       
  1490 algorithm generates the following correct POSIX value
       
  1491 %
       
  1492 \[
       
  1493 @{term "Seq (Right (Seq (Char a) (Char b))) (Right Empty)"}
       
  1494 \]
       
  1495 
       
  1496 \noindent
       
  1497 Essentially it matches the string with the longer @{text "Right"}-alternative in the
       
  1498 first sequence (and then the `rest' with the empty regular expression @{const ONE} from the second sequence). 
       
  1499 If we add the simplification above, then we obtain the following value
       
  1500 %
       
  1501 \[
       
  1502 @{term "Seq (Left (Char a)) (Left (Char b))"}
       
  1503 \]
       
  1504 
       
  1505 \noindent
       
  1506 where the @{text "Left"}-alternatives get priority. However, this violates
       
  1507 the POSIX rules and we have not been able to
       
  1508 reconcile this problem. Therefore we leave better bounds for future work.\\[-6.5mm]
  1390 
  1509 
  1391 *}
  1510 *}
  1392 
  1511 
  1393 
  1512 
  1394 section {* Conclusion *}
  1513 section {* Conclusion *}
  1421    obscure, examples.
  1540    obscure, examples.
  1422    %We found that from an implementation
  1541    %We found that from an implementation
  1423    %point-of-view it is really important to have the formal proofs of
  1542    %point-of-view it is really important to have the formal proofs of
  1424    %the corresponding properties at hand.
  1543    %the corresponding properties at hand.
  1425 
  1544 
  1426    We have also developed a
  1545    We can of course only make a claim about the correctness and the sizes of the
  1427    healthy suspicion when experimental data is used to back up
  1546    derivatives, not about the efficiency or runtime of our version of
       
  1547    Sulzman and Lu's algorithm. But we found the size is an important
       
  1548    first indicator about efficiency: clearly if the derivatives can
       
  1549    grow to arbitrarily big sizes and the algorithm needs to traverse
       
  1550    the derivatives possibly several times, then the algorithm will be
       
  1551    slow---excruciatingly slow that is. Other works seems to make
       
  1552    stronger claims, but during our work we have developed a healthy
       
  1553    suspicion when for example experimental data is used to back up
  1428    efficiency claims. For example Sulzmann and Lu write about their
  1554    efficiency claims. For example Sulzmann and Lu write about their
  1429    equivalent of @{term blexer_simp} \textit{``...we can incrementally compute
  1555    equivalent of @{term blexer_simp} \textit{``...we can incrementally
  1430    bitcoded parse trees in linear time in the size of the input''}
  1556    compute bitcoded parse trees in linear time in the size of the
  1431    \cite[Page 14]{Sulzmann2014}. 
  1557    input''} \cite[Page 14]{Sulzmann2014}.  Given the growth of the
  1432    Given the growth of the
  1558    derivatives in some cases even after aggressive simplification,
  1433    derivatives in some cases even after aggressive simplification, this
  1559    this is a hard to believe claim. A similar claim about a
  1434    is a hard to believe claim. A similar claim about a theoretical runtime
  1560    theoretical runtime of @{text "O(n\<^sup>2)"} is made for the
  1435    of @{text "O(n\<^sup>2)"} is made for the Verbatim lexer, which calculates
  1561    Verbatim lexer, which calculates tokens according to POSIX
  1436    tokens according to POSIX rules~\cite{verbatim}. For this Verbatim uses Brzozowski's
  1562    rules~\cite{verbatim}. For this, Verbatim uses Brzozowski's
  1437    derivatives like in our work. 
  1563    derivatives like in our work.  The authors write: \textit{``The
  1438    The authors write: \textit{``The results of our empirical tests [..] confirm that Verbatim has
  1564    results of our empirical tests [..] confirm that Verbatim has
  1439    @{text "O(n\<^sup>2)"} time complexity.''} \cite[Section~VII]{verbatim}.
  1565    @{text "O(n\<^sup>2)"} time complexity.''}
  1440    While their correctness proof for Verbatim is formalised in Coq, the claim about
  1566    \cite[Section~VII]{verbatim}.  While their correctness proof for
  1441    the runtime complexity is only supported by some emperical evidence obtained
  1567    Verbatim is formalised in Coq, the claim about the runtime
  1442    by using the code extraction facilities of Coq.
  1568    complexity is only supported by some emperical evidence obtained by
  1443    Given our observation with the ``growth problem'' of derivatives,
  1569    using the code extraction facilities of Coq.  Given our observation
  1444    we
  1570    with the ``growth problem'' of derivatives, we tried out their
  1445    tried out their extracted OCaml code with the example
  1571    extracted OCaml code with the example \mbox{@{text "(a +
  1446    \mbox{@{text "(a + aa)\<^sup>*"}} as a single lexing rule, and it took for us around 5 minutes to tokenise a
  1572    aa)\<^sup>*"}} as a single lexing rule, and it took for us around 5
  1447    string of 40 $a$'s and that increased to approximately 19 minutes when the
  1573    minutes to tokenise a string of 40 $a$'s and that increased to
  1448    string is 50 $a$'s long. Taking into account that derivatives are not simplified in the Verbatim
  1574    approximately 19 minutes when the string is 50 $a$'s long. Taking
  1449    lexer, such numbers are not surprising. 
  1575    into account that derivatives are not simplified in the Verbatim
  1450    Clearly our result of having finite
  1576    lexer, such numbers are not surprising.  Clearly our result of
  1451    derivatives might sound rather weak in this context but we think such effeciency claims
  1577    having finite derivatives might sound rather weak in this context
  1452    really require further scrutiny.
  1578    but we think such effeciency claims really require further
  1453 
  1579    scrutiny.
  1454    The contribution of this paper is to make sure
  1580 
  1455    derivatives do not grow arbitrarily big (universially) In the example \mbox{@{text "(a + aa)\<^sup>*"}},
  1581    The contribution of this paper is to make sure derivatives do not
  1456    \emph{all} derivatives have a size of 17 or less. The result is that
  1582    grow arbitrarily big (universially) In the example \mbox{@{text "(a
  1457    lexing a string of, say, 50\,000 a's with the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes approximately
  1583    + aa)\<^sup>*"}}, \emph{all} derivatives have a size of 17 or
  1458    10 seconds with our Scala implementation
  1584    less. The result is that lexing a string of, say, 50\,000 a's with
  1459    of the presented algorithm. Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.
  1585    the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes
       
  1586    approximately 10 seconds with our Scala implementation of the
       
  1587    presented algorithm.
       
  1588 
       
  1589    Finally, let us come back to the point about bounded regular
       
  1590    expressions. We have in this paper only shown that @{term "NTIMES r
       
  1591    n"} can be included, but all our results extend straightforwardly
       
  1592    also to the other bounded regular expressions. We find bounded
       
  1593    regular expressions fit naturally into the setting of Brzozowski
       
  1594    derivatives and the bitcoded regular expressions by Sulzmann and Lu.
       
  1595    In contrast bounded regular expressions are often the Achilles'
       
  1596    heel in regular expression matchers that use the traditional
       
  1597    automata-based approach to lexing, primarily because they need to expand the
       
  1598    counters of bounded regular expressions into $n$-connected copies
       
  1599    of an automaton. This is not needed in Sulzmann and Lu's algorithm.
       
  1600    To see the difference consider for example the regular expression @{term "SEQ (NTIMES a
       
  1601    1001) (STAR a)"}, which is not permitted in the Go language because
       
  1602    the counter is too big. In contrast we have no problem with
       
  1603    matching this regular expression with, say 50\,000 a's, because the
       
  1604    counters can be kept compact. In fact, the overall size of the
       
  1605    derivatives is never greater than 5 in this example. Even in the
       
  1606    example from Section 2, where Rust raises an error message, namely
       
  1607    \mbox{@{text "a\<^bsup>{1000}{100}{5}\<^esup>"}}, the maximum size for
       
  1608    our derivatives is a moderate 14.
       
  1609 
       
  1610    Let us also compare our work to the verified Verbatim++ lexer where
       
  1611    the authors of the Verbatim lexer introduced a number of
       
  1612    improvements and optimisations, for example memoisation
       
  1613    \cite{verbatimpp}. However, unlike Verbatim, which works with
       
  1614    derivatives like in our work, Verbatim++ compiles first a regular
       
  1615    expression into a DFA. While this makes lexing fast in many cases,
       
  1616    with examples of bounded regular expressions like
       
  1617    \mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}}
       
  1618    one needs to represent them as
       
  1619    sequences of $a \cdot a \cdot \ldots \cdot a$ (500 a's in sequence). We have run
       
  1620    their extracted code with such a regular expression as a
       
  1621    single lexing rule and a string of 50\,000 a's---lexing in this case
       
  1622    takes approximately 5~minutes. We are not aware of any better
       
  1623    translation using the traditional notion of DFAs. Therefore we
       
  1624    prefer to stick with calculating derivatives, but attempt to make
       
  1625    this calculation (in the future) as fast as possible. What we can guaranty
       
  1626    with the presented work is that the maximum size of the derivatives
       
  1627    for this example is not bigger than 9. This means our Scala
       
  1628    implementation only needs a few seconds for this example.
       
  1629    %
       
  1630    %
       
  1631    %Possible ideas are
       
  1632    %zippers which have been used in the context of parsing of
       
  1633    %context-free grammars \cite{zipperparser}.
       
  1634    \medskip
       
  1635 
       
  1636    \noindent
       
  1637    Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.
  1460    %\\[-10mm]
  1638    %\\[-10mm]
  1461 
  1639 
  1462 
  1640 
  1463 %%\bibliographystyle{plain}
  1641 %%\bibliographystyle{plain}
  1464 \bibliography{root}
  1642 \bibliography{root}
  1465 
  1643 
       
  1644 \newpage
  1466 \appendix
  1645 \appendix
  1467 
  1646 
  1468 \section{Some Proofs}
  1647 \section{Some Proofs}
  1469 
  1648 
  1470 While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some
  1649 While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some
  1471 rough details of our reasoning in ``English''.
  1650 rough details of our reasoning in ``English'' if this is helpful for reviewing.
  1472 
  1651 
  1473 \begin{proof}[Proof of Lemma~\ref{codedecode}]
  1652 \begin{proof}[Proof of Lemma~\ref{codedecode}]
  1474   This follows from the property that
  1653   This follows from the property that
  1475   $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
  1654   $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
  1476   for any bit-sequence $bs$ and $\vdash v : r$. This property can be
  1655   for any bit-sequence $bs$ and $\vdash v : r$. This property can be