diff -r f9cdc295ccf7 -r f493a20feeb3 thys3/Paper.thy --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/thys3/Paper.thy Sat Apr 30 00:50:08 2022 +0100 @@ -0,0 +1,1545 @@ +(*<*) +theory Paper + imports + "Posix.FBound" + "HOL-Library.LaTeXsugar" +begin + +declare [[show_question_marks = false]] + +notation (latex output) + If ("(\<^latex>\\\textrm{\if\<^latex>\}\ (_)/ \<^latex>\\\textrm{\then\<^latex>\}\ (_)/ \<^latex>\\\textrm{\else\<^latex>\}\ (_))" 10) and + Cons ("_\<^latex>\\\mbox{$\\,$}\::\<^latex>\\\mbox{$\\,$}\_" [75,73] 73) + + +abbreviation + "der_syn r c \ der c r" +abbreviation + "ders_syn r s \ ders s r" +abbreviation + "bder_syn r c \ bder c r" + +notation (latex output) + der_syn ("_\\_" [79, 1000] 76) and + ders_syn ("_\\_" [79, 1000] 76) and + bder_syn ("_\\_" [79, 1000] 76) and + bders ("_\\_" [79, 1000] 76) and + bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and + + ZERO ("\<^bold>0" 81) and + ONE ("\<^bold>1" 81) and + CH ("_" [1000] 80) and + ALT ("_ + _" [77,77] 78) and + SEQ ("_ \ _" [77,77] 78) and + STAR ("_\<^sup>*" [79] 78) and + + val.Void ("Empty" 78) and + val.Char ("Char _" [1000] 78) and + val.Left ("Left _" [79] 78) and + val.Right ("Right _" [1000] 78) and + val.Seq ("Seq _ _" [79,79] 78) and + val.Stars ("Stars _" [79] 78) and + + Prf ("\ _ : _" [75,75] 75) and + Posix ("'(_, _') \ _" [63,75,75] 75) and + + flat ("|_|" [75] 74) and + flats ("|_|" [72] 74) and + injval ("inj _ _ _" [79,77,79] 76) and + mkeps ("mkeps _" [79] 76) and + length ("len _" [73] 73) and + set ("_" [73] 73) and + + AZERO ("ZERO" 81) and + AONE ("ONE _" [79] 78) and + ACHAR ("CHAR _ _" [79, 79] 80) and + AALTs ("ALTs _ _" [77,77] 78) and + ASEQ ("SEQ _ _ _" [79, 79,79] 78) and + ASTAR ("STAR _ _" [79, 79] 78) and + + code ("code _" [79] 74) and + intern ("_\<^latex>\\\mbox{$^\\uparrow$}\" [900] 80) and + erase ("_\<^latex>\\\mbox{$^\\downarrow$}\" [1000] 74) and + bnullable ("bnullable _" [1000] 80) and + bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and + bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and + bmkeps ("bmkeps _" [1000] 80) and + + eq1 ("_ \ _" [77,77] 76) and + + srewrite ("_\<^latex>\\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\ _" [71, 71] 80) and + rrewrites ("_ \<^latex>\\\mbox{$\\,\\leadsto^*$}\ _" [71, 71] 80) and + srewrites ("_ \<^latex>\\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\ _" [71, 71] 80) and + blexer_simp ("blexer\<^sup>+" 1000) + + +lemma better_retrieve: + shows "rs \ Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v" + and "rs \ Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v" + apply (metis list.exhaust retrieve.simps(4)) + by (metis list.exhaust retrieve.simps(5)) + + + + +(*>*) + +section {* Introduction *} + +text {* + +In the last fifteen or so years, Brzozowski's derivatives of regular +expressions have sparked quite a bit of interest in the functional +programming and theorem prover communities. The beauty of +Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly +expressible in any functional language, and easily definable and +reasoned about in theorem provers---the definitions just consist of +inductive datatypes and simple recursive functions. Derivatives of a +regular expression, written @{term "der c r"}, give a simple solution +to the problem of matching a string @{term s} with a regular +expression @{term r}: if the derivative of @{term r} w.r.t.\ (in +succession) all the characters of the string matches the empty string, +then @{term r} matches @{term s} (and {\em vice versa}). We are aware +of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by +Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part +of the work by Krauss and Nipkow~\cite{Krauss2011}. And another one +in Coq is given by Coquand and Siles \cite{Coquand2012}. +Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}. + + +However, there are two difficulties with derivative-based matchers: +First, Brzozowski's original matcher only generates a yes/no answer +for whether a regular expression matches a string or not. This is too +little information in the context of lexing where separate tokens must +be identified and also classified (for example as keywords +or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this +difficulty by cleverly extending Brzozowski's matching +algorithm. Their extended version generates additional information on +\emph{how} a regular expression matches a string following the POSIX +rules for regular expression matching. They achieve this by adding a +second ``phase'' to Brzozowski's algorithm involving an injection +function. In our own earlier work we provided the formal +specification of what POSIX matching means and proved in Isabelle/HOL +the correctness +of Sulzmann and Lu's extended algorithm accordingly +\cite{AusafDyckhoffUrban2016}. + +The second difficulty is that Brzozowski's derivatives can +grow to arbitrarily big sizes. For example if we start with the +regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take +successive derivatives according to the character $a$, we end up with +a sequence of ever-growing derivatives like + +\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}} +\begin{center} +\begin{tabular}{rll} +$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\ +& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\ +& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\ +& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\ +& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots) +\end{tabular} +\end{center} + +\noindent where after around 35 steps we run out of memory on a +typical computer (we shall define shortly the precise details of our +regular expressions and the derivative operation). Clearly, the +notation involving $\ZERO$s and $\ONE$s already suggests +simplification rules that can be applied to regular regular +expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r +\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow +r$. While such simple-minded simplifications have been proved in our +earlier work to preserve the correctness of Sulzmann and Lu's +algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do +\emph{not} help with limiting the growth of the derivatives shown +above: the growth is slowed, but the derivatives can still grow rather +quickly beyond any finite bound. + + +Sulzmann and Lu overcome this ``growth problem'' in a second algorithm +\cite{Sulzmann2014} where they introduce bitcoded +regular expressions. In this version, POSIX values are +represented as bitsequences and such sequences are incrementally generated +when derivatives are calculated. The compact representation +of bitsequences and regular expressions allows them to define a more +``aggressive'' simplification method that keeps the size of the +derivatives finitely bounded no matter what the length of the string is. +They make some informal claims about the correctness and linear behaviour +of this version, but do not provide any supporting proof arguments, not +even ``pencil-and-paper'' arguments. They write about their bitcoded +\emph{incremental parsing method} (that is the algorithm to be formalised +in this paper): +% +\begin{quote}\it + ``Correctness Claim: We further claim that the incremental parsing + method [..] in combination with the simplification steps [..] + yields POSIX parse trees. We have tested this claim + extensively [..] but yet + have to work out all proof details.'' \cite[Page 14]{Sulzmann2014} +\end{quote} + +\noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL +the derivative-based lexing algorithm of Sulzmann and Lu +\cite{Sulzmann2014} where regular expressions are annotated with +bitsequences. We define the crucial simplification function as a +recursive function, without the need of a fix-point operation. One objective of +the simplification function is to remove duplicates of regular +expressions. For this Sulzmann and Lu use in their paper the standard +@{text nub} function from Haskell's list library, but this function +does not achieve the intended objective with bitcoded regular expressions. The +reason is that in the bitcoded setting, each copy generally has a +different bitcode annotation---so @{text nub} would never ``fire''. +Inspired by Scala's library for lists, we shall instead use a @{text +distinctWith} function that finds duplicates under an ``erasing'' function +which deletes bitcodes before comparing regular expressions. +We shall also introduce our own argument and definitions for +establishing the correctness of the bitcoded algorithm when +simplifications are included. Finally we +establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions +%of regular expressions and describe the definition +%of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves +%as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove +%the correctness for the bitcoded algorithm without simplification, and +%after that extend the proof to include simplification.}\smallskip +%\mbox{}\\[-10mm] + +\noindent In this paper, we shall first briefly introduce the basic notions +of regular expressions and describe the definition +of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves +as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove +the correctness for the bitcoded algorithm without simplification, and +after that extend the proof to include simplification.\mbox{}\\[-6mm] + +*} + +section {* Background *} + +text {* + In our Isabelle/HOL formalisation strings are lists of characters with + the empty string being represented by the empty list, written $[]$, + and list-cons being written as $\_\!::\!\_\,$; string + concatenation is $\_ \,@\, \_\,$. We often use the usual + bracket notation for lists also for strings; for example a string + consisting of just a single character $c$ is written $[c]$. + Our regular expressions are defined as usual as the following inductive + datatype:\\[-4mm] + % + \begin{center} + @{text "r ::="} \; + @{const "ZERO"} $\mid$ + @{const "ONE"} $\mid$ + @{term "CH c"} $\mid$ + @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$ + @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$ + @{term "STAR r"} + \end{center} + + \noindent where @{const ZERO} stands for the regular expression that does + not match any string, @{const ONE} for the regular expression that matches + only the empty string and @{term c} for matching a character literal. + The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively. + We sometimes omit the $\cdot$ in a sequence regular expression for brevity. + The + \emph{language} of a regular expression, written $L(r)$, is defined as usual + and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}). + + Central to Brzozowski's regular expression matcher are two functions + called @{text nullable} and \emph{derivative}. The latter is written + $r\backslash c$ for the derivative of the regular expression $r$ + w.r.t.~the character $c$. Both functions are defined by recursion over + regular expressions. +% +\begin{center} +\begin{tabular}{cc} + \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l} + @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\ + @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\ + @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\ + @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\ + & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\ + & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\ + % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)} + \end{tabular} + & + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\ + @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\ + @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\ + @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}%\medskip\\ + \end{tabular} +\end{tabular} +\end{center} + + \noindent + We can extend this definition to give derivatives w.r.t.~strings, + namely @{thm (lhs) ders.simps(1)}$\dn$@{thm (rhs) ders.simps(1)} + and @{thm (lhs) ders.simps(2)}$\dn$@{thm (rhs) ders.simps(2)}. + Using @{text nullable} and the derivative operation, we can +define a simple regular expression matcher, namely +$@{text "match s r"} \;\dn\; @{term nullable}(r\backslash s)$. +This is essentially Brzozowski's algorithm from 1964. Its +main virtue is that the algorithm can be easily implemented as a +functional program (either in a functional programming language or in +a theorem prover). The correctness of @{text match} amounts to +establishing the property:%\footnote{It is a fun exercise to formally prove this property in a theorem prover.} +% +\begin{proposition}\label{matchcorr} +@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$ +\end{proposition} + +\noindent +It is a fun exercise to formally prove this property in a theorem prover. + +The novel idea of Sulzmann and Lu is to extend this algorithm for +lexing, where it is important to find out which part of the string +is matched by which part of the regular expression. +For this Sulzmann and Lu presented two lexing algorithms in their paper + \cite{Sulzmann2014}. The first algorithm consists of two phases: first a + matching phase (which is Brzozowski's algorithm) and then a value + construction phase. The values encode \emph{how} a regular expression + matches a string. \emph{Values} are defined as the inductive datatype + % + \begin{center} + @{text "v ::="} + @{const "Void"} $\mid$ + @{term "val.Char c"} $\mid$ + @{term "Left v"} $\mid$ + @{term "Right v"} $\mid$ + @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ + @{term "Stars vs"} + \end{center} + + \noindent where we use @{term vs} to stand for a list of values. The + string underlying a value can be calculated by a @{const flat} + function, written @{term "flat DUMMY"}. It traverses a value and + collects the characters contained in it \cite{AusafDyckhoffUrban2016}. + + + Sulzmann and Lu also define inductively an + inhabitation relation that associates values to regular expressions. This + is defined by the following six rules for the values: + % + \begin{center} + \begin{tabular}{@ {\hspace{-12mm}}c@ {}} + \begin{tabular}{@ {}c@ {}} + \\[-8mm] + @{thm[mode=Axiom] Prf.intros(4)}\\ + @{thm[mode=Axiom] Prf.intros(5)[of "c"]} + \end{tabular} + \quad + @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\quad + @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\quad + @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\quad + @{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\mbox{}\hspace{-10mm}\mbox{} + \end{tabular} + \end{center} + + \noindent Note that no values are associated with the regular expression + @{term ZERO}, since it cannot match any string. + It is routine to establish how values ``inhabiting'' a regular + expression correspond to the language of a regular expression, namely + @{thm L_flat_Prf}. + + In general there is more than one value inhabited by a regular + expression (meaning regular expressions can typically match more + than one string). But even when fixing a string from the language of the + regular expression, there are generally more than one way of how the + regular expression can match this string. POSIX lexing is about + identifying the unique value for a given regular expression and a + string that satisfies the informal POSIX rules (see + \cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).\footnote{POSIX + lexing acquired its name from the fact that the corresponding + rules were described as part of the POSIX specification for + Unix-like operating systems \cite{POSIX}.} Sometimes these + informal rules are called \emph{maximal munch rule} and \emph{rule priority}. + One contribution of our earlier paper is to give a convenient + specification for what POSIX values are (the inductive rules are shown in + Figure~\ref{POSIXrules}). + +\begin{figure}[t] + \begin{center} + \begin{tabular}{@ {}c@ {}} + @{thm[mode=Axiom] Posix.intros(1)}\P\@{term "ONE"} \quad + @{thm[mode=Axiom] Posix.intros(2)}\P\@{term "c"}\quad + @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\P+L\\quad + @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\P+R\\medskip\\ + $\mprset{flushleft} + \inferrule + {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad + @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\ + @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}} + {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\PS\\medskip\smallskip\\ + @{thm[mode=Axiom] Posix.intros(7)}\P[]\\qquad + $\mprset{flushleft} + \inferrule + {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad + @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad + @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\ + @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}} + {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\P\\\\[-4mm] + \end{tabular} + \end{center} + \caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion + of given a string $s$ and a regular + expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for + regular expression matching.}\label{POSIXrules} + \end{figure} + + The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define + an injection function on values that mirrors (but inverts) the + construction of the derivative on regular expressions. Essentially it + injects back a character into a value. + For this they define two functions called @{text mkeps} and @{text inj}: + % + \begin{center} + \begin{tabular}{@ {}l@ {}} + \begin{tabular}{@ {}lcl@ {}} + @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\ + @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\ + \end{tabular}\smallskip\\ + + \begin{tabular}{@ {}cc@ {}} + \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}} + @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\ + @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & + @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\ + @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & + @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\ + @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ + & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]} + \end{tabular} + & + \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}} + @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ + & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\ + @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ + & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\ + @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$\\ + \multicolumn{3}{@ {}r@ {}}{@{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}} + \end{tabular} + \end{tabular} + \end{tabular}\smallskip + \end{center} + + \noindent + The function @{text mkeps} is run when the last derivative is nullable, that is + the string to be matched is in the language of the regular expression. It generates + a value for how the last derivative can match the empty string. The injection function + then calculates the corresponding value for each intermediate derivative until + a value for the original regular expression is generated. + Graphically the algorithm by + Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz} + where the path from the left to the right involving @{term derivatives}/@{const + nullable} is the first phase of the algorithm (calculating successive + \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to + left, the second phase. +% +\begin{center} +\begin{tikzpicture}[scale=0.99,node distance=9mm, + every node/.style={minimum size=6mm}] +\node (r1) {@{term "r\<^sub>1"}}; +\node (r2) [right=of r1]{@{term "r\<^sub>2"}}; +\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}}; +\node (r3) [right=of r2]{@{term "r\<^sub>3"}}; +\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}}; +\node (r4) [right=of r3]{@{term "r\<^sub>4"}}; +\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}}; +\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}}; +\node (v4) [below=of r4]{@{term "v\<^sub>4"}}; +\draw[->,line width=1mm](r4) -- (v4); +\node (v3) [left=of v4] {@{term "v\<^sub>3"}}; +\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\inj r\<^sub>3 c\}; +\node (v2) [left=of v3]{@{term "v\<^sub>2"}}; +\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\inj r\<^sub>2 b\}; +\node (v1) [left=of v2] {@{term "v\<^sub>1"}}; +\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\inj r\<^sub>1 a\}; +\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}}; +\end{tikzpicture} +\end{center} +% +\noindent + The picture shows the steps required when a + regular expression, say @{text "r\<^sub>1"}, matches the string @{term + "[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:\\[-8mm] + +% \begin{figure}[t] +%\begin{center} +%\begin{tikzpicture}[scale=1,node distance=1cm, +% every node/.style={minimum size=6mm}] +%\node (r1) {@{term "r\<^sub>1"}}; +%\node (r2) [right=of r1]{@{term "r\<^sub>2"}}; +%\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}}; +%\node (r3) [right=of r2]{@{term "r\<^sub>3"}}; +%\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}}; +%\node (r4) [right=of r3]{@{term "r\<^sub>4"}}; +%\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}}; +%\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}}; +%\node (v4) [below=of r4]{@{term "v\<^sub>4"}}; +%\draw[->,line width=1mm](r4) -- (v4); +%\node (v3) [left=of v4] {@{term "v\<^sub>3"}}; +%\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\inj r\<^sub>3 c\}; +%\node (v2) [left=of v3]{@{term "v\<^sub>2"}}; +%\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\inj r\<^sub>2 b\}; +%\node (v1) [left=of v2] {@{term "v\<^sub>1"}}; +%\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\inj r\<^sub>1 a\}; +%\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}}; +%\end{tikzpicture} +%\end{center} +%\mbox{}\\[-13mm] +% +%\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014}, +%matching the string @{term "[a,b,c]"}. The first phase (the arrows from +%left to right) is \Brz's matcher building successive derivatives. If the +%last regular expression is @{term nullable}, then the functions of the +%second phase are called (the top-down and right-to-left arrows): first +%@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing +%how the empty string has been recognised by @{term "r\<^sub>4"}. After +%that the function @{term inj} ``injects back'' the characters of the string into +%the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing +%the POSIX value for this string and +%regular expression. +%\label{Sulz}} +%\end{figure} + + + + \begin{center} + \begin{tabular}{lcl} + @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\ + @{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of} + @{term "None"} @{text "\"} @{term None}\\ + & & \hspace{24mm}$|\;$ @{term "Some v"} @{text "\"} @{term "Some (injval r c v)"} + \end{tabular} + \end{center} + + +We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that +this algorithm is correct, that is it generates POSIX values. The +central property we established relates the derivative operation to the +injection function. + + \begin{proposition}\label{Posix2} + \textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$. +\end{proposition} + + \noindent + With this in place we were able to prove: + + + \begin{proposition}\mbox{}\smallskip\\\label{lexercorrect} + \begin{tabular}{ll} + (1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\ + (2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\ + \end{tabular} + \end{proposition} + + \noindent + In fact we have shown that, in the success case, the generated POSIX value $v$ is + unique and in the failure case that there is no POSIX value $v$ that satisfies + $(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly + slow in cases where the derivatives grow arbitrarily (recall the example from the + Introduction). However it can be used as a convenient reference point for the correctness + proof of the second algorithm by Sulzmann and Lu, which we shall describe next. + +*} + +section {* Bitcoded Regular Expressions and Derivatives *} + +text {* + + In the second part of their paper \cite{Sulzmann2014}, + Sulzmann and Lu describe another algorithm that also generates POSIX + values but dispenses with the second phase where characters are + injected ``back'' into values. For this they annotate bitcodes to + regular expressions, which we define in Isabelle/HOL as the datatype + + \begin{center} + @{term breg} $\;::=\;$ @{term "AZERO"} + $\;\mid\;$ @{term "AONE bs"} + $\;\mid\;$ @{term "ACHAR bs c"} + $\;\mid\;$ @{term "AALTs bs rs"} + $\;\mid\;$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"} + $\;\mid\;$ @{term "ASTAR bs r"} + \end{center} + + \noindent where @{text bs} stands for bitsequences; @{text r}, + @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular + expressions; and @{text rs} for lists of bitcoded regular + expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"} + is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}. + For bitsequences we use lists made up of the + constants @{text Z} and @{text S}. The idea with bitcoded regular + expressions is to incrementally generate the value information (for + example @{text Left} and @{text Right}) as bitsequences. For this + Sulzmann and Lu define a coding + function for how values can be coded into bitsequences. + + \begin{center} + \begin{tabular}{cc} + \begin{tabular}{lcl} + @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\ + @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\ + @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\ + @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)} + \end{tabular} + & + \begin{tabular}{lcl} + @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\ + @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\ + @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\ + \mbox{\phantom{XX}}\\ + \end{tabular} + \end{tabular} + \end{center} + + \noindent + As can be seen, this coding is ``lossy'' in the sense that we do not + record explicitly character values and also not sequence values (for + them we just append two bitsequences). However, the + different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and + @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate + if there is still a value coming in the list of @{text Stars}, whereas @{text S} + indicates the end of the list. The lossiness makes the process of + decoding a bit more involved, but the point is that if we have a + regular expression \emph{and} a bitsequence of a corresponding value, + then we can always decode the value accurately (see Fig.~\ref{decode}). + The function \textit{decode} checks whether all of the bitsequence is + consumed and returns the corresponding value as @{term "Some v"}; otherwise + it fails with @{text "None"}. We can establish that for a value $v$ + inhabited by a regular expression $r$, the decoding of its + bitsequence never fails. + + %The decoding can be + %defined by using two functions called $\textit{decode}'$ and + %\textit{decode}: + + \begin{figure} + \begin{center} + \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}} + \multicolumn{3}{@ {}l}{$\textit{decode}'\,bs\,(\ONE)$ $\;\dn\;$ $(\Empty, bs)$ \quad\qquad + $\textit{decode}'\,bs\,(c)$ $\;\dn\;$ $(\Char\,c, bs)$}\\ + $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\; + (\Left\,v, bs_1)$\\ + $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\; + (\Right\,v, bs_1)$\\ + $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ & + $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\ + & & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$ + \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\ + $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\ + $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\ + & & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$ + \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\medskip\\ + \multicolumn{3}{l}{$\textit{decode}\,bs\,r$ $\dn$ + $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$ + $\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\; + \textit{else}\;\textit{None}$}\\[-4mm] + \end{tabular} + \end{center} + \caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode} + \end{figure} + + %\noindent + %The function \textit{decode} checks whether all of the bitsequence is + %consumed and returns the corresponding value as @{term "Some v"}; otherwise + %it fails with @{text "None"}. We can establish that for a value $v$ + %inhabited by a regular expression $r$, the decoding of its + %bitsequence never fails. + +\begin{lemma}\label{codedecode}\it + If $\;\vdash v : r$ then + $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$. +\end{lemma} + + + + Sulzmann and Lu define the function \emph{internalise} + in order to transform (standard) regular expressions into annotated + regular expressions. We write this operation as $r^\uparrow$. + This internalisation uses the following + \emph{fuse} function. + % + \begin{center} + \begin{tabular}{@ {}cc@ {}} + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\ + $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ & + $\textit{ONE}\,(bs\,@\,bs')$\\ + $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ & + $\textit{CHAR}\,(bs\,@\,bs')\,c$ + \end{tabular} + & + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ & + $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\ + $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ & + $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\ + $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ & + $\textit{STAR}\,(bs\,@\,bs')\,r$ + \end{tabular} + \end{tabular} + \end{center} + + \noindent + A regular expression can then be \emph{internalised} into a bitcoded + regular expression as follows: + % + \begin{center} + \begin{tabular}{@ {}ccc@ {}} + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\ + $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$ + \end{tabular} + & + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\ + $(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$ + \end{tabular} + & + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + $(r_1 + r_2)^\uparrow$ & $\dn$ & + $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\, + (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\ + $(r_1\cdot r_2)^\uparrow$ & $\dn$ & + $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$ + \end{tabular} + \end{tabular} + \end{center} + + \noindent + There is also an \emph{erase}-function, written $r^\downarrow$, which + transforms a bitcoded regular expression into a (standard) regular + expression by just erasing the annotated bitsequences. We omit the + straightforward definition. For defining the algorithm, we also need + the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the + ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on + bitcoded regular expressions. + % + \begin{center} + \begin{tabular}{@ {}c@ {\hspace{2mm}}c@ {}} + \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\ + $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\ + $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\ + $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ & + $\exists\, r \in \rs. \,\textit{bnullable}\,r$\\ + $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ & + $\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\ + $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ & + $\textit{True}$ + \end{tabular} + & + \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}} + $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\ + $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ & + $bs\,@\,\textit{bmkepss}\,\rs$\\ + $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\ + \multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\ + $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ & + $bs \,@\, [\S]$\\ + $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,r$\\ + & &$\textit{then}\;\textit{bmkeps}\,r$\\ + & &$\textit{else}\;\textit{bmkepss}\,\rs$ + \end{tabular} + \end{tabular} + \end{center} + + + \noindent + The key function in the bitcoded algorithm is the derivative of a + bitcoded regular expression. This derivative function calculates the + derivative but at the same time also the incremental part of the bitsequences + that contribute to constructing a POSIX value. + % + \begin{center} + \begin{tabular}{@ {}lcl@ {}} + \multicolumn{3}{@ {}l}{$(\textit{ZERO})\backslash c$ $\;\dn\;$ $\textit{ZERO}$ \quad\qquad + $(\textit{ONE}\;bs)\backslash c$ $\;\dn\;$ $\textit{ZERO}$}\\ + $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ & + $\textit{if}\;c=d\; \;\textit{then}\; + \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\ + $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ & + $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\ + $(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,r_1$\\ + & &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$ + $\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\ + & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\ + $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ & + $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\, + (\textit{STAR}\,[]\,r)$ + \end{tabular} + \end{center} + + + \noindent + This function can also be extended to strings, written $r\backslash s$, + just like the standard derivative. We omit the details. Finally we + can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}: + % + \begin{center} +\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}} + $\textit{blexer}\;r\,s$ & $\dn$ & + $\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$ + $\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r + \;\;\textit{else}\;\textit{None}$ +\end{tabular} +\end{center} + + \noindent +This bitcoded lexer first internalises the regular expression $r$ and then +builds the bitcoded derivative according to $s$. If the derivative is +(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the +$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value. If +the derivative is \emph{not} nullable, then $\textit{None}$ is +returned. We can show that this way of calculating a value +generates the same result as \textit{lexer}. + +Before we can proceed we need to define a helper function, called +\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof. +% +\begin{center} + \begin{tabular}{lcl} + @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\ + @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\ + @{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\ + @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\ + @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\ + @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]} + & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\ + @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\ + @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)} + \end{tabular} + \end{center} + +\noindent +The idea behind this function is to retrieve a possibly partial +bitsequence from a bitcoded regular expression, where the retrieval is +guided by a value. For example if the value is $\Left$ then we +descend into the left-hand side of an alternative in order to +assemble the bitcode. Similarly for +$\Right$. The property we can show is that for a given $v$ and $r$ +with $\vdash v : r$, the retrieved bitsequence from the internalised +regular expression is equal to the bitcoded version of $v$. + +\begin{lemma}\label{retrievecode} +If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$. +\end{lemma} + +\noindent +We also need some auxiliary facts about how the bitcoded operations +relate to the ``standard'' operations on regular expressions. For +example if we build a bitcoded derivative and erase the result, this +is the same as if we first erase the bitcoded regular expression and +then perform the ``standard'' derivative operation. + +\begin{lemma}\label{bnullable}\mbox{}\smallskip\\ + \begin{tabular}{ll} +\textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\ +\textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\ +\textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$. +\end{tabular} +\end{lemma} + +%\begin{proof} +% All properties are by induction on annotated regular expressions. +% %There are no interesting cases. +%\end{proof} + +\noindent +The only difficulty left for the correctness proof is that the bitcoded algorithm +has only a ``forward phase'' where POSIX values are generated incrementally. +We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection +functions during the forward phase. An auxiliary function, called $\textit{flex}$, +allows us to recast the rules of $\lexer$ in terms of a single +phase and stacked up injection functions. +% +\begin{center} +\begin{tabular}{@ {}l@ {}c@ {}l@ {\hspace{10mm}}l@ {}c@ {}l@ {}} + $\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$ & + $\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ & + $\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$ +\end{tabular} +\end{center} + +\noindent +The point of this function is that when +reaching the end of the string, we just need to apply the stacked up +injection functions to the value generated by @{text mkeps}. +Using this function we can recast the success case in @{text lexer} +as follows: + +\begin{lemma}\label{flex} +If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\, + (\mkeps (r\backslash s))$. +\end{lemma} + +\noindent +Note we did not redefine \textit{lexer}, we just established that the +value generated by \textit{lexer} can also be obtained by a different +method. While this different method is not efficient (we essentially +need to traverse the string $s$ twice, once for building the +derivative $r\backslash s$ and another time for stacking up injection +functions), it helps us with proving +that incrementally building up values in @{text blexer} generates the same result. + +This brings us to our main lemma in this section: if we calculate a +derivative, say $r\backslash s$, and have a value, say $v$, inhabited +by this derivative, then we can produce the result @{text lexer} generates +by applying this value to the stacked-up injection functions +that $\textit{flex}$ assembles. The lemma establishes that this is the same +value as if we build the annotated derivative $r^\uparrow\backslash s$ +and then retrieve the corresponding bitcoded version, followed by a +decoding step. + +\begin{lemma}[Main Lemma]\label{mainlemma}\it +If $\vdash v : r\backslash s$ then +$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) = + \textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r$ +\end{lemma} + + + +\noindent +%With this lemma in place, +We can then prove the correctness of \textit{blexer}---it indeed +produces the same result as \textit{lexer}. + + +\begin{theorem}\label{thmone} +$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$ +\end{theorem} + + + +\noindent This establishes that the bitcoded algorithm \emph{without} +simplification produces correct results. This was +only conjectured by Sulzmann and Lu in their paper +\cite{Sulzmann2014}. The next step is to add simplifications. + +*} + + +section {* Simplification *} + +text {* + + Derivatives as calculated by Brzozowski’s method are usually more + complex regular expressions than the initial one; the result is + that derivative-based matching and lexing algorithms are + often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various + optimisations are possible, such as the simplifications + $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$, + $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these + simplifications can considerably speed up the two algorithms in many + cases, they do not solve fundamentally the growth problem with + derivatives. To see this let us return to the example from the + Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}. + If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to + the simplification rules shown above we obtain + % + %% + % + \begin{equation}\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}\label{derivex} + (a + aa)^* \quad\xll\quad + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\; + ((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r}) + \end{equation} + + \noindent This is a simpler derivative, but unfortunately we + cannot make any further simplifications. This is a problem because + the outermost alternatives contains two copies of the same + regular expression (underlined with $r$). These copies will + spawn new copies in later derivative steps and they in turn even more copies. This + destroys any hope of taming the size of the derivatives. But the + second copy of $r$ in \eqref{derivex} will never contribute to a + value, because POSIX lexing will always prefer matching a string + with the first copy. So it could be safely removed without affecting the correctness of the algorithm. + The dilemma with the simple-minded + simplification rules above is that the rule $r + r \Rightarrow r$ + will never be applicable because as can be seen in this example the + regular expressions are not next to each other but separated by another regular expression. + + But here is where Sulzmann and Lu's representation of generalised + alternatives in the bitcoded algorithm shines: in @{term + "ALTs bs rs"} we can define a more aggressive simplification by + recursively simplifying all regular expressions in @{text rs} and + then analyse the resulting list and remove any duplicates. + Another advantage with the bitsequences in bitcoded regular + expressions is that they can be easily modified such that simplification does not + interfere with the value constructions. For example we can ``flatten'', or + de-nest, or spill out, @{text ALTs} as follows + % + \[ + @{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"} + \quad\xrightarrow{bsimp}\quad + @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) # rs\<^sub>1)"} + \] + + \noindent + where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"} + to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will + ensure that the correct value corresponding to the original (unsimplified) + regular expression can still be extracted. %In this way the value construction + %is not affected by simplification. + + However there is one problem with the definition for the more + aggressive simplification rules described by Sulzmann and Lu. Recasting + their definition with our syntax they define the step of removing + duplicates as + % + \[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs + bs (nub (map bsimp rs))"} + \] + + \noindent where they first recursively simplify the regular + expressions in @{text rs} (using @{text map}) and then use + Haskell's @{text nub}-function to remove potential + duplicates. While this makes sense when considering the example + shown in \eqref{derivex}, @{text nub} is the inappropriate + function in the case of bitcoded regular expressions. The reason + is that in general the elements in @{text rs} will have a + different annotated bitsequence and in this way @{text nub} + will never find a duplicate to be removed. One correct way to + handle this situation is to first \emph{erase} the regular + expressions when comparing potential duplicates. This is inspired + by Scala's list functions of the form \mbox{@{text "distinctWith rs eq + acc"}} where @{text eq} is an user-defined equivalence relation that + compares two elements in @{text rs}. We define this function in + Isabelle/HOL as + + \begin{center} + \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}} + @{thm (lhs) distinctWith.simps(1)} & $\dn$ & @{thm (rhs) distinctWith.simps(1)}\\ + @{thm (lhs) distinctWith.simps(2)} & $\dn$ & + @{text "if (\ y \ acc. eq x y)"} & @{text "then distinctWith xs eq acc"}\\ + & & & @{text "else x :: distinctWith xs eq ({x} \ acc)"} + \end{tabular} + \end{center} + + \noindent where we scan the list from left to right (because we + have to remove later copies). In @{text distinctWith}, @{text eq} is intended to be an + equivalence relation for annotated regular expressions and @{text acc} is an accumulator for annotated regular + expressions---essentially a set of regular expressions that we have already seen + while scanning the list. Therefore we delete an element, say @{text x}, + from the list provided a @{text "y"} with @{text "y"} being equivalent to @{text x} is already in the accumulator; + otherwise we keep @{text x} and scan the rest of the list but + add @{text "x"} as another ``seen'' element to @{text acc}. We will use + @{term distinctWith} where @{text eq} is an equivalence that deletes bitsequences from bitcoded regular expressions + before comparing the components. One way to define this in Isabelle/HOL is by the following recursive function from + annotated regular expressions to @{text bool}: + % + \begin{center} + \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}} + @{thm (lhs) eq1.simps(1)} & $\dn$ & @{thm (rhs) eq1.simps(1)}\\ + @{thm (lhs) eq1.simps(2)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(2)[of DUMMY DUMMY]}\\ + @{thm (lhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]} & $\dn$ & @{thm (rhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]}\\ + @{thm (lhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]} & $\dn$ & + @{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\ + @{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\ + @{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ & + @{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}\\ + @{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\ + \end{tabular} + \end{center} + + \noindent + where all other cases are set to @{text False}. + This equivalence is clearly a computationally more expensive operation than @{text nub}, + but is needed in order to make the removal of unnecessary copies + to work properly. + + Our simplification function depends on three more helper functions, one is called + @{text flts} and analyses lists of regular expressions coming from alternatives. + It is defined as follows: + + \begin{center} + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + @{thm (lhs) flts.simps(1)} & $\dn$ & @{thm (rhs) flts.simps(1)}\\ + @{thm (lhs) flts.simps(2)} & $\dn$ & @{thm (rhs) flts.simps(2)}\\ + @{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\ + \end{tabular} + \end{center} + + \noindent + The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and + the third ``de-nests'' alternatives (but retaining the + bitsequence @{text "bs'"} accumulated in the inner alternative). There are + some corner cases to be considered when the resulting list inside an alternative is + empty or a singleton list. We take care of those cases in the + @{text "bsimpALTs"} function; similarly we define a helper function that simplifies + sequences according to the usual rules about @{text ZERO}s and @{text ONE}s: + + \begin{center} + \begin{tabular}{c@ {\hspace{5mm}}c} + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + @{text "bsimpALTs bs []"} & $\dn$ & @{text "ZERO"}\\ + @{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\ + @{text "bsimpALTs bs rs"} & $\dn$ & @{text "ALTs bs rs"}\\ + \mbox{}\\ + \end{tabular} + & + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + @{text "bsimpSEQ bs _ ZERO"} & $\dn$ & @{text "ZERO"}\\ + @{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\ + @{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"} + & $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\ + @{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ & @{text "SEQ bs r\<^sub>1 r\<^sub>2"} + \end{tabular} + \end{tabular} + \end{center} + + \noindent + With this in place we can define our simplification function as + + \begin{center} + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + @{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & + @{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{text "bsimpALTs bs (distinctWith (flts (map bsimp rs)) \ \)"}\\ + @{text "bsimp r"} & $\dn$ & @{text r} + \end{tabular} + \end{center} + + \noindent + We believe our recursive function @{term bsimp} simplifies regular + expressions as intended by Sulzmann and Lu. There is no point in applying the + @{text bsimp} function repeatedly (like the simplification in their paper which needs to be + applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent, + that is + + \begin{proposition} + @{term "bsimp (bsimp r) = bsimp r"} + \end{proposition} + + \noindent + This can be proved by induction on @{text r} but requires a detailed analysis + that the de-nesting of alternatives always results in a flat list of regular + expressions. We omit the details since it does not concern the correctness proof. + + Next we can include simplification after each derivative step leading to the + following notion of bitcoded derivatives: + + \begin{center} + \begin{tabular}{cc} + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + @{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)} + \end{tabular} + & + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + @{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)} + \end{tabular} + \end{tabular} + \end{center} + + \noindent + and use it in the improved lexing algorithm defined as + + \begin{center} +\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}} + $\textit{blexer}^+\;r\,s$ & $\dn$ & + $\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$ + $\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r + \;\;\textit{else}\;\textit{None}$ +\end{tabular} +\end{center} + + \noindent The remaining task is to show that @{term blexer} and + @{term "blexer_simp"} generate the same answers. + + When we first + attempted this proof we encountered a problem with the idea + in Sulzmann and Lu's paper where the argument seems to be to appeal + again to the @{text retrieve}-function defined for the unsimplified version + of the algorithm. But + this does not work, because desirable properties such as + % + \[ + @{text "retrieve r v = retrieve (bsimp r) v"} + \] + + \noindent do not hold under simplification---this property + essentially purports that we can retrieve the same value from a + simplified version of the regular expression. To start with @{text retrieve} + depends on the fact that the value @{text v} corresponds to the + structure of the regular expression @{text r}---but the whole point of simplification + is to ``destroy'' this structure by making the regular expression simpler. + To see this consider the regular expression @{term "r = ALT r' ZERO"} and a corresponding + value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then + we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding + bitsequence. The reason that this works is that @{text r} is an alternative + regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify + @{text r}, then @{text v} does not correspond to the shape of the regular + expression anymore. So unless one can somehow + synchronise the change in the simplified regular expressions with + the original POSIX value, there is no hope of appealing to @{text retrieve} in the + correctness argument for @{term blexer_simp}. + + We found it more helpful to introduce the rewriting systems shown in + Figure~\ref{SimpRewrites}. The idea is to generate + simplified regular expressions in small steps (unlike the @{text bsimp}-function which + does the same in a big step), and show that each of + the small steps preserves the bitcodes that lead to the final POSIX value. + The rewrite system is organised such that $\leadsto$ is for bitcoded regular + expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular + expressions. The former essentially implements the simplifications of + @{text "bsimpSEQ"} and @{text flts}; while the latter implements the + simplifications in @{text "bsimpALTs"}. We can show that any bitcoded + regular expression reduces in zero or more steps to the simplified + regular expression generated by @{text bsimp}: + + \begin{lemma}\label{lemone} + @{thm[mode=IfThen] rewrites_to_bsimp} + \end{lemma} + + + + \noindent + We can show that this rewrite system preserves @{term bnullable}, that + is simplification does not affect nullability: + + \begin{lemma}\label{lembnull} + @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]} + \end{lemma} + + + + \noindent + From this, we can show that @{text bmkeps} will produce the same bitsequence + as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma + establishes the missing fact we were not able to establish using @{text retrieve}, as suggested + in the paper by Sulzmannn and Lu). + + + \begin{lemma}\label{lemthree} + @{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]} + \end{lemma} + + + + \noindent + Crucial is also the fact that derivative steps and simplification steps can be interleaved, + which is shown by the fact that $\leadsto$ is preserved under derivatives. + + \begin{lemma}\label{bderlem} + @{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]} + \end{lemma} + + + + + \noindent + Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma + that the unsimplified + derivative (with a string @{term s}) reduces to the simplified derivative (with the same string). + + \begin{lemma}\label{lemtwo} + @{thm[mode=IfThen] central} + \end{lemma} + + %\begin{proof} + %By reverse induction on @{term s} generalising over @{text r}. + %\end{proof} + + \noindent + With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"} + generate the same value, and using Theorem~\ref{thmone} from the previous section that this value + is indeed the POSIX value. + + \begin{theorem} + @{thm[mode=IfThen] main_blexer_simp} + \end{theorem} + + %\begin{proof} + %By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}. + %\end{proof} + + \noindent + This means that if the algorithm is called with a regular expression @{term r} and a string + @{term s} with $@{term s} \in L(@{term r})$, it will return @{term "Some v"} for the + @{term v} we defined by the POSIX relation $(@{term s}, @{term r}) \rightarrow @{term v}$; otherwise + the algorithm returns @{term "None"} when $s \not\in L(r)$ and no such @{text v} exists. + This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu. + The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily big but + can be finitely bounded, which + we shall show next. + + \begin{figure}[t] + \begin{center} + \begin{tabular}{@ {\hspace{-8mm}}c@ {}} + @{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\quad + @{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\quad + @{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\ + @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad + @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\ + @{thm[mode=Axiom] bs6}$A0$\quad + @{thm[mode=Axiom] bs7}$A1$\quad + @{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\ + @{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad + @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad + @{thm[mode=Axiom] ss4}$L\ZERO$\quad + @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\ + @{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm] + \end{tabular} + \end{center} + \caption{The rewrite rules that generate simplified regular expressions + in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular + expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded + expressions. Interesting is the $LD$ rule that allows copies of regular + expressions to be removed provided a regular expression earlier in the list can + match the same strings.}\label{SimpRewrites} + \end{figure} +*} + +section {* Finiteness of Derivatives *} + +text {* + +In this section let us sketch our argument for why the size of the simplified +derivatives with the aggressive simplification function can be finitely bounded. Suppose +we have a size function for bitcoded regular expressions, written +$\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree +(we omit the precise definition; ditto for lists $\llbracket r\!s\rrbracket$). For this we show that for every $r$ +there exists a bound $N$ +such that + +\begin{center} +$\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$ +\end{center} + +\noindent +We prove this by induction on $r$. The base cases for @{term AZERO}, +@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is +for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction +hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and +$\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows +% +\begin{center} +\begin{tabular}{lcll} +& & $ \llbracket@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\rrbracket$\\ +& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) :: + [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]))\rrbracket $ & (1) \\ +& $\leq$ & + $\llbracket\textit{distinctWith}\,((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) :: + [@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)])\,\approx\;@{term "{}"}\rrbracket + 1 $ & (2) \\ +& $\leq$ & $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket + + \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket + 1 $ & (3) \\ +& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + + \llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;|\; s' \in \textit{Suffix}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket$ & (4)\\ +& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5) +\end{tabular} +\end{center} + +% tell Chengsong about Indian paper of closed forms of derivatives + +\noindent +where in (1) the $\textit{Suffix}(@{text "r"}_1, s)$ are the all the suffixes of $s$ where @{term "bders_simp r\<^sub>1 s'"} is nullable ($s'$ being a suffix of $s$). +In (3) we know that $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket$ is +bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller +than $N_2$. The list length after @{text distinctWith} is bounded by a number, which we call $l_{N_2}$. It stands +for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them). +We reason similarly for @{text STAR}.\medskip + +\noindent +Clearly we give in this finiteness argument (Step (5)) a very loose bound that is +far from the actual bound we can expect. We can do better than this, but this does not improve +the finiteness property we are proving. If we are interested in a polynomial bound, +one would hope to obtain a similar tight bound as for partial +derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with +@{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in +partial derivatives). Unfortunately to obtain the exact same bound would mean +we need to introduce simplifications, such as + $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$, +which exist for partial derivatives. However, if we introduce them in our +setting we would lose the POSIX property of our calculated values. We leave better +bounds for future work.\\[-6.5mm] + +*} + + +section {* Conclusion *} + +text {* + + We set out in this work to prove in Isabelle/HOL the correctness of + the second POSIX lexing algorithm by Sulzmann and Lu + \cite{Sulzmann2014}. This follows earlier work where we established + the correctness of the first algorithm + \cite{AusafDyckhoffUrban2016}. In the earlier work we needed to + introduce our own specification about what POSIX values are, + because the informal definition given by Sulzmann and Lu did not + stand up to a formal proof. Also for the second algorithm we needed + to introduce our own definitions and proof ideas in order to establish the + correctness. Our interest in the second algorithm + lies in the fact that by using bitcoded regular expressions and an aggressive + simplification method there is a chance that the derivatives + can be kept universally small (we established in this paper that + they can be kept finitely bounded for any string). This is important if one is after + an efficient POSIX lexing algorithm based on derivatives. + + Having proved the correctness of the POSIX lexing algorithm, which + lessons have we learned? Well, we feel this is a very good example + where formal proofs give further insight into the matter at + hand. For example it is very hard to see a problem with @{text nub} + vs @{text distinctWith} with only experimental data---one would still + see the correct result but find that simplification does not simplify in well-chosen, but not + obscure, examples. We found that from an implementation + point-of-view it is really important to have the formal proofs of + the corresponding properties at hand. + + We have also developed a + healthy suspicion when experimental data is used to back up + efficiency claims. For example Sulzmann and Lu write about their + equivalent of @{term blexer_simp} \textit{``...we can incrementally compute + bitcoded parse trees in linear time in the size of the input''} + \cite[Page 14]{Sulzmann2014}. + Given the growth of the + derivatives in some cases even after aggressive simplification, this + is a hard to believe claim. A similar claim about a theoretical runtime + of @{text "O(n\<^sup>2)"} is made for the Verbatim lexer, which calculates + tokens according to POSIX rules~\cite{verbatim}. For this Verbatim uses Brzozowski's + derivatives like in our work. + The authors write: \textit{``The results of our empirical tests [..] confirm that Verbatim has + @{text "O(n\<^sup>2)"} time complexity.''} \cite[Section~VII]{verbatim}. + While their correctness proof for Verbatim is formalised in Coq, the claim about + the runtime complexity is only supported by some emperical evidence obtained + by using the code extraction facilities of Coq. + Given our observation with the ``growth problem'' of derivatives, + we + tried out their extracted OCaml code with the example + \mbox{@{text "(a + aa)\<^sup>*"}} as a single lexing rule, and it took for us around 5 minutes to tokenise a + string of 40 $a$'s and that increased to approximately 19 minutes when the + string is 50 $a$'s long. Taking into account that derivatives are not simplified in the Verbatim + lexer, such numbers are not surprising. + Clearly our result of having finite + derivatives might sound rather weak in this context but we think such effeciency claims + really require further scrutiny. + + The contribution of this paper is to make sure + derivatives do not grow arbitrarily big (universially) In the example \mbox{@{text "(a + aa)\<^sup>*"}}, + \emph{all} derivatives have a size of 17 or less. The result is that + lexing a string of, say, 50\,000 a's with the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes approximately + 10 seconds with our Scala implementation + of the presented algorithm. + \smallskip + + \noindent + Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.\\[-10mm] + + +%%\bibliographystyle{plain} +\bibliography{root} + +\appendix + +\section{Some Proofs} + +While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some +rough details of our reasoning in ``English''. + +\begin{proof}[Proof of Lemma~\ref{codedecode}] + This follows from the property that + $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds + for any bit-sequence $bs$ and $\vdash v : r$. This property can be + easily proved by induction on $\vdash v : r$. +\end{proof} + +\begin{proof}[Proof of Lemma~\ref{mainlemma}] + This can be proved by induction on $s$ and generalising over + $v$. The interesting point is that we need to prove this in the + reverse direction for $s$. This means instead of cases $[]$ and + $c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the + string from the back.\footnote{Isabelle/HOL provides an induction principle + for this way of performing the induction.} + + The case for $[]$ is routine using Lemmas~\ref{codedecode} + and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from + the assumption that $\vdash v : (r\backslash s)\backslash c$ + holds. Hence by Prop.~\ref{Posix2} we know that + (*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too. + By definition of $\textit{flex}$ we can unfold the left-hand side + to be + \[ + \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) = + \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v)) + \] + + \noindent + By IH and (*) we can rewrite the right-hand side to + $\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\; + (\inj\,(r\backslash s)\,c\,\,v))\,r$ which is equal to + $\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash + (s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible + because we generalised over $v$ in our induction. +\end{proof} + +\begin{proof}[Proof of Theorem~\ref{thmone}] + We can first expand both sides using Lem.~\ref{flex} and the + definition of \textit{blexer}. This gives us two + \textit{if}-statements, which we need to show to be equal. By + Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide: + $\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\; + \nullable(r\backslash s)$. + For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and + $d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show + by Lemma~\ref{bnullable}\textit{(3)} that + % + \[ + \textit{decode}(\textit{bmkeps}\:r_d)\,r = + \textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r + \] + + \noindent + where the right-hand side is equal to + $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\, + d))$ by Lemma~\ref{mainlemma} (we know + $\vdash \textit{mkeps}\,d : d$ by (*)). This shows the + \textit{if}-branches return the same value. In the + \textit{else}-branches both \textit{lexer} and \textit{blexer} return + \textit{None}. Therefore we can conclude the proof. +\end{proof} + +\begin{proof}[Proof of Lemma~\ref{lemone}] + By induction on @{text r}. For this we can use the properties + @{thm fltsfrewrites} and @{text "rs"}$\;\stackrel{s}{\leadsto}^*$@{text "distinctWith rs \"} @{term "{}"}. The latter uses + repeated applications of the $LD$ rule which allows the removal + of duplicates that can recognise the same strings. + \end{proof} + + \begin{proof}[Proof of Lemma~\ref{lembnull}] + Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$. + The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will + be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and + @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}. + \end{proof} + + \begin{proof}[Proof of Lemma \ref{lemthree}] + By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$. + Again the only interesting case is the rule $LD$ where we need to ensure that + @{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) = + bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"} holds. + This is indeed the case because according to the POSIX rules the + generated bitsequence is determined by the first alternative that can match the + string (in this case being nullable). + \end{proof} + + \begin{proof}[Proof of Lemma~\ref{bderlem}] + By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$. + The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \ L (erase (bder c r\<^sub>1))"} + if and only if @{term "L (erase (r\<^sub>2)) \ L (erase (r\<^sub>1))"}. + \end{proof} +*} + +(*<*) +end +(*>*) \ No newline at end of file