# HG changeset patch # User Christian Urban # Date 1644196285 0 # Node ID 41a2a3b63853c887e79389ae4d0bfdf9ccabe564 # Parent 57182b36ec0193d8aa91f696e6169836bf971043 more of the paper diff -r 57182b36ec01 -r 41a2a3b63853 thys2/Paper/Paper.thy --- a/thys2/Paper/Paper.thy Sun Feb 06 00:02:04 2022 +0000 +++ b/thys2/Paper/Paper.thy Mon Feb 07 01:11:25 2022 +0000 @@ -17,9 +17,12 @@ abbreviation "der_syn r c \ der c r" +abbreviation + "bder_syn r c \ bder c r" notation (latex output) der_syn ("_\\_" [79, 1000] 76) and + bder_syn ("_\\_" [79, 1000] 76) and ZERO ("\<^bold>0" 81) and ONE ("\<^bold>1" 81) and @@ -54,10 +57,12 @@ code ("code _" [79] 74) and intern ("_\<^latex>\\\mbox{$^\\uparrow$}\" [900] 80) and erase ("_\<^latex>\\\mbox{$^\\downarrow$}\" [1000] 74) and - bnullable ("nullable\<^latex>\\\mbox{$_b$}\ _" [1000] 80) and - bmkeps ("mkeps\<^latex>\\\mbox{$_b$}\ _" [1000] 80) and + bnullable ("bnullable _" [1000] 80) and + bmkeps ("bmkeps _" [1000] 80) and - srewrite ("_\<^latex>\\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\ _" [71, 71] 80) + srewrite ("_\<^latex>\\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\ _" [71, 71] 80) and + rrewrites ("_ \<^latex>\\\mbox{$\\,\\leadsto^*$}\ _" [71, 71] 80) and + blexer_simp ("blexer\<^sup>+" 1000) lemma better_retrieve: @@ -263,7 +268,7 @@ \end{tabular} \end{center} - \noindent where @{text bs} stands for a bitsequences; @{text r}, + \noindent where @{text bs} stands for bitsequences; @{text r}, @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular expressions; and @{text rs} for lists of bitcoded regular expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"} @@ -271,9 +276,8 @@ For bitsequences we just use lists made up of the constants @{text Z} and @{text S}. The idea with bitcoded regular expressions is to incrementally generate the value information (for - example @{text Left} and @{text Right}) as bitsequences - as part of the regular expression constructors. - Sulzmann and Lu then define a coding + example @{text Left} and @{text Right}) as bitsequences. For this + Sulzmann and Lu define a coding function for how values can be coded into bitsequences. \begin{center} @@ -297,8 +301,8 @@ \noindent As can be seen, this coding is ``lossy'' in the sense that we do not record explicitly character values and also not sequence values (for - them we just append two bitsequences). We do, however, record the - different alternatives for @{text Left}, respectively @{text Right}, as @{text Z} and + them we just append two bitsequences). However, the + different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate if there is still a value coming in the list of @{text Stars}, whereas @{text S} indicates the end of the list. The lossiness makes the process of @@ -404,55 +408,55 @@ bitcoded regular expressions, instead of regular expressions. \begin{center} - \begin{tabular}{lcl} - $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$ \textbf{fix}\\ + \begin{tabular}{@ {}c@ {}c@ {}} + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$\\ $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{true}$\\ $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{false}$\\ - $\textit{bnullable}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ & - $\textit{bnullable}\,a_1\vee \textit{bnullable}\,a_2$\\ - $\textit{bnullable}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ & - $\textit{bnullable}\,a_1\wedge \textit{bnullable}\,a_2$\\ - $\textit{bnullable}\,(\textit{STAR}\,bs\,a)$ & $\dn$ & + $\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ & + $\exists\, r \in \rs. \,\textit{bnullable}\,r$\\ + $\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ & + $\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\ + $\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ & $\textit{true}$ - \end{tabular} - \end{center} - - \begin{center} - \begin{tabular}{lcl} - $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$ \textbf{fix}\\ - $\textit{bmkeps}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ & - $\textit{if}\;\textit{bnullable}\,a_1$\\ - & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a_1$\\ - & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,a_2$\\ - $\textit{bmkeps}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ & - $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ - $\textit{bmkeps}\,(\textit{STAR}\,bs\,a)$ & $\dn$ & + \end{tabular} + & + \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l} + $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\ + $\textit{bmkeps}\,(\textit{ALTs}\,bs\,r\!::\!\rs)$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,r$\\ + & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,r$\\ + & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,\rs$\\ + $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\ + \multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\ + $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ & $bs \,@\, [\S]$ - \end{tabular} + \end{tabular} + \end{tabular} \end{center} \noindent The key function in the bitcoded algorithm is the derivative of an - annotated regular expression. This derivative calculates the - derivative but at the same time also the incremental part that - contributes to constructing a value. + bitcoded regular expression. This derivative calculates the + derivative but at the same time also the incremental part of bitsequences + that contribute to constructing a POSIX value. \begin{center} \begin{tabular}{@ {}lcl@ {}} - $(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \textbf{fix}\\ + $(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \\ $(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\ $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ & $\textit{if}\;c=d\; \;\textit{then}\; \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\ - $(\textit{ALT}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ & - $\textit{ALT}\,bs\,(a_1\backslash c)\,(a_2\backslash c)$\\ - $(\textit{SEQ}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ & - $\textit{if}\;\textit{bnullable}\,a_1$\\ - & &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(a_1\backslash c)\,a_2)$\\ - & &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\backslash c))$\\ - & &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\backslash c)\,a_2$\\ - $(\textit{STAR}\,bs\,a)\backslash c$ & $\dn$ & + $(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ & + $\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\ + $(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,r_1$\\ + & &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$\\ + & &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\ + & &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\ + $(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ & $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\, (\textit{STAR}\,[]\,r)$ \end{tabular} @@ -460,66 +464,32 @@ \noindent - This function can also be extended to strings, written $a\backslash s$, + This function can also be extended to strings, written $r\backslash s$, just like the standard derivative. We omit the details. Finally we - can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}: + can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}: + + \begin{center} +\begin{tabular}{lcl} + $\textit{blexer}\;r\,s$ & $\dn$ & + $\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\ + & & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r + \;\;\textit{else}\;\textit{None}$ +\end{tabular} +\end{center} \noindent This bitcoded lexer first internalises the regular expression $r$ and then -builds the annotated derivative according to $s$. If the derivative is -nullable, then it extracts the bitcoded value using the -$\textit{bmkeps}$ function. Finally it decodes the bitcoded value. If +builds the bitcoded derivative according to $s$. If the derivative is +(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the +$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value. If the derivative is \emph{not} nullable, then $\textit{None}$ is -returned. The task is to show that this way of calculating a value +returned. We can show that this way of calculating a value generates the same result as with \textit{lexer}. -Before we can proceed we need to define a function, called -\textit{retrieve}, which Sulzmann and Lu introduced for the proof. - -\textbf{fix} - -\noindent -The idea behind this function is to retrieve a possibly partial -bitcode from an annotated regular expression, where the retrieval is -guided by a value. For example if the value is $\Left$ then we -descend into the left-hand side of an alternative (annotated) regular -expression in order to assemble the bitcode. Similarly for -$\Right$. The property we can show is that for a given $v$ and $r$ -with $\vdash v : r$, the retrieved bitsequence from the internalised -regular expression is equal to the bitcoded version of $v$. - -\begin{lemma}\label{retrievecode} -If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$. -\end{lemma} - -*} +Before we can proceed we need to define a helper function, called +\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof. -text {* - There is also a corresponding decoding function that takes a bitsequence - and generates back a value. However, since the bitsequences are a ``lossy'' - coding (@{term Seq}s are not coded) the decoding function depends also - on a regular expression in order to decode values. - - - - - The idea of the bitcodes is to annotate them to regular expressions and generate values - incrementally. The bitcodes can be read off from the @{text breg} and then decoded into a value. - - \begin{center} - \begin{tabular}{lcl} - @{term breg} & $::=$ & @{term "AZERO"}\\ - & $\mid$ & @{term "AONE bs"}\\ - & $\mid$ & @{term "ACHAR bs c"}\\ - & $\mid$ & @{term "AALTs bs rs"}\\ - & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\ - & $\mid$ & @{term "ASTAR bs r"} - \end{tabular} - \end{center} - - - - \begin{center} +\begin{center} \begin{tabular}{lcl} @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\ @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\ @@ -533,22 +503,171 @@ \end{tabular} \end{center} +\noindent +The idea behind this function is to retrieve a possibly partial +bitcode from a bitcoded regular expression, where the retrieval is +guided by a value. For example if the value is $\Left$ then we +descend into the left-hand side of an alternative in order to +assemble the bitcode. Similarly for +$\Right$. The property we can show is that for a given $v$ and $r$ +with $\vdash v : r$, the retrieved bitsequence from the internalised +regular expression is equal to the bitcoded version of $v$. - \begin{theorem} - @{thm blexer_correctness} - \end{theorem} +\begin{lemma}\label{retrievecode} +If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$. +\end{lemma} + +\noindent +We also need some auxiliary facts about how the bitcoded operations +relate to the ``standard'' operations on regular expressions. For +example if we build a bitcoded derivative and erase the result, this +is the same as if we first erase the bitcoded regular expression and +then perform the ``standard'' derivative operation. + +\begin{lemma}\label{bnullable}\mbox{}\smallskip\\ + \begin{tabular}{ll} +\textit{(1)} & $(a\backslash s)^\downarrow = (a^\downarrow)\backslash s$\\ +\textit{(2)} & $\textit{bnullable}(a)$ iff $\textit{nullable}(a^\downarrow)$\\ +\textit{(3)} & $\textit{bmkeps}(a) = \textit{retrieve}\,a\,(\textit{mkeps}\,(a^\downarrow))$ provided $\textit{nullable}(a^\downarrow)$. +\end{tabular} +\end{lemma} + +\begin{proof} + All properties are by induction on annotated regular expressions. There are no + interesting cases. +\end{proof} + +\noindent +This brings us to our main lemma in this section: if we build a +derivative, say $r\backslash s$ and have a value, say $v$, inhabited +by this derivative, then we can produce the result $\lexer$ generates +by applying this value to the stacked-up injection functions +$\textit{flex}$ assembles. The lemma establishes that this is the same +value as if we build the annotated derivative $r^\uparrow\backslash s$ +and then retrieve the corresponding bitcoded version, followed by a +decoding step. + +\begin{lemma}[Main Lemma]\label{mainlemma}\it +If $\vdash v : r\backslash s$ then +\[\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) = + \textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r\] +\end{lemma} + +\begin{proof} + This can be proved by induction on $s$ and generalising over + $v$. The interesting point is that we need to prove this in the + reverse direction for $s$. This means instead of cases $[]$ and + $c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the + string from the back.\footnote{Isabelle/HOL provides an induction principle + for this way of performing the induction.} + + The case for $[]$ is routine using Lemmas~\ref{codedecode} + and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from + the assumption that $\vdash v : (r\backslash s)\backslash c$ + holds. Hence by Lemma~\ref{Posix2} we know that + (*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too. + By definition of $\textit{flex}$ we can unfold the left-hand side + to be + \[ + \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) = + \textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v)) + \] + + \noindent + By induction hypothesis and (*) we can rewrite the right-hand side to + + \[ + \textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\; + (\inj\,(r\backslash s)\,c\,\,v))\,r + \] + + \noindent + which is equal to + $\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash + (s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible + because we generalised over $v$ in our induction. +\end{proof} + +\noindent +With this lemma in place, we can prove the correctness of \textit{blexer} such +that it produces the same result as \textit{lexer}. - bitcoded regexes / decoding / bmkeps - gets rid of the second phase (only single phase) - correctness +\begin{theorem} +$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$ +\end{theorem} + +\begin{proof} + We can first expand both sides using Lemma~\ref{flex} and the + definition of \textit{blexer}. This gives us two + \textit{if}-statements, which we need to show to be equal. By + Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide: + \[ + \textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\; + \nullable(r\backslash s) + \] + + \noindent + For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and + $d \dn r\backslash s$. We have (*) $\nullable\,d$. We can then show + by Lemma~\ref{bnullable}\textit{(3)} that + % + \[ + \textit{decode}(\textit{bmkeps}\,r_d)\,r = + \textit{decode}(\textit{retrieve}\,a\,(\textit{mkeps}\,d))\,r + \] + + \noindent + where the right-hand side is equal to + $\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\, + d))$ by Lemma~\ref{mainlemma} (we know + $\vdash \textit{mkeps}\,d : d$ by (*)). This shows the + \textit{if}-branches return the same value. In the + \textit{else}-branches both \textit{lexer} and \textit{blexer} return + \textit{None}. Therefore we can conclude the proof. +\end{proof} + +\noindent +This establishes that the bitcoded algorithm by Sulzmann +and Lu without simplification produces correct results. This was +only conjectured in their paper \cite{Sulzmann2014}. The next step +is to add simplifications. + *} section {* Simplification *} text {* - Sulzmann \& Lu apply simplification via a fixpoint operation; also does not use erase to filter out duplicates. + + \begin{lemma} + @{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]} + \end{lemma} + + + \begin{lemma} + @{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]} + \end{lemma} + + \begin{lemma} + @{thm[mode=IfThen] rewrites_to_bsimp} + \end{lemma} + + \begin{lemma} + @{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]} + \end{lemma} + + \begin{lemma} + @{thm[mode=IfThen] central} + \end{lemma} + + \begin{theorem} + @{thm[mode=IfThen] main_blexer_simp} + \end{theorem} + + Sulzmann \& Lu apply simplification via a fixpoint operation + + ; also does not use erase to filter out duplicates. not direct correspondence with PDERs, because of example problem with retrieve diff -r 57182b36ec01 -r 41a2a3b63853 thys2/Paper/document/root.tex --- a/thys2/Paper/document/root.tex Sun Feb 06 00:02:04 2022 +0000 +++ b/thys2/Paper/document/root.tex Mon Feb 07 01:11:25 2022 +0000 @@ -49,7 +49,7 @@ \def\S{\mathit{S}} \newcommand{\ZERO}{\mbox{\bf 0}} \newcommand{\ONE}{\mbox{\bf 1}} - +\def\rs{\mathit{rs}} \def\Brz{Brzozowski} \def\der{\backslash} @@ -83,17 +83,19 @@ of the regular expression. The purpose of the bitcodes is to generate POSIX values incrementally while derivatives are calculated. They also help with designing an ``aggressive'' simplification function that keeps the size of - derivatives small. Without simplification the size derivatives can grow - exponentially resulting in an extremely slow lexing algorithm. In this + derivatives finite. Without simplification the size derivatives can grow + arbitrarily big resulting in an extremely slow lexing algorithm. In this paper we describe a variant of Sulzmann and Lu's algorithm: Our algorithm is a recursive functional program, whereas Sulzmann and Lu's version involves a fixpoint construction. We \textit{(i)} prove in Isabelle/HOL that our program is correct and generates - unique POSIX values; we also \textit{(ii)} establish a polynomial - bound for the size of the derivatives. The size can be seen as a - proxy measure for the efficiency of the lexing algorithm: because of - the polynomial bound our algorithm does not suffer from - the exponential blowup in earlier works. + unique POSIX values; we also \textit{(ii)} establish a finite + bound for the size of the derivatives. + + %The size can be seen as a + %proxy measure for the efficiency of the lexing algorithm: because of + %the polynomial bound our algorithm does not suffer from + %the exponential blowup in earlier works. % Brzozowski introduced the notion of derivatives for regular % expressions. They can be used for a very simple regular expression diff -r 57182b36ec01 -r 41a2a3b63853 thys2/SizeBound4.thy --- a/thys2/SizeBound4.thy Sun Feb 06 00:02:04 2022 +0000 +++ b/thys2/SizeBound4.thy Mon Feb 07 01:11:25 2022 +0000 @@ -179,7 +179,7 @@ | "bmkeps(ASEQ bs r1 r2) = bs @ (bmkeps r1) @ (bmkeps r2)" | "bmkeps(AALTs bs rs) = bs @ (bmkepss rs)" | "bmkeps(ASTAR bs r) = bs @ [S]" -| "bmkepss [] = []" +(*| "bmkepss [] = []"*) | "bmkepss (r # rs) = (if bnullable(r) then (bmkeps r) else (bmkepss rs))" lemma bmkepss1: @@ -849,7 +849,7 @@ have as1: "L(erase a2) \ L(erase a1)" by fact have as3: "bnullables (rsa @ [a1] @ rsb @ [a2] @ rsc)" by fact show "bmkepss (rsa @ [a1] @ rsb @ [a2] @ rsc) = bmkepss (rsa @ [a1] @ rsb @ rsc)" using as1 as3 - by (smt (verit, ccfv_SIG) append_Cons bmkepss.simps(2) bmkepss1 bmkepss2 bnullable_correctness nullable_correctness subset_eq) + by (smt (verit, ccfv_SIG) append_Cons bmkepss.simps(1) bmkepss1 bmkepss2 bnullable_correctness nullable_correctness subset_eq) (*next case (extra bs0 bs1 r1 bs2 r2 bs4 bs3) then show ?case diff -r 57182b36ec01 -r 41a2a3b63853 thys2/paper.pdf Binary file thys2/paper.pdf has changed