# HG changeset patch # User Chengsong # Date 1657697248 -3600 # Node ID 3cbcd7cda0a97b001314eacf70dbd1e69d00b4c0 # Parent 57e33978e55d5e8863f6b9d9611ca0755e4aa796 more diff -r 57e33978e55d -r 3cbcd7cda0a9 ChengsongTanPhdThesis/Chapters/Bitcoded1.tex --- a/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Tue Jul 05 00:42:06 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Wed Jul 13 08:27:28 2022 +0100 @@ -8,96 +8,105 @@ %simplifications and therefore introduce our version of the bitcoded algorithm and %its correctness proof in %Chapter 3\ref{Chapter3}. -In this chapter, we are going to introduce the bit-coded algorithm -introduced by Sulzmann and Lu to address the problem of -under-simplified regular expressions. +In this chapter, we are going to describe the bit-coded algorithm +introduced by Sulzmann and Lu \parencite{Sulzmann2014} to address the growth problem of +regular expressions. \section{Bit-coded Algorithm} The lexer algorithm in Chapter \ref{Inj}, as shown in \ref{InjFigure}, stores information of previous lexing steps on a stack, in the form of regular expressions and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc. -\begin{envForCaption} \begin{ceqn} \begin{equation}%\label{graph:injLexer} -\begin{tikzcd} -r_0 \arrow[r, "\backslash c_0"] \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\ -v_0 & v_1 \arrow[l,"inj_{r_0} c_0"] & v_2 \arrow[l, "inj_{r_1} c_1"] & v_n \arrow[l, dashed] + \begin{tikzcd}[ampersand replacement=\&, execute at end picture={ + \begin{scope}[on background layer] + \node[rectangle, fill={red!30}, + pattern=north east lines, pattern color=red, + fit={(-3,-1) (-3, 1) (1, -1) + (1, 1)} + ] + {}; , + \node[rectangle, fill={blue!20}, + pattern=north east lines, pattern color=blue, + fit= {(1, -1) (1, 1) (3, -1) (3, 1)} + ] + {}; + \end{scope}} + ] +r_0 \arrow[r, "\backslash c_0"] \arrow[d] \& r_1 \arrow[r, "\backslash c_1"] \arrow[d] \& r_2 \arrow[r, dashed] \arrow[d] \& r_n \arrow[d, "mkeps" description] \\ +v_0 \& v_1 \arrow[l,"inj_{r_0} c_0"] \& v_2 \arrow[l, "inj_{r_1} c_1"] \& v_n \arrow[l, dashed] \\ \end{tikzcd} \end{equation} \end{ceqn} -\caption{Injection-based Lexing from Chapter\ref{Inj}}\label{InjFigure} -\end{envForCaption} \noindent +The red part represents what we already know during the first +derivative phase, +and the blue part represents the unknown part of input. +The red area expands as we move towards $r_n$, +indicating an increasing stack size during lexing. +Despite having some partial lexing information during +the forward derivative phase, we choose to store them +temporarily, only to convert the information to lexical +values at a later stage. In essence we are repeating work we +have already done. This is both inefficient and prone to stack overflow. A natural question arises as to whether we can store lexing information on the fly, while still using regular expression -derivatives? +derivatives. -In a lexing algorithm's run, split by the current input position, -we have a sub-string that has been consumed, -and the sub-string that has yet to come. -We already know what was before, and this should be reflected in the value -and the regular expression at that step as well. But this is not the -case for injection-based regular expression derivatives. -Take the regex $(aa)^* \cdot bc$ matching the string $aabc$ -as an example, if we have just read the two former characters $aa$: - +If we remove the details of the individual +lexing steps, and use red and blue areas as before +to indicate consumed (seen) input and constructed +partial value (before recovering the rest of the stack), +one could see that the seen part's lexical information +is stored in the form of a regular expression. +Consider the regular expression $(aa)^* \cdot bc$ matching the string $aabc$ +and assume we have just read the two characters $aa$: \begin{center} -\begin{envForCaption} -\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] - \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] - {Consumed: $aa$ - \nodepart{two} Not Yet Reached: $bc$ }; -%\caption{term 1 \ref{term:1}'s matching configuration} -\end{tikzpicture} -\caption{Partially matched String} -\end{envForCaption} -\end{center} -%\caption{Input String}\label{StringPartial} -%\end{figure} - -\noindent -We have the value that has already been partially calculated, -and the part that has yet to come: -\begin{center} -\begin{envForCaption} \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] - {$\Seq(\Stars[\Char(a), \Char(a)], ???)$ + {Partial lexing info: $\ONE \cdot a \cdot (aa)^* \cdot bc$ etc. \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$}; -%\caption{term 1 \ref{term:1}'s matching configuration} \end{tikzpicture} -\caption{Partially constructed Value} -\end{envForCaption} \end{center} - -In the regex derivative part , (after simplification) -all we have is just what is about to come: +\noindent +In the injection-based lexing algorithm, we ``neglect" the red area +by putting all the characters we have consumed and +intermediate regular expressions on the stack when +we go from left to right in the derivative phase. +The red area grows till the string is exhausted. +During the injection phase, the value in the blue area +is built up incrementally, while the red area shrinks. +Before we have recovered all characters and intermediate +derivative regular expressions from the stack, +what values these characters and regular expressions correspond +to are unknown: \begin{center} -\begin{envForCaption} \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={white!30,blue!20},] - {$???$ - \nodepart{two} To Come: $b c$}; + {$(\ONE \cdot \ONE) \cdot (aa)^* \cdot bc $ correspond to:$???$ + \nodepart{two} $b c$ corresponds to $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$}; %\caption{term 1 \ref{term:1}'s matching configuration} \end{tikzpicture} -\caption{Derivative} -\end{envForCaption} \end{center} \noindent -The previous part is missing. -How about keeping the partially constructed value -attached to the front of the regular expression? +However, they should be calculable, +as characters and regular expression shapes +after taking derivative w.r.t those characters +have already been known, therefore in our example, +we know that the value starts with two $a$s, +and makes up to an iteration in a Kleene star: +(We have put the injection-based lexing's partial +result in the right part of the split rectangle +to contrast it with the partial valued produced +in a forward manner) \begin{center} -\begin{envForCaption} \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] - {$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$ - \nodepart{two} To Come: $b c$}; + {$\stackrel{Bitcoded}{\longrightarrow} \Seq(\Stars[\Char(a), \Char(a)], ???)$ + \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$ $\stackrel{Inj}{\longleftarrow}$}; %\caption{term 1 \ref{term:1}'s matching configuration} \end{tikzpicture} -\caption{Derivative} -\end{envForCaption} \end{center} \noindent If we do this kind of "attachment" @@ -105,19 +114,23 @@ constructed value when taking off a character: \begin{center} -\begin{envForCaption} +\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] + \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] (spPoint) + {$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$ + \nodepart{two} Remaining: $b c$}; +\end{tikzpicture}\\ +$\downarrow$\\ \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \ldots))$ - \nodepart{two} To Come: $c$}; + \nodepart{two} Remaining: $c$}; \end{tikzpicture}\\ +$\downarrow$\\ \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \Char(c)))$ \nodepart{two} EOF}; \end{tikzpicture} -\caption{After $\backslash b$ and $\backslash c$} -\end{envForCaption} \end{center} \noindent In the end we could recover the value without a backward phase. @@ -125,14 +138,11 @@ we instead use bit-codes to encode them. Bits and bitcodes (lists of bits) are defined as: -\begin{envForCaption} \begin{center} $b ::= S \mid Z \qquad bs ::= [] \mid b::bs $ \end{center} -\caption{Bit-codes datatype} -\end{envForCaption} \noindent Using $S$ and $Z$ rather than $1$ and $0$ is to avoid @@ -141,7 +151,6 @@ bit-lists) can be used to encode values (or potentially incomplete values) in a compact form. This can be straightforwardly seen in the following coding function from values to bitcodes: -\begin{envForCaption} \begin{center} \begin{tabular}{lcl} $\textit{code}(\Empty)$ & $\dn$ & $[]$\\ @@ -154,9 +163,6 @@ code(\Stars\,vs)$ \end{tabular} \end{center} -\caption{Coding Function for Values} -\end{envForCaption} - \noindent Here $\textit{code}$ encodes a value into a bit-code by converting $\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty @@ -168,9 +174,10 @@ whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$. The reason for choosing this compact way of storing information is that the relatively small size of bits can be easily manipulated and ``moved -around'' in a regular expression. +around" in a regular expression. - +Because of the lossiness, the process of decoding a bitlist requires additionally +a regular expression. The function $\decode$ is defined as: We define the reverse operation of $\code$, which is $\decode$. As expected, $\decode$ not only requires the bit-codes, but also a regular expression to guide the decoding and @@ -178,7 +185,6 @@ %\begin{definition}[Bitdecoding of Values]\mbox{} -\begin{envForCaption} \begin{center} \begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}} $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ @@ -205,10 +211,10 @@ \textit{else}\;\textit{None}$ \end{tabular} \end{center} -\end{envForCaption} %\end{definition} \noindent +The function $\decode'$ returns a pair consisting of a partially decoded value and some leftover: $\decode'$ does most of the job while $\decode$ throws away leftover bit-codes and returns the value only. $\decode$ is terminating as $\decode'$ is terminating. @@ -251,7 +257,7 @@ The first thing we define related to bit-coded regular expressions -is how we move bits, for instance pasting it at the front of an annotated regex. +is how we move bits, for instance pasting it at the front of an annotated regular expression. The operation $\fuse$ is just to attach bit-codes to the front of an annotated regular expression: \begin{center} @@ -364,7 +370,7 @@ This is done by adding bitcodes to the derivatives, for example when one more star iteratoin is taken (we call the operation of derivatives on annotated regular expressions $\bder$ -because it is derivatives on regexes with \emph{b}itcodes), +because it is derivatives on regular expressiones with \emph{b}itcodes), we need to unfold it into a sequence, and attach an additional bit $Z$ to the front of $r \backslash c$ to indicate one more star iteration. @@ -652,7 +658,7 @@ $\blexer \; r \; s = \decode \; (\retrieve \; (\internalise \; r) \; (\mkeps \; (r \backslash s) )) \; r$ \end{lemma} \noindent -$\retrieve$ allows free navigation on the diagram \ref{InjFigure} for annotated regexes of $\blexer$. +$\retrieve$ allows free navigation on the diagram \ref{InjFigure} for annotated regular expressiones of $\blexer$. For plain regular expressions something similar is required as well. \subsection{$\flex$} diff -r 57e33978e55d -r 3cbcd7cda0a9 ChengsongTanPhdThesis/Chapters/Finite.tex --- a/ChengsongTanPhdThesis/Chapters/Finite.tex Tue Jul 05 00:42:06 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Finite.tex Wed Jul 13 08:27:28 2022 +0100 @@ -209,7 +209,7 @@ With $\rrexp$ the size caclulation of annotated regular expressions' simplification and derivatives can be done by the size of their unlifted counterpart with the unlifted version of simplification and derivatives applied. -\begin{lemma} +\begin{lemma}\label{sizeRelations} The following equalities hold: \begin{itemize} \item @@ -1244,8 +1244,35 @@ \end{corollary} \noindent \subsection{Closed Forms for Star Regular Expressions} -We use a similar technique as $r_1 \cdot r_2$ case, -generating +We have shown how to control the size of the sequence regular expression $r_1\cdot r_2$ using +the "closed form" of $(r_1 \cdot r_2) \backslash s$ and then +the property of the $\distinct$ function. +Now we try to get a bound on $r^* \backslash s$ as well. +Again, we first look at how a star's derivatives evolve, if they grow maximally: +\begin{center} + +$r^* \quad \longrightarrow_{\backslash c} \quad (r\backslash c) \cdot r^* \quad \longrightarrow_{\backslash c'} \quad +r \backslash cc' \cdot r^* + r \backslash c' \cdot r^* \quad \longrightarrow_{\backslash c''} \quad +(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*) \quad \longrightarrow_{\backslash c'''} +\quad \ldots$ + +\end{center} +When we have a string $s = c :: c' :: c'' \ldots$ such that $r \backslash c$, $r \backslash cc'$, $r \backslash c'$, +$r \backslash cc'c''$, $r \backslash c'c''$, $r\backslash c''$ etc. are all nullable, +the number of terms in $r^* \backslash s$ will grow exponentially, causing the size +of the derivatives $r^* \backslash s$ to grow exponentially, even if we do not +count the possible size explosions of $r \backslash c$ themselves. + +Thanks to $\rflts$ and $\rDistinct$, we are able to open up regexes like +$(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + +(r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*) $ +into $\RALTS{[r_1 \backslash cc'c'' \cdot r^*, r \backslash c'', +r \backslash c'c'' \cdot r^*, r \backslash c'' \cdot r^*]}$ +and then de-duplicate terms of the form $r\backslash s' \cdot r^*$ ($s'$ being a substring of $s$). +This allows us to use a similar technique as $r_1 \cdot r_2$ case, +where the crux is to get an equivalent form of +$\rderssimp{r^*}{s}$ with shape $\rsimp{\sum rs}$. +This requires generating all possible sub-strings $s'$ of $s$ such that $r\backslash s' \cdot r^*$ will appear as a term in $(r^*) \backslash s$. @@ -1304,6 +1331,68 @@ \end{center} \noindent %MAYBE TODO: introduce createdByStar +Again these definitions are tailor-made for dealing with alternatives that have +originated from a star's derivatives, so we do not attempt to open up all possible +regexes of the form $\RALTS{rs}$, where $\textit{rs}$ might not contain precisely 2 +elements. +We give a predicate for such "star-created" regular expressions: +\begin{center} +\begin{tabular}{lcr} + & & $\createdByStar{(\RSEQ{ra}{\RSTAR{rb}}) }$\\ + $\createdByStar{r_1} \land \createdByStar{r_2} $ & $ \Longrightarrow$ & $\createdByStar{(r_1 + r_2)}$ + \end{tabular} + \end{center} + + These definitions allows us the flexibility to talk about + regular expressions in their most convenient format, + for example, flattened out $\RALTS{[r_1, r_2, \ldots, r_n]} $ + instead of binary-nested: $((r_1 + r_2) + (r_3 + r_4)) + \ldots$. + These definitions help express that certain classes of syntatically + distinct regular expressions are actually the same under simplification. + This is not entirely true for annotated regular expressions: + %TODO: bsimp bders \neq bderssimp + \begin{center} + $(1+ (c\cdot \ASEQ{bs}{c^*}{c} ))$ + \end{center} + For bit-codes, the order in which simplification is applied + might cause a difference in the location they are placed. + If we want something like + \begin{center} + $\bderssimp{r}{s} \myequiv \bsimp{\bders{r}{s}}$ + \end{center} + Some "canonicalization" procedure is required, + which either pushes all the common bitcodes to nodes + as senior as possible: + \begin{center} + $_{bs}(_{bs_1 @ bs'}r_1 + _{bs_1 @ bs''}r_2) \rightarrow _{bs @ bs_1}(_{bs'}r_1 + _{bs''}r_2) $ + \end{center} + or does the reverse. However bitcodes are not of interest if we are talking about + the $\llbracket r \rrbracket$ size of a regex. + Therefore for the ease and simplicity of producing a + proof for a size bound, we are happy to restrict ourselves to + unannotated regular expressions, and obtain such equalities as + \begin{lemma} + $\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$ + \end{lemma} + + \begin{proof} + By using the rewriting relation $\rightsquigarrow$ + \end{proof} + %TODO: rsimp sflat +And from this we obtain a proof that a star's derivative will be the same +as if it had all its nested alternatives created during deriving being flattened out: + For example, + \begin{lemma} + $\createdByStar{r} \implies \rsimp{r} = \rsimp{\RALTS{\hflataux{r}}}$ + \end{lemma} + \begin{proof} + By structural induction on $r$, where the induction rules are these of $\createdByStar{_}$. + \end{proof} +% The simplification of a flattened out regular expression, provided it comes +%from the derivative of a star, is the same as the one nested. + + + We first introduce an inductive property for $\starupdate$ and $\hflataux{\_}$, it says if we do derivatives of $r^*$ @@ -1516,7 +1605,7 @@ We reason similarly for $\STAR$. The inductive hypothesis is $\exists N. \forall s. \; \llbracket \rderssimp{r}{s} \rrbracket \leq N$. -Let $\n_r = \llbracket r^* \rrbracket_r$. +Let $n_r = \llbracket r^* \rrbracket_r$. When $s = c :: cs$ is not empty, \begin{center} \begin{tabular}{lcll} @@ -1572,6 +1661,13 @@ (4), (8), and (12) are all the inductive cases proven. \end{proof} + +\begin{corollary} +For any regex $a$, $\exists N_r. \forall s. \; \rsize{\bderssimp{a}{s}} \leq N_r$ +\end{corollary} +\begin{proof} + By \ref{sizeRelations}. +\end{proof} \noindent %----------------------------------- @@ -1891,105 +1987,6 @@ %---------------------------------------------------------------------------------------- % SECTION 4 %---------------------------------------------------------------------------------------- -\section{A Bound for the Star Regular Expression} -We have shown how to control the size of the sequence regular expression $r_1\cdot r_2$ using -the "closed form" of $(r_1 \cdot r_2) \backslash s$ and then -the property of the $\distinct$ function. -Now we try to get a bound on $r^* \backslash s$ as well. -Again, we first look at how a star's derivatives evolve, if they grow maximally: -\begin{center} - -$r^* \quad \longrightarrow_{\backslash c} \quad (r\backslash c) \cdot r^* \quad \longrightarrow_{\backslash c'} \quad -r \backslash cc' \cdot r^* + r \backslash c' \cdot r^* \quad \longrightarrow_{\backslash c''} \quad -(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*) \quad \longrightarrow_{\backslash c'''} -\quad \ldots$ - -\end{center} -When we have a string $s = c :: c' :: c'' \ldots$ such that $r \backslash c$, $r \backslash cc'$, $r \backslash c'$, -$r \backslash cc'c''$, $r \backslash c'c''$, $r\backslash c''$ etc. are all nullable, -the number of terms in $r^* \backslash s$ will grow exponentially, causing the size -of the derivatives $r^* \backslash s$ to grow exponentially, even if we do not -count the possible size explosions of $r \backslash c$ themselves. - -Thanks to $\flts$ and $\distinctWith$, we are able to open up regexes like -$(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*) $ -into $\RALTS{[r_1 \backslash cc'c'' \cdot r^*, r \backslash c'', r \backslash c'c'' \cdot r^*, r \backslash c'' \cdot r^*]}$ -and then de-duplicate terms of the form $r\backslash s' \cdot r^*$ ($s'$ being a substring of $s$). -For this we define $\hflataux{\_}$ and $\hflat{\_}$, similar to $\sflataux{\_}$ and $\sflat{\_}$: -%TODO: definitions of and \hflataux \hflat - \begin{center} - \begin{tabular}{ccc} - $\hflataux{r_1 + r_2}$ & $=$ & $\hflataux{r_1} @ \hflataux{r_2}$\\ -$\hflataux r$ & $=$ & $ [r]$ -\end{tabular} -\end{center} - - \begin{center} - \begin{tabular}{ccc} - $\hflat{r_1 + r_2}$ & $=$ & $\RALTS{\hflataux{r_1} @ \hflataux{r_2}}$\\ -$\hflat r$ & $=$ & $ r$ -\end{tabular} -\end{center}s -Again these definitions are tailor-made for dealing with alternatives that have -originated from a star's derivatives, so we don't attempt to open up all possible -regexes of the form $\RALTS{rs}$, where $\textit{rs}$ might not contain precisely 2 -elements. -We give a predicate for such "star-created" regular expressions: -\begin{center} -\begin{tabular}{lcr} - & & $\createdByStar{(\RSEQ{ra}{\RSTAR{rb}}) }$\\ - $\createdByStar{r_1} \land \createdByStar{r_2} $ & $ \Longrightarrow$ & $\createdByStar{(r_1 + r_2)}$ - \end{tabular} - \end{center} - - These definitions allows us the flexibility to talk about - regular expressions in their most convenient format, - for example, flattened out $\RALTS{[r_1, r_2, \ldots, r_n]} $ - instead of binary-nested: $((r_1 + r_2) + (r_3 + r_4)) + \ldots$. - These definitions help express that certain classes of syntatically - distinct regular expressions are actually the same under simplification. - This is not entirely true for annotated regular expressions: - %TODO: bsimp bders \neq bderssimp - \begin{center} - $(1+ (c\cdot \ASEQ{bs}{c^*}{c} ))$ - \end{center} - For bit-codes, the order in which simplification is applied - might cause a difference in the location they are placed. - If we want something like - \begin{center} - $\bderssimp{r}{s} \myequiv \bsimp{\bders{r}{s}}$ - \end{center} - Some "canonicalization" procedure is required, - which either pushes all the common bitcodes to nodes - as senior as possible: - \begin{center} - $_{bs}(_{bs_1 @ bs'}r_1 + _{bs_1 @ bs''}r_2) \rightarrow _{bs @ bs_1}(_{bs'}r_1 + _{bs''}r_2) $ - \end{center} - or does the reverse. However bitcodes are not of interest if we are talking about - the $\llbracket r \rrbracket$ size of a regex. - Therefore for the ease and simplicity of producing a - proof for a size bound, we are happy to restrict ourselves to - unannotated regular expressions, and obtain such equalities as - \begin{lemma} - $\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$ - \end{lemma} - - \begin{proof} - By using the rewriting relation $\rightsquigarrow$ - \end{proof} - %TODO: rsimp sflat -And from this we obtain a proof that a star's derivative will be the same -as if it had all its nested alternatives created during deriving being flattened out: - For example, - \begin{lemma} - $\createdByStar{r} \implies \rsimp{r} = \rsimp{\RALTS{\hflataux{r}}}$ - \end{lemma} - \begin{proof} - By structural induction on $r$, where the induction rules are these of $\createdByStar{_}$. - \end{proof} -% The simplification of a flattened out regular expression, provided it comes -%from the derivative of a star, is the same as the one nested. - diff -r 57e33978e55d -r 3cbcd7cda0a9 ChengsongTanPhdThesis/Chapters/Inj.tex --- a/ChengsongTanPhdThesis/Chapters/Inj.tex Tue Jul 05 00:42:06 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Inj.tex Wed Jul 13 08:27:28 2022 +0100 @@ -4,22 +4,32 @@ \label{Inj} % In chapter 2 \ref{Chapter2} we will introduce the concepts %and notations we -%use for describing the lexing algorithm by Sulzmann and Lu, -%and then give the algorithm and its variant, and discuss +% used for describing the lexing algorithm by Sulzmann and Lu, +%and then give the algorithm and its variant and discuss %why more aggressive simplifications are needed. In this chapter, we define the basic notions for regular languages and regular expressions. -This is essentially a description in "English" -of your formalisation in Isabelle/HOL. -We also give the definition of what $\POSIX$ lexing means. +This is essentially a description in ``English" +of our formalisation in Isabelle/HOL. +We also give the definition of what $\POSIX$ lexing means, +followed by an algorithm by Sulzmanna and Lu\parencite{Sulzmann2014} +that produces the output conforming +to the $\POSIX$ standard. +It is also worth mentioning that +we choose to use the ML-style notation +for function applications, where +the parameters of a function is not enclosed +inside a pair of parentheses (e.g. $f \;x \;y$ +instead of $f(x,\;y)$). This is mainly +to make the text visually more concise. \section{Basic Concepts} -Usually formal language theory starts with an alphabet +Usually, formal language theory starts with an alphabet denoting a set of characters. Here we just use the datatype of characters from Isabelle, which roughly corresponds to the ASCII characters. -In what follows we shall leave the information about the alphabet +In what follows, we shall leave the information about the alphabet implicit. Then using the usual bracket notation for lists, we can define strings made up of characters: @@ -30,7 +40,7 @@ \end{center} Where $c$ is a variable ranging over characters. Strings can be concatenated to form longer strings in the same -way as we concatenate two lists, which we write as @. +way as we concatenate two lists, which we shall write as $s_1 @ s_2$. We omit the precise recursive definition here. We overload this concatenation operator for two sets of strings: @@ -48,8 +58,8 @@ $A^{n+1}$ & $\dn$ & $A @ A^n$ \end{tabular} \end{center} -The union of all the natural number powers of a language -is usually defined as the Kleene star operator: +The union of all powers of a language +can be used to define the Kleene star operator: \begin{center} \begin{tabular}{lcl} $A*$ & $\dn$ & $\bigcup_{i \geq 0} A^i$ \\ @@ -57,19 +67,19 @@ \end{center} \noindent -However, to obtain a convenient induction principle +However, to obtain a more convenient induction principle in Isabelle/HOL, we instead define the Kleene star as an inductive set: \begin{center} \begin{mathpar} -\inferrule{}{[] \in A*\\} + \inferrule{\mbox{}}{[] \in A*\\} -\inferrule{\\s_1 \in A \land \; s_2 \in A*}{s_1 @ s_2 \in A*} +\inferrule{s_1 \in A \;\; s_2 \in A*}{s_1 @ s_2 \in A*} \end{mathpar} \end{center} -\ChristianComment{Yes, used the inferrule command in mathpar} +\noindent We also define an operation of "chopping off" a character from a language, which we call $\Der$, meaning \emph{Derivative} (for a language): \begin{center} @@ -89,10 +99,10 @@ which is essentially the left quotient $A \backslash L$ of $A$ against the singleton language with $L = \{w\}$ in formal language theory. -However for the purposes here, the $\textit{Ders}$ definition with +However, for the purposes here, the $\textit{Ders}$ definition with a single string is sufficient. -With the sequencing, Kleene star, and $\textit{Der}$ operator on languages, +With the sequencing, Kleene star, and $\textit{Der}$ operator on languages, we have a few properties of how the language derivative can be defined using sub-languages. \begin{lemma} @@ -113,26 +123,28 @@ $\textit{Der} \;c \;(A*) = (\textit{Der}\; c A) @ (A*)$\\ \end{lemma} \begin{proof} +There are too inclusions to prove: \begin{itemize} -\item{$\subseteq$} -\noindent +\item{$\subseteq$}:\\ The set \[ \{s \mid c :: s \in A*\} \] is enclosed in the set -\[ \{s_1 @ s_2 \mid s_1 \, s_2. s_1 \in \{s \mid c :: s \in A\} \land s_2 \in A* \} \] +\[ \{s_1 @ s_2 \mid s_1 \, s_2.\; s_1 \in \{s \mid c :: s \in A\} \land s_2 \in A* \} \] because whenever you have a string starting with a character in the language of a Kleene star $A*$, then that character together with some sub-string immediately after it will form the first iteration, and the rest of the string will be still in $A*$. -\item{$\supseteq$} +\item{$\supseteq$}:\\ Note that \[ \Der \; c \; (A*) = \Der \; c \; (\{ [] \} \cup (A @ A*) ) \] -and +hold. +Also this holds: \[ \Der \; c \; (\{ [] \} \cup (A @ A*) ) = \Der\; c \; (A @ A*) \] -where the $\textit{RHS}$ of the above equatioin can be rewritten -as \[ (\Der \; c\; A) @ A* \cup A' \], $A'$ being a possibly empty set. +where the $\textit{RHS}$ can be rewritten +as \[ (\Der \; c\; A) @ A* \cup (\Der \; c \; (A*)) \] +which of course contains $\Der \; c \; A @ A*$. \end{itemize} \end{proof} @@ -141,7 +153,7 @@ for regular languages, we need to first give definitions for regular expressions. \subsection{Regular Expressions and Their Meaning} -The basic regular expressions are defined inductively +The \emph{basic regular expressions} are defined inductively by the following grammar: \[ r ::= \ZERO \mid \ONE \mid c @@ -150,81 +162,127 @@ \mid r^* \] \noindent -We call them basic because we might introduce -more constructs later such as negation +We call them basic because we will introduce +additional constructors in later chapters such as negation and bounded repetitions. -We defined the regular expression containing -nothing as $\ZERO$, note that some authors -also use $\phi$ for that. -Similarly, the regular expression denoting the -singleton set with only $[]$ is sometimes -denoted by $\epsilon$, but we use $\ONE$ here. - -The language or set of strings denoted -by regular expressions are defined as +We use $\ZERO$ for the regular expression that +matches no string, and $\ONE$ for the regular +expression that matches only the empty string\footnote{ +some authors +also use $\phi$ and $\epsilon$ for $\ZERO$ and $\ONE$ +but we prefer our notation}. +The sequence regular expression is written $r_1\cdot r_2$ +and sometimes we omit the dot if it is clear which +regular expression is meant; the alternative +is written $r_1 + r_2$. +The \emph{language} or meaning of +a regular expression is defined recursively as +a set of strings: %TODO: FILL in the other defs \begin{center} \begin{tabular}{lcl} -$L \; (\ZERO)$ & $\dn$ & $\phi$\\ -$L \; (\ONE)$ & $\dn$ & $\{[]\}$\\ -$L \; (c)$ & $\dn$ & $\{[c]\}$\\ -$L \; (r_1 + r_2)$ & $\dn$ & $ L \; (r_1) \cup L \; ( r_2)$\\ -$L \; (r_1 \cdot r_2)$ & $\dn$ & $ L \; (r_1) @ L \; (r_2)$\\ -$L \; (r^*)$ & $\dn$ & $ (L(r))^*$ +$L \; \ZERO$ & $\dn$ & $\phi$\\ +$L \; \ONE$ & $\dn$ & $\{[]\}$\\ +$L \; c$ & $\dn$ & $\{[c]\}$\\ +$L \; r_1 + r_2$ & $\dn$ & $ L \; r_1 \cup L \; r_2$\\ +$L \; r_1 \cdot r_2$ & $\dn$ & $ L \; r_1 @ L \; r_2$\\ +$L \; r^*$ & $\dn$ & $ (L\;r)*$ \end{tabular} \end{center} \noindent -Which is also called the "language interpretation" of -a regular expression. - Now with semantic derivatives of a language and regular expressions and their language interpretations in place, we are ready to define derivatives on regexes. \subsection{Brzozowski Derivatives and a Regular Expression Matcher} - -\ChristianComment{Hi this part I want to keep the ordering as is, so that it keeps the -readers engaged with a story how we got to the definition of $\backslash$, rather -than first "overwhelming" them with the definition of $\nullable$.} - -The language derivative acts on a string set and chops off a character from -all strings in that set, we want to define a derivative operation on regular expressions -so that after derivative $L(r\backslash c)$ -will look as if it was obtained by doing a language derivative on $L(r)$: +%Recall, the language derivative acts on a set of strings +%and essentially chops off a particular character from +%all strings in that set, Brzozowski defined a derivative operation on regular expressions +%so that after derivative $L(r\backslash c)$ +%will look as if it was obtained by doing a language derivative on $L(r)$: +Recall that the semantic derivative acts on a set of +strings. Brzozowski noticed that this operation +can be ``mirrored" on regular expressions which +he calls the derivative of a regular expression $r$ +with respect to a character $c$, written +$r \backslash c$. +He defined this operation such that the following property holds: \begin{center} -\[ -r\backslash c \dn ? -\] -so that -\[ -L(r \backslash c) = \Der \; c \; L(r) ? -\] -\end{center} -So we mimic the equalities we have for $\Der$ on language concatenation \[ -\Der \; c \; (A @ B) = \textit{if} \; [] \in A \; \textit{then} ((\Der \; c \; A) @ B ) \cup \Der \; c\; B \quad \textit{else}\; (\Der \; c \; A) @ B\\ +L(r \backslash c) = \Der \; c \; L(r) \] -to get the derivative for sequence regular expressions: -\[ -(r_1 \cdot r_2 ) \backslash c = \textit{if}\,([] \in L(r_1)) r_1 \backslash c \cdot r_2 + r_2 \backslash c \textit{else} (r_1 \backslash c) \cdot r_2 -\] +\end{center} +\noindent +For example in the sequence case we have +\begin{center} + \begin{tabular}{lcl} + $\Der \; c \; (A @ B)$ & $\dn$ & + $ \textit{if} \;\, [] \in A \; + \textit{then} \;\, ((\Der \; c \; A) @ B ) \cup + \Der \; c\; B$\\ + & & $\textit{else}\; (\Der \; c \; A) @ B$\\ + \end{tabular} +\end{center} +\noindent +This can be translated to +regular expressions in the following +manner: +\begin{center} + \begin{tabular}{lcl} + $(r_1 \cdot r_2 ) \backslash c$ & $\dn$ & $\textit{if}\;\,([] \in L(r_1)) r_1 \backslash c \cdot r_2 + r_2 \backslash c$ \\ + & & $\textit{else} \; (r_1 \backslash c) \cdot r_2$ + \end{tabular} +\end{center} \noindent -and language Kleene star: +And similarly, the Kleene star's semantic derivative +can be expressed as \[ -\textit{Der} \;c \;A* = (\textit{Der}\; c A) @ (A*) + \textit{Der} \;c \;(A*) \dn (\textit{Der}\; c A) @ (A*) \] -to get derivative of the Kleene star regular expression: +which translates to \[ -r^* \backslash c = (r \backslash c)\cdot r^* + (r^*) \backslash c \dn (r \backslash c)\cdot r^*. \] -Note that although we can formalise the boolean predicate -$[] \in L(r_1)$ without problems, if we want a function that works -computationally, then we would have to define a function that tests -whether an empty string is in the language of a regular expression. -We call such a function $\nullable$: - - - +In the above definition of $(r_1\cdot r_2) \backslash c$, +the $\textit{if}$ clause's +boolean condition +$[] \in L(r_1)$ needs to be +somehow recursively computed. +We call such a function that checks +whether the empty string $[]$ is +in the language of a regular expression $\nullable$: +\begin{center} + \begin{tabular}{lcl} + $\nullable(\ZERO)$ & $\dn$ & $\mathit{false}$ \\ + $\nullable(\ONE)$ & $\dn$ & $\mathit{true}$ \\ + $\nullable(c)$ & $\dn$ & $\mathit{false}$ \\ + $\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\ + $\nullable(r_1\cdot r_2)$ & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\ + $\nullable(r^*)$ & $\dn$ & $\mathit{true}$ \\ + \end{tabular} +\end{center} +\noindent +The $\ZERO$ regular expression +does not contain any string and +therefore is not \emph{nullable}. +$\ONE$ is \emph{nullable} +by definition. +The character regular expression $c$ +corresponds to the singleton set $\{c\}$, +and therefore does not contain the empty string. +The alternative regular expression is nullable +if at least one of its children is nullable. +The sequence regular expression +would require both children to have the empty string +to compose an empty string, and the Kleene star +is always nullable because it naturally +contains the empty string. + +The derivative function, written $r\backslash c$, +defines how a regular expression evolves into +a new one after all the string it contains is acted on: +if it starts with $c$, then the character is chopped of, +if not, that string is removed. \begin{center} \begin{tabular}{lcl} $\ZERO \backslash c$ & $\dn$ & $\ZERO$\\ @@ -239,67 +297,32 @@ \end{tabular} \end{center} \noindent -The function derivative, written $r\backslash c$, -defines how a regular expression evolves into -a new regular expression after all the string it contains -is chopped off a certain head character $c$. -The most involved cases are the sequence -and star case. +The most involved cases are the sequence case +and the star case. The sequence case says that if the first regular expression -contains an empty string then the second component of the sequence -might be chosen as the target regular expression to be chopped -off its head character. -The star regular expression's derivative unwraps the iteration of -regular expression and attaches the star regular expression -to the sequence's second element to make sure a copy is retained -for possible more iterations in later phases of lexing. - - -To test whether $[] \in L(r_1)$, we need the $\nullable$ function, -which tests whether the empty string $""$ -is in the language of $r$: - - -\begin{center} - \begin{tabular}{lcl} - $\nullable(\ZERO)$ & $\dn$ & $\mathit{false}$ \\ - $\nullable(\ONE)$ & $\dn$ & $\mathit{true}$ \\ - $\nullable(c)$ & $\dn$ & $\mathit{false}$ \\ - $\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\ - $\nullable(r_1\cdot r_2)$ & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\ - $\nullable(r^*)$ & $\dn$ & $\mathit{true}$ \\ - \end{tabular} -\end{center} -\noindent -The empty set does not contain any string and -therefore not the empty string, the empty string -regular expression contains the empty string -by definition, the character regular expression -is the singleton that contains character only, -and therefore does not contain the empty string, -the alternative regular expression (or "or" expression) -might have one of its children regular expressions -being nullable and any one of its children being nullable -would suffice. The sequence regular expression -would require both children to have the empty string -to compose an empty string and the Kleene star -operation naturally introduced the empty string. - -We have the following property where the derivative on regular -expressions coincides with the derivative on a set of strings: - -\begin{lemma} +contains an empty string, then the second component of the sequence +needs to be considered, as its derivative will contribute to the +result of this derivative. +The star regular expression $r^*$'s derivative +unwraps one iteration of $r$, turns it into $r\backslash c$, +and attaches the original $r^*$ +after $r\backslash c$, so that +we can further unfold it as many times as needed. +We have the following correspondence between +derivatives on regular expressions and +derivatives on a set of strings: +\begin{lemma}\label{derDer} $\textit{Der} \; c \; L(r) = L (r\backslash c)$ \end{lemma} \noindent The main property of the derivative operation -that enables us to reason about the correctness of -an algorithm using derivatives is +(that enables us to reason about the correctness of +derivative-based matching) +is \begin{lemma}\label{derStepwise} -$c\!::\!s \in L(r)$ holds -if and only if $s \in L(r\backslash c)$. + $c\!::\!s \in L(r)$ \textit{iff} $s \in L(r\backslash c)$. \end{lemma} \noindent @@ -314,12 +337,14 @@ \end{center} \noindent -When there is no ambiguity we will use $\backslash$ to denote +When there is no ambiguity, we will +omit the subscript and use $\backslash$ instead +of $\backslash_r$ to denote string derivatives for brevity. Brzozowski's regular-expression matcher algorithm can then be described as: \begin{definition} -$\textit{match}\;s\;r \;\dn\; \nullable(r\backslash s)$ +$\textit{match}\;s\;r \;\dn\; \nullable \; (r\backslash s)$ \end{definition} \noindent @@ -336,17 +361,15 @@ It can be relatively easily shown that this matcher is correct: \begin{lemma} -$\textit{match} \; s\; r = \textit{true} \Longleftrightarrow s \in L(r)$ + $\textit{match} \; s\; r = \textit{true} \; \textit{iff} \; s \in L(r)$ \end{lemma} \begin{proof} -By the stepwise property of $\backslash$ (\ref{derStepwise}) + By the stepwise property of derivatives (lemma \ref{derStepwise}) + and lemma \ref{derDer}. \end{proof} \noindent -If we implement the above algorithm naively, however, -the algorithm can be excruciatingly slow. - - -\begin{figure} +\begin{center} + \begin{figure} \begin{tikzpicture} \begin{axis}[ xlabel={$n$}, @@ -360,84 +383,93 @@ \end{tikzpicture} \caption{Matching $(a^*)^*b$ against $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}$}\label{NaiveMatcher} \end{figure} - +\end{center} \noindent -For this we need to introduce certain +If we implement the above algorithm naively, however, +the algorithm can be excruciatingly slow, as shown in +\ref{NaiveMatcher}. +Note that both axes are in logarithmic scale. +Around two dozens characters +would already explode the matcher on regular expression +$(a^*)^*b$. +For this, we need to introduce certain rewrite rules for the intermediate results, such as $r + r \rightarrow r$, and make sure those rules do not change the language of the regular expression. -We have a simplification function (that is as simple as possible -while having much power on making a regex simpler): -\begin{verbatim} -def simp(r: Rexp) : Rexp = r match { - case SEQ(r1, r2) => - (simp(r1), simp(r2)) match { - case (ZERO, _) => ZERO - case (_, ZERO) => ZERO - case (ONE, r2s) => r2s - case (r1s, ONE) => r1s - case (r1s, r2s) => SEQ(r1s, r2s) - } - case ALTS(r1, r2) => { - (simp(r1), simp(r2)) match { - case (ZERO, r2s) => r2s - case (r1s, ZERO) => r1s - case (r1s, r2s) => - if(r1s == r2s) r1s else ALTS(r1s, r2s) - } - } - case r => r -} -\end{verbatim} -If we repeatedly incorporate these -rules during the matching algorithm, -we have a lexer with simplification: -\begin{verbatim} -def ders_simp(s: List[Char], r: Rexp) : Rexp = s match { - case Nil => simp(r) - case c :: cs => ders_simp(cs, simp(der(c, r))) -} - -def simp_matcher(s: String, r: Rexp) : Boolean = - nullable(ders_simp(s.toList, r)) - -\end{verbatim} -\noindent -After putting in those rules, the example of \ref{NaiveMatcher} -is now very tame in the length of inputs: - - +One simpled-minded simplification function +that achieves these requirements is given below: +\begin{center} + \begin{tabular}{lcl} + $\simp \; r_1 \cdot r_2 $ & $ \dn$ & + $(\simp \; r_1, \simp \; r_2) \; \textit{match}$\\ + & & $\quad \case \; (\ZERO, \_) \Rightarrow \ZERO$\\ + & & $\quad \case \; (\_, \ZERO) \Rightarrow \ZERO$\\ + & & $\quad \case \; (\ONE, r_2') \Rightarrow r_2'$\\ + & & $\quad \case \; (r_1', \ONE) \Rightarrow r_1'$\\ + & & $\quad \case \; (r_1', r_2') \Rightarrow r_1'\cdot r_2'$\\ + $\simp \; r_1 + r_2$ & $\dn$ & $(\simp \; r_1, \simp \; r_2) \textit{match}$\\ + & & $\quad \; \case \; (\ZERO, r_2') \Rightarrow r_2'$\\ + & & $\quad \; \case \; (r_1', \ZERO) \Rightarrow r_1'$\\ + & & $\quad \; \case \; (r_1', r_2') \Rightarrow r_1' + r_2'$\\ + $\simp \; r$ & $\dn$ & $r$ + + \end{tabular} +\end{center} +If we repeatedly apply this simplification +function during the matching algorithm, +we have a matcher with simplification: +\begin{center} + \begin{tabular}{lcl} + $\derssimp \; [] \; r$ & $\dn$ & $r$\\ + $\derssimp \; c :: cs \; r$ & $\dn$ & $\derssimp \; cs \; (\simp \; (r \backslash c))$\\ + $\textit{matcher}_{simp}\; s \; r $ & $\dn$ & $\nullable \; (\derssimp \; s\;r)$ + \end{tabular} +\end{center} +\begin{figure} \begin{tikzpicture} \begin{axis}[ xlabel={$n$}, ylabel={time in secs}, ymode = log, xmode = log, + grid = both, legend entries={Matcher With Simp}, legend pos=north west, legend cell align=left] \addplot[red,mark=*, mark options={fill=white}] table {BetterMatcher.data}; \end{axis} -\end{tikzpicture} \label{fig:BetterMatcher} - +\end{tikzpicture} +\caption{$(a^*)^*b$ +against +$\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}$ Using $\textit{matcher}_{simp}$}\label{BetterMatcher} +\end{figure} +\noindent +The running time of $\textit{ders}\_\textit{simp}$ +on the same example of \ref{NaiveMatcher} +is now very tame in terms of the length of inputs, +as shown in \ref{BetterMatcher}. -\noindent -Note how the x-axis is in logarithmic scale. Building derivatives and then testing the existence of empty string in the resulting regular expression's language, -and add simplification rules when necessary. +adding simplifications when necessary. So far, so good. But what if we want to do lexing instead of just getting a YES/NO answer? -\citeauthor{Sulzmann2014} first came up with a nice and +Sulzmanna and Lu \cite{Sulzmann2014} first came up with a nice and elegant (arguably as beautiful as the definition of the original derivative) solution for this. \section{Values and the Lexing Algorithm by Sulzmann and Lu} -Here we present the hybrid phases of a regular expression lexing -algorithm using the function $\inj$, as given by Sulzmann and Lu. -They first defined the datatypes for storing the -lexing information called a \emph{value} or -sometimes also \emph{lexical value}. These values and regular +In this section, we present a two-phase regular expression lexing +algorithm. +The first phase takes successive derivatives with +respect to the input string, +and the second phase does the reverse, \emph{injecting} back +characters, in the meantime constructing a lexing result. +We will introduce the injection phase in detail slightly +later, but as a preliminary we have to first define +the datatype for lexing results, +called \emph{value} or +sometimes also \emph{lexical value}. Values and regular expressions correspond to each other as illustrated in the following table: @@ -466,56 +498,116 @@ \end{tabular} \end{tabular} \end{center} - \noindent -We have a formal binary relation for telling whether the structure -of a regular expression agrees with the value. +A value has an underlying string, which +can be calculated by the ``flatten" function $|\_|$: +\begin{center} + \begin{tabular}{lcl} + $|\Empty|$ & $\dn$ & $[]$\\ + $|\Char \; c|$ & $ \dn$ & $ [c]$\\ + $|\Seq(v_1, v_2)|$ & $ \dn$ & $ v_1| @ |v_2|$\\ + $|\Left(v)|$ & $ \dn$ & $ |v|$\\ + $|\Right(v)|$ & $ \dn$ & $ |v|$\\ + $|\Stars([])|$ & $\dn$ & $[]$\\ + $|\Stars(v::vs)|$ & $\dn$ & $ |v| @ |\Stars(vs)|$ + \end{tabular} +\end{center} +Sulzmann and Lu used a binary predicate, written $\vdash v:r $, +to indicate that a value $v$ could be generated from a lexing algorithm +with input $r$. They call it the value inhabitation relation. \begin{mathpar} -\inferrule{}{\vdash \Char(c) : \mathbf{c}} \hspace{2em} -\inferrule{}{\vdash \Empty : \ONE} \hspace{2em} -\inferrule{\vdash v_1 : r_1 \\ \vdash v_2 : r_2 }{\vdash \Seq(v_1, v_2) : (r_1 \cdot r_2)} -\end{mathpar} + \inferrule{\mbox{}}{\vdash \Char(c) : \mathbf{c}} \hspace{2em} + + \inferrule{\mbox{}}{\vdash \Empty : \ONE} \hspace{2em} + +\inferrule{\vdash v_1 : r_1 \;\; \vdash v_2 : r_2 }{\vdash \Seq(v_1, v_2) : (r_1 \cdot r_2)} + +\inferrule{\vdash v_1 : r_1}{\vdash \Left(v_1):r_1+r_2} + +\inferrule{\vdash v_2 : r_2}{\vdash \Right(v_2):r_1 + r_2} -Building on top of Sulzmann and Lu's attempt to formalise the -notion of POSIX lexing rules \parencite{Sulzmann2014}, -Ausaf and Urban\parencite{AusafDyckhoffUrban2016} modelled -POSIX matching as a ternary relation recursively defined in a -natural deduction style. -The formal definition of a $\POSIX$ value $v$ for a regular expression +\inferrule{\forall v \in vs. \vdash v:r \land |v| \neq []}{\vdash \Stars(vs):r^*} +\end{mathpar} +\noindent +The condition $|v| \neq []$ in the premise of star's rule +is to make sure that for a given pair of regular +expression $r$ and string $s$, the number of values +satisfying $|v| = s$ and $\vdash v:r$ is finite. +Given the same string and regular expression, there can be +multiple values for it. For example, both +$\vdash \Seq(\Left \; ab)(\Right \; c):(ab+a)(bc+c)$ and +$\vdash \Seq(\Right\; a)(\Left \; bc ):(ab+a)(bc+c)$ hold +and the values both flatten to $abc$. +Lexers therefore have to disambiguate and choose only +one of the values to output. $\POSIX$ is one of the +disambiguation strategies that is widely adopted. + +Ausaf and Urban\parencite{AusafDyckhoffUrban2016} +formalised the property +as a ternary relation. +The $\POSIX$ value $v$ for a regular expression $r$ and string $s$, denoted as $(s, r) \rightarrow v$, can be specified -in the following set of rules: -\ChristianComment{Will complete later} -\newcommand*{\inference}[3][t]{% - \begingroup - \def\and{\\}% - \begin{tabular}[#1]{@{\enspace}c@{\enspace}} - #2 \\ - \hline - #3 - \end{tabular}% - \endgroup -} -\begin{center} -\inference{$s_1 @ s_2 = s$ \and $(\nexists s_3 s_4 s_5. s_1 @ s_5 = s_3 \land s_5 \neq [] \land s_3 @ s_4 = s \land (s_3, r_1) \rightarrow v_3 \land (s_4, r_2) \rightarrow v_4)$ \and $(s_1, r_1) \rightarrow v_1$ \and $(s_2, r_2) \rightarrow v_2$ }{$(s, r_1 \cdot r_2) \rightarrow \Seq(v_1, v_2)$ } -\end{center} +in the following set of rules\footnote{The names of the rules are used +as they were originally given in \cite{AusafDyckhoffUrban2016}}: +\noindent +\begin{figure} +\begin{mathpar} + \inferrule[P1]{\mbox{}}{([], \ONE) \rightarrow \Empty} + + \inferrule[PC]{\mbox{}}{([c], c) \rightarrow \Char \; c} + + \inferrule[P+L]{(s,r_1)\rightarrow v_1}{(s, r_1+r_2)\rightarrow \Left \; v_1} + + \inferrule[P+R]{(s,r_2)\rightarrow v_2\\ s \notin L \; r_1}{(s, r_1+r_2)\rightarrow \Right \; v_2} + + \inferrule[PS]{(s_1, v_1) \rightarrow r_1 \\ (s_2, v_2)\rightarrow r_2\\ + \nexists s_3 \; s_4. s_3 \neq [] \land s_3 @ s_4 = s_2 \land + s_1@ s_3 \in L \; r_1 \land s_4 \in L \; r_2}{(s_1 @ s_2, r_1\cdot r_2) \rightarrow + \Seq \; v_1 \; v_2} + + \inferrule[P{[]}]{\mbox{}}{([], r^*) \rightarrow \Stars([])} + + \inferrule[P*]{(s_1, v) \rightarrow v \\ (s_2, r^*) \rightarrow \Stars \; vs \\ + |v| \neq []\\ \nexists s_3 \; s_4. s_3 \neq [] \land s_3@s_4 = s_2 \land + s_1@s_3 \in L \; r \land s_4 \in L \; r^*}{(s_1@s_2, r^*)\rightarrow \Stars \; + (v::vs)} +\end{mathpar} +\caption{POSIX Lexing Rules} +\end{figure} \noindent -The above $\POSIX$ rules could be explained intuitionally as +The above $\POSIX$ rules follows the intuition described below: \begin{itemize} -\item -match the leftmost regular expression when multiple options of matching -are available -\item -always match a subpart as much as possible before proceeding -to the next token. + \item (Left Priority)\\ + Match the leftmost regular expression when multiple options of matching + are available. + \item (Maximum munch)\\ + Always match a subpart as much as possible before proceeding + to the next token. \end{itemize} - -The reason why we are interested in $\POSIX$ values is that they can -be practically used in the lexing phase of a compiler front end. +\noindent +These disambiguation strategies can be +quite practical. For instance, when lexing a code snippet -$\textit{iffoo} = 3$ with the regular expression $\textit{keyword} + \textit{identifier}$, we want $\textit{iffoo}$ to be recognized -as an identifier rather than a keyword. -\ChristianComment{Do I also introduce lexical values $LV$ here?} -We know that $\POSIX$ values are also part of the normal values: +\[ + \textit{iffoo} = 3 +\] +using the regular expression (with +keyword and identifier having their +usualy definitions on any formal +language textbook, for instance +keyword is a nonempty string starting with letters +followed by alphanumeric characters or underscores): +\[ + \textit{keyword} + \textit{identifier}, +\] +we want $\textit{iffoo}$ to be recognized +as an identifier rather than a keyword (if) +followed by +an identifier (foo). +POSIX lexing achieves this. + +We know that a $\POSIX$ value is also a normal underlying +value: \begin{lemma} $(r, s) \rightarrow v \implies \vdash v: r$ \end{lemma} @@ -527,10 +619,13 @@ $\textit{if} \,(s, r) \rightarrow v_1 \land (s, r) \rightarrow v_2\quad \textit{then} \; v_1 = v_2$ \end{lemma} \begin{proof} -By induction on $s$, $r$ and $v_1$. The induction principle is -the \POSIX rules. Each case is proven by a combination of -the induction rules for $\POSIX$ values and the inductive hypothesis. -Probably the most cumbersome cases are the sequence and star with non-empty iterations. +By induction on $s$, $r$ and $v_1$. The inductive cases +are all the POSIX rules. +Each case is proven by a combination of +the induction rules for $\POSIX$ +values and the inductive hypothesis. +Probably the most cumbersome cases are +the sequence and star with non-empty iterations. We give the reasoning about the sequence case as follows: When we have $(s_1, r_1) \rightarrow v_1$ and $(s_2, r_2) \rightarrow v_2$, @@ -544,15 +639,14 @@ which means this "different" $\POSIX$ value $\Seq(v_1', v_2')$ is the same as $\Seq(v_1, v_2)$. \end{proof} -Now we know what a $\POSIX$ value is, the problem is how do we achieve +Now we know what a $\POSIX$ value is; the problem is how do we achieve such a value in a lexing algorithm, using derivatives? \subsection{Sulzmann and Lu's Injection-based Lexing Algorithm} The contribution of Sulzmann and Lu is an extension of Brzozowski's algorithm by a second phase (the first phase being building successive -derivatives---see \ref{graph:successive_ders}). In this second phase, a POSIX value -is generated if the regular expression matches the string. +derivatives---see \ref{graph:successive_ders}). This second phase generates a POSIX value if the regular expression matches the string. Two functions are involved: $\inj$ and $\mkeps$. The function $\mkeps$ constructs a value from the last one of all the successive derivatives: @@ -570,13 +664,13 @@ \begin{center} \begin{tabular}{lcl} - $\mkeps(\ONE)$ & $\dn$ & $\Empty$ \\ - $\mkeps(r_{1}+r_{2})$ & $\dn$ - & \textit{if} $\nullable(r_{1})$\\ - & & \textit{then} $\Left(\mkeps(r_{1}))$\\ - & & \textit{else} $\Right(\mkeps(r_{2}))$\\ - $\mkeps(r_1\cdot r_2)$ & $\dn$ & $\Seq\,(\mkeps\,r_1)\,(\mkeps\,r_2)$\\ - $mkeps(r^*)$ & $\dn$ & $\Stars\,[]$ + $\mkeps \; \ONE$ & $\dn$ & $\Empty$ \\ + $\mkeps \; r_{1}+r_{2}$ & $\dn$ + & \textit{if} ($\nullable \; r_{1})$\\ + & & \textit{then} $\Left (\mkeps \; r_{1})$\\ + & & \textit{else} $\Right(\mkeps \; r_{2})$\\ + $\mkeps \; r_1\cdot r_2$ & $\dn$ & $\Seq\;(\mkeps\;r_1)\;(\mkeps \; r_2)$\\ + $\mkeps \; r^* $ & $\dn$ & $\Stars\;[]$ \end{tabular} \end{center} @@ -669,9 +763,10 @@ The central property of the $\lexer$ is that it gives the correct result by $\POSIX$ standards: \begin{theorem} - \begin{tabular}{l} - $\lexer \; r \; s = \Some(v) \Longleftrightarrow (r, \; s) \rightarrow v$\\ - $\lexer \;r \; s = \None \Longleftrightarrow \neg(\exists v. (r, s) \rightarrow v)$ + The $\lexer$ based on derivatives and injections is both sound and complete: + \begin{tabular}{lcl} + $\lexer \; r \; s = \Some(v)$ & $ \Longleftrightarrow$ & $ (r, \; s) \rightarrow v$\\ + $\lexer \;r \; s = \None $ & $\Longleftrightarrow$ & $ \neg(\exists v. (r, s) \rightarrow v)$ \end{tabular} \end{theorem} @@ -776,7 +871,7 @@ of a match. This means Sulzmann and Lu's injection-based algorithm exponential by nature. Somehow one has to make sure which - lexical values are $\POSIX$ and need to be kept in a lexing algorithm. + lexical values are $\POSIX$ and must be kept in a lexing algorithm. For example, the above $r= (a^*\cdot a^*)^*$ and @@ -788,7 +883,7 @@ Can we not create those intermediate values $v_1,\ldots v_n$, and get the lexing information that should be already there while doing derivatives in one pass, without a second injection phase? -In the meantime, can we make sure that simplifications +In the meantime, can we ensure that simplifications are easily handled without breaking the correctness of the algorithm? Sulzmann and Lu solved this problem by diff -r 57e33978e55d -r 3cbcd7cda0a9 ChengsongTanPhdThesis/Chapters/Introduction.tex --- a/ChengsongTanPhdThesis/Chapters/Introduction.tex Tue Jul 05 00:42:06 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Introduction.tex Wed Jul 13 08:27:28 2022 +0100 @@ -21,6 +21,7 @@ \newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3} \newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2} \newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2} +\def\derssimp{\textit{ders}\_\textit{simp}} \def\rders{\textit{rders}} \newcommand{\bders}[2]{#1 \backslash #2} \newcommand{\bsimp}[1]{\textit{bsimp}(#1)} @@ -39,6 +40,7 @@ \newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}} +\def\case{\textit{case}} \def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}} \def\rsimpalts{\textit{rsimp}_{ALTS}} \def\good{\textit{good}} @@ -82,7 +84,7 @@ \def\blexer{\textit{blexer}} \def\flex{\textit{flex}} \def\inj{\mathit{inj}} -\def\Empty{\mathit{Empty}} +\def\Empty{\textit{Empty}} \def\Left{\mathit{Left}} \def\Right{\mathit{Right}} \def\Stars{\mathit{Stars}} diff -r 57e33978e55d -r 3cbcd7cda0a9 ChengsongTanPhdThesis/main.tex --- a/ChengsongTanPhdThesis/main.tex Tue Jul 05 00:42:06 2022 +0100 +++ b/ChengsongTanPhdThesis/main.tex Wed Jul 13 08:27:28 2022 +0100 @@ -46,7 +46,8 @@ \usepackage{mathpazo} % Use the Palatino font by default \usepackage{hyperref} \usepackage{lipsum} -\usepackage[backend=bibtex,style=authoryear,natbib=true]{biblatex} % Use the bibtex backend with the authoryear citation style (which resembles APA) +\usepackage[backend=bibtex]{biblatex} % Use the bibtex backend with the authoryear citation style (which resembles APA) +%style=authoryear, natbib=true \usepackage{stmaryrd} \usepackage{caption} @@ -71,7 +72,11 @@ \usepackage{tikz} \usetikzlibrary{automata, positioning, calc} \usetikzlibrary{fit, - shapes.geometric} + shapes.geometric, + patterns, + backgrounds, + graphs} +\usetikzlibrary{babel} \usepackage{mathpartir} \usepackage{stackrel} diff -r 57e33978e55d -r 3cbcd7cda0a9 thys2/blexer2.sc --- a/thys2/blexer2.sc Tue Jul 05 00:42:06 2022 +0100 +++ b/thys2/blexer2.sc Wed Jul 13 08:27:28 2022 +0100 @@ -781,7 +781,7 @@ // fun pAKC_aux :: "arexp list ⇒ arexp ⇒ rexp ⇒ arexp" // where -// "pAKC_aux rsa r ctx = (if (regConcat (SEQ (erase r) ( ctx) )) ⊆ (regConcat (erase (AALTs [] rsa))) then AZERO else +// "pAKC_aux rsa r ctx = (if (seqFold (SEQ (erase r) ( ctx) )) ⊆ (seqFold (erase (AALTs [] rsa))) then AZERO else // case r of (ASEQ bs r1 r2) ⇒ // bsimp_ASEQ bs (pAKC_aux rsa r1 (SEQ (erase r2) ( ctx) )) r2 | // (AALTs bs rs) ⇒ @@ -796,13 +796,13 @@ //the "fake" Language interpretation: just concatenates! -def regConcat(acc: Rexp, rs: List[Rexp]) : Rexp = rs match { +def seqFold(acc: Rexp, rs: List[Rexp]) : Rexp = rs match { case Nil => acc case r :: rs1 => // if(acc == ONE) - // regConcat(r, rs1) + // seqFold(r, rs1) // else - regConcat(SEQ(acc, r), rs1) + seqFold(SEQ(acc, r), rs1) } def rprint(r: Rexp) : Unit = println(shortRexpOutput(r)) @@ -819,9 +819,9 @@ // println("ctx---------") // rsprint(ctx) // println("ctx---------end") - // rsprint(breakIntoTerms(regConcat(erase(r), ctx)).map(oneSimp)) + // rsprint(breakIntoTerms(seqFold(erase(r), ctx)).map(oneSimp)) - if (breakIntoTerms(regConcat(erase(r), ctx)).map(oneSimp).forall(acc.contains)) {//acc.flatMap(breakIntoTerms + if (breakIntoTerms(seqFold(erase(r), ctx)).map(oneSimp).forall(acc.contains)) {//acc.flatMap(breakIntoTerms AZERO } else{ @@ -928,12 +928,12 @@ } def attachCtx(r: ARexp, ctx: List[Rexp]) : Set[Rexp] = { - turnIntoTerms((regConcat(erase(r), ctx))) + turnIntoTerms((seqFold(erase(r), ctx))) .toSet } def attachCtxcc(r: Rexp, ctx: List[Rexp]) : Set[Rexp] = - turnIntoTerms(regConcat(r, ctx)).toSet + turnIntoTerms(seqFold(r, ctx)).toSet def ABIncludedByC[A, B, C](a: A, b: B, c: C, f: (A, B) => C, subseteqPred: (C, C) => Boolean) : Boolean = { diff -r 57e33978e55d -r 3cbcd7cda0a9 thys3/BlexerSimp.thy --- a/thys3/BlexerSimp.thy Tue Jul 05 00:42:06 2022 +0100 +++ b/thys3/BlexerSimp.thy Wed Jul 13 08:27:28 2022 +0100 @@ -77,23 +77,27 @@ | "turnIntoTerms (SEQ r1 r2) = concat (map (\r11. furtherSEQ r11 r2) (turnIntoTerms r1))" | "turnIntoTerms r = [r]" -fun regConcat :: "rexp \ rexp list \ rexp" where - "regConcat acc [] = acc" -| "regConcat ONE (r # rs1) = regConcat r rs1" -| "regConcat acc (r # rs1) = regConcat (SEQ acc r) rs1" +abbreviation "tint \ turnIntoTerms" + +fun seqFold :: "rexp \ rexp list \ rexp" where + "seqFold acc [] = acc" +| "seqFold ONE (r # rs1) = seqFold r rs1" +| "seqFold acc (r # rs1) = seqFold (SEQ acc r) rs1" + fun attachCtx :: "arexp \ rexp list \ rexp set" where - "attachCtx r ctx = set (turnIntoTerms (regConcat (erase r) ctx))" - + "attachCtx r ctx = set (turnIntoTerms (seqFold (erase r) ctx))" fun rs1_subseteq_rs2 :: "rexp set \ rexp set \ bool" where "rs1_subseteq_rs2 rs1 rs2 = (rs1 \ rs2)" + fun notZero :: "arexp \ bool" where "notZero AZERO = True" | "notZero _ = False" + fun tset :: "arexp list \ rexp set" where "tset rs = set (concat (map turnIntoTerms (map erase rs)))" @@ -101,10 +105,14 @@ fun prune6 :: "rexp set \ arexp \ rexp list \ arexp" where "prune6 acc a ctx = (if (ABIncludedByC a ctx acc attachCtx rs1_subseteq_rs2) then AZERO else (case a of (ASEQ bs r1 r2) \ bsimp_ASEQ bs (prune6 acc r1 (erase r2 # ctx)) r2 - | AALTs bs rs0 \ bsimp_AALTs bs (filter notZero (map (\r.(prune6 acc r ctx)) rs0))) )" + | AALTs bs rs0 \ bsimp_AALTs bs (filter notZero (map (\r.(prune6 acc r ctx)) rs0)) + | r \ r + ) + ) + " abbreviation - "p acc r \ prune6 (set (concat (map turnIntoTerms (map erase acc)) ) ) r Nil" + "p6 acc r \ prune6 (tset acc) r Nil" fun dB6 :: "arexp list \ rexp set \ arexp list" where @@ -154,8 +162,9 @@ | ss4: "(AZERO # rs) s\ rs" | ss5: "(AALTs bs1 rs1 # rsb) s\ ((map (fuse bs1) rs1) @ rsb)" | ss6: "L (erase a2) \ L (erase a1) \ (rsa@[a1]@rsb@[a2]@rsc) s\ (rsa@[a1]@rsb@rsc)" -| ss7: " (as @ [a] @ as1) s\ (as @ [prune6 (set (concat (map (\r. turnIntoTerms (erase r)) as))) a Nil] @ as1)" +| ss7: " (as @ [a] @ as1) s\ (as @ [p6 as a] @ as1)" +thm tset.simps inductive rrewrites:: "arexp \ arexp \ bool" ("_ \* _" [100, 100] 100) @@ -291,10 +300,10 @@ apply(subgoal_tac "(cc @ a # cc') s\* (cc @ a # dB6 cc' (tset (cc @ [a])))") prefer 2 apply (metis append.assoc append.left_neutral append_Cons) - apply(subgoal_tac "(cc @ a # dB6 cc' (tset (cc @ [a]))) s\* (cc @ (p cc a) # dB6 cc' (tset (cc @ [a])))") + apply(subgoal_tac "(cc @ a # dB6 cc' (tset (cc @ [a]))) s\* (cc @ (p6 cc a) # dB6 cc' (tset (cc @ [a])))") sorry - +(* lemma ss6_stronger_aux: shows "(rs1 @ rs2) s\* (rs1 @ dB6 rs2 (set (map erase rs1)))" apply(induct rs2 arbitrary: rs1) @@ -313,22 +322,83 @@ prefer 2 apply(drule_tac x="rs1 @ [a]" in meta_spec) apply(simp) - apply(drule_tac x="rs1" in meta_spec) - apply(subgoal_tac "(rs1 @ a # rs2) s\* (rs1 @ rs2)") - using srewrites_trans apply blast - apply(subgoal_tac "\rs1a rs1b. rs1 = rs1a @ [x] @ rs1b") - prefer 2 - apply (simp add: split_list) - apply(erule exE)+ - apply(simp) - using eq1_L rs_in_rstar ss sorry +*) lemma ss6_stronger: shows "rs1 s\* dB6 rs1 {}" + using ss6 + by (metis append_Nil concat.simps(1) list.set(1) list.simps(8) ss6_realistic tset.simps) + + +lemma tint_fuse: + shows "tint (erase a) = tint (erase (fuse bs a))" + by (simp add: erase_fuse) + +lemma tint_fuse2: + shows " set (tint (erase a)) = + set (tint (erase (fuse bs a)))" + using tint_fuse by auto + +lemma fused_bits_at_head: + shows "fuse bs a = ASEQ bs1 a1 a2 \ \bs2. bs1 = bs @ bs2" + +(* by (smt (verit) arexp.distinct(13) arexp.distinct(19) arexp.distinct(25) arexp.distinct(27) arexp.distinct(5) arexp.inject(3) fuse.elims) +harmless sorry +*) + sorry +thm seqFold.simps + +lemma seqFold_concats: + shows "r \ ONE \ seqFold r [r1] = SEQ r r1" + apply(case_tac r) + apply simp+ +done + + +lemma "set (tint (seqFold (erase x42) [erase x43])) = + set (tint (SEQ (erase x42) (erase x43)))" + apply(case_tac "erase x42 = ONE") + apply simp + apply simp + +lemma prune6_preserves_fuse: + shows "fuse bs (p6 as a) = p6 as (fuse bs a)" + using tint_fuse2 + apply simp + apply(case_tac a) + apply simp + apply simp + apply simp + + using fused_bits_at_head + + apply simp + apply(case_tac "erase x42 = ONE") + apply simp + + sorry + +corollary prune6_preserves_fuse_srewrite: + shows "(as @ map (fuse bs) [a] @ as2) s\* (as @ map (fuse bs) [p6 as a] @ as2)" + apply(subgoal_tac "map (fuse bs) [a] = [fuse bs a]") + apply(subgoal_tac "(as @ [fuse bs a] @ as2) s\* (as @ [ (p6 as (fuse bs a))] @ as2)") + using prune6_preserves_fuse apply auto[1] + using rs_in_rstar ss7 apply presburger + by simp + +lemma prune6_invariant_fuse: + shows "p6 as a = p6 (map (fuse bs) as) a" + apply simp + using erase_fuse by presburger + + +lemma p6pfs_cor: + shows "(map (fuse bs) as @ map (fuse bs) [a] @ map (fuse bs) as1) s\* (map (fuse bs) as @ map (fuse bs) [p6 as a] @ map (fuse bs) as1)" + by (metis prune6_invariant_fuse prune6_preserves_fuse_srewrite) lemma rewrite_preserves_fuse: shows "r2 \ r3 \ fuse bs r2 \ fuse bs r3" @@ -358,7 +428,13 @@ then show ?case apply(simp only: map_append) by (smt (verit, best) erase_fuse list.simps(8) list.simps(9) rrewrite_srewrite.ss6 srewrites.simps) +next + case (ss7 as a as1) + then show ?case + apply(simp only: map_append) + using p6pfs_cor by presburger qed (auto intro: rrewrite_srewrite.intros) + lemma rewrites_fuse: @@ -424,11 +500,14 @@ lemma bnullable0: shows "r1 \ r2 \ bnullable r1 = bnullable r2" and "rs1 s\ rs2 \ bnullables rs1 = bnullables rs2" - apply(induct rule: rrewrite_srewrite.inducts) - apply(auto simp add: bnullable_fuse) - apply (meson UnCI bnullable_fuse imageI) - using bnullable_correctness nullable_correctness by blast - + apply(induct rule: rrewrite_srewrite.inducts) + apply simp + apply simp + apply (simp add: bnullable_fuse) + using bnullable.simps(5) apply presburger + apply simp + apply simp + sorry @@ -438,7 +517,7 @@ using assms apply(induction r1 r2 rule: rrewrites.induct) apply simp - using bnullable0(1) by auto + using bnullable0(1) by presburger lemma rewrite_bmkeps_aux: shows "r1 \ r2 \ (bnullable r1 \ bnullable r2 \ bmkeps r1 = bmkeps r2)"