more
authorChengsong
Wed, 13 Jul 2022 08:27:28 +0100
changeset 564 3cbcd7cda0a9
parent 562 57e33978e55d
child 565 0497408a3598
more
ChengsongTanPhdThesis/Chapters/Bitcoded1.tex
ChengsongTanPhdThesis/Chapters/Finite.tex
ChengsongTanPhdThesis/Chapters/Inj.tex
ChengsongTanPhdThesis/Chapters/Introduction.tex
ChengsongTanPhdThesis/main.tex
thys2/blexer2.sc
thys3/BlexerSimp.thy
--- a/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex	Tue Jul 05 00:42:06 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex	Wed Jul 13 08:27:28 2022 +0100
@@ -8,96 +8,105 @@
 %simplifications and therefore introduce our version of the bitcoded algorithm and 
 %its correctness proof in 
 %Chapter 3\ref{Chapter3}. 
-In this chapter, we are going to introduce the bit-coded algorithm
-introduced by Sulzmann and Lu to address the problem of 
-under-simplified regular expressions. 
+In this chapter, we are going to describe the bit-coded algorithm
+introduced by Sulzmann and Lu \parencite{Sulzmann2014} to address the growth problem of 
+regular expressions. 
 \section{Bit-coded Algorithm}
 The lexer algorithm in Chapter \ref{Inj}, as shown in \ref{InjFigure},
 stores information of previous lexing steps
 on a stack, in the form of regular expressions
 and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
-\begin{envForCaption}
 \begin{ceqn}
 \begin{equation}%\label{graph:injLexer}
-\begin{tikzcd}
-r_0 \arrow[r, "\backslash c_0"]  \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\
-v_0           & v_1 \arrow[l,"inj_{r_0} c_0"]                & v_2 \arrow[l, "inj_{r_1} c_1"]              & v_n \arrow[l, dashed]         
+	\begin{tikzcd}[ampersand replacement=\&, execute at end picture={
+			\begin{scope}[on background layer]
+				\node[rectangle, fill={red!30},
+					pattern=north east lines, pattern color=red,
+					fit={(-3,-1) (-3, 1) (1, -1) 
+						(1, 1)}
+				     ] 
+				     {}; ,
+				\node[rectangle, fill={blue!20},
+					pattern=north east lines, pattern color=blue,
+					fit= {(1, -1) (1, 1) (3, -1) (3, 1)}
+					]
+					{};
+				\end{scope}}
+					]
+r_0 \arrow[r, "\backslash c_0"]  \arrow[d] \& r_1 \arrow[r, "\backslash c_1"] \arrow[d] \& r_2 \arrow[r, dashed] \arrow[d] \& r_n \arrow[d, "mkeps" description] \\
+v_0           \& v_1 \arrow[l,"inj_{r_0} c_0"]                \& v_2 \arrow[l, "inj_{r_1} c_1"]              \& v_n \arrow[l, dashed]         \\
 \end{tikzcd}
 \end{equation}
 \end{ceqn}
-\caption{Injection-based Lexing from Chapter\ref{Inj}}\label{InjFigure}
-\end{envForCaption}
 \noindent
+The red part represents what we already know during the first
+derivative phase,
+and the blue part represents the unknown part of input.
+The red area expands as we move towards $r_n$, 
+indicating an increasing stack size during lexing.
+Despite having some partial lexing information during
+the forward derivative phase, we choose to store them
+temporarily, only to convert the information to lexical
+values at a later stage. In essence we are repeating work we
+have already done.
 This is both inefficient and prone to stack overflow.
 A natural question arises as to whether we can store lexing
 information on the fly, while still using regular expression 
-derivatives?
+derivatives.
 
-In a lexing algorithm's run, split by the current input position,
-we have a sub-string that has been consumed,
-and the sub-string that has yet to come.
-We already know what was before, and this should be reflected in the value 
-and the regular expression at that step as well. But this is not the 
-case for injection-based regular expression derivatives.
-Take the regex $(aa)^* \cdot bc$ matching the string $aabc$
-as an example, if we have just read the two former characters $aa$:
-
+If we remove the details of the individual 
+lexing steps, and use red and blue areas as before
+to indicate consumed (seen) input and constructed
+partial value (before recovering the rest of the stack),
+one could see that the seen part's lexical information
+is stored in the form of a regular expression.
+Consider the regular expression $(aa)^* \cdot bc$ matching the string $aabc$
+and assume we have just read the two characters $aa$:
 \begin{center}
-\begin{envForCaption}
-\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
-    \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
-        {Consumed: $aa$
-         \nodepart{two} Not Yet Reached: $bc$ };
-%\caption{term 1 \ref{term:1}'s matching configuration}
-\end{tikzpicture}
-\caption{Partially matched String} 
-\end{envForCaption}
-\end{center}
-%\caption{Input String}\label{StringPartial}
-%\end{figure}
-
-\noindent
-We have the value that has already been partially calculated,
-and the part that has yet to come:
-\begin{center}
-\begin{envForCaption}
 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
     \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
-        {$\Seq(\Stars[\Char(a), \Char(a)], ???)$
+	    {Partial lexing info: $\ONE \cdot a \cdot (aa)^* \cdot bc$ etc.
          \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$};
-%\caption{term 1 \ref{term:1}'s matching configuration}
 \end{tikzpicture}
-\caption{Partially constructed Value} 
-\end{envForCaption}
 \end{center}
-
-In the regex derivative part , (after simplification)
-all we have is just what is about to come:
+\noindent
+In the injection-based lexing algorithm, we ``neglect" the red area
+by putting all the characters we have consumed and
+intermediate regular expressions on the stack when 
+we go from left to right in the derivative phase.
+The red area grows till the string is exhausted.
+During the injection phase, the value in the blue area
+is built up incrementally, while the red area shrinks.
+Before we have recovered all characters and intermediate
+derivative regular expressions from the stack,
+what values these characters and regular expressions correspond 
+to are unknown: 
 \begin{center}
-\begin{envForCaption}
 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
     \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={white!30,blue!20},]
-        {$???$
-         \nodepart{two} To Come: $b c$};
+	    {$(\ONE \cdot \ONE) \cdot (aa)^* \cdot bc $ correspond to:$???$
+         \nodepart{two}  $b c$ corresponds to  $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$};
 %\caption{term 1 \ref{term:1}'s matching configuration}
 \end{tikzpicture}
-\caption{Derivative} 
-\end{envForCaption}
 \end{center}
 \noindent
-The previous part is missing.
-How about keeping the partially constructed value 
-attached to the front of the regular expression?
+However, they should be calculable,
+as characters and regular expression shapes
+after taking derivative w.r.t those characters
+have already been known, therefore in our example,
+we know that the value starts with two $a$s,
+and makes up to an iteration in a Kleene star:
+(We have put the injection-based lexing's partial 
+result in the right part of the split rectangle
+to contrast it with the partial valued produced
+in a forward manner)
 \begin{center}
-\begin{envForCaption}
 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
     \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
-        {$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$
-         \nodepart{two} To Come: $b c$};
+	    {$\stackrel{Bitcoded}{\longrightarrow} \Seq(\Stars[\Char(a), \Char(a)], ???)$
+	\nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$  $\stackrel{Inj}{\longleftarrow}$};
 %\caption{term 1 \ref{term:1}'s matching configuration}
 \end{tikzpicture}
-\caption{Derivative} 
-\end{envForCaption}
 \end{center}
 \noindent
 If we do this kind of "attachment"
@@ -105,19 +114,23 @@
 constructed value when taking off a 
 character:
 \begin{center}
-\begin{envForCaption}
+\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+	\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] (spPoint)
+        {$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$
+         \nodepart{two} Remaining: $b c$};
+\end{tikzpicture}\\
+$\downarrow$\\
 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
     \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
         {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \ldots))$
-         \nodepart{two} To Come: $c$};
+         \nodepart{two} Remaining: $c$};
 \end{tikzpicture}\\
+$\downarrow$\\
 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
     \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
         {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \Char(c)))$
          \nodepart{two} EOF};
 \end{tikzpicture}
-\caption{After $\backslash b$ and $\backslash c$} 
-\end{envForCaption}
 \end{center}
 \noindent
 In the end we could recover the value without a backward phase.
@@ -125,14 +138,11 @@
 we instead use bit-codes to encode them.
 
 Bits and bitcodes (lists of bits) are defined as:
-\begin{envForCaption}
 \begin{center}
 		$b ::=   S \mid  Z \qquad
 bs ::= [] \mid b::bs    
 $
 \end{center}
-\caption{Bit-codes datatype}
-\end{envForCaption}
 
 \noindent
 Using $S$ and $Z$ rather than $1$ and $0$ is to avoid
@@ -141,7 +151,6 @@
 bit-lists) can be used to encode values (or potentially incomplete values) in a
 compact form. This can be straightforwardly seen in the following
 coding function from values to bitcodes: 
-\begin{envForCaption}
 \begin{center}
 \begin{tabular}{lcl}
   $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
@@ -154,9 +163,6 @@
                                                  code(\Stars\,vs)$
 \end{tabular}    
 \end{center} 
-\caption{Coding Function for Values}
-\end{envForCaption}
-
 \noindent
 Here $\textit{code}$ encodes a value into a bit-code by converting
 $\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty
@@ -168,9 +174,10 @@
 whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$. The
 reason for choosing this compact way of storing information is that the
 relatively small size of bits can be easily manipulated and ``moved
-around'' in a regular expression. 
+around" in a regular expression. 
 
-
+Because of the lossiness, the process of decoding a bitlist requires additionally 
+a regular expression. The function $\decode$ is defined as:
 We define the reverse operation of $\code$, which is $\decode$.
 As expected, $\decode$ not only requires the bit-codes,
 but also a regular expression to guide the decoding and 
@@ -178,7 +185,6 @@
 
 
 %\begin{definition}[Bitdecoding of Values]\mbox{}
-\begin{envForCaption}
 \begin{center}
 \begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
   $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
@@ -205,10 +211,10 @@
        \textit{else}\;\textit{None}$                       
 \end{tabular}    
 \end{center} 
-\end{envForCaption}
 %\end{definition}
 
 \noindent
+The function $\decode'$ returns a pair consisting of a partially decoded value and some leftover:
 $\decode'$ does most of the job while $\decode$ throws
 away leftover bit-codes and returns the value only.
 $\decode$ is terminating as $\decode'$ is terminating.
@@ -251,7 +257,7 @@
 
 
 The first thing we define related to bit-coded regular expressions
-is how we move bits, for instance pasting it at the front of an annotated regex.
+is how we move bits, for instance pasting it at the front of an annotated regular expression.
 The operation $\fuse$ is just to attach bit-codes 
 to the front of an annotated regular expression:
 \begin{center}
@@ -364,7 +370,7 @@
 This is done by adding bitcodes to the 
 derivatives, for example when one more star iteratoin is taken (we
 call the operation of derivatives on annotated regular expressions $\bder$
-because it is derivatives on regexes with \emph{b}itcodes),
+because it is derivatives on regular expressiones with \emph{b}itcodes),
 we need to unfold it into a sequence,
 and attach an additional bit $Z$ to the front of $r \backslash c$
 to indicate one more star iteration. 
@@ -652,7 +658,7 @@
 $\blexer \; r \; s = \decode  \; (\retrieve \; (\internalise \; r) \; (\mkeps \; (r \backslash s) )) \; r$
 \end{lemma}
 \noindent
-$\retrieve$ allows free navigation on the diagram \ref{InjFigure} for annotated regexes of $\blexer$.
+$\retrieve$ allows free navigation on the diagram \ref{InjFigure} for annotated regular expressiones of $\blexer$.
 For plain regular expressions something similar is required as well.
 
 \subsection{$\flex$}
--- a/ChengsongTanPhdThesis/Chapters/Finite.tex	Tue Jul 05 00:42:06 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Finite.tex	Wed Jul 13 08:27:28 2022 +0100
@@ -209,7 +209,7 @@
 With $\rrexp$ the size caclulation of annotated regular expressions'
 simplification and derivatives can be done by the size of their unlifted 
 counterpart with the unlifted version of simplification and derivatives applied.
-\begin{lemma}
+\begin{lemma}\label{sizeRelations}
 	The following equalities hold:
 	\begin{itemize}
 		\item
@@ -1244,8 +1244,35 @@
 \end{corollary}
 \noindent
 \subsection{Closed Forms for Star Regular Expressions}
-We use a similar technique as $r_1 \cdot r_2$ case,
-generating 
+We have shown how to control the size of the sequence regular expression $r_1\cdot r_2$ using
+the "closed form" of $(r_1 \cdot r_2) \backslash s$ and then 
+the property of the $\distinct$ function.
+Now we try to get a bound on $r^* \backslash s$ as well.
+Again, we first look at how a star's derivatives evolve, if they grow maximally: 
+\begin{center}
+
+$r^* \quad \longrightarrow_{\backslash c}  \quad   (r\backslash c)  \cdot  r^* \quad \longrightarrow_{\backslash c'}  \quad
+r \backslash cc'  \cdot r^* + r \backslash c' \cdot r^*  \quad \longrightarrow_{\backslash c''} \quad 
+(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*)   \quad \longrightarrow_{\backslash c'''}
+\quad \ldots$
+
+\end{center}
+When we have a string $s = c :: c' :: c'' \ldots$  such that $r \backslash c$, $r \backslash cc'$, $r \backslash c'$, 
+$r \backslash cc'c''$, $r \backslash c'c''$, $r\backslash c''$ etc. are all nullable,
+the number of terms in $r^* \backslash s$ will grow exponentially, causing the size
+of the derivatives $r^* \backslash s$ to grow exponentially, even if we do not 
+count the possible size explosions of $r \backslash c$ themselves.
+
+Thanks to $\rflts$ and $\rDistinct$, we are able to open up regexes like
+$(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + 
+(r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*) $ 
+into $\RALTS{[r_1 \backslash cc'c'' \cdot r^*, r \backslash c'', 
+r \backslash c'c'' \cdot r^*, r \backslash c'' \cdot r^*]}$
+and then de-duplicate terms of the form $r\backslash s' \cdot r^*$ ($s'$ being a substring of $s$).
+This allows us to use a similar technique as $r_1 \cdot r_2$ case,
+where the crux is to get an equivalent form of 
+$\rderssimp{r^*}{s}$ with shape $\rsimp{\sum rs}$.
+This requires generating 
 all possible sub-strings $s'$ of $s$
 such that $r\backslash s' \cdot r^*$ will appear 
 as a term in $(r^*) \backslash s$.
@@ -1304,6 +1331,68 @@
 \end{center}
 \noindent
 %MAYBE TODO: introduce createdByStar
+Again these definitions are tailor-made for dealing with alternatives that have
+originated from a star's derivatives, so we do not attempt to open up all possible 
+regexes of the form $\RALTS{rs}$, where $\textit{rs}$ might not contain precisely 2
+elements.
+We give a predicate for such "star-created" regular expressions:
+\begin{center}
+\begin{tabular}{lcr}
+         &    &       $\createdByStar{(\RSEQ{ra}{\RSTAR{rb}}) }$\\
+ $\createdByStar{r_1} \land \createdByStar{r_2} $ & $ \Longrightarrow$ & $\createdByStar{(r_1 + r_2)}$
+ \end{tabular}
+ \end{center}
+ 
+ These definitions allows us the flexibility to talk about 
+ regular expressions in their most convenient format,
+ for example, flattened out $\RALTS{[r_1, r_2, \ldots, r_n]} $
+ instead of binary-nested: $((r_1 + r_2) + (r_3 + r_4)) + \ldots$.
+ These definitions help express that certain classes of syntatically 
+ distinct regular expressions are actually the same under simplification.
+ This is not entirely true for annotated regular expressions: 
+ %TODO: bsimp bders \neq bderssimp
+ \begin{center}
+ $(1+ (c\cdot \ASEQ{bs}{c^*}{c} ))$
+ \end{center}
+ For bit-codes, the order in which simplification is applied
+ might cause a difference in the location they are placed.
+ If we want something like
+ \begin{center}
+ $\bderssimp{r}{s} \myequiv \bsimp{\bders{r}{s}}$
+ \end{center}
+ Some "canonicalization" procedure is required,
+ which either pushes all the common bitcodes to nodes
+  as senior as possible:
+  \begin{center}
+  $_{bs}(_{bs_1 @ bs'}r_1 + _{bs_1 @ bs''}r_2) \rightarrow _{bs @ bs_1}(_{bs'}r_1 + _{bs''}r_2) $
+  \end{center}
+ or does the reverse. However bitcodes are not of interest if we are talking about
+ the $\llbracket r \rrbracket$ size of a regex.
+ Therefore for the ease and simplicity of producing a
+ proof for a size bound, we are happy to restrict ourselves to 
+ unannotated regular expressions, and obtain such equalities as
+ \begin{lemma}
+ $\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$
+ \end{lemma}
+ 
+ \begin{proof}
+ By using the rewriting relation $\rightsquigarrow$
+ \end{proof}
+ %TODO: rsimp sflat
+And from this we obtain a proof that a star's derivative will be the same
+as if it had all its nested alternatives created during deriving being flattened out:
+ For example,
+ \begin{lemma}
+ $\createdByStar{r} \implies \rsimp{r} = \rsimp{\RALTS{\hflataux{r}}}$
+ \end{lemma}
+ \begin{proof}
+ By structural induction on $r$, where the induction rules are these of $\createdByStar{_}$.
+ \end{proof}
+% The simplification of a flattened out regular expression, provided it comes
+%from the derivative of a star, is the same as the one nested.
+ 
+
+
 We first introduce an inductive property
 for $\starupdate$ and $\hflataux{\_}$, 
 it says if we do derivatives of $r^*$
@@ -1516,7 +1605,7 @@
 We reason similarly for  $\STAR$.
 The inductive hypothesis is
 $\exists N. \forall s. \; \llbracket \rderssimp{r}{s} \rrbracket \leq N$.
-Let $\n_r = \llbracket r^* \rrbracket_r$.
+Let $n_r = \llbracket r^* \rrbracket_r$.
 When $s = c :: cs$ is not empty,
 \begin{center}
 \begin{tabular}{lcll}
@@ -1572,6 +1661,13 @@
 (4), (8), and (12) are all the inductive cases proven.
 \end{proof}
 
+
+\begin{corollary}
+For any regex $a$, $\exists N_r. \forall s. \; \rsize{\bderssimp{a}{s}} \leq N_r$
+\end{corollary}
+\begin{proof}
+	By \ref{sizeRelations}.
+\end{proof}
 \noindent
 
 %-----------------------------------
@@ -1891,105 +1987,6 @@
 %----------------------------------------------------------------------------------------
 %	SECTION 4
 %----------------------------------------------------------------------------------------
-\section{A Bound for the Star Regular Expression}
-We have shown how to control the size of the sequence regular expression $r_1\cdot r_2$ using
-the "closed form" of $(r_1 \cdot r_2) \backslash s$ and then 
-the property of the $\distinct$ function.
-Now we try to get a bound on $r^* \backslash s$ as well.
-Again, we first look at how a star's derivatives evolve, if they grow maximally: 
-\begin{center}
-
-$r^* \quad \longrightarrow_{\backslash c}  \quad   (r\backslash c)  \cdot  r^* \quad \longrightarrow_{\backslash c'}  \quad
-r \backslash cc'  \cdot r^* + r \backslash c' \cdot r^*  \quad \longrightarrow_{\backslash c''} \quad 
-(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*)   \quad \longrightarrow_{\backslash c'''}
-\quad \ldots$
-
-\end{center}
-When we have a string $s = c :: c' :: c'' \ldots$  such that $r \backslash c$, $r \backslash cc'$, $r \backslash c'$, 
-$r \backslash cc'c''$, $r \backslash c'c''$, $r\backslash c''$ etc. are all nullable,
-the number of terms in $r^* \backslash s$ will grow exponentially, causing the size
-of the derivatives $r^* \backslash s$ to grow exponentially, even if we do not 
-count the possible size explosions of $r \backslash c$ themselves.
-
-Thanks to $\flts$ and $\distinctWith$, we are able to open up regexes like
-$(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*) $ 
-into $\RALTS{[r_1 \backslash cc'c'' \cdot r^*, r \backslash c'', r \backslash c'c'' \cdot r^*, r \backslash c'' \cdot r^*]}$
-and then de-duplicate terms of the form $r\backslash s' \cdot r^*$ ($s'$ being a substring of $s$).
-For this we define $\hflataux{\_}$ and $\hflat{\_}$, similar to $\sflataux{\_}$ and $\sflat{\_}$:
-%TODO: definitions of  and \hflataux \hflat
- \begin{center}  
- \begin{tabular}{ccc}
- $\hflataux{r_1 + r_2}$ & $=$ & $\hflataux{r_1} @ \hflataux{r_2}$\\
-$\hflataux r$ & $=$ & $ [r]$
-\end{tabular}
-\end{center}
-
- \begin{center}  
- \begin{tabular}{ccc}
- $\hflat{r_1 + r_2}$ & $=$ & $\RALTS{\hflataux{r_1} @ \hflataux{r_2}}$\\
-$\hflat r$ & $=$ & $ r$
-\end{tabular}
-\end{center}s
-Again these definitions are tailor-made for dealing with alternatives that have
-originated from a star's derivatives, so we don't attempt to open up all possible 
-regexes of the form $\RALTS{rs}$, where $\textit{rs}$ might not contain precisely 2
-elements.
-We give a predicate for such "star-created" regular expressions:
-\begin{center}
-\begin{tabular}{lcr}
-         &    &       $\createdByStar{(\RSEQ{ra}{\RSTAR{rb}}) }$\\
- $\createdByStar{r_1} \land \createdByStar{r_2} $ & $ \Longrightarrow$ & $\createdByStar{(r_1 + r_2)}$
- \end{tabular}
- \end{center}
- 
- These definitions allows us the flexibility to talk about 
- regular expressions in their most convenient format,
- for example, flattened out $\RALTS{[r_1, r_2, \ldots, r_n]} $
- instead of binary-nested: $((r_1 + r_2) + (r_3 + r_4)) + \ldots$.
- These definitions help express that certain classes of syntatically 
- distinct regular expressions are actually the same under simplification.
- This is not entirely true for annotated regular expressions: 
- %TODO: bsimp bders \neq bderssimp
- \begin{center}
- $(1+ (c\cdot \ASEQ{bs}{c^*}{c} ))$
- \end{center}
- For bit-codes, the order in which simplification is applied
- might cause a difference in the location they are placed.
- If we want something like
- \begin{center}
- $\bderssimp{r}{s} \myequiv \bsimp{\bders{r}{s}}$
- \end{center}
- Some "canonicalization" procedure is required,
- which either pushes all the common bitcodes to nodes
-  as senior as possible:
-  \begin{center}
-  $_{bs}(_{bs_1 @ bs'}r_1 + _{bs_1 @ bs''}r_2) \rightarrow _{bs @ bs_1}(_{bs'}r_1 + _{bs''}r_2) $
-  \end{center}
- or does the reverse. However bitcodes are not of interest if we are talking about
- the $\llbracket r \rrbracket$ size of a regex.
- Therefore for the ease and simplicity of producing a
- proof for a size bound, we are happy to restrict ourselves to 
- unannotated regular expressions, and obtain such equalities as
- \begin{lemma}
- $\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$
- \end{lemma}
- 
- \begin{proof}
- By using the rewriting relation $\rightsquigarrow$
- \end{proof}
- %TODO: rsimp sflat
-And from this we obtain a proof that a star's derivative will be the same
-as if it had all its nested alternatives created during deriving being flattened out:
- For example,
- \begin{lemma}
- $\createdByStar{r} \implies \rsimp{r} = \rsimp{\RALTS{\hflataux{r}}}$
- \end{lemma}
- \begin{proof}
- By structural induction on $r$, where the induction rules are these of $\createdByStar{_}$.
- \end{proof}
-% The simplification of a flattened out regular expression, provided it comes
-%from the derivative of a star, is the same as the one nested.
- 
  
  
  
--- a/ChengsongTanPhdThesis/Chapters/Inj.tex	Tue Jul 05 00:42:06 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Inj.tex	Wed Jul 13 08:27:28 2022 +0100
@@ -4,22 +4,32 @@
 
 \label{Inj} % In chapter 2 \ref{Chapter2} we will introduce the concepts
 %and notations we 
-%use for describing the lexing algorithm by Sulzmann and Lu,
-%and then give the algorithm and its variant, and discuss
+% used for describing the lexing algorithm by Sulzmann and Lu,
+%and then give the algorithm and its variant and discuss
 %why more aggressive simplifications are needed. 
 
 In this chapter, we define the basic notions 
 for regular languages and regular expressions.
-This is essentially a description in "English"
-of your formalisation in Isabelle/HOL.
-We also give the definition of what $\POSIX$ lexing means.
+This is essentially a description in ``English"
+of our formalisation in Isabelle/HOL.
+We also give the definition of what $\POSIX$ lexing means, 
+followed by an algorithm by Sulzmanna and Lu\parencite{Sulzmann2014} 
+that produces the output conforming
+to the $\POSIX$ standard.
+It is also worth mentioning that
+we choose to use the ML-style notation
+for function applications, where
+the parameters of a function is not enclosed
+inside a pair of parentheses (e.g. $f \;x \;y$
+instead of $f(x,\;y)$). This is mainly
+to make the text visually more concise.
 
 \section{Basic Concepts}
-Usually formal language theory starts with an alphabet 
+Usually, formal language theory starts with an alphabet 
 denoting a set of characters.
 Here we just use the datatype of characters from Isabelle,
 which roughly corresponds to the ASCII characters.
-In what follows we shall leave the information about the alphabet
+In what follows, we shall leave the information about the alphabet
 implicit.
 Then using the usual bracket notation for lists,
 we can define strings made up of characters: 
@@ -30,7 +40,7 @@
 \end{center}
 Where $c$ is a variable ranging over characters.
 Strings can be concatenated to form longer strings in the same
-way as we concatenate two lists, which we write as @.
+way as we concatenate two lists, which we shall write as $s_1 @ s_2$.
 We omit the precise 
 recursive definition here.
 We overload this concatenation operator for two sets of strings:
@@ -48,8 +58,8 @@
 $A^{n+1}$ & $\dn$ & $A @ A^n$
 \end{tabular}
 \end{center}
-The union of all the natural number powers of a language   
-is usually defined as the Kleene star operator:
+The union of all powers of a language   
+can be used to define the Kleene star operator:
 \begin{center}
 \begin{tabular}{lcl}
  $A*$ & $\dn$ & $\bigcup_{i \geq 0} A^i$ \\
@@ -57,19 +67,19 @@
 \end{center}
 
 \noindent
-However, to obtain a convenient induction principle 
+However, to obtain a more convenient induction principle 
 in Isabelle/HOL, 
 we instead define the Kleene star
 as an inductive set: 
 
 \begin{center}
 \begin{mathpar}
-\inferrule{}{[] \in A*\\}
+	\inferrule{\mbox{}}{[] \in A*\\}
 
-\inferrule{\\s_1 \in A \land \; s_2 \in A*}{s_1 @ s_2 \in A*}
+\inferrule{s_1 \in A \;\; s_2 \in A*}{s_1 @ s_2 \in A*}
 \end{mathpar}
 \end{center}
-\ChristianComment{Yes, used the inferrule command in mathpar}
+\noindent
 We also define an operation of "chopping off" a character from
 a language, which we call $\Der$, meaning \emph{Derivative} (for a language):
 \begin{center}
@@ -89,10 +99,10 @@
 which is essentially the left quotient $A \backslash L$ of $A$ against 
 the singleton language with $L = \{w\}$
 in formal language theory.
-However for the purposes here, the $\textit{Ders}$ definition with 
+However, for the purposes here, the $\textit{Ders}$ definition with 
 a single string is sufficient.
 
-With the  sequencing, Kleene star, and $\textit{Der}$ operator on languages,
+With the sequencing, Kleene star, and $\textit{Der}$ operator on languages,
 we have a  few properties of how the language derivative can be defined using 
 sub-languages.
 \begin{lemma}
@@ -113,26 +123,28 @@
 $\textit{Der} \;c \;(A*) = (\textit{Der}\; c A) @ (A*)$\\
 \end{lemma}
 \begin{proof}
+There are too inclusions to prove:
 \begin{itemize}
-\item{$\subseteq$}
-\noindent
+\item{$\subseteq$}:\\
 The set 
 \[ \{s \mid c :: s \in A*\} \]
 is enclosed in the set
-\[ \{s_1 @ s_2 \mid s_1 \, s_2. s_1 \in \{s \mid c :: s \in A\} \land s_2 \in A* \} \]
+\[ \{s_1 @ s_2 \mid s_1 \, s_2.\;  s_1 \in \{s \mid c :: s \in A\} \land s_2 \in A* \} \]
 because whenever you have a string starting with a character 
 in the language of a Kleene star $A*$, 
 then that character together with some sub-string
 immediately after it will form the first iteration, 
 and the rest of the string will 
 be still in $A*$.
-\item{$\supseteq$}
+\item{$\supseteq$}:\\
 Note that
 \[ \Der \; c \; (A*) = \Der \; c \;  (\{ [] \} \cup (A @ A*) ) \]
-and 
+hold.
+Also this holds:
 \[ \Der \; c \;  (\{ [] \} \cup (A @ A*) ) = \Der\; c \; (A @ A*) \]
-where the $\textit{RHS}$ of the above equatioin can be rewritten
-as \[ (\Der \; c\; A) @ A* \cup A' \], $A'$ being a possibly empty set.
+where the $\textit{RHS}$ can be rewritten
+as \[ (\Der \; c\; A) @ A* \cup (\Der \; c \; (A*)) \]
+which of course contains $\Der \; c \; A @ A*$.
 \end{itemize}
 \end{proof}
 
@@ -141,7 +153,7 @@
 for regular languages, we need to first give definitions for regular expressions.
 
 \subsection{Regular Expressions and Their Meaning}
-The basic regular expressions  are defined inductively
+The \emph{basic regular expressions} are defined inductively
  by the following grammar:
 \[			r ::=   \ZERO \mid  \ONE
 			 \mid  c  
@@ -150,81 +162,127 @@
 			 \mid r^*         
 \]
 \noindent
-We call them basic because we might introduce
-more constructs later such as negation
+We call them basic because we will introduce
+additional constructors in later chapters such as negation
 and bounded repetitions.
-We defined the regular expression containing
-nothing as $\ZERO$, note that some authors
-also use $\phi$ for that.
-Similarly, the regular expression denoting the 
-singleton set with only $[]$ is sometimes 
-denoted by $\epsilon$, but we use $\ONE$ here.
-
-The language or set of strings denoted 
-by regular expressions are defined as
+We use $\ZERO$ for the regular expression that
+matches no string, and $\ONE$ for the regular
+expression that matches only the empty string\footnote{
+some authors
+also use $\phi$ and $\epsilon$ for $\ZERO$ and $\ONE$
+but we prefer our notation}. 
+The sequence regular expression is written $r_1\cdot r_2$
+and sometimes we omit the dot if it is clear which
+regular expression is meant; the alternative
+is written $r_1 + r_2$.
+The \emph{language} or meaning of 
+a regular expression is defined recursively as
+a set of strings:
 %TODO: FILL in the other defs
 \begin{center}
 \begin{tabular}{lcl}
-$L \; (\ZERO)$ & $\dn$ & $\phi$\\
-$L \; (\ONE)$ & $\dn$ & $\{[]\}$\\
-$L \; (c)$ & $\dn$ & $\{[c]\}$\\
-$L \; (r_1 + r_2)$ & $\dn$ & $ L \; (r_1) \cup L \; ( r_2)$\\
-$L \; (r_1 \cdot r_2)$ & $\dn$ & $ L \; (r_1) @ L \; (r_2)$\\
-$L \; (r^*)$ & $\dn$ & $ (L(r))^*$
+$L \; \ZERO$ & $\dn$ & $\phi$\\
+$L \; \ONE$ & $\dn$ & $\{[]\}$\\
+$L \; c$ & $\dn$ & $\{[c]\}$\\
+$L \; r_1 + r_2$ & $\dn$ & $ L \; r_1 \cup L \; r_2$\\
+$L \; r_1 \cdot r_2$ & $\dn$ & $ L \; r_1 @ L \; r_2$\\
+$L \; r^*$ & $\dn$ & $ (L\;r)*$
 \end{tabular}
 \end{center}
 \noindent
-Which is also called the "language interpretation" of
-a regular expression.
-
 Now with semantic derivatives of a language and regular expressions and
 their language interpretations in place, we are ready to define derivatives on regexes.
 \subsection{Brzozowski Derivatives and a Regular Expression Matcher}
-
-\ChristianComment{Hi this part I want to keep the ordering as is, so that it keeps the 
-readers engaged with a story how we got to the definition of $\backslash$, rather 
-than first "overwhelming" them with the definition of $\nullable$.}
-
-The language derivative acts on a string set and chops off a character from
-all strings in that set, we want to define a derivative operation on regular expressions
-so that after derivative $L(r\backslash c)$ 
-will look as if it was obtained by doing a language derivative on $L(r)$:
+%Recall, the language derivative acts on a set of strings
+%and essentially chops off a particular character from
+%all strings in that set, Brzozowski defined a derivative operation on regular expressions
+%so that after derivative $L(r\backslash c)$ 
+%will look as if it was obtained by doing a language derivative on $L(r)$:
+Recall that the semantic derivative acts on a set of 
+strings. Brzozowski noticed that this operation
+can be ``mirrored" on regular expressions which
+he calls the derivative of a regular expression $r$
+with respect to a character $c$, written
+$r \backslash c$.
+He defined this operation such that the following property holds:
 \begin{center}
-\[
-r\backslash c \dn ?
-\]
-so that
-\[
-L(r \backslash c) = \Der \; c \; L(r) ?
-\]
-\end{center}
-So we mimic the equalities we have for $\Der$ on language concatenation
 
 \[
-\Der \; c \; (A @ B) = \textit{if} \;  [] \in A \; \textit{then} ((\Der \; c \; A) @ B ) \cup \Der \; c\; B \quad \textit{else}\; (\Der \; c \; A) @ B\\
+L(r \backslash c) = \Der \; c \; L(r)
 \]
-to get the derivative for sequence regular expressions:
-\[
-(r_1 \cdot r_2 ) \backslash c = \textit{if}\,([] \in L(r_1)) r_1 \backslash c \cdot r_2 + r_2 \backslash c \textit{else} (r_1 \backslash c) \cdot r_2
-\]
+\end{center}
+\noindent
+For example in the sequence case we have 
+\begin{center}
+	\begin{tabular}{lcl}
+		$\Der \; c \; (A @ B)$ & $\dn$ & 
+		$ \textit{if} \;\,  [] \in A \; 
+		\textit{then} \;\, ((\Der \; c \; A) @ B ) \cup 
+		\Der \; c\; B$\\
+		& & $\textit{else}\; (\Der \; c \; A) @ B$\\
+	\end{tabular}
+\end{center}
+\noindent
+This can be translated to  
+regular expressions in the following 
+manner:
+\begin{center}
+	\begin{tabular}{lcl}
+		$(r_1 \cdot r_2 ) \backslash c$ & $\dn$ & $\textit{if}\;\,([] \in L(r_1)) r_1 \backslash c \cdot r_2 + r_2 \backslash c$ \\
+		& & $\textit{else} \; (r_1 \backslash c) \cdot r_2$
+	\end{tabular}
+\end{center}
 
 \noindent
-and language Kleene star:
+And similarly, the Kleene star's semantic derivative
+can be expressed as
 \[
-\textit{Der} \;c \;A* = (\textit{Der}\; c A) @ (A*)
+	\textit{Der} \;c \;(A*) \dn (\textit{Der}\; c A) @ (A*)
 \]
-to get derivative of the Kleene star regular expression:
+which translates to
 \[
-r^* \backslash c = (r \backslash c)\cdot r^*
+	(r^*) \backslash c \dn (r \backslash c)\cdot r^*.
 \]
-Note that although we can formalise the boolean predicate
-$[] \in L(r_1)$ without problems, if we want a function that works
-computationally, then we would have to define a function that tests
-whether an empty string is in the language of a regular expression.
-We call such a function $\nullable$:
-
-
-
+In the above definition of $(r_1\cdot r_2) \backslash c$,
+the $\textit{if}$ clause's
+boolean condition 
+$[] \in L(r_1)$ needs to be 
+somehow recursively computed.
+We call such a function that checks
+whether the empty string $[]$ is 
+in the language of a regular expression $\nullable$:
+\begin{center}
+		\begin{tabular}{lcl}
+			$\nullable(\ZERO)$     & $\dn$ & $\mathit{false}$ \\  
+			$\nullable(\ONE)$      & $\dn$ & $\mathit{true}$ \\
+			$\nullable(c)$ 	       & $\dn$ & $\mathit{false}$ \\
+			$\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\
+			$\nullable(r_1\cdot r_2)$  & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\
+			$\nullable(r^*)$       & $\dn$ & $\mathit{true}$ \\
+		\end{tabular}
+\end{center}
+\noindent
+The $\ZERO$ regular expression
+does not contain any string and
+therefore is not \emph{nullable}.
+$\ONE$ is \emph{nullable} 
+by definition. 
+The character regular expression $c$
+corresponds to the singleton set $\{c\}$, 
+and therefore does not contain the empty string.
+The alternative regular expression is nullable
+if at least one of its children is nullable.
+The sequence regular expression
+would require both children to have the empty string
+to compose an empty string, and the Kleene star
+is always nullable because it naturally
+contains the empty string. 
+  
+The derivative function, written $r\backslash c$, 
+defines how a regular expression evolves into
+a new one after all the string it contains is acted on: 
+if it starts with $c$, then the character is chopped of,
+if not, that string is removed.
 \begin{center}
 \begin{tabular}{lcl}
 		$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\  
@@ -239,67 +297,32 @@
 \end{tabular}
 \end{center}
 \noindent
-The function derivative, written $r\backslash c$, 
-defines how a regular expression evolves into
-a new regular expression after all the string it contains
-is chopped off a certain head character $c$.
-The most involved cases are the sequence 
-and star case.
+The most involved cases are the sequence case
+and the star case.
 The sequence case says that if the first regular expression
-contains an empty string then the second component of the sequence
-might be chosen as the target regular expression to be chopped
-off its head character.
-The star regular expression's derivative unwraps the iteration of
-regular expression and attaches the star regular expression
-to the sequence's second element to make sure a copy is retained
-for possible more iterations in later phases of lexing.
-
-
-To test whether $[] \in L(r_1)$, we need the $\nullable$ function,
-which tests whether the empty string $""$ 
-is in the language of $r$:
-
-
-\begin{center}
-		\begin{tabular}{lcl}
-			$\nullable(\ZERO)$     & $\dn$ & $\mathit{false}$ \\  
-			$\nullable(\ONE)$      & $\dn$ & $\mathit{true}$ \\
-			$\nullable(c)$ 	       & $\dn$ & $\mathit{false}$ \\
-			$\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\
-			$\nullable(r_1\cdot r_2)$  & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\
-			$\nullable(r^*)$       & $\dn$ & $\mathit{true}$ \\
-		\end{tabular}
-\end{center}
-\noindent
-The empty set does not contain any string and
-therefore not the empty string, the empty string 
-regular expression contains the empty string
-by definition, the character regular expression
-is the singleton that contains character only,
-and therefore does not contain the empty string,
-the alternative regular expression (or "or" expression)
-might have one of its children regular expressions
-being nullable and any one of its children being nullable
-would suffice. The sequence regular expression
-would require both children to have the empty string
-to compose an empty string and the Kleene star
-operation naturally introduced the empty string. 
-  
-We have the following property where the derivative on regular 
-expressions coincides with the derivative on a set of strings:
-
-\begin{lemma}
+contains an empty string, then the second component of the sequence
+needs to be considered, as its derivative will contribute to the
+result of this derivative.
+The star regular expression $r^*$'s derivative 
+unwraps one iteration of $r$, turns it into $r\backslash c$,
+and attaches the original $r^*$
+after $r\backslash c$, so that 
+we can further unfold it as many times as needed.
+We have the following correspondence between 
+derivatives on regular expressions and
+derivatives on a set of strings:
+\begin{lemma}\label{derDer}
 $\textit{Der} \; c \; L(r) = L (r\backslash c)$
 \end{lemma}
 
 \noindent
 The main property of the derivative operation
-that enables us to reason about the correctness of
-an algorithm using derivatives is 
+(that enables us to reason about the correctness of
+derivative-based matching)
+is 
 
 \begin{lemma}\label{derStepwise}
-$c\!::\!s \in L(r)$ holds
-if and only if $s \in L(r\backslash c)$.
+	$c\!::\!s \in L(r)$ \textit{iff} $s \in L(r\backslash c)$.
 \end{lemma}
 
 \noindent
@@ -314,12 +337,14 @@
 \end{center}
 
 \noindent
-When there is no ambiguity we will use  $\backslash$ to denote
+When there is no ambiguity, we will 
+omit the subscript and use $\backslash$ instead
+of $\backslash_r$ to denote
 string derivatives for brevity.
 Brzozowski's  regular-expression matcher algorithm can then be described as:
 
 \begin{definition}
-$\textit{match}\;s\;r \;\dn\; \nullable(r\backslash s)$
+$\textit{match}\;s\;r \;\dn\; \nullable \; (r\backslash s)$
 \end{definition}
 
 \noindent
@@ -336,17 +361,15 @@
  It can  be
 relatively  easily shown that this matcher is correct:
 \begin{lemma}
-$\textit{match} \; s\; r  = \textit{true} \Longleftrightarrow s \in L(r)$
+	$\textit{match} \; s\; r  = \textit{true} \; \textit{iff} \; s \in L(r)$
 \end{lemma}
 \begin{proof}
-By the stepwise property of $\backslash$ (\ref{derStepwise})
+	By the stepwise property of derivatives (lemma \ref{derStepwise})
+	and lemma \ref{derDer}. 
 \end{proof}
 \noindent
-If we implement the above algorithm naively, however,
-the algorithm can be excruciatingly slow.
-
-
-\begin{figure}
+\begin{center}
+	\begin{figure}
 \begin{tikzpicture}
 \begin{axis}[
     xlabel={$n$},
@@ -360,84 +383,93 @@
 \end{tikzpicture} 
 \caption{Matching $(a^*)^*b$ against $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}$}\label{NaiveMatcher}
 \end{figure}
-   
+\end{center} 
 \noindent
-For this we need to introduce certain 
+If we implement the above algorithm naively, however,
+the algorithm can be excruciatingly slow, as shown in 
+\ref{NaiveMatcher}.
+Note that both axes are in logarithmic scale.
+Around two dozens characters
+would already explode the matcher on regular expression 
+$(a^*)^*b$.
+For this, we need to introduce certain 
 rewrite rules for the intermediate results,
 such as $r + r \rightarrow r$,
 and make sure those rules do not change the 
 language of the regular expression.
-We have a simplification function (that is as simple as possible
-while having much power on making a regex simpler):
-\begin{verbatim}
-def simp(r: Rexp) : Rexp = r match {
-  case SEQ(r1, r2) => 
-    (simp(r1), simp(r2)) match {
-      case (ZERO, _) => ZERO
-      case (_, ZERO) => ZERO
-      case (ONE, r2s) => r2s
-      case (r1s, ONE) => r1s
-      case (r1s, r2s) => SEQ(r1s, r2s)
-    }
-  case ALTS(r1, r2) => {
-    (simp(r1), simp(r2)) match {
-      case (ZERO, r2s) => r2s
-      case (r1s, ZERO) => r1s
-      case (r1s, r2s) =>
-        if(r1s == r2s) r1s else ALTS(r1s, r2s)
-    }
-  }
-  case r => r
-}
-\end{verbatim}
-If we repeatedly incorporate these 
-rules during the matching algorithm, 
-we have a lexer with simplification:
-\begin{verbatim}
-def ders_simp(s: List[Char], r: Rexp) : Rexp = s match {
-  case Nil => simp(r)
-  case c :: cs => ders_simp(cs, simp(der(c, r)))
-}
-
-def simp_matcher(s: String, r: Rexp) : Boolean = 
-  nullable(ders_simp(s.toList, r))
-
-\end{verbatim}
-\noindent
-After putting in those rules, the example of \ref{NaiveMatcher}
-is now very tame in the length of inputs:
-
-
+One simpled-minded simplification function
+that achieves these requirements is given below:
+\begin{center}
+	\begin{tabular}{lcl}
+		$\simp \; r_1 \cdot r_2 $ & $ \dn$ & 
+		$(\simp \; r_1,  \simp \; r_2) \; \textit{match}$\\
+					  & & $\quad \case \; (\ZERO, \_) \Rightarrow \ZERO$\\
+					  & & $\quad \case \; (\_, \ZERO) \Rightarrow \ZERO$\\
+					  & & $\quad \case \; (\ONE, r_2') \Rightarrow r_2'$\\
+					  & & $\quad \case \; (r_1', \ONE) \Rightarrow r_1'$\\
+					  & & $\quad \case \; (r_1', r_2') \Rightarrow r_1'\cdot r_2'$\\
+		$\simp \; r_1 + r_2$ & $\dn$ & $(\simp \; r_1, \simp \; r_2) \textit{match}$\\
+				     & & $\quad \; \case \; (\ZERO, r_2') \Rightarrow r_2'$\\
+				     & & $\quad \; \case \; (r_1', \ZERO) \Rightarrow r_1'$\\
+				     & & $\quad \; \case \; (r_1', r_2') \Rightarrow r_1' + r_2'$\\
+		$\simp \; r$ & $\dn$ & $r$
+				   
+	\end{tabular}
+\end{center}
+If we repeatedly apply this simplification  
+function during the matching algorithm, 
+we have a matcher with simplification:
+\begin{center}
+	\begin{tabular}{lcl}
+		$\derssimp \; [] \; r$ & $\dn$ & $r$\\
+		$\derssimp \; c :: cs \; r$ & $\dn$ & $\derssimp \; cs \; (\simp \; (r \backslash c))$\\
+		$\textit{matcher}_{simp}\; s \; r $ & $\dn$ & $\nullable \; (\derssimp \; s\;r)$
+	\end{tabular}
+\end{center}
+\begin{figure}
 \begin{tikzpicture}
 \begin{axis}[
     xlabel={$n$},
     ylabel={time in secs},
     ymode = log,
     xmode = log,
+    grid = both,
     legend entries={Matcher With Simp},  
     legend pos=north west,
     legend cell align=left]
 \addplot[red,mark=*, mark options={fill=white}] table {BetterMatcher.data};
 \end{axis}
-\end{tikzpicture} \label{fig:BetterMatcher}
-
+\end{tikzpicture} 
+\caption{$(a^*)^*b$ 
+against 
+$\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}$ Using $\textit{matcher}_{simp}$}\label{BetterMatcher}
+\end{figure}
+\noindent
+The running time of $\textit{ders}\_\textit{simp}$
+on the same example of \ref{NaiveMatcher}
+is now very tame in terms of the length of inputs,
+as shown in \ref{BetterMatcher}.
 
-\noindent
-Note how the x-axis is in logarithmic scale.
 Building derivatives and then testing the existence
 of empty string in the resulting regular expression's language,
-and add simplification rules when necessary.
+adding simplifications when necessary.
 So far, so good. But what if we want to 
 do lexing instead of just getting a YES/NO answer?
-\citeauthor{Sulzmann2014} first came up with a nice and 
+Sulzmanna and Lu \cite{Sulzmann2014} first came up with a nice and 
 elegant (arguably as beautiful as the definition of the original derivative) solution for this.
 
 \section{Values and the Lexing Algorithm by Sulzmann and Lu}
-Here we present the hybrid phases of a regular expression lexing 
-algorithm using the function $\inj$, as given by Sulzmann and Lu.
-They first defined the datatypes for storing the 
-lexing information called a \emph{value} or
-sometimes also \emph{lexical value}.  These values and regular
+In this section, we present a two-phase regular expression lexing 
+algorithm.
+The first phase takes successive derivatives with 
+respect to the input string,
+and the second phase does the reverse, \emph{injecting} back
+characters, in the meantime constructing a lexing result.
+We will introduce the injection phase in detail slightly
+later, but as a preliminary we have to first define 
+the datatype for lexing results, 
+called \emph{value} or
+sometimes also \emph{lexical value}.  Values and regular
 expressions correspond to each other as illustrated in the following
 table:
 
@@ -466,56 +498,116 @@
 		\end{tabular}
 	\end{tabular}
 \end{center}
-
 \noindent
-We have a formal binary relation for telling whether the structure
-of a regular expression agrees with the value.
+A value has an underlying string, which 
+can be calculated by the ``flatten" function $|\_|$:
+\begin{center}
+	\begin{tabular}{lcl}
+		$|\Empty|$ & $\dn$ &  $[]$\\
+		$|\Char \; c|$ & $ \dn$ & $ [c]$\\
+		$|\Seq(v_1, v_2)|$ & $ \dn$ & $ v_1| @ |v_2|$\\
+		$|\Left(v)|$ & $ \dn$ & $ |v|$\\
+		$|\Right(v)|$ & $ \dn$ & $ |v|$\\
+		$|\Stars([])|$ & $\dn$ & $[]$\\
+		$|\Stars(v::vs)|$ &  $\dn$ & $ |v| @ |\Stars(vs)|$
+	\end{tabular}
+\end{center}
+Sulzmann and Lu used a binary predicate, written $\vdash v:r $,
+to indicate that a value $v$ could be generated from a lexing algorithm
+with input $r$. They call it the value inhabitation relation. 
 \begin{mathpar}
-\inferrule{}{\vdash \Char(c) : \mathbf{c}} \hspace{2em}
-\inferrule{}{\vdash \Empty :  \ONE} \hspace{2em}
-\inferrule{\vdash v_1 : r_1 \\ \vdash v_2 : r_2 }{\vdash \Seq(v_1, v_2) : (r_1 \cdot r_2)}
-\end{mathpar}
+	\inferrule{\mbox{}}{\vdash \Char(c) : \mathbf{c}} \hspace{2em}
+
+	\inferrule{\mbox{}}{\vdash \Empty :  \ONE} \hspace{2em}
+
+\inferrule{\vdash v_1 : r_1 \;\; \vdash v_2 : r_2 }{\vdash \Seq(v_1, v_2) : (r_1 \cdot r_2)}
+
+\inferrule{\vdash v_1 : r_1}{\vdash \Left(v_1):r_1+r_2}
+
+\inferrule{\vdash v_2 : r_2}{\vdash \Right(v_2):r_1 + r_2}
 
-Building on top of Sulzmann and Lu's attempt to formalise the 
-notion of POSIX lexing rules \parencite{Sulzmann2014}, 
-Ausaf and Urban\parencite{AusafDyckhoffUrban2016} modelled
-POSIX matching as a ternary relation recursively defined in a
-natural deduction style.
-The formal definition of a $\POSIX$ value $v$ for a regular expression
+\inferrule{\forall v \in vs. \vdash v:r \land  |v| \neq []}{\vdash \Stars(vs):r^*}
+\end{mathpar}
+\noindent
+The condition $|v| \neq []$ in the premise of star's rule
+is to make sure that for a given pair of regular 
+expression $r$ and string $s$, the number of values 
+satisfying $|v| = s$ and $\vdash v:r$ is finite.
+Given the same string and regular expression, there can be
+multiple values for it. For example, both
+$\vdash \Seq(\Left \; ab)(\Right \; c):(ab+a)(bc+c)$ and
+$\vdash \Seq(\Right\; a)(\Left \; bc ):(ab+a)(bc+c)$ hold
+and the values both flatten to $abc$.
+Lexers therefore have to disambiguate and choose only
+one of the values to output. $\POSIX$ is one of the
+disambiguation strategies that is widely adopted.
+
+Ausaf and Urban\parencite{AusafDyckhoffUrban2016} 
+formalised the property 
+as a ternary relation.
+The $\POSIX$ value $v$ for a regular expression
 $r$ and string $s$, denoted as $(s, r) \rightarrow v$, can be specified 
-in the following set of rules:
-\ChristianComment{Will complete later}
-\newcommand*{\inference}[3][t]{%
-   \begingroup
-   \def\and{\\}%
-   \begin{tabular}[#1]{@{\enspace}c@{\enspace}}
-   #2 \\
-   \hline
-   #3
-   \end{tabular}%
-   \endgroup
-}
-\begin{center}
-\inference{$s_1 @ s_2 = s$ \and $(\nexists s_3 s_4 s_5. s_1 @ s_5 = s_3 \land s_5 \neq [] \land s_3 @ s_4 = s \land (s_3, r_1) \rightarrow v_3 \land (s_4, r_2) \rightarrow v_4)$ \and $(s_1, r_1) \rightarrow v_1$ \and $(s_2, r_2) \rightarrow v_2$  }{$(s, r_1 \cdot r_2) \rightarrow \Seq(v_1, v_2)$ }
-\end{center}
+in the following set of rules\footnote{The names of the rules are used
+as they were originally given in \cite{AusafDyckhoffUrban2016}}:
+\noindent
+\begin{figure}
+\begin{mathpar}
+	\inferrule[P1]{\mbox{}}{([], \ONE) \rightarrow \Empty}
+		
+	\inferrule[PC]{\mbox{}}{([c], c) \rightarrow \Char \; c}
+
+	\inferrule[P+L]{(s,r_1)\rightarrow v_1}{(s, r_1+r_2)\rightarrow \Left \; v_1}
+
+	\inferrule[P+R]{(s,r_2)\rightarrow v_2\\ s \notin L \; r_1}{(s, r_1+r_2)\rightarrow \Right \; v_2}
+
+	\inferrule[PS]{(s_1, v_1) \rightarrow r_1 \\ (s_2, v_2)\rightarrow r_2\\
+		\nexists s_3 \; s_4. s_3 \neq [] \land s_3 @ s_4 = s_2 \land 
+		s_1@ s_3 \in L \; r_1 \land s_4 \in L \; r_2}{(s_1 @ s_2, r_1\cdot r_2) \rightarrow
+	\Seq \; v_1 \; v_2}
+
+	\inferrule[P{[]}]{\mbox{}}{([], r^*) \rightarrow \Stars([])}
+
+	\inferrule[P*]{(s_1, v) \rightarrow v \\ (s_2, r^*) \rightarrow \Stars \; vs \\
+		|v| \neq []\\ \nexists s_3 \; s_4. s_3 \neq [] \land s_3@s_4 = s_2 \land
+		s_1@s_3 \in L \; r \land s_4 \in L \; r^*}{(s_1@s_2, r^*)\rightarrow \Stars \;
+	(v::vs)}
+\end{mathpar}
+\caption{POSIX Lexing Rules}
+\end{figure}
 \noindent
-The above $\POSIX$ rules could be explained intuitionally as
+The above $\POSIX$ rules follows the intuition described below: 
 \begin{itemize}
-\item
-match the leftmost regular expression when multiple options of matching
-are available  
-\item 
-always match a subpart as much as possible before proceeding
-to the next token.
+	\item (Left Priority)\\
+		Match the leftmost regular expression when multiple options of matching
+		are available.
+	\item (Maximum munch)\\
+		Always match a subpart as much as possible before proceeding
+		to the next token.
 \end{itemize}
-
-The reason why we are interested in $\POSIX$ values is that they can
-be practically used in the lexing phase of a compiler front end.
+\noindent
+These disambiguation strategies can be 
+quite practical.
 For instance, when lexing a code snippet 
-$\textit{iffoo} = 3$ with the regular expression $\textit{keyword} + \textit{identifier}$, we want $\textit{iffoo}$ to be recognized
-as an identifier rather than a keyword.
-\ChristianComment{Do I also introduce lexical values $LV$ here?}
-We know that $\POSIX$ values are also part of the normal values:
+\[ 
+	\textit{iffoo} = 3
+\]
+using the regular expression (with 
+keyword and identifier having their 
+usualy definitions on any formal
+language textbook, for instance
+keyword is a nonempty string starting with letters 
+followed by alphanumeric characters or underscores):
+\[
+	\textit{keyword} + \textit{identifier},
+\]
+we want $\textit{iffoo}$ to be recognized
+as an identifier rather than a keyword (if)
+followed by
+an identifier (foo).
+POSIX lexing achieves this.
+
+We know that a $\POSIX$ value is also a normal underlying
+value:
 \begin{lemma}
 $(r, s) \rightarrow v \implies \vdash v: r$
 \end{lemma}
@@ -527,10 +619,13 @@
 $\textit{if} \,(s, r) \rightarrow v_1 \land (s, r) \rightarrow v_2\quad  \textit{then} \; v_1 = v_2$
 \end{lemma}
 \begin{proof}
-By induction on $s$, $r$ and $v_1$. The induction principle is
-the \POSIX rules. Each case is proven by a combination of
-the induction rules for $\POSIX$ values and the inductive hypothesis.
-Probably the most cumbersome cases are the sequence and star with non-empty iterations.
+By induction on $s$, $r$ and $v_1$. The inductive cases
+are all the POSIX rules. 
+Each case is proven by a combination of
+the induction rules for $\POSIX$ 
+values and the inductive hypothesis.
+Probably the most cumbersome cases are 
+the sequence and star with non-empty iterations.
 
 We give the reasoning about the sequence case as follows:
 When we have $(s_1, r_1) \rightarrow v_1$ and $(s_2, r_2) \rightarrow v_2$, 
@@ -544,15 +639,14 @@
 which means this "different" $\POSIX$ value $\Seq(v_1', v_2')$
 is the same as $\Seq(v_1, v_2)$. 
 \end{proof}
-Now we know what a $\POSIX$ value is, the problem is how do we achieve 
+Now we know what a $\POSIX$ value is; the problem is how do we achieve 
 such a value in a lexing algorithm, using derivatives?
 
 \subsection{Sulzmann and Lu's Injection-based Lexing Algorithm}
 
 The contribution of Sulzmann and Lu is an extension of Brzozowski's
 algorithm by a second phase (the first phase being building successive
-derivatives---see \ref{graph:successive_ders}). In this second phase, a POSIX value 
-is generated if the regular expression matches the string.
+derivatives---see \ref{graph:successive_ders}). This second phase generates a POSIX value if the regular expression matches the string.
 Two functions are involved: $\inj$ and $\mkeps$.
 The function $\mkeps$ constructs a value from the last
 one of all the successive derivatives:
@@ -570,13 +664,13 @@
 
 	\begin{center}
 		\begin{tabular}{lcl}
-			$\mkeps(\ONE)$ 		& $\dn$ & $\Empty$ \\
-			$\mkeps(r_{1}+r_{2})$	& $\dn$ 
-			& \textit{if} $\nullable(r_{1})$\\ 
-			& & \textit{then} $\Left(\mkeps(r_{1}))$\\ 
-			& & \textit{else} $\Right(\mkeps(r_{2}))$\\
-			$\mkeps(r_1\cdot r_2)$ 	& $\dn$ & $\Seq\,(\mkeps\,r_1)\,(\mkeps\,r_2)$\\
-			$mkeps(r^*)$	        & $\dn$ & $\Stars\,[]$
+			$\mkeps \; \ONE$ 		& $\dn$ & $\Empty$ \\
+			$\mkeps \; r_{1}+r_{2}$	& $\dn$ 
+						& \textit{if} ($\nullable \; r_{1})$\\ 
+			& & \textit{then} $\Left (\mkeps \; r_{1})$\\ 
+			& & \textit{else} $\Right(\mkeps \; r_{2})$\\
+			$\mkeps \; r_1\cdot r_2$ 	& $\dn$ & $\Seq\;(\mkeps\;r_1)\;(\mkeps \; r_2)$\\
+			$\mkeps \; r^* $	        & $\dn$ & $\Stars\;[]$
 		\end{tabular}
 	\end{center}
 
@@ -669,9 +763,10 @@
  The central property of the $\lexer$ is that it gives the correct result by
  $\POSIX$ standards:
  \begin{theorem}
- \begin{tabular}{l}
- $\lexer \; r \; s = \Some(v) \Longleftrightarrow (r, \; s) \rightarrow v$\\
- $\lexer \;r \; s = \None \Longleftrightarrow \neg(\exists v. (r, s) \rightarrow v)$
+	 The $\lexer$ based on derivatives and injections is both sound and complete:
+ \begin{tabular}{lcl}
+	 $\lexer \; r \; s = \Some(v)$ & $ \Longleftrightarrow$ & $ (r, \; s) \rightarrow v$\\
+	 $\lexer \;r \; s = \None $ & $\Longleftrightarrow$ & $ \neg(\exists v. (r, s) \rightarrow v)$
  \end{tabular}
  \end{theorem}
  
@@ -776,7 +871,7 @@
 of a match. This means Sulzmann and Lu's injection-based algorithm 
  exponential by nature.
 Somehow one has to make sure which
- lexical values are $\POSIX$ and need to be kept in a lexing algorithm.
+ lexical values are $\POSIX$ and must be kept in a lexing algorithm.
 
 
  For example, the above $r= (a^*\cdot a^*)^*$  and 
@@ -788,7 +883,7 @@
 Can we not create those intermediate values $v_1,\ldots v_n$,
 and get the lexing information that should be already there while
 doing derivatives in one pass, without a second injection phase?
-In the meantime, can we make sure that simplifications
+In the meantime, can we ensure that simplifications
 are easily handled without breaking the correctness of the algorithm?
 
 Sulzmann and Lu solved this problem by
--- a/ChengsongTanPhdThesis/Chapters/Introduction.tex	Tue Jul 05 00:42:06 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Introduction.tex	Wed Jul 13 08:27:28 2022 +0100
@@ -21,6 +21,7 @@
 \newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
 \newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
 \newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
+\def\derssimp{\textit{ders}\_\textit{simp}}
 \def\rders{\textit{rders}}
 \newcommand{\bders}[2]{#1 \backslash #2}
 \newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
@@ -39,6 +40,7 @@
 
 \newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
 
+\def\case{\textit{case}}
 \def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
 \def\rsimpalts{\textit{rsimp}_{ALTS}}
 \def\good{\textit{good}}
@@ -82,7 +84,7 @@
 \def\blexer{\textit{blexer}}
 \def\flex{\textit{flex}}
 \def\inj{\mathit{inj}}
-\def\Empty{\mathit{Empty}}
+\def\Empty{\textit{Empty}}
 \def\Left{\mathit{Left}}
 \def\Right{\mathit{Right}}
 \def\Stars{\mathit{Stars}}
--- a/ChengsongTanPhdThesis/main.tex	Tue Jul 05 00:42:06 2022 +0100
+++ b/ChengsongTanPhdThesis/main.tex	Wed Jul 13 08:27:28 2022 +0100
@@ -46,7 +46,8 @@
 \usepackage{mathpazo} % Use the Palatino font by default
 \usepackage{hyperref}
 \usepackage{lipsum}
-\usepackage[backend=bibtex,style=authoryear,natbib=true]{biblatex} % Use the bibtex backend with the authoryear citation style (which resembles APA)
+\usepackage[backend=bibtex]{biblatex} % Use the bibtex backend with the authoryear citation style (which resembles APA)
+%style=authoryear, natbib=true 
 \usepackage{stmaryrd}
 \usepackage{caption}
 
@@ -71,7 +72,11 @@
 \usepackage{tikz}
 \usetikzlibrary{automata, positioning, calc}
 \usetikzlibrary{fit,
-                shapes.geometric}
+                shapes.geometric,
+		patterns,
+		backgrounds,
+		graphs}
+\usetikzlibrary{babel}
 \usepackage{mathpartir}
 \usepackage{stackrel}
 
--- a/thys2/blexer2.sc	Tue Jul 05 00:42:06 2022 +0100
+++ b/thys2/blexer2.sc	Wed Jul 13 08:27:28 2022 +0100
@@ -781,7 +781,7 @@
 
 // fun pAKC_aux :: "arexp list ⇒ arexp ⇒ rexp ⇒ arexp"
 //   where
-// "pAKC_aux rsa r ctx = (if (regConcat (SEQ (erase r) ( ctx) )) ⊆ (regConcat (erase (AALTs [] rsa))) then AZERO else
+// "pAKC_aux rsa r ctx = (if (seqFold (SEQ (erase r) ( ctx) )) ⊆ (seqFold (erase (AALTs [] rsa))) then AZERO else
 //                           case r of (ASEQ bs r1 r2) ⇒ 
 //                             bsimp_ASEQ bs (pAKC_aux rsa r1 (SEQ  (erase r2) ( ctx) )) r2   |
 //                                     (AALTs bs rs) ⇒ 
@@ -796,13 +796,13 @@
 
 
 //the "fake" Language interpretation: just concatenates!
-def regConcat(acc: Rexp, rs: List[Rexp]) : Rexp = rs match {
+def seqFold(acc: Rexp, rs: List[Rexp]) : Rexp = rs match {
   case Nil => acc
   case r :: rs1 => 
     // if(acc == ONE) 
-    //   regConcat(r, rs1) 
+    //   seqFold(r, rs1) 
     // else
-      regConcat(SEQ(acc, r), rs1)
+      seqFold(SEQ(acc, r), rs1)
 }
 
 def rprint(r: Rexp) : Unit = println(shortRexpOutput(r))
@@ -819,9 +819,9 @@
   // println("ctx---------")
   // rsprint(ctx)
   // println("ctx---------end")
-  // rsprint(breakIntoTerms(regConcat(erase(r), ctx)).map(oneSimp))
+  // rsprint(breakIntoTerms(seqFold(erase(r), ctx)).map(oneSimp))
 
-  if (breakIntoTerms(regConcat(erase(r), ctx)).map(oneSimp).forall(acc.contains)) {//acc.flatMap(breakIntoTerms
+  if (breakIntoTerms(seqFold(erase(r), ctx)).map(oneSimp).forall(acc.contains)) {//acc.flatMap(breakIntoTerms
     AZERO
   }
   else{
@@ -928,12 +928,12 @@
 }
 
 def attachCtx(r: ARexp, ctx: List[Rexp]) : Set[Rexp] = {
-  turnIntoTerms((regConcat(erase(r), ctx)))
+  turnIntoTerms((seqFold(erase(r), ctx)))
     .toSet
 }
 
 def attachCtxcc(r: Rexp, ctx: List[Rexp]) : Set[Rexp] =
-  turnIntoTerms(regConcat(r, ctx)).toSet
+  turnIntoTerms(seqFold(r, ctx)).toSet
 
 def ABIncludedByC[A, B, C](a: A, b: B, c: C, f: (A, B) => C, 
 subseteqPred: (C, C) => Boolean) : Boolean = {
--- a/thys3/BlexerSimp.thy	Tue Jul 05 00:42:06 2022 +0100
+++ b/thys3/BlexerSimp.thy	Wed Jul 13 08:27:28 2022 +0100
@@ -77,23 +77,27 @@
 | "turnIntoTerms (SEQ r1 r2) = concat (map (\<lambda>r11. furtherSEQ r11 r2) (turnIntoTerms r1))"
 | "turnIntoTerms r = [r]"
 
-fun regConcat :: "rexp \<Rightarrow> rexp list \<Rightarrow> rexp" where
-  "regConcat acc [] = acc"
-| "regConcat ONE (r # rs1) = regConcat r rs1"
-| "regConcat acc (r # rs1) = regConcat (SEQ acc r) rs1"
+abbreviation "tint \<equiv> turnIntoTerms"
+
+fun seqFold :: "rexp \<Rightarrow> rexp list \<Rightarrow> rexp" where
+  "seqFold acc [] = acc"
+| "seqFold ONE (r # rs1) = seqFold r rs1"
+| "seqFold acc (r # rs1) = seqFold (SEQ acc r) rs1"
+
 
 fun attachCtx :: "arexp \<Rightarrow> rexp list \<Rightarrow> rexp set" where
-  "attachCtx r ctx = set (turnIntoTerms (regConcat (erase r) ctx))"
-
+  "attachCtx r ctx = set (turnIntoTerms (seqFold (erase r) ctx))"
 
 
 fun rs1_subseteq_rs2 :: "rexp set \<Rightarrow> rexp set \<Rightarrow> bool" where
   "rs1_subseteq_rs2 rs1 rs2 = (rs1 \<subseteq> rs2)"
 
+
 fun notZero :: "arexp \<Rightarrow> bool" where
   "notZero AZERO = True"
 | "notZero _ = False"
 
+
 fun tset :: "arexp list \<Rightarrow> rexp set" where
   "tset rs = set (concat (map turnIntoTerms (map erase rs)))"
 
@@ -101,10 +105,14 @@
 fun prune6 :: "rexp set \<Rightarrow> arexp \<Rightarrow> rexp list \<Rightarrow> arexp" where
   "prune6 acc a ctx = (if (ABIncludedByC a ctx acc attachCtx rs1_subseteq_rs2) then AZERO else 
                         (case a of (ASEQ bs r1 r2) \<Rightarrow> bsimp_ASEQ bs (prune6 acc r1 (erase r2 # ctx)) r2
-                                 | AALTs bs rs0 \<Rightarrow> bsimp_AALTs bs (filter notZero (map (\<lambda>r.(prune6 acc r ctx)) rs0)))  )"
+                                 | AALTs bs rs0 \<Rightarrow> bsimp_AALTs bs (filter notZero (map (\<lambda>r.(prune6 acc r ctx)) rs0))
+                                 | r \<Rightarrow> r
+                        )
+                      )
+            "
 
 abbreviation
-  "p acc r \<equiv> prune6 (set (concat (map turnIntoTerms (map erase acc)) ) ) r Nil"
+  "p6 acc r \<equiv> prune6 (tset acc) r Nil"
 
 
 fun dB6 :: "arexp list \<Rightarrow> rexp set \<Rightarrow> arexp list" where
@@ -154,8 +162,9 @@
 | ss4:  "(AZERO # rs) s\<leadsto> rs"
 | ss5:  "(AALTs bs1 rs1 # rsb) s\<leadsto> ((map (fuse bs1) rs1) @ rsb)"
 | ss6:  "L (erase a2) \<subseteq> L (erase a1) \<Longrightarrow> (rsa@[a1]@rsb@[a2]@rsc) s\<leadsto> (rsa@[a1]@rsb@rsc)"
-| ss7:  " (as @ [a] @ as1) s\<leadsto> (as @ [prune6 (set (concat (map (\<lambda>r. turnIntoTerms (erase r)) as))) a Nil] @ as1)"
+| ss7:  " (as @ [a] @ as1) s\<leadsto> (as @ [p6 as a] @ as1)"
 
+thm tset.simps
 
 inductive 
   rrewrites:: "arexp \<Rightarrow> arexp \<Rightarrow> bool" ("_ \<leadsto>* _" [100, 100] 100)
@@ -291,10 +300,10 @@
   apply(subgoal_tac "(cc @ a # cc') s\<leadsto>* (cc @ a # dB6 cc' (tset (cc @ [a])))")
    prefer 2
   apply (metis append.assoc append.left_neutral append_Cons)
-  apply(subgoal_tac "(cc @ a # dB6 cc' (tset (cc @ [a]))) s\<leadsto>* (cc @ (p cc a) # dB6 cc' (tset (cc @ [a])))") 
+  apply(subgoal_tac "(cc @ a # dB6 cc' (tset (cc @ [a]))) s\<leadsto>* (cc @ (p6 cc a) # dB6 cc' (tset (cc @ [a])))") 
   sorry
 
-
+(*
 lemma ss6_stronger_aux:
   shows "(rs1 @ rs2) s\<leadsto>* (rs1 @ dB6 rs2 (set (map erase rs1)))"
   apply(induct rs2 arbitrary: rs1)
@@ -313,22 +322,83 @@
   prefer 2
   apply(drule_tac x="rs1 @ [a]" in meta_spec)
   apply(simp)
-  apply(drule_tac x="rs1" in meta_spec)
-  apply(subgoal_tac "(rs1 @ a # rs2) s\<leadsto>* (rs1 @ rs2)")
-  using srewrites_trans apply blast
-  apply(subgoal_tac "\<exists>rs1a rs1b. rs1 = rs1a @ [x] @ rs1b")
-  prefer 2
-  apply (simp add: split_list)
-  apply(erule exE)+
-  apply(simp)
-  using eq1_L rs_in_rstar ss
   sorry
+*)
 
 
 lemma ss6_stronger:
   shows "rs1 s\<leadsto>* dB6 rs1 {}"
+  using ss6
+  by (metis append_Nil concat.simps(1) list.set(1) list.simps(8) ss6_realistic tset.simps)
+
+ 
+lemma tint_fuse:
+  shows "tint (erase a) = tint (erase (fuse bs a))"
+  by (simp add: erase_fuse)
+
+lemma tint_fuse2:
+  shows " set (tint (erase a)) =
+     set (tint (erase (fuse bs a)))"
+  using tint_fuse by auto
+
+lemma fused_bits_at_head:
+  shows "fuse bs a = ASEQ bs1 a1 a2 \<Longrightarrow> \<exists>bs2. bs1 = bs @ bs2"
+  
+(* by (smt (verit) arexp.distinct(13) arexp.distinct(19) arexp.distinct(25) arexp.distinct(27) arexp.distinct(5) arexp.inject(3) fuse.elims)
+harmless sorry
+*)
+
   sorry
 
+thm seqFold.simps
+
+lemma seqFold_concats:
+  shows "r \<noteq> ONE \<Longrightarrow> seqFold r [r1] = SEQ r r1"
+  apply(case_tac r)
+       apply simp+
+done
+
+
+lemma "set (tint (seqFold (erase x42) [erase x43])) = 
+           set (tint (SEQ (erase x42) (erase x43)))"
+  apply(case_tac "erase x42 = ONE")
+   apply simp
+  apply simp
+
+lemma prune6_preserves_fuse:
+  shows "fuse bs (p6 as a) = p6 as (fuse bs a)"
+  using tint_fuse2
+  apply simp
+  apply(case_tac a)
+       apply simp
+  apply simp
+     apply simp
+
+  using fused_bits_at_head
+
+    apply simp
+  apply(case_tac "erase x42 = ONE")
+  apply simp
+
+  sorry
+
+corollary prune6_preserves_fuse_srewrite:
+  shows "(as @ map (fuse bs) [a] @ as2) s\<leadsto>* (as @ map (fuse bs) [p6 as a] @ as2)"
+  apply(subgoal_tac "map (fuse bs) [a] = [fuse bs a]")
+  apply(subgoal_tac "(as @ [fuse bs a] @ as2) s\<leadsto>* (as @ [ (p6 as (fuse bs a))] @ as2)")
+  using prune6_preserves_fuse apply auto[1]
+  using rs_in_rstar ss7 apply presburger
+  by simp
+
+lemma prune6_invariant_fuse:
+  shows "p6 as a = p6 (map (fuse bs) as) a"
+  apply simp
+  using erase_fuse by presburger
+
+
+lemma p6pfs_cor:
+  shows "(map (fuse bs) as @ map (fuse bs) [a] @ map (fuse bs) as1) s\<leadsto>* (map (fuse bs) as @ map (fuse bs) [p6 as a] @ map (fuse bs) as1)"
+  by (metis prune6_invariant_fuse prune6_preserves_fuse_srewrite)
 
 lemma rewrite_preserves_fuse: 
   shows "r2 \<leadsto> r3 \<Longrightarrow> fuse bs r2 \<leadsto> fuse bs r3"
@@ -358,7 +428,13 @@
   then show ?case 
     apply(simp only: map_append)
     by (smt (verit, best) erase_fuse list.simps(8) list.simps(9) rrewrite_srewrite.ss6 srewrites.simps)
+next
+  case (ss7 as a as1)
+  then show ?case
+    apply(simp only: map_append)
+    using p6pfs_cor by presburger
 qed (auto intro: rrewrite_srewrite.intros)
+  
 
 
 lemma rewrites_fuse:  
@@ -424,11 +500,14 @@
 lemma bnullable0:
 shows "r1 \<leadsto> r2 \<Longrightarrow> bnullable r1 = bnullable r2" 
   and "rs1 s\<leadsto> rs2 \<Longrightarrow> bnullables rs1 = bnullables rs2" 
-  apply(induct rule: rrewrite_srewrite.inducts)
-  apply(auto simp add:  bnullable_fuse)
-   apply (meson UnCI bnullable_fuse imageI)
-  using bnullable_correctness nullable_correctness by blast
-
+   apply(induct rule: rrewrite_srewrite.inducts)
+              apply simp
+             apply simp
+              apply (simp add: bnullable_fuse)
+  using bnullable.simps(5) apply presburger
+          apply simp
+         apply simp
+  sorry   
 
 
 
@@ -438,7 +517,7 @@
 using assms 
   apply(induction r1 r2 rule: rrewrites.induct)
   apply simp
-  using bnullable0(1) by auto
+  using bnullable0(1) by presburger
 
 lemma rewrite_bmkeps_aux: 
   shows "r1 \<leadsto> r2 \<Longrightarrow> (bnullable r1 \<and> bnullable r2 \<Longrightarrow> bmkeps r1 = bmkeps r2)"