diff -r 0f00d440f484 -r 15d182ffbc76 ChengsongTanPhdThesis/Chapters/Finite.tex
--- a/ChengsongTanPhdThesis/Chapters/Finite.tex	Fri Jun 24 21:49:23 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Finite.tex	Sun Jun 26 22:22:47 2022 +0100
@@ -136,6 +136,7 @@
 \begin{lemma}
 $\rsize{\rerase a} = \asize a$
 \end{lemma}
+\noindent
 This does not hold for plain $\rexp$s. 
 
 Similarly we could define the derivative  and simplification on 
@@ -197,6 +198,9 @@
 \end{center}
 %TODO: definition of rsimp (maybe only the alternative clause)
 \noindent
+The prefix $r$ in front of $\rdistinct{}{}$ is used mainly to 
+differentiate with $\textit{distinct}$, which is a built-in predicate
+in Isabelle that says all the elements of a list are unique.
 With $\textit{rdistinct}$ one can chain together all the other modules
 of $\bsimp{\_}$ (removing the functionalities related to bit-sequences)
 and get $\textit{rsimp}$ and $\rderssimp{\_}{\_}$.
@@ -209,10 +213,10 @@
 	The following equalities hold:
 	\begin{itemize}
 		\item
-$\asize{\bsimp{a}} = \rsize{\rsimp{\rerase{a}}}$
-\item
-$\asize{\bderssimp{r}{s}} =  \rsize{\rderssimp{\rerase{r}}{s}}$
-\end{itemize}
+			$\asize{\bsimp{a}} = \rsize{\rsimp{\rerase{a}}}$
+		\item
+			$\asize{\bderssimp{r}{s}} =  \rsize{\rderssimp{\rerase{r}}{s}}$
+	\end{itemize}
 \end{lemma}
 \noindent
 In the following content, we will focus on $\rrexp$'s size bound.
@@ -297,9 +301,18 @@
 on the resulting list, the output will be as if we had called $\rdistinct{\_}{\_}$
 without the prepending of $rs$:
 \begin{lemma}
-	If $rs \subseteq rset$, then 
-	$\rdistinct{rs@rsa }{acc} = \rdistinct{rsa }{acc}$.
+	The elements appearing in the accumulator will always be removed.
+	More precisely,
+	\begin{itemize}
+		\item
+			If $rs \subseteq rset$, then 
+			$\rdistinct{rs@rsa }{acc} = \rdistinct{rsa }{acc}$.
+		\item
+			Furthermore, if $a \in rset$ and $\rdistinct{rs}{\{a\}} = []$,
+			then $\rdistinct{(rs @ rs')}{rset} = \rdistinct{rs'}{rset}$
+	\end{itemize}
 \end{lemma}
+
 \begin{proof}
 	By induction on $rs$.
 \end{proof}
@@ -326,15 +339,44 @@
 for any prefix of an input list, in other words, when can 
 we ``push out" the arguments of $\rdistinct{\_}{\_}$:
 \begin{lemma}
-	If $rs_1$'s elements are all unique, and not appearing in the accumulator $acc$, 
+	If $\textit{isDistinct} \; rs_1$, and $rs_1 \cap acc = \varnothing$,
 	then it can be taken out and left untouched in the output:
 	\[\textit{rdistinct}\;  (rs_1 @ rsa)\;\, acc
 	= rs_1@(\textit{rdistinct} rsa \; (acc \cup rs_1))\]
 \end{lemma}
-
+\noindent
+The predicate $\textit{isDistinct}$ is for testing
+whether a list's elements are all unique. It is defined
+recursively on the structure of a regular expression,
+and we omit the precise definition here.
 \begin{proof}
 By an induction on $rs_1$, where $rsa$ and $acc$ are allowed to be arbitrary.
 \end{proof}
+\noindent
+$\rdistinct{}{}$ removes any element in anywhere of a list, if it
+had appeared previously:
+\begin{lemma}\label{distinctRemovesMiddle}
+	The two properties hold if $r \in rs$:
+	\begin{itemize}
+		\item
+			$\rdistinct{rs}{rset} = \rdistinct{(rs @ [r])}{rset}$
+			and 
+			$\rdistinct{(ab :: rs @ [ab])}{rset'} = \rdistinct{(ab :: rs)}{rset'}$
+		\item
+			$\rdistinct{ (rs @ rs') }{rset} = \rdistinct{rs @ [r] @ rs'}{rset}$
+			and
+			$\rdistinct{(ab :: rs @ [ab] @ rs'')}{rset'} = 
+			 \rdistinct{(ab :: rs @ rs'')}{rset'}$
+	\end{itemize}
+\end{lemma}
+\noindent
+\begin{proof}
+By induction on $rs$. All other variables are allowed to be arbitrary.
+The second half of the lemma requires the first half.
+Note that for each half's two sub-propositions need to be proven concurrently,
+so that the induction goes through.
+\end{proof}
+
 \subsubsection{The Properties of $\backslash_r$, $\backslash_{rsimp}$, $\textit{Rflts}$ and $\textit{Rsimp}_{ALTS}$} 
 We give in this subsection some properties of how $\backslash_r$, $\backslash_{rsimp}$, $\textit{Rflts}$ and $\textit{Rsimp}_{ALTS} $ interact with each other and with $@$, the concatenation operator.
 These will be helpful in later closed form proofs, when
@@ -345,41 +387,290 @@
 When the function $\textit{Rflts}$ 
 is applied to the concatenation of two lists, the output can be calculated by first applying the
 functions on two lists separately, and then concatenating them together.
-\begin{lemma}
+\begin{lemma}\label{rfltsProps}
 	The function $\rflts$ has the below properties:\\
 	\begin{itemize}
 		\item
-	$\rflts \; (rs_1 @ rs_2) = \rflts \; rs_1 @ \rflts \; rs_2$
-\item
-	If $r \neq \RZERO$ and $\nexists rs_1. r = \RALTS{rs}_1$, then $\rflts \; (r::rs) = r :: \rflts \; rs$
+			$\rflts \; (rs_1 @ rs_2) = \rflts \; rs_1 @ \rflts \; rs_2$
+		\item
+			If $r \neq \RZERO$ and $\nexists rs_1. r = \RALTS{rs}_1$, then $\rflts \; (r::rs) = r :: \rflts \; rs$
+		\item
+			$\rflts \; (rs @ [\RZERO]) = \rflts \; rs$
+		\item
+			$\rflts \; (rs' @ [\RALTS{rs}]) = \rflts \; rs'@rs$
+		\item
+			$\rflts \; (rs @ [\RONE]) = \rflts \; rs @ [\RONE]$
+		\item
+			If $r \neq \RZERO$ and $\nexists rs'. r = \RALTS{rs'}$ then $\rflts \; (rs @ [r])
+			= (\rflts \; rs) @ [r]$
 	\end{itemize}
 \end{lemma}
 \noindent
 \begin{proof}
-	By induction on $rs_1$.
+	By induction on $rs_1$ in the first part, and induction on $r$ in the second part,
+	and induction on $rs$, $rs'$ in the third and fourth sub-lemma.
 \end{proof}
-
+\subsubsection{The $RL$ Function: Language Interpretation of $\textit{Rrexp}$s}
+Much like the definition of $L$ on plain regular expressions, one could also 
+define the language interpretation of $\rrexp$s.
+\begin{center}
+\begin{tabular}{lcl}
+$RL \; (\ZERO)$ & $\dn$ & $\phi$\\
+$RL \; (\ONE)$ & $\dn$ & $\{[]\}$\\
+$RL \; (c)$ & $\dn$ & $\{[c]\}$\\
+$RL \; \sum rs$ & $\dn$ & $ \bigcup_{r \in rs} (RL \; r)$\\
+$RL \; (r_1 \cdot r_2)$ & $\dn$ & $ RL \; (r_1) @ RL \; (r_2)$\\
+$RL \; (r^*)$ & $\dn$ & $ (RL(r))^*$
+\end{tabular}
+\end{center}
+\noindent
+The main use of $RL$ is to establish some connections between $\rsimp{}$ 
+and $\rnullable{}$:
+\begin{lemma}
+	The following properties hold:
+	\begin{itemize}
+		\item
+			If $\rnullable{r}$, then $\rsimp{r} \neq \RZERO$.
+		\item
+			$\rnullable{r \backslash s} \quad $ if and only if $\quad \rnullable{\rderssimp{r}{s}}$.
+	\end{itemize}
+\end{lemma}
+\begin{proof}
+	The first part is by induction on $r$. 
+	The second part is true because property 
+	\[ RL \; r = RL \; (\rsimp{r})\] holds.
+\end{proof}
+	
+\subsubsection{$\rsimp{}$ is Non-Increasing}
+In this subsection, we prove that the function $\rsimp{}$ does not 
+make the $\llbracket \rrbracket_r$ size increase.
 
 
-\subsection{A Closed Form for the Sequence Regular Expression}
-\begin{quote}\it
-	Claim: For regular expressions $r_1 \cdot r_2$, we claim that
-	\begin{center}
-		$ \rderssimp{r_1 \cdot r_2}{s} = 
-	\rsimp{(\sum (r_1 \backslash s \cdot r_2 ) \; :: \;(\map \; \rderssimp{r2}{\_} \;(\vsuf{s}{r_1})))}$
-	\end{center}
-\end{quote}
+\begin{lemma}\label{rsimpSize}
+	$\llbracket \rsimp{r} \rrbracket_r \leq \llbracket r \rrbracket_r$
+\end{lemma}
+\subsubsection{Simplified $\textit{Rrexp}$s are Good}
+We formalise the notion of ``good" regular expressions,
+which means regular expressions that
+are not fully simplified. For alternative regular expressions that means they
+do not contain any nested alternatives like 
+\[ r_1 + (r_2 + r_3) \], un-removed $\RZERO$s like \[\RZERO + r\]
+or duplicate elements in a children regular expression list like \[ \sum [r, r, \ldots]\]:
+\begin{center}
+	\begin{tabular}{@{}lcl@{}}
+		$\good\; \RZERO$ & $\dn$ & $\textit{false}$\\
+		$\good\; \RONE$ & $\dn$ & $\textit{true}$\\
+		$\good\; \RCHAR{c}$ & $\dn$ & $\btrue$\\
+		$\good\; \RALTS{[]}$ & $\dn$ & $\bfalse$\\
+		$\good\; \RALTS{[r]}$ & $\dn$ & $\bfalse$\\
+		$\good\; \RALTS{r_1 :: r_2 :: rs}$ & $\dn$ & 
+		$\textit{isDistinct} \; (r_1 :: r_2 :: rs) \;$\\
+		& & $\textit{and}\; (\forall r' \in (r_1 :: r_2 :: rs).\; \good \; r'\; \,  \textit{and}\; \, \textit{nonAlt}\; r')$\\
+		$\good \; \RSEQ{\RZERO}{r}$ & $\dn$ & $\bfalse$\\
+		$\good \; \RSEQ{\RONE}{r}$ & $\dn$ & $\bfalse$\\
+		$\good \; \RSEQ{r}{\RZERO}$ & $\dn$ & $\bfalse$\\
+		$\good \; \RSEQ{r_1}{r_2}$ & $\dn$ & $\good \; r_1 \;\, \textit{and} \;\, \good \; r_2$\\
+		$\good \; \RSTAR{r}$ & $\dn$ & $\btrue$\\
+	\end{tabular}
+\end{center}
+\noindent
+The predicate $\textit{nonAlt}$ evaluates to true when the regular expression is not an
+alternative, and false otherwise.
+The $\good$ property is preserved under $\rsimp_{ALTS}$, provided that
+its non-empty argument list of expressions are all good themsleves, and $\textit{nonAlt}$, 
+and unique:
+\begin{lemma}\label{rsimpaltsGood}
+	If $rs \neq []$ and forall $r \in rs. \textit{nonAlt} \; r$ and $\textit{isDistinct} \; rs$,
+	then $\good \; (\rsimpalts \; rs)$ if and only if forall $r \in rs. \; \good \; r$.
+\end{lemma}
+\noindent
+We also note that
+if a regular expression $r$ is good, then $\rflts$ on the singleton
+list $[r]$ will not break goodness:
+\begin{lemma}\label{flts2}
+	If $\good \; r$, then forall $r' \in \rflts \; [r]. \; \good \; r'$ and $\textit{nonAlt} \; r'$.
+\end{lemma}
+\begin{proof}
+	By an induction on $r$.
+\end{proof}
 \noindent
+The other observation we make about $\rsimp{r}$ is that it never
+comes with nested alternatives, which we describe as the $\nonnested$
+property:
+\begin{center}
+	\begin{tabular}{lcl}
+		$\nonnested \; \, \sum []$ & $\dn$ & $\btrue$\\
+		$\nonnested \; \, \sum ((\sum rs_1) :: rs_2)$ & $\dn$ & $\bfalse$\\
+		$\nonnested \; \, \sum (r :: rs)$ & $\dn$ & $\nonnested (\sum rs)$\\
+		$\nonnested \; \, r $ & $\dn$ & $\btrue$
+	\end{tabular}
+\end{center}
+\noindent
+The $\rflts$ function
+always opens up nested alternatives,
+which enables $\rsimp$ to be non-nested:
+
+\begin{lemma}\label{nonnestedRsimp}
+	$\nonnested \; (\rsimp{r})$
+\end{lemma}
+\begin{proof}
+	By an induction on $r$.
+\end{proof}
+\noindent
+With this we could prove that a regular expressions
+after simplification and flattening and de-duplication,
+will not contain any alternative regular expression directly:
+\begin{lemma}\label{nonaltFltsRd}
+	If $x \in \rdistinct{\rflts\; (\map \; \rsimp{} \; rs)}{\varnothing}$ 
+	then $\textit{nonAlt} \; x$.
+\end{lemma}
+\begin{proof}
+	By \ref{nonnestedRsimp}.
+\end{proof}
+\noindent
+The other thing we know is that once $\rsimp{}$ had finished
+processing an alternative regular expression, it will not
+contain any $\RZERO$s, this is because all the recursive 
+calls to the simplification on the children regular expressions
+make the children good, and $\rflts$ will not take out
+any $\RZERO$s out of a good regular expression list,
+and $\rdistinct{}$ will not mess with the result.
+\begin{lemma}\label{flts3Obv}
+	The following are true:
+	\begin{itemize}
+		\item
+			If for all $r \in rs. \, \good \; r $ or $r = \RZERO$,
+			then for all $r \in \rflts\; rs. \, \good \; r$.
+		\item
+			If $x \in \rdistinct{\rflts\; (\map \; rsimp{}\; rs)}{\varnothing}$
+			and for all $y$ such that $\llbracket y \rrbracket_r$ less than
+			$\llbracket rs \rrbracket_r + 1$, either
+			$\good \; (\rsimp{y})$ or $\rsimp{y} = \RZERO$,
+			then $\good \; x$.
+	\end{itemize}
+\end{lemma}
+\begin{proof}
+	The first part is by induction on $rs$, where the induction
+	rule is the inductive cases for $\rflts$.
+	The second part is a corollary from the first part.
+\end{proof}
 
-Before we get to the proof that says the intermediate result of our lexer will
-remain finitely bounded, which is an important efficiency/liveness guarantee,
-we shall first develop a few preparatory properties and definitions to 
-make the process of proving that a breeze.
+And this leads to good structural property of $\rsimp{}$,
+that after simplification, a regular expression is
+either good or $\RZERO$:
+\begin{lemma}\label{good1}
+	For any r-regular expression $r$, $\good \; \rsimp{r}$ or $\rsimp{r} = \RZERO$.
+\end{lemma}
+\begin{proof}
+	By an induction on $r$. The inductive measure is the size $\llbracket \rrbracket_r$.
+	Lemma \ref{rsimpSize} says that 
+	$\llbracket \rsimp{r}\rrbracket_r$ is smaller than or equal to
+	$\llbracket r \rrbracket_r$.
+	Therefore, in the $r_1 \cdot r_2$ and $\sum rs$ case,
+	Inductive hypothesis applies to the children regular expressions
+	$r_1$, $r_2$, etc. The lemma \ref{flts3Obv}'s precondition is satisfied
+	by that as well.
+	The lemmas \ref{nonnestedRsimp} and  \ref{nonaltFltsRd} are used
+	to ensure that goodness is preserved at the topmost level.
+\end{proof}
+We shall prove that any good regular expression is 
+a fixed-point for $\rsimp{}$.
+First we prove an auxiliary lemma:
+\begin{lemma}\label{goodaltsNonalt}
+	If $\good \; \sum rs$, then $\rflts\; rs = rs$.
+\end{lemma}
+\begin{proof}
+	By an induction on $\sum rs$. The inductive rules are the cases
+	for $\good$.
+\end{proof}
+\noindent
+Now we are ready to prove that good regular expressions are invariant
+of $\rsimp{}$ application:
+\begin{lemma}\label{test}
+	If $\good \;r$ then $\rsimp{r} = r$.
+\end{lemma}
+\begin{proof}
+	By an induction on the inductive cases of $\good$.
+	The lemma \ref{goodaltsNonalt} is used in the alternative
+	case where 2 or more elements are present in the list.
+\end{proof}
+
+\subsubsection{$\rsimp$ is Idempotent}
+The idempotency of $\rsimp$ is very useful in 
+manipulating regular expression terms into desired
+forms so that key steps allowing further rewriting to closed forms
+are possible.
+\begin{lemma}\label{rsimpIdem}
+$\rsimp{r} = \rsimp{\rsimp{r}}$
+\end{lemma}
+
+\begin{proof}
+	By \ref{test} and \ref{good1}.
+\end{proof}
+\noindent
+This property means we do not have to repeatedly
+apply simplification in each step, which justifies
+our definition of $\blexersimp$.
+
 
-We define rewriting relations for $\rrexp$s, which allows us to do the 
-same trick as we did for the correctness proof,
-but this time we will have stronger equalities established.
-\subsection{"hrewrite" relation}
+On the other hand, we could repeat the same $\rsimp{}$ applications
+on regular expressions as many times as we want, if we have at least
+one simplification applied to it, and apply it wherever we would like to:
+\begin{corollary}\label{headOneMoreSimp}
+	$\map \; \rsimp{(r :: rs)} = \map \; \rsimp{} \; (\rsimp{r} :: rs)$
+\end{corollary}
+\noindent
+This will be useful in later closed form proof's rewriting steps.
+Similarly, we point out the following useful facts below:
+\begin{lemma}
+	The following equalities hold if $r = \rsimp{r'}$ for some $r'$:
+	\begin{itemize}
+		\item
+			If $r  = \sum rs$ then $\rsimpalts \; rs = \sum rs$.
+		\item
+			If $r = \sum rs$ then $\rdistinct{rs}{\varnothing} = rs$.
+		\item
+			$\rsimpalts \; (\rdistinct{\rflts \; [r]}{\varnothing}) = r$.
+	\end{itemize}
+\end{lemma}
+\begin{proof}
+	By application of \ref{rsimpIdem} and \ref{good1}.
+\end{proof}
+
+\noindent
+With the idempotency of $\rsimp{}$ and its corollaries, 
+we can start proving some key equalities leading to the 
+closed forms.
+Now presented are a few equivalent terms under $\rsimp{}$.
+We use $r_1 \sequal r_2 $ here to denote $\rsimp{r_1} = \rsimp{r_2}$.
+\begin{lemma}
+\begin{itemize}
+	\item
+		$\rsimpalts \; (\RZERO :: rs) \sequal \rsimpalts\; rs$
+	\item
+		$\rsimpalts \; rs \sequal \rsimpalts (\map \; \rsimp{} \; rs)$
+	\item
+		$\RALTS{\RALTS{rs}} \sequal \RALTS{rs}$
+\end{itemize}
+\end{lemma}
+\noindent
+We need more equalities like the above to enable a closed form,
+but to proceed we need to introduce two rewrite relations,
+to make things smoother.
+\subsubsection{The rewrite relation $\hrewrite$ and $\grewrite$}
+Insired by the success we had in the correctness proof 
+in \ref{Bitcoded2}, where we invented
+a term rewriting system to capture the similarity between terms
+and prove equivalence, we follow suit here defining simplification
+steps as rewriting steps.
+The presentation will be more concise than that in \ref{Bitcoded2}.
+To differentiate between the rewriting steps for annotated regular expressions
+and $\rrexp$s, we add characters $h$ and $g$ below the squig arrow symbol
+to mean atomic simplification transitions 
+of $\rrexp$s and $\rrexp$ lists, respectively.
+
+\begin{center}
+
 List of 1-step rewrite rules for regular expressions simplification without bitcodes:
 \begin{figure}
 \caption{the "h-rewrite" rules}
@@ -419,6 +710,25 @@
 
 Fortunately this is provable by induction on the list $rs$
 
+\subsection{A Closed Form for the Sequence Regular Expression}
+\begin{quote}\it
+	Claim: For regular expressions $r_1 \cdot r_2$, we claim that
+	\begin{center}
+		$ \rderssimp{r_1 \cdot r_2}{s} = 
+	\rsimp{(\sum (r_1 \backslash s \cdot r_2 ) \; :: \;(\map \; \rderssimp{r2}{\_} \;(\vsuf{s}{r_1})))}$
+	\end{center}
+\end{quote}
+\noindent
+
+Before we get to the proof that says the intermediate result of our lexer will
+remain finitely bounded, which is an important efficiency/liveness guarantee,
+we shall first develop a few preparatory properties and definitions to 
+make the process of proving that a breeze.
+
+We define rewriting relations for $\rrexp$s, which allows us to do the 
+same trick as we did for the correctness proof,
+but this time we will have stronger equalities established.
+
 
 \section{Estimating the Closed Forms' sizes}
 
@@ -787,11 +1097,8 @@
 \begin{lemma}
 $\rsimp{\sflat{(r_1 \cdot r_2) \backslash s} }= \rsimp{\AALTS{[[]}{ (r_1 \backslash s) \cdot r_2 :: (\map (r_2 \backslash \_) (\vsuf{s}{r_1}))}}$
 \end{lemma}
-Together with the idempotency property of $\rsimp$,
+Together with the idempotency property of $\rsimp{}$\ref{rsimpIdem},
 %TODO: state the idempotency property of rsimp
-\begin{lemma}
-$\rsimp{r} = \rsimp{\rsimp{r}}$
-\end{lemma}
 it is possible to convert the above lemma to obtain a "closed form"
 for  derivatives nested with simplification:
 \begin{lemma}
@@ -1088,17 +1395,6 @@
 %	SECTION 1
 %----------------------------------------------------------------------------------------
 
-\section{Idempotency of $\simp$}
-
-\begin{equation}
-	\simp \;r = \simp\; \simp \; r 
-\end{equation}
-This property means we do not have to repeatedly
-apply simplification in each step, which justifies
-our definition of $\blexersimp$.
-It will also be useful in future proofs where properties such as 
-closed forms are needed.
-The proof is by structural induction on $r$.
 
 %-----------------------------------
 %	SUBSECTION 1