--- a/ChengsongTanPhdThesis/Chapters/Finite.tex Fri Jun 24 21:49:23 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Finite.tex Sun Jun 26 22:22:47 2022 +0100
@@ -136,6 +136,7 @@
\begin{lemma}
$\rsize{\rerase a} = \asize a$
\end{lemma}
+\noindent
This does not hold for plain $\rexp$s.
Similarly we could define the derivative and simplification on
@@ -197,6 +198,9 @@
\end{center}
%TODO: definition of rsimp (maybe only the alternative clause)
\noindent
+The prefix $r$ in front of $\rdistinct{}{}$ is used mainly to
+differentiate with $\textit{distinct}$, which is a built-in predicate
+in Isabelle that says all the elements of a list are unique.
With $\textit{rdistinct}$ one can chain together all the other modules
of $\bsimp{\_}$ (removing the functionalities related to bit-sequences)
and get $\textit{rsimp}$ and $\rderssimp{\_}{\_}$.
@@ -209,10 +213,10 @@
The following equalities hold:
\begin{itemize}
\item
-$\asize{\bsimp{a}} = \rsize{\rsimp{\rerase{a}}}$
-\item
-$\asize{\bderssimp{r}{s}} = \rsize{\rderssimp{\rerase{r}}{s}}$
-\end{itemize}
+ $\asize{\bsimp{a}} = \rsize{\rsimp{\rerase{a}}}$
+ \item
+ $\asize{\bderssimp{r}{s}} = \rsize{\rderssimp{\rerase{r}}{s}}$
+ \end{itemize}
\end{lemma}
\noindent
In the following content, we will focus on $\rrexp$'s size bound.
@@ -297,9 +301,18 @@
on the resulting list, the output will be as if we had called $\rdistinct{\_}{\_}$
without the prepending of $rs$:
\begin{lemma}
- If $rs \subseteq rset$, then
- $\rdistinct{rs@rsa }{acc} = \rdistinct{rsa }{acc}$.
+ The elements appearing in the accumulator will always be removed.
+ More precisely,
+ \begin{itemize}
+ \item
+ If $rs \subseteq rset$, then
+ $\rdistinct{rs@rsa }{acc} = \rdistinct{rsa }{acc}$.
+ \item
+ Furthermore, if $a \in rset$ and $\rdistinct{rs}{\{a\}} = []$,
+ then $\rdistinct{(rs @ rs')}{rset} = \rdistinct{rs'}{rset}$
+ \end{itemize}
\end{lemma}
+
\begin{proof}
By induction on $rs$.
\end{proof}
@@ -326,15 +339,44 @@
for any prefix of an input list, in other words, when can
we ``push out" the arguments of $\rdistinct{\_}{\_}$:
\begin{lemma}
- If $rs_1$'s elements are all unique, and not appearing in the accumulator $acc$,
+ If $\textit{isDistinct} \; rs_1$, and $rs_1 \cap acc = \varnothing$,
then it can be taken out and left untouched in the output:
\[\textit{rdistinct}\; (rs_1 @ rsa)\;\, acc
= rs_1@(\textit{rdistinct} rsa \; (acc \cup rs_1))\]
\end{lemma}
-
+\noindent
+The predicate $\textit{isDistinct}$ is for testing
+whether a list's elements are all unique. It is defined
+recursively on the structure of a regular expression,
+and we omit the precise definition here.
\begin{proof}
By an induction on $rs_1$, where $rsa$ and $acc$ are allowed to be arbitrary.
\end{proof}
+\noindent
+$\rdistinct{}{}$ removes any element in anywhere of a list, if it
+had appeared previously:
+\begin{lemma}\label{distinctRemovesMiddle}
+ The two properties hold if $r \in rs$:
+ \begin{itemize}
+ \item
+ $\rdistinct{rs}{rset} = \rdistinct{(rs @ [r])}{rset}$
+ and
+ $\rdistinct{(ab :: rs @ [ab])}{rset'} = \rdistinct{(ab :: rs)}{rset'}$
+ \item
+ $\rdistinct{ (rs @ rs') }{rset} = \rdistinct{rs @ [r] @ rs'}{rset}$
+ and
+ $\rdistinct{(ab :: rs @ [ab] @ rs'')}{rset'} =
+ \rdistinct{(ab :: rs @ rs'')}{rset'}$
+ \end{itemize}
+\end{lemma}
+\noindent
+\begin{proof}
+By induction on $rs$. All other variables are allowed to be arbitrary.
+The second half of the lemma requires the first half.
+Note that for each half's two sub-propositions need to be proven concurrently,
+so that the induction goes through.
+\end{proof}
+
\subsubsection{The Properties of $\backslash_r$, $\backslash_{rsimp}$, $\textit{Rflts}$ and $\textit{Rsimp}_{ALTS}$}
We give in this subsection some properties of how $\backslash_r$, $\backslash_{rsimp}$, $\textit{Rflts}$ and $\textit{Rsimp}_{ALTS} $ interact with each other and with $@$, the concatenation operator.
These will be helpful in later closed form proofs, when
@@ -345,41 +387,290 @@
When the function $\textit{Rflts}$
is applied to the concatenation of two lists, the output can be calculated by first applying the
functions on two lists separately, and then concatenating them together.
-\begin{lemma}
+\begin{lemma}\label{rfltsProps}
The function $\rflts$ has the below properties:\\
\begin{itemize}
\item
- $\rflts \; (rs_1 @ rs_2) = \rflts \; rs_1 @ \rflts \; rs_2$
-\item
- If $r \neq \RZERO$ and $\nexists rs_1. r = \RALTS{rs}_1$, then $\rflts \; (r::rs) = r :: \rflts \; rs$
+ $\rflts \; (rs_1 @ rs_2) = \rflts \; rs_1 @ \rflts \; rs_2$
+ \item
+ If $r \neq \RZERO$ and $\nexists rs_1. r = \RALTS{rs}_1$, then $\rflts \; (r::rs) = r :: \rflts \; rs$
+ \item
+ $\rflts \; (rs @ [\RZERO]) = \rflts \; rs$
+ \item
+ $\rflts \; (rs' @ [\RALTS{rs}]) = \rflts \; rs'@rs$
+ \item
+ $\rflts \; (rs @ [\RONE]) = \rflts \; rs @ [\RONE]$
+ \item
+ If $r \neq \RZERO$ and $\nexists rs'. r = \RALTS{rs'}$ then $\rflts \; (rs @ [r])
+ = (\rflts \; rs) @ [r]$
\end{itemize}
\end{lemma}
\noindent
\begin{proof}
- By induction on $rs_1$.
+ By induction on $rs_1$ in the first part, and induction on $r$ in the second part,
+ and induction on $rs$, $rs'$ in the third and fourth sub-lemma.
\end{proof}
-
+\subsubsection{The $RL$ Function: Language Interpretation of $\textit{Rrexp}$s}
+Much like the definition of $L$ on plain regular expressions, one could also
+define the language interpretation of $\rrexp$s.
+\begin{center}
+\begin{tabular}{lcl}
+$RL \; (\ZERO)$ & $\dn$ & $\phi$\\
+$RL \; (\ONE)$ & $\dn$ & $\{[]\}$\\
+$RL \; (c)$ & $\dn$ & $\{[c]\}$\\
+$RL \; \sum rs$ & $\dn$ & $ \bigcup_{r \in rs} (RL \; r)$\\
+$RL \; (r_1 \cdot r_2)$ & $\dn$ & $ RL \; (r_1) @ RL \; (r_2)$\\
+$RL \; (r^*)$ & $\dn$ & $ (RL(r))^*$
+\end{tabular}
+\end{center}
+\noindent
+The main use of $RL$ is to establish some connections between $\rsimp{}$
+and $\rnullable{}$:
+\begin{lemma}
+ The following properties hold:
+ \begin{itemize}
+ \item
+ If $\rnullable{r}$, then $\rsimp{r} \neq \RZERO$.
+ \item
+ $\rnullable{r \backslash s} \quad $ if and only if $\quad \rnullable{\rderssimp{r}{s}}$.
+ \end{itemize}
+\end{lemma}
+\begin{proof}
+ The first part is by induction on $r$.
+ The second part is true because property
+ \[ RL \; r = RL \; (\rsimp{r})\] holds.
+\end{proof}
+
+\subsubsection{$\rsimp{}$ is Non-Increasing}
+In this subsection, we prove that the function $\rsimp{}$ does not
+make the $\llbracket \rrbracket_r$ size increase.
-\subsection{A Closed Form for the Sequence Regular Expression}
-\begin{quote}\it
- Claim: For regular expressions $r_1 \cdot r_2$, we claim that
- \begin{center}
- $ \rderssimp{r_1 \cdot r_2}{s} =
- \rsimp{(\sum (r_1 \backslash s \cdot r_2 ) \; :: \;(\map \; \rderssimp{r2}{\_} \;(\vsuf{s}{r_1})))}$
- \end{center}
-\end{quote}
+\begin{lemma}\label{rsimpSize}
+ $\llbracket \rsimp{r} \rrbracket_r \leq \llbracket r \rrbracket_r$
+\end{lemma}
+\subsubsection{Simplified $\textit{Rrexp}$s are Good}
+We formalise the notion of ``good" regular expressions,
+which means regular expressions that
+are not fully simplified. For alternative regular expressions that means they
+do not contain any nested alternatives like
+\[ r_1 + (r_2 + r_3) \], un-removed $\RZERO$s like \[\RZERO + r\]
+or duplicate elements in a children regular expression list like \[ \sum [r, r, \ldots]\]:
+\begin{center}
+ \begin{tabular}{@{}lcl@{}}
+ $\good\; \RZERO$ & $\dn$ & $\textit{false}$\\
+ $\good\; \RONE$ & $\dn$ & $\textit{true}$\\
+ $\good\; \RCHAR{c}$ & $\dn$ & $\btrue$\\
+ $\good\; \RALTS{[]}$ & $\dn$ & $\bfalse$\\
+ $\good\; \RALTS{[r]}$ & $\dn$ & $\bfalse$\\
+ $\good\; \RALTS{r_1 :: r_2 :: rs}$ & $\dn$ &
+ $\textit{isDistinct} \; (r_1 :: r_2 :: rs) \;$\\
+ & & $\textit{and}\; (\forall r' \in (r_1 :: r_2 :: rs).\; \good \; r'\; \, \textit{and}\; \, \textit{nonAlt}\; r')$\\
+ $\good \; \RSEQ{\RZERO}{r}$ & $\dn$ & $\bfalse$\\
+ $\good \; \RSEQ{\RONE}{r}$ & $\dn$ & $\bfalse$\\
+ $\good \; \RSEQ{r}{\RZERO}$ & $\dn$ & $\bfalse$\\
+ $\good \; \RSEQ{r_1}{r_2}$ & $\dn$ & $\good \; r_1 \;\, \textit{and} \;\, \good \; r_2$\\
+ $\good \; \RSTAR{r}$ & $\dn$ & $\btrue$\\
+ \end{tabular}
+\end{center}
+\noindent
+The predicate $\textit{nonAlt}$ evaluates to true when the regular expression is not an
+alternative, and false otherwise.
+The $\good$ property is preserved under $\rsimp_{ALTS}$, provided that
+its non-empty argument list of expressions are all good themsleves, and $\textit{nonAlt}$,
+and unique:
+\begin{lemma}\label{rsimpaltsGood}
+ If $rs \neq []$ and forall $r \in rs. \textit{nonAlt} \; r$ and $\textit{isDistinct} \; rs$,
+ then $\good \; (\rsimpalts \; rs)$ if and only if forall $r \in rs. \; \good \; r$.
+\end{lemma}
+\noindent
+We also note that
+if a regular expression $r$ is good, then $\rflts$ on the singleton
+list $[r]$ will not break goodness:
+\begin{lemma}\label{flts2}
+ If $\good \; r$, then forall $r' \in \rflts \; [r]. \; \good \; r'$ and $\textit{nonAlt} \; r'$.
+\end{lemma}
+\begin{proof}
+ By an induction on $r$.
+\end{proof}
\noindent
+The other observation we make about $\rsimp{r}$ is that it never
+comes with nested alternatives, which we describe as the $\nonnested$
+property:
+\begin{center}
+ \begin{tabular}{lcl}
+ $\nonnested \; \, \sum []$ & $\dn$ & $\btrue$\\
+ $\nonnested \; \, \sum ((\sum rs_1) :: rs_2)$ & $\dn$ & $\bfalse$\\
+ $\nonnested \; \, \sum (r :: rs)$ & $\dn$ & $\nonnested (\sum rs)$\\
+ $\nonnested \; \, r $ & $\dn$ & $\btrue$
+ \end{tabular}
+\end{center}
+\noindent
+The $\rflts$ function
+always opens up nested alternatives,
+which enables $\rsimp$ to be non-nested:
+
+\begin{lemma}\label{nonnestedRsimp}
+ $\nonnested \; (\rsimp{r})$
+\end{lemma}
+\begin{proof}
+ By an induction on $r$.
+\end{proof}
+\noindent
+With this we could prove that a regular expressions
+after simplification and flattening and de-duplication,
+will not contain any alternative regular expression directly:
+\begin{lemma}\label{nonaltFltsRd}
+ If $x \in \rdistinct{\rflts\; (\map \; \rsimp{} \; rs)}{\varnothing}$
+ then $\textit{nonAlt} \; x$.
+\end{lemma}
+\begin{proof}
+ By \ref{nonnestedRsimp}.
+\end{proof}
+\noindent
+The other thing we know is that once $\rsimp{}$ had finished
+processing an alternative regular expression, it will not
+contain any $\RZERO$s, this is because all the recursive
+calls to the simplification on the children regular expressions
+make the children good, and $\rflts$ will not take out
+any $\RZERO$s out of a good regular expression list,
+and $\rdistinct{}$ will not mess with the result.
+\begin{lemma}\label{flts3Obv}
+ The following are true:
+ \begin{itemize}
+ \item
+ If for all $r \in rs. \, \good \; r $ or $r = \RZERO$,
+ then for all $r \in \rflts\; rs. \, \good \; r$.
+ \item
+ If $x \in \rdistinct{\rflts\; (\map \; rsimp{}\; rs)}{\varnothing}$
+ and for all $y$ such that $\llbracket y \rrbracket_r$ less than
+ $\llbracket rs \rrbracket_r + 1$, either
+ $\good \; (\rsimp{y})$ or $\rsimp{y} = \RZERO$,
+ then $\good \; x$.
+ \end{itemize}
+\end{lemma}
+\begin{proof}
+ The first part is by induction on $rs$, where the induction
+ rule is the inductive cases for $\rflts$.
+ The second part is a corollary from the first part.
+\end{proof}
-Before we get to the proof that says the intermediate result of our lexer will
-remain finitely bounded, which is an important efficiency/liveness guarantee,
-we shall first develop a few preparatory properties and definitions to
-make the process of proving that a breeze.
+And this leads to good structural property of $\rsimp{}$,
+that after simplification, a regular expression is
+either good or $\RZERO$:
+\begin{lemma}\label{good1}
+ For any r-regular expression $r$, $\good \; \rsimp{r}$ or $\rsimp{r} = \RZERO$.
+\end{lemma}
+\begin{proof}
+ By an induction on $r$. The inductive measure is the size $\llbracket \rrbracket_r$.
+ Lemma \ref{rsimpSize} says that
+ $\llbracket \rsimp{r}\rrbracket_r$ is smaller than or equal to
+ $\llbracket r \rrbracket_r$.
+ Therefore, in the $r_1 \cdot r_2$ and $\sum rs$ case,
+ Inductive hypothesis applies to the children regular expressions
+ $r_1$, $r_2$, etc. The lemma \ref{flts3Obv}'s precondition is satisfied
+ by that as well.
+ The lemmas \ref{nonnestedRsimp} and \ref{nonaltFltsRd} are used
+ to ensure that goodness is preserved at the topmost level.
+\end{proof}
+We shall prove that any good regular expression is
+a fixed-point for $\rsimp{}$.
+First we prove an auxiliary lemma:
+\begin{lemma}\label{goodaltsNonalt}
+ If $\good \; \sum rs$, then $\rflts\; rs = rs$.
+\end{lemma}
+\begin{proof}
+ By an induction on $\sum rs$. The inductive rules are the cases
+ for $\good$.
+\end{proof}
+\noindent
+Now we are ready to prove that good regular expressions are invariant
+of $\rsimp{}$ application:
+\begin{lemma}\label{test}
+ If $\good \;r$ then $\rsimp{r} = r$.
+\end{lemma}
+\begin{proof}
+ By an induction on the inductive cases of $\good$.
+ The lemma \ref{goodaltsNonalt} is used in the alternative
+ case where 2 or more elements are present in the list.
+\end{proof}
+
+\subsubsection{$\rsimp$ is Idempotent}
+The idempotency of $\rsimp$ is very useful in
+manipulating regular expression terms into desired
+forms so that key steps allowing further rewriting to closed forms
+are possible.
+\begin{lemma}\label{rsimpIdem}
+$\rsimp{r} = \rsimp{\rsimp{r}}$
+\end{lemma}
+
+\begin{proof}
+ By \ref{test} and \ref{good1}.
+\end{proof}
+\noindent
+This property means we do not have to repeatedly
+apply simplification in each step, which justifies
+our definition of $\blexersimp$.
+
-We define rewriting relations for $\rrexp$s, which allows us to do the
-same trick as we did for the correctness proof,
-but this time we will have stronger equalities established.
-\subsection{"hrewrite" relation}
+On the other hand, we could repeat the same $\rsimp{}$ applications
+on regular expressions as many times as we want, if we have at least
+one simplification applied to it, and apply it wherever we would like to:
+\begin{corollary}\label{headOneMoreSimp}
+ $\map \; \rsimp{(r :: rs)} = \map \; \rsimp{} \; (\rsimp{r} :: rs)$
+\end{corollary}
+\noindent
+This will be useful in later closed form proof's rewriting steps.
+Similarly, we point out the following useful facts below:
+\begin{lemma}
+ The following equalities hold if $r = \rsimp{r'}$ for some $r'$:
+ \begin{itemize}
+ \item
+ If $r = \sum rs$ then $\rsimpalts \; rs = \sum rs$.
+ \item
+ If $r = \sum rs$ then $\rdistinct{rs}{\varnothing} = rs$.
+ \item
+ $\rsimpalts \; (\rdistinct{\rflts \; [r]}{\varnothing}) = r$.
+ \end{itemize}
+\end{lemma}
+\begin{proof}
+ By application of \ref{rsimpIdem} and \ref{good1}.
+\end{proof}
+
+\noindent
+With the idempotency of $\rsimp{}$ and its corollaries,
+we can start proving some key equalities leading to the
+closed forms.
+Now presented are a few equivalent terms under $\rsimp{}$.
+We use $r_1 \sequal r_2 $ here to denote $\rsimp{r_1} = \rsimp{r_2}$.
+\begin{lemma}
+\begin{itemize}
+ \item
+ $\rsimpalts \; (\RZERO :: rs) \sequal \rsimpalts\; rs$
+ \item
+ $\rsimpalts \; rs \sequal \rsimpalts (\map \; \rsimp{} \; rs)$
+ \item
+ $\RALTS{\RALTS{rs}} \sequal \RALTS{rs}$
+\end{itemize}
+\end{lemma}
+\noindent
+We need more equalities like the above to enable a closed form,
+but to proceed we need to introduce two rewrite relations,
+to make things smoother.
+\subsubsection{The rewrite relation $\hrewrite$ and $\grewrite$}
+Insired by the success we had in the correctness proof
+in \ref{Bitcoded2}, where we invented
+a term rewriting system to capture the similarity between terms
+and prove equivalence, we follow suit here defining simplification
+steps as rewriting steps.
+The presentation will be more concise than that in \ref{Bitcoded2}.
+To differentiate between the rewriting steps for annotated regular expressions
+and $\rrexp$s, we add characters $h$ and $g$ below the squig arrow symbol
+to mean atomic simplification transitions
+of $\rrexp$s and $\rrexp$ lists, respectively.
+
+\begin{center}
+
List of 1-step rewrite rules for regular expressions simplification without bitcodes:
\begin{figure}
\caption{the "h-rewrite" rules}
@@ -419,6 +710,25 @@
Fortunately this is provable by induction on the list $rs$
+\subsection{A Closed Form for the Sequence Regular Expression}
+\begin{quote}\it
+ Claim: For regular expressions $r_1 \cdot r_2$, we claim that
+ \begin{center}
+ $ \rderssimp{r_1 \cdot r_2}{s} =
+ \rsimp{(\sum (r_1 \backslash s \cdot r_2 ) \; :: \;(\map \; \rderssimp{r2}{\_} \;(\vsuf{s}{r_1})))}$
+ \end{center}
+\end{quote}
+\noindent
+
+Before we get to the proof that says the intermediate result of our lexer will
+remain finitely bounded, which is an important efficiency/liveness guarantee,
+we shall first develop a few preparatory properties and definitions to
+make the process of proving that a breeze.
+
+We define rewriting relations for $\rrexp$s, which allows us to do the
+same trick as we did for the correctness proof,
+but this time we will have stronger equalities established.
+
\section{Estimating the Closed Forms' sizes}
@@ -787,11 +1097,8 @@
\begin{lemma}
$\rsimp{\sflat{(r_1 \cdot r_2) \backslash s} }= \rsimp{\AALTS{[[]}{ (r_1 \backslash s) \cdot r_2 :: (\map (r_2 \backslash \_) (\vsuf{s}{r_1}))}}$
\end{lemma}
-Together with the idempotency property of $\rsimp$,
+Together with the idempotency property of $\rsimp{}$\ref{rsimpIdem},
%TODO: state the idempotency property of rsimp
-\begin{lemma}
-$\rsimp{r} = \rsimp{\rsimp{r}}$
-\end{lemma}
it is possible to convert the above lemma to obtain a "closed form"
for derivatives nested with simplification:
\begin{lemma}
@@ -1088,17 +1395,6 @@
% SECTION 1
%----------------------------------------------------------------------------------------
-\section{Idempotency of $\simp$}
-
-\begin{equation}
- \simp \;r = \simp\; \simp \; r
-\end{equation}
-This property means we do not have to repeatedly
-apply simplification in each step, which justifies
-our definition of $\blexersimp$.
-It will also be useful in future proofs where properties such as
-closed forms are needed.
-The proof is by structural induction on $r$.
%-----------------------------------
% SUBSECTION 1