ChengsongTanPhdThesis/Chapters/Chapter2.tex
changeset 503 35b80e37dfe7
parent 500 4d9eecfc936a
child 505 5ce3bd8e5696
--- a/ChengsongTanPhdThesis/Chapters/Chapter2.tex	Tue May 03 13:11:41 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Chapter2.tex	Fri May 06 13:22:20 2022 +0100
@@ -71,20 +71,52 @@
 \section{Finiteness Property}
 In this section let us sketch our argument for why the size of the simplified
 derivatives with the aggressive simplification function can be finitely bounded.
-For convenience, we use a new datatype which we call $\textit{rrexp}$ to denote
+
+For convenience, we use a new datatype which we call $\rrexp$ to denote
 the difference between it and annotated regular expressions. 
-For $\rrexp$ we throw away the bitcodes on the annotated regular expressions, but keeps
-everything else intact.
-It is similar to annotated regular expression being $\erase$ed, but with all its structure being intact
+\[			\rrexp ::=   \RZERO \mid  \RONE
+			 \mid  \RCHAR{c}  
+			 \mid  \RSEQ{r_1}{r_2}
+			 \mid  \RALTS{rs}
+			 \mid \RSTAR{r}        
+\]
+For $\rrexp$ we throw away the bitcodes on the annotated regular expressions, 
+but keep everything else intact.
+It is similar to annotated regular expressions being $\erase$-ed, but with all its structure preserved
 (the $\erase$ function unfortunately does not preserve structure in the case of empty and singleton
-$\AALTS$.
+$\ALTS$).
 We denote the operation of erasing the bits and turning an annotated regular expression 
-into an $\rrexp$ as $\rerase$.
-%TODO: definition of rerase
-That we can bound the size of annotated regular expressions by 
-$\rrexp$ is that the bitcodes grow linearly w.r.t the input string, and don't contribute to the structural size of 
-an annotated regular expression:
-$\rsize (\rerase a) = \asize a$
+into an $\rrexp{}$ as $\rerase{}$.
+\begin{center}
+\begin{tabular}{lcr}
+$\rerase{\AZERO}$ & $=$ & $\RZERO$\\
+$\rerase{\ASEQ{bs}{r_1}{r_2}}$ & $=$ & $\RSEQ{\rerase{r_1}}{\rerase{r_2}}$\\
+$\rerase{\AALTS{bs}{rs}}$ & $ =$ & $ \RALTS{rs}$
+\end{tabular}
+\end{center}
+%TODO: FINISH definition of rerase
+Similarly we could define the derivative  and simplification on 
+$\rrexp$, which would be identical to those we defined for plain $\rexp$s in chapter1, 
+except that now they can operate on alternatives taking multiple arguments.
+%TODO: definition of rder rsimp (maybe only the alternative clause)
+The reason why these definitions  mirror precisely the corresponding operations
+on annotated regualar expressions is that we can calculate sizes more easily:
+
+\begin{lemma}
+$\rsize{\rerase a} = \asize a$
+\end{lemma}
+A similar equation holds for annotated regular expressions' simplification
+and the plain regular expressions' simplification:
+\begin{lemma}
+$\asize{\bsimp{a}} = \rsize{\rsimp{\rerase{a}}}$
+\end{lemma}
+Putting these two together we get a property that allows us to estimate the 
+size of an annotated regular expression derivative using its un-annotated counterpart:
+\begin{lemma}
+$\asize{\bderssimp{r}{s}} =  \rsize{\rderssimp{\rerase{r}}{s}}$
+\end{lemma}
+Unless stated otherwise in this chapter all $\textit{rexp}$s without
+ bitcodes are seen as $\rrexp$s.
 
  Suppose
 we have a size function for bitcoded regular expressions, written
@@ -94,28 +126,28 @@
 such that 
 
 \begin{center}
-$\forall s. \; \llbracket{\bderssimp r s}\rrbracket \leq N$
+$\forall s. \; \llbracket{\bderssimp{r}{s}}\rrbracket \leq N$
 \end{center}
 
 \noindent
 We prove this by induction on $r$. The base cases for $\AZERO$,
 $\AONE \textit{bs}$ and $\ACHAR \textit{bs} c$ are straightforward. The interesting case is
-for sequences of the form $\ASEQ \textit{bs} r_1 r_2$. In this case our induction
-hypotheses state $\exists N_1. \forall s. \; \llbracket \bderssimp(r_1, s) \rrbracket \leq N_1$ and
-$\exists N_2. \forall s. \; \llbracket \bderssimp(r_2, s) \rrbracket \leq N_2$. We can reason as follows
+for sequences of the form $\ASEQ{bs}{r_1}{r_2}$. In this case our induction
+hypotheses state $\exists N_1. \forall s. \; \llbracket \bderssimp{r}{s} \rrbracket \leq N_1$ and
+$\exists N_2. \forall s. \; \llbracket \bderssimp{r_2}{s} \rrbracket \leq N_2$. We can reason as follows
 %
 \begin{center}
 \begin{tabular}{lcll}
-& & $ \llbracket \bderssimp( (\ASEQ\, \textit{bs} \, r_1 \,r_2),  s) \rrbracket$\\
-& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;((ASEQ [] (\bderssimp r_1 s) r_2) ::
-    [\bderssimp r_2 s' \;|\; s' \in \textit{Suffix}( r_1, s)]))\rrbracket $ & (1) \\
+& & $ \llbracket   \bderssimp{\ASEQ{bs}{r_1}{r_2} }{s} \rrbracket$\\
+& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;(\ASEQ{nil}{\bderssimp{ r_1}{s}}{ r_2} ::
+    [\bderssimp{ r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)]))\rrbracket $ & (1) \\
 & $\leq$ &
-    $\llbracket\textit{\distinctWith}\,((\ASEQ [] (\bderssimp r_1 s) r_2$) ::
-    [$\bderssimp r_2 s' \;|\; s' \in \textit{Suffix}( r_1, s)])\,\approx\;{}\rrbracket + 1 $ & (2) \\
-& $\leq$ & $\llbracket\ASEQ [] (\bderssimp r_1 s) r_2\rrbracket +
-             \llbracket\textit{distinctWith}\,[\bderssimp r_2 s' \;|\; s' \in \textit{Suffix}( r_1, s)]\,\approx\;{}\rrbracket + 1 $ & (3) \\
+    $\llbracket\textit{\distinctWith}\,((\ASEQ{nil}{\bderssimp{r_1}{s}}{r_2}$) ::
+    [$\bderssimp{ r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)])\,\approx\;{}\rrbracket + 1 $ & (2) \\
+& $\leq$ & $\llbracket\ASEQ{bs}{\bderssimp{ r_1}{s}}{r_2}\rrbracket +
+             \llbracket\textit{distinctWith}\,[\bderssimp{r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)]\,\approx\;{}\rrbracket + 1 $ & (3) \\
 & $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 +
-      \llbracket \distinctWith\,[ \bderssimp r_2 s' \;|\; s' \in \textit{Suffix}( r_1, s)] \,\approx\;\rrbracket$ & (4)\\
+      \llbracket \distinctWith\,[ \bderssimp{r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)] \,\approx\;\rrbracket$ & (4)\\
 & $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5)
 \end{tabular}
 \end{center}
@@ -123,13 +155,14 @@
 % tell Chengsong about Indian paper of closed forms of derivatives
 
 \noindent
-where in (1) the $\textit{Suffix}( r_1, s)$ are the all the suffixes of $s$ where $\bderssimp r_1 s'$ is nullable ($s'$ being a suffix of $s$).
+where in (1) the $\textit{Suffix}( r_1, s)$ are the all the suffixes of $s$ where $\bderssimp{ r_1}{s'}$ is nullable ($s'$ being a suffix of $s$).
 The reason why we could write the derivatives of a sequence this way is described in section 2.
-The term (2) is used to control (1) since it we know that you can obtain an overall
+The term (2) is used to control (1). 
+That is because one can obtain an overall
 smaller regex list
 by flattening it and removing $\ZERO$s first before applying $\distinctWith$ on it.
 Section 3 is dedicated to its proof.
-In (3) we know that  $\llbracket\ASEQ [] (\bderssimp r_1 s) r_2\rrbracket$ is 
+In (3) we know that  $\llbracket \ASEQ{bs}{(\bderssimp{ r_1}{s}}{r_2}\rrbracket$ is 
 bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller
 than $N_2$. The list length after  $\distinctWith$ is bounded by a number, which we call $l_{N_2}$. It stands
 for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them).
@@ -155,14 +188,106 @@
 %----------------------------------------------------------------------------------------
 
 \section{"Closed Forms" of regular expressions' derivatives w.r.t strings}
-There is a nice property about the compound regular expression 
-$r_1 \cdot r_2$ in general,
-that the derivatives of it against a string $s$ can be described by
-the derivatives w.r.t $r_1$ and $r_2$ over substrings of $s$:
+To embark on getting the "closed forms" of regexes, we first
+define a few auxiliary definitions to make expressing them smoothly.
+
+ \begin{center}  
+ \begin{tabular}{ccc}
+ $\sflataux{\AALTS{ }{r :: rs}}$ & $=$ & $\sflataux{r} @ rs$\\
+$\sflataux{\AALTS{ }{[]}}$ & $ = $ & $ []$\\
+$\sflataux r$ & $=$ & $ [r]$
+\end{tabular}
+\end{center}
+The intuition behind $\sflataux{\_}$ is to break up nested regexes 
+of the $(\ldots((r_1 + r_2) + r_3) + \ldots )$(left-associated) shape
+into a more "balanced" list: $\AALTS{\_}{[r_1,\, r_2 ,\, r_3, \ldots]}$.
+It will return the singleton list $[r]$ otherwise.
+
+$\sflat{\_}$ works the same  as $\sflataux{\_}$, except that it keeps
+the output type a regular expression, not a list:
+ \begin{center} 
+ \begin{tabular}{ccc}
+ $\sflat{\AALTS{ }{r :: rs}}$ & $=$ & $\sflataux{r} @ rs$\\
+$\sflat{\AALTS{ }{[]}}$ & $ = $ & $ \AALTS{ }{[]}$\\
+$\sflat r$ & $=$ & $ [r]$
+\end{tabular}
+\end{center}
+$\sflataux{\_}$  and $\sflat{\_}$ is only recursive in terms of the
+ first element of the list of children of
+an alternative regex ($\AALTS{ }{rs}$), and is purposefully built for  dealing with the regular expression
+$r_1 \cdot r_2 \backslash s$.
+
+With $\sflat{\_}$ and $\sflataux{\_}$ we make 
+precise what  "closed forms" we have for the sequence derivatives and their simplifications,
+in other words, how can we construct $(r_1 \cdot r_2) \backslash s$
+and $\bderssimp{(r_1\cdot r_2)}{s}$,
+if we are only allowed to use a combination of $r_1 \backslash s'$ ($\bderssimp{r_1}{s'}$)
+and  $r_2 \backslash s'$ ($\bderssimp{r_2}{s'}$), where $s'$  ranges over 
+the substring of $s$?
+First let's look at a series of derivatives steps on a sequence 
+regular expression,  (assuming) that each time the first
+component of the sequence is always nullable):
+\begin{center}
+
+$r_1 \cdot r_2 \quad \longrightarrow_{\backslash c}  \quad   r_1  \backslash c \cdot r_2 + r_2 \backslash c \quad \longrightarrow_{\backslash c'} \quad (r_1 \backslash cc' \cdot r_2 + r_2 \backslash c') + r_2 \backslash cc' \longrightarrow_{\backslash c''} \quad$\\
+$((r_1 \backslash cc'c'' \cdot r_2 + r_2 \backslash c'') + r_2 \backslash c'c'') + r_2 \backslash cc'c''   \longrightarrow_{\backslash c''} \quad
+ \ldots$
+
+\end{center}
+%TODO: cite indian paper
+Indianpaper have  come up with a slightly more formal way of putting the above process:
+\begin{center}
+$r_1 \cdot r_2 \backslash (c_1 :: c_2 ::\ldots c_n) \myequiv r_1 \backslash (c_1 :: c_2:: \ldots c_n) +
+\delta(\nullable(r_1 \backslash (c_1 :: c_2 \ldots c_{n-1}) ), r_2 \backslash (c_n)) + \ldots
++ \delta (\nullable(r_1), r_2\backslash (c_1 :: c_2 ::\ldots c_n))$
+\end{center}
+where  $\delta(b, r)$ will produce $r$
+if $b$ evaluates to true, 
+and $\ZERO$ otherwise.
+
+ But the $\myequiv$ sign in the above equation means language equivalence instead of syntactical
+ equivalence. To make this intuition useful 
+ for a formal proof, we need something
+stronger than language equivalence.
+With the help of $\sflat{\_}$ we can state the equation in Indianpaper
+more rigorously:
 \begin{lemma}
-$\textit{sflat}\_{aux} ( (r_1 \cdot r_2) \backslash s) = (r_1 \backslash s) \cdot r_2 :: (\map (r_2 \backslash \_) (\vsuf s r_1))$
+$\sflat{(r_1 \cdot r_2) \backslash s} = \AALTS{[[]}{ (r_1 \backslash s) \cdot r_2 :: (\map (r_2 \backslash \_) (\vsuf{s}{r_1}))}$
 \end{lemma}
+With this property we can prove the finiteness of the size of the regex $(r_1 \cdot r_2) \backslash s$,
+much like a recursive function's termination proof.
+The function $\vsuf{\_}{\_}$ is defined this way:
+%TODO: DEF of vsuf
+\begin{center}
+\begin{tabular}{lcl}
+$\vsuf{[]}{\_} $ & $=$ &  $[]$\\
+$\vsuf{c::cs}{r_1}$ & $ =$ & $ \textit{if} (\rnullable{r_1}) \textit{then}  (\vsuf{cs}{(rder c r1)}) @ [c :: cs]$\\
+                                     && $\textit{else}  (\vsuf{cs}{rder c r1})  $
+\end{tabular}
+\end{center}                   
+It takes into account which prefixes $s'$ of $s$ would make $r \backslash s'$ nullable,
+and keep a list of suffixes $s''$ such that $s' @ s'' = s$. The list is sorted in
+the order $r_2\backslash s''$ appears in $(r_1\cdot r_2)\backslash s$.
+In essence, $\vsuf{\_}{\_}$ is doing a "virtual derivative" of $r$, but instead of producing 
+the entire result of  $(r_1 \cdot r_2) \backslash s$, 
+it only stores all the $r_2 \backslash s''$ terms.
 
+With this we can also add simplifications to both sides and get
+\begin{lemma}
+$\rsimp{\sflat{(r_1 \cdot r_2) \backslash s} }= \rsimp{\AALTS{[[]}{ (r_1 \backslash s) \cdot r_2 :: (\map (r_2 \backslash \_) (\vsuf{s}{r_1}))}}$
+\end{lemma}
+Together with the idempotency property of $\rsimp$,
+%TODO: state the idempotency property of rsimp
+it is possible to convert the above lemma to obtain a "closed form"
+for our lexer's intermediate result without bitcodes:
+\begin{lemma}
+$\rderssimp{\sflat{(r_1 \cdot r_2) \backslash s} }= \rsimp{\AALTS{[[]}{ (r_1 \backslash s) \cdot r_2 :: (\map (r_2 \backslash \_) (\vsuf{s}{r_1}))}}$
+\end{lemma}
+And now the reason we have (2) in section 1 is clear:
+$\rsize{\rsimp{\AALTS{[[]}{ (r_1 \backslash s) \cdot r_2 :: (\map (r_2 \backslash \_) (\vsuf{s}{r_1}))}}}$
+is bounded by     $\llbracket\textit{\distinctWith}\,((\ASEQ{nil}{\bderssimp{r_1}{s}}{r_2}$) ::
+    [$\bderssimp{ r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)])\,\approx\;{}\rrbracket + 1 $ ,
+    as $\vsuf{s}{r_1}$ is a subset of s $\textit{Suffix}( r_1, s)])$.
 
 %----------------------------------------------------------------------------------------
 %	SECTION 3