ChengsongTanPhdThesis/Chapters/Finite.tex
changeset 593 83fab852d72d
parent 591 b2d0de6aee18
child 594 62f8fa03863e
equal deleted inserted replaced
592:7f4c353c0f6b 593:83fab852d72d
    11 given an annotated regular expression $a$, for any string $s$
    11 given an annotated regular expression $a$, for any string $s$
    12 our algorithm $\blexersimp$'s internal annotated regular expression 
    12 our algorithm $\blexersimp$'s internal annotated regular expression 
    13 size  is finitely bounded
    13 size  is finitely bounded
    14 by a constant $N_a$ that only depends on $a$:
    14 by a constant $N_a$ that only depends on $a$:
    15 \begin{center}
    15 \begin{center}
    16 $\llbracket \bderssimp{a}{s} \rrbracket \leq N_a$
    16 	$\llbracket \bderssimp{a}{s} \rrbracket \leq N_a$
    17 \end{center}
    17 \end{center}
    18 \noindent
    18 \noindent
    19 where the size of an annotated regular expression is defined
    19 where the size of an annotated regular expression is defined
    20 in terms of the number of nodes in its tree structure:
    20 in terms of the number of nodes in its tree structure:
    21 \begin{center}
    21 \begin{center}
    22 \begin{tabular}{ccc}
    22 	\begin{tabular}{ccc}
    23 	$\llbracket _{bs}\ONE \rrbracket$ & $\dn$ & $1$\\
    23 		$\llbracket _{bs}\ONE \rrbracket$ & $\dn$ & $1$\\
    24 	$\llbracket \ZERO \rrbracket$ & $\dn$ & $1$ \\
    24 		$\llbracket \ZERO \rrbracket$ & $\dn$ & $1$ \\
    25 	$\llbracket _{bs} r_1 \cdot r_2 \rrbracket$ & $\dn$ & $\llbracket r_1 \rrbracket + \llbracket r_2 \rrbracket + 1$\\
    25 		$\llbracket _{bs} r_1 \cdot r_2 \rrbracket$ & $\dn$ & $\llbracket r_1 \rrbracket + \llbracket r_2 \rrbracket + 1$\\
    26 $\llbracket _{bs}\mathbf{c} \rrbracket $ & $\dn$ & $1$\\
    26 		$\llbracket _{bs}\mathbf{c} \rrbracket $ & $\dn$ & $1$\\
    27 $\llbracket _{bs}\sum as \rrbracket $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket)\; as   + 1$\\
    27 		$\llbracket _{bs}\sum as \rrbracket $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket)\; as   + 1$\\
    28 $\llbracket _{bs} a^* \rrbracket $ & $\dn$ & $\llbracket a \rrbracket + 1$.
    28 		$\llbracket _{bs} a^* \rrbracket $ & $\dn$ & $\llbracket a \rrbracket + 1$.
    29 \end{tabular}
    29 	\end{tabular}
    30 \end{center}
    30 \end{center}
    31 We believe this size formalisation 
    31 We believe this size formalisation 
    32 of the algorithm is important in our context, because 
    32 of the algorithm is important in our context, because 
    33 \begin{itemize}
    33 \begin{itemize}
    34 	\item
    34 	\item
    46 \noindent
    46 \noindent
    47 In our lexer $\blexersimp$,
    47 In our lexer $\blexersimp$,
    48 The regular expression is repeatedly being taken derivative of
    48 The regular expression is repeatedly being taken derivative of
    49 and then simplified.
    49 and then simplified.
    50 \begin{figure}[H]
    50 \begin{figure}[H]
    51 \begin{tikzpicture}[scale=2,
    51 	\begin{tikzpicture}[scale=2,
    52                     every node/.style={minimum size=11mm},
    52 		every node/.style={minimum size=11mm},
    53 		    ->,>=stealth',shorten >=1pt,auto,thick
    53 		->,>=stealth',shorten >=1pt,auto,thick
    54 		    ]
    54 		]
    55 \node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
    55 		\node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
    56 \node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
    56 		\node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
    57 \draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
    57 		\draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
    58 
    58 
    59 \node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=6mm]{$a_{1s}$};
    59 		\node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=6mm]{$a_{1s}$};
    60 \draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp$};
    60 		\draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp$};
    61 
    61 
    62 \node (r2) [rectangle, draw=black, thick,  right=of r1s, minimum size = 12mm]{$a_2$};
    62 		\node (r2) [rectangle, draw=black, thick,  right=of r1s, minimum size = 12mm]{$a_2$};
    63 \draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
    63 		\draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
    64 
    64 
    65 \node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=6mm]{$a_{2s}$};
    65 		\node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=6mm]{$a_{2s}$};
    66 \draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp$};
    66 		\draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp$};
    67 
    67 
    68 \node (rns) [rectangle, draw = blue, thick, right=of r2s,minimum size=6mm]{$a_{ns}$};
    68 		\node (rns) [rectangle, draw = blue, thick, right=of r2s,minimum size=6mm]{$a_{ns}$};
    69 \draw[->,line width=0.2mm, dashed](r2s)--(rns) node[above,midway] {$\backslash \ldots$};
    69 		\draw[->,line width=0.2mm, dashed](r2s)--(rns) node[above,midway] {$\backslash \ldots$};
    70 
    70 
    71 \node (v) [circle, thick, draw, right=of rns, minimum size=6mm, right=1.7cm]{$v$};
    71 		\node (v) [circle, thick, draw, right=of rns, minimum size=6mm, right=1.7cm]{$v$};
    72 \draw[->, line width=0.2mm](rns)--(v) node[above, midway] {\bmkeps} node [below, midway] {\decode};
    72 		\draw[->, line width=0.2mm](rns)--(v) node[above, midway] {\bmkeps} node [below, midway] {\decode};
    73 \end{tikzpicture}
    73 	\end{tikzpicture}
    74 \caption{Regular expression size change during our $\blexersimp$ algorithm}\label{simpShrinks}
    74 	\caption{Regular expression size change during our $\blexersimp$ algorithm}\label{simpShrinks}
    75 \end{figure}
    75 \end{figure}
    76 \noindent
    76 \noindent
    77 Each time
    77 Each time
    78 a derivative is taken, a regular expression might grow a bit,
    78 a derivative is taken, a regular expression might grow a bit,
    79 but simplification always takes care that 
    79 but simplification always takes care that 
    86 
    86 
    87 \noindent
    87 \noindent
    88 Sulzmann and Lu's assumed something similar about their algorithm,
    88 Sulzmann and Lu's assumed something similar about their algorithm,
    89 though in fact their algorithm's size might be better depicted by the following graph:
    89 though in fact their algorithm's size might be better depicted by the following graph:
    90 \begin{figure}[H]
    90 \begin{figure}[H]
    91 \begin{tikzpicture}[scale=2,
    91 	\begin{tikzpicture}[scale=2,
    92                     every node/.style={minimum size=11mm},
    92 		every node/.style={minimum size=11mm},
    93 		    ->,>=stealth',shorten >=1pt,auto,thick
    93 		->,>=stealth',shorten >=1pt,auto,thick
    94 		    ]
    94 		]
    95 \node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
    95 		\node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
    96 \node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
    96 		\node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
    97 \draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
    97 		\draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
    98 
    98 
    99 \node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=7mm]{$a_{1s}$};
    99 		\node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=7mm]{$a_{1s}$};
   100 \draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp'$};
   100 		\draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp'$};
   101 
   101 
   102 \node (r2) [rectangle, draw=black, thick,  right=of r1s, minimum size = 17mm]{$a_2$};
   102 		\node (r2) [rectangle, draw=black, thick,  right=of r1s, minimum size = 17mm]{$a_2$};
   103 \draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
   103 		\draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
   104 
   104 
   105 \node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=14mm]{$a_{2s}$};
   105 		\node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=14mm]{$a_{2s}$};
   106 \draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp'$};
   106 		\draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp'$};
   107 
   107 
   108 \node (r3) [rectangle, draw = black, thick, right= of r2s, minimum size = 22mm]{$a_3$};
   108 		\node (r3) [rectangle, draw = black, thick, right= of r2s, minimum size = 22mm]{$a_3$};
   109 \draw[->,line width=0.2mm](r2s)--(r3) node[above,midway] {$\backslash c_3$};
   109 		\draw[->,line width=0.2mm](r2s)--(r3) node[above,midway] {$\backslash c_3$};
   110 
   110 
   111 \node (rns) [right = of r3, draw=blue, minimum size = 20mm]{$a_{3s}$};
   111 		\node (rns) [right = of r3, draw=blue, minimum size = 20mm]{$a_{3s}$};
   112 \draw[->,line width=0.2mm] (r3)--(rns) node [above, midway] {$\simp'$};
   112 		\draw[->,line width=0.2mm] (r3)--(rns) node [above, midway] {$\simp'$};
   113 
   113 
   114 \node (rnn) [right = of rns, minimum size = 1mm]{};
   114 		\node (rnn) [right = of rns, minimum size = 1mm]{};
   115 \draw[->, dashed] (rns)--(rnn) node [above, midway] {$\ldots$};
   115 		\draw[->, dashed] (rns)--(rnn) node [above, midway] {$\ldots$};
   116 
   116 
   117 \end{tikzpicture}
   117 	\end{tikzpicture}
   118 \caption{Regular expression size change during our $\blexersimp$ algorithm}\label{sulzShrinks}
   118 	\caption{Regular expression size change during our $\blexersimp$ algorithm}\label{sulzShrinks}
   119 \end{figure}
   119 \end{figure}
   120 \noindent
   120 \noindent
   121 That is, on certain cases their lexer 
   121 That is, on certain cases their lexer 
   122 will have an indefinite size explosion, causing the running time 
   122 will have an indefinite size explosion, causing the running time 
   123 of each derivative step to grow arbitrarily large (for example 
   123 of each derivative step to grow arbitrarily large (for example 
   130 We give details of the proof in the next sections.
   130 We give details of the proof in the next sections.
   131 
   131 
   132 \subsection{Overview of the Proof}
   132 \subsection{Overview of the Proof}
   133 Here is a bird's eye view of how the proof of finiteness works,
   133 Here is a bird's eye view of how the proof of finiteness works,
   134 which involves three steps:
   134 which involves three steps:
   135 \begin{center}
   135 \begin{figure}[H]
   136   \begin{tikzpicture}[scale=1,font=\bf,
   136 	\begin{tikzpicture}[scale=1,font=\bf,
   137                       node/.style={
   137 		node/.style={
   138                       rectangle,rounded corners=3mm,
   138 			rectangle,rounded corners=3mm,
   139                       ultra thick,draw=black!50,minimum height=18mm, 
   139 			ultra thick,draw=black!50,minimum height=18mm, 
   140                       minimum width=20mm,
   140 			minimum width=20mm,
   141                       top color=white,bottom color=black!20}]
   141 		top color=white,bottom color=black!20}]
   142 
   142 
   143 
   143 
   144 		      \node (0) at (-5,0) 
   144 		\node (0) at (-5,0) 
   145 			      [node, text width=1.8cm, text centered] 
   145 			[node, text width=1.8cm, text centered] 
   146 			      {$\llbracket \bderssimp{a}{s} \rrbracket$};
   146 			{$\llbracket \bderssimp{a}{s} \rrbracket$};
   147 		      \node (A) at (0,0) 
   147 		\node (A) at (0,0) 
   148 			      [node,text width=1.6cm,  text centered] 
   148 			[node,text width=1.6cm,  text centered] 
   149 			      {$\llbracket \rderssimp{r}{s} \rrbracket_r$};
   149 			{$\llbracket \rderssimp{r}{s} \rrbracket_r$};
   150 		      \node (B) at (3,0) 
   150 		\node (B) at (3,0) 
   151 			      [node,text width=3.0cm, anchor=west, minimum width = 40mm] 
   151 			[node,text width=3.0cm, anchor=west, minimum width = 40mm] 
   152 			      {$\llbracket \textit{ClosedForm}(r, s)\rrbracket_r$};
   152 			{$\llbracket \textit{ClosedForm}(r, s)\rrbracket_r$};
   153   \node (C) at (9.5,0) [node, minimum width=10mm] {$N_r$};
   153 		\node (C) at (9.5,0) [node, minimum width=10mm] {$N_r$};
   154  
   154 
   155   \draw [->,line width=0.5mm] (0) -- 
   155 		\draw [->,line width=0.5mm] (0) -- 
   156 	  node [above,pos=0.45] {=} (A) node [below, pos = 0.45] {$(r = a \downarrow_r)$} (A); 
   156 			node [above,pos=0.45] {=} (A) node [below, pos = 0.45] {$(r = a \downarrow_r)$} (A); 
   157   \draw [->,line width=0.5mm] (A) -- 
   157 		\draw [->,line width=0.5mm] (A) -- 
   158 	  node [above,pos=0.35] {$\quad =\ldots=$} (B); 
   158 			node [above,pos=0.35] {$\quad =\ldots=$} (B); 
   159   \draw [->,line width=0.5mm] (B) -- 
   159 		\draw [->,line width=0.5mm] (B) -- 
   160 	  node [above,pos=0.35] {$\quad \leq \ldots \leq$} (C); 
   160 			node [above,pos=0.35] {$\quad \leq \ldots \leq$} (C); 
   161   \end{tikzpicture}
   161 	\end{tikzpicture}
   162 \end{center}
   162 	%\caption{
       
   163 \end{figure}
   163 \noindent
   164 \noindent
   164 We explain the steps one by one:
   165 We explain the steps one by one:
   165 \begin{itemize}
   166 \begin{itemize}
   166 	\item
   167 	\item
   167 		We first introduce the operations such as 
   168 		We first introduce the operations such as 
   168 		derivatives, simplification, size calculation, etc.
   169 		derivatives, simplification, size calculation, etc.
   169 		associated with $\rrexp$s, which we have given
   170 		associated with $\rrexp$s, which we have given
   170 		a very brief introduction to in chapter \ref{Bitcoded2}.
   171 		a very brief introduction to in chapter \ref{Bitcoded2}.
       
   172 		The operations on $\rrexp$s are identical to those on
       
   173 		annotated regular expressions except that they are unaware
       
   174 		of bitcodes. This means that all proofs about size of $\rrexp$s will apply to
       
   175 		annotated regular expressions.
   171 	\item
   176 	\item
   172 		We have a set of equalities for this new datatype that enables one to
   177 		We prove that $\rderssimp{r}{s} = \textit{ClosedForm}(r, s)$,
   173 		rewrite $\rderssimp{r_1 \cdot r_2}{s}$ 
   178 		where $\textit{ClosedForm}(r, s)$ is entirely 
   174 		and $\rderssimp{r^*}{s}$ (the most involved
   179 		written in the derivatives of their children regular 
   175 		inductive cases)
   180 		expressions.
   176 		by a combinatioin of derivatives of their 
   181 		We call the right-hand-side the \emph{Closed Form}
   177 		children regular expressions ($r_1$, $r_2$, and $r$, respectively),
   182 		of the derivative $\rderssimp{r}{s}$.
   178 		which we call the \emph{Closed Forms}
       
   179 		of the derivatives.
       
   180 	\item
   183 	\item
   181 		The Closed Forms of the regular expressions
   184 		We estimate $\llbracket \textit{ClosedForm}(r, s) \rrbracket_r$.
   182 		are controlled by terms that
   185 		The key observation is that $\distinctBy$'s output is
   183 		are easier to deal with.
   186 		a list with a constant length bound.
   184 		Using inductive hypothesis, these terms
       
   185 		are in turn bounded loosely
       
   186 		by a large yet constant number.
       
   187 \end{itemize}
   187 \end{itemize}
   188 We give details of these steps in the next sections.
   188 We give details of these steps in the next sections.
   189 The first step is to use 
   189 The first step is to use 
   190 $\textit{rrexp}$s,
   190 $\textit{rrexp}$s,
   191 something simpler than
   191 something simpler than
   192 annotated regular expressions. 
   192 annotated regular expressions. 
   193 
   193 
   194 \section{the $\textit{rrexp}$ Datatype and Its Size Functions}
   194 \section{The $\textit{Rrexp}$ Datatype and Its Lexing-Related Functions}
   195 
   195 We first recap the definition of the new datatype $\rrexp$, called
   196 We first recap a bit about the new datatype
   196 \emph{r-regular expressions},
   197 we defined in \ref{rrexpDef},
   197 which we first defined in \ref{rrexpDef}.
   198 called $\textit{rrexp}$s.
   198 R-regular expressions are similar to basic regular expressions.
   199 We chose $\rrexp$ over 
   199 We call them \emph{r}-regular expressions
   200 the basic regular expressions
   200 to make a distinction  
   201 because it is more straightforward to tweak 
   201 with plain regular expressions.
   202 into the shape we want 
       
   203 compared with an annotated regular expression.
       
   204 We want to prove the size property about annotated regular expressions.
       
   205 The size is 
       
   206 written $\llbracket r\rrbracket$, whose intuitive definition is as below
       
   207 \noindent
       
   208 We first note that $\llbracket \_ \rrbracket$
       
   209 is unaware of bitcodes, since 
       
   210 it only counts the number of nodes
       
   211 if we regard $r$ as a tree.
       
   212 This suggests that if we define a new datatype that is similar to plain $\rexp$s,
       
   213 \[			\rrexp ::=   \RZERO \mid  \RONE
   202 \[			\rrexp ::=   \RZERO \mid  \RONE
   214 			 \mid  \RCHAR{c}  
   203 	\mid  \RCHAR{c}  
   215 			 \mid  \RSEQ{r_1}{r_2}
   204 	\mid  \RSEQ{r_1}{r_2}
   216 			 \mid  \RALTS{rs}
   205 	\mid  \RALTS{rs}
   217 			 \mid \RSTAR{r}        
   206 	\mid \RSTAR{r}        
   218 \]
   207 \]
   219 and define 
   208 The size of an r-regular expression is
   220 \begin{center}
   209 written $\llbracket r\rrbracket_r$, 
   221 \begin{tabular}{lcl}
   210 whose definition mirrors that of an annotated regular expression. 
   222 $\rerase{\ZERO}$ & $\dn$ & $\RZERO$\\
   211 \begin{center}
   223 $\rerase{_{bs}\ONE}$ & $\dn$ & $\RONE$\\
   212 	\begin{tabular}{ccc}
   224 	$\rerase{_{bs}\mathbf{c}}$ & $\dn$ & $\RCHAR{c}$\\
   213 		$\llbracket _{bs}\ONE \rrbracket_r$ & $\dn$ & $1$\\
   225 $\rerase{_{bs}r_1\cdot r_2}$ & $\dn$ & $\RSEQ{\rerase{r_1}}{\rerase{r_2}}$\\
   214 		$\llbracket \ZERO \rrbracket_r$ & $\dn$ & $1$ \\
   226 $\rerase{_{bs}\sum as}$ & $\dn$ & $\RALTS{\map \; \rerase{\_} \; as}$\\
   215 		$\llbracket _{bs} r_1 \cdot r_2 \rrbracket_r$ & $\dn$ & $\llbracket r_1 \rrbracket_r + \llbracket r_2 \rrbracket_r + 1$\\
   227 $\rerase{_{bs} a ^*}$ & $\dn$ & $\rerase{a}^*$
   216 		$\llbracket _{bs}\mathbf{c} \rrbracket_r $ & $\dn$ & $1$\\
   228 \end{tabular}
   217 		$\llbracket _{bs}\sum as \rrbracket_r $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket_r)\; as   + 1$\\
   229 \end{center}
   218 		$\llbracket _{bs} a^* \rrbracket_r $ & $\dn$ & $\llbracket a \rrbracket_r + 1$.
   230 \noindent
   219 	\end{tabular}
   231 we could calculate the size of annotated regular expressions in terms of
   220 \end{center}
   232 its un-annotated counterpart: 
   221 \noindent
   233 \begin{lemma}
   222 The $r$ in the subscript of $\llbracket \rrbracket_r$ is to 
   234 $\rsize{\rerase a} = \asize a$
   223 differentiate with the same operation for annotated regular expressions.
   235 \end{lemma}
   224 Adding $r$ as subscript will be used in 
   236 \begin{proof}
   225 other operations as well.
   237 	By routine induction on the structure of $a$.
   226 
   238 \end{proof}
   227 The transformation from an annotated regular expression
   239 \noindent
   228 to an r-regular expression is straightforward.
   240 We call them \emph{r}-regular expressions:
   229 \begin{center}
   241 we use
   230 	\begin{tabular}{lcl}
   242 $\rrexp$ as its type name, so as to make a distinction  
   231 		$\rerase{\ZERO}$ & $\dn$ & $\RZERO$\\
   243 with plain regular expressions.
   232 		$\rerase{_{bs}\ONE}$ & $\dn$ & $\RONE$\\
   244 In $\rrexp$ we throw away the bitcodes on the annotated regular expressions, 
   233 		$\rerase{_{bs}\mathbf{c}}$ & $\dn$ & $\RCHAR{c}$\\
   245 but keep everything else intact.
   234 		$\rerase{_{bs}r_1\cdot r_2}$ & $\dn$ & $\RSEQ{\rerase{r_1}}{\rerase{r_2}}$\\
   246 To transform an annotated regular expression into an r-regular expression,
   235 		$\rerase{_{bs}\sum as}$ & $\dn$ & $\RALTS{\map \; \rerase{\_} \; as}$\\
   247 we use the function \emph{rerase}, written $\downarrow_r$. 
   236 		$\rerase{_{bs} a ^*}$ & $\dn$ & $\rerase{a} ^*$
   248 The $r$ in the subscript of $\downarrow$ is to 
   237 	\end{tabular}
   249 differentiate with the $\downarrow$ for the $\erase$ operation.
   238 \end{center}
       
   239 \noindent
       
   240 $\textit{Rerase}$ throws away the bitcodes on the annotated regular expressions, 
       
   241 but keeps everything else intact.
       
   242 
   250 Before we introduce more functions related to r-regular expressions,
   243 Before we introduce more functions related to r-regular expressions,
   251 we first give out the reason why we take all the trouble 
   244 we first give out the reason why we take all the trouble 
   252 defining a new datatype in the first place.
   245 defining a new datatype in the first place.
       
   246 We could calculate the size of annotated regular expressions in terms of
       
   247 their un-annotated $\rrexp$ counterpart: 
       
   248 \begin{lemma}
       
   249 	$\rsize{\rerase a} = \asize a$
       
   250 \end{lemma}
       
   251 \begin{proof}
       
   252 	By routine structural induction on $a$.
       
   253 \end{proof}
       
   254 \noindent
   253 \subsection{Motivation Behind a New Datatype}
   255 \subsection{Motivation Behind a New Datatype}
   254 The main difference between a plain regular expression
   256 The main difference between a plain regular expression
   255 and an r-regular expression is that it can take
   257 and an r-regular expression is that it can take
   256 non-binary arguments for its alternative constructor.
   258 non-binary arguments for its alternative constructor.
   257 This turned out to be necessary if we want our proofs to be
   259 This turned out to be necessary if we want our proofs to be
   273 and its (r-)erased version:
   275 and its (r-)erased version:
   274 
   276 
   275 This does not hold for plain $\rexp$s. 
   277 This does not hold for plain $\rexp$s. 
   276 
   278 
   277 
   279 
   278 		These operations are 
   280 These operations are 
   279 		almost identical to those of the annotated regular expressions,
   281 almost identical to those of the annotated regular expressions,
   280 		except that no bitcodes are attached.
   282 except that no bitcodes are attached.
   281 Of course, the bits which encode the lexing information would grow linearly with respect 
   283 Of course, the bits which encode the lexing information would grow linearly with respect 
   282 to the input, which should be taken into account when we wish to tackle the runtime comlexity.
   284 to the input, which should be taken into account when we wish to tackle the runtime comlexity.
   283 But at the current stage 
   285 But at the current stage 
   284 we can safely ignore them.
   286 we can safely ignore them.
   285 Similarly there is a size function for plain regular expressions:
   287 Similarly there is a size function for plain regular expressions:
   286 \begin{center}
   288 \begin{center}
   287 \begin{tabular}{ccc}
   289 	\begin{tabular}{ccc}
   288 	$\llbracket \ONE \rrbracket_p$ & $\dn$ & $1$\\
   290 		$\llbracket \ONE \rrbracket_p$ & $\dn$ & $1$\\
   289 	$\llbracket \ZERO \rrbracket_p$ & $\dn$ & $1$ \\
   291 		$\llbracket \ZERO \rrbracket_p$ & $\dn$ & $1$ \\
   290 	$\llbracket r_1 \cdot r_2 \rrbracket_p$ & $\dn$ & $\llbracket r_1 \rrbracket_p + \llbracket r_2 \rrbracket_p + 1$\\
   292 		$\llbracket r_1 \cdot r_2 \rrbracket_p$ & $\dn$ & $\llbracket r_1 \rrbracket_p + \llbracket r_2 \rrbracket_p + 1$\\
   291 $\llbracket \mathbf{c} \rrbracket_p $ & $\dn$ & $1$\\
   293 		$\llbracket \mathbf{c} \rrbracket_p $ & $\dn$ & $1$\\
   292 $\llbracket r_1 \cdot r_2 \rrbracket_p $ & $\dn$ & $\llbracket r_1 \rrbracket_p \; + \llbracket r_2 \rrbracket_p + 1$\\
   294 		$\llbracket r_1 \cdot r_2 \rrbracket_p $ & $\dn$ & $\llbracket r_1 \rrbracket_p \; + \llbracket r_2 \rrbracket_p + 1$\\
   293 $\llbracket a^* \rrbracket_p $ & $\dn$ & $\llbracket a \rrbracket_p + 1$
   295 		$\llbracket a^* \rrbracket_p $ & $\dn$ & $\llbracket a \rrbracket_p + 1$
   294 \end{tabular}
   296 	\end{tabular}
   295 \end{center}
   297 \end{center}
   296 
   298 
   297 \noindent
   299 \noindent
   298 The idea of obatining a bound for $\llbracket \bderssimp{a}{s} \rrbracket$
   300 The idea of obatining a bound for $\llbracket \bderssimp{a}{s} \rrbracket$
   299 is to get an equivalent form
   301 is to get an equivalent form
   317 But there is a minor nuisance: 
   319 But there is a minor nuisance: 
   318 the erase function unavoidbly messes with the structure of the regular expression,
   320 the erase function unavoidbly messes with the structure of the regular expression,
   319 due to the discrepancy between annotated regular expression's $\sum$ constructor
   321 due to the discrepancy between annotated regular expression's $\sum$ constructor
   320 and plain regular expression's $+$ constructor having different arity.
   322 and plain regular expression's $+$ constructor having different arity.
   321 \begin{center}
   323 \begin{center}
   322 \begin{tabular}{ccc}
   324 	\begin{tabular}{ccc}
   323 $\erase \; _{bs}\sum [] $ & $\dn$ & $\ZERO$\\
   325 		$\erase \; _{bs}\sum [] $ & $\dn$ & $\ZERO$\\
   324 $\erase \; _{bs}\sum [a]$ & $\dn$ & $a$\\
   326 		$\erase \; _{bs}\sum [a]$ & $\dn$ & $a$\\
   325 $\erase \; _{bs}\sum a :: as$ & $\dn$ & $a + (\erase \; _{[]} \sum as)\quad \text{if $as$ length over 1}$
   327 		$\erase \; _{bs}\sum a :: as$ & $\dn$ & $a + (\erase \; _{[]} \sum as)\quad \text{if $as$ length over 1}$
   326 \end{tabular}
   328 	\end{tabular}
   327 \end{center}
   329 \end{center}
   328 \noindent
   330 \noindent
   329 An alternative regular expression with an empty list of children
   331 An alternative regular expression with an empty list of children
   330  is turned into a $\ZERO$ during the
   332 is turned into a $\ZERO$ during the
   331 $\erase$ function, thereby changing the size and structure of the regex.
   333 $\erase$ function, thereby changing the size and structure of the regex.
   332 Therefore the equality in question does not hold.
   334 Therefore the equality in question does not hold.
   333 
   335 
   334 These will likely be fixable if we really want to use plain $\rexp$s for dealing
   336 These will likely be fixable if we really want to use plain $\rexp$s for dealing
   335 with size, but we choose a more straightforward (or stupid) method by 
   337 with size, but we choose a more straightforward (or stupid) method by 
   337 Similarly we could define the derivative  and simplification on 
   339 Similarly we could define the derivative  and simplification on 
   338 $\rrexp$, which would be identical to those we defined for plain $\rexp$s in chapter1, 
   340 $\rrexp$, which would be identical to those we defined for plain $\rexp$s in chapter1, 
   339 except that now they can operate on alternatives taking multiple arguments.
   341 except that now they can operate on alternatives taking multiple arguments.
   340 
   342 
   341 \begin{center}
   343 \begin{center}
   342 \begin{tabular}{lcr}
   344 	\begin{tabular}{lcr}
   343 	$(\RALTS{rs})\; \backslash c$ & $\dn$ &  $\RALTS{\map\; (\_ \backslash c) \;rs}$\\
   345 		$(\RALTS{rs})\; \backslash c$ & $\dn$ &  $\RALTS{\map\; (\_ \backslash c) \;rs}$\\
   344 (other clauses omitted)
   346 		(other clauses omitted)
   345 With the new $\rrexp$ datatype in place, one can define its size function,
   347 		With the new $\rrexp$ datatype in place, one can define its size function,
   346 which precisely mirrors that of the annotated regular expressions:
   348 		which precisely mirrors that of the annotated regular expressions:
   347 \end{tabular}
   349 	\end{tabular}
   348 \end{center}
   350 \end{center}
   349 \noindent
   351 \noindent
   350 \begin{center}
   352 \begin{center}
   351 \begin{tabular}{ccc}
   353 	\begin{tabular}{ccc}
   352 	$\llbracket _{bs}\ONE \rrbracket_r$ & $\dn$ & $1$\\
   354 		$\llbracket _{bs}\ONE \rrbracket_r$ & $\dn$ & $1$\\
   353 	$\llbracket \ZERO \rrbracket_r$ & $\dn$ & $1$ \\
   355 		$\llbracket \ZERO \rrbracket_r$ & $\dn$ & $1$ \\
   354 	$\llbracket _{bs} r_1 \cdot r_2 \rrbracket_r$ & $\dn$ & $\llbracket r_1 \rrbracket_r + \llbracket r_2 \rrbracket_r + 1$\\
   356 		$\llbracket _{bs} r_1 \cdot r_2 \rrbracket_r$ & $\dn$ & $\llbracket r_1 \rrbracket_r + \llbracket r_2 \rrbracket_r + 1$\\
   355 $\llbracket _{bs}\mathbf{c} \rrbracket_r $ & $\dn$ & $1$\\
   357 		$\llbracket _{bs}\mathbf{c} \rrbracket_r $ & $\dn$ & $1$\\
   356 $\llbracket _{bs}\sum as \rrbracket_r $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket_r)\; as   + 1$\\
   358 		$\llbracket _{bs}\sum as \rrbracket_r $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket_r)\; as   + 1$\\
   357 $\llbracket _{bs} a^* \rrbracket_r $ & $\dn$ & $\llbracket a \rrbracket_r + 1$
   359 		$\llbracket _{bs} a^* \rrbracket_r $ & $\dn$ & $\llbracket a \rrbracket_r + 1$
   358 \end{tabular}
   360 	\end{tabular}
   359 \end{center}
   361 \end{center}
   360 \noindent
   362 \noindent
   361 
   363 
   362 \subsection{Lexing Related Functions for $\rrexp$}
   364 \subsection{Lexing Related Functions for $\rrexp$}
   363 Everything else for $\rrexp$ will be precisely the same for annotated expressions,
   365 Everything else for $\rrexp$ will be precisely the same for annotated expressions,
   364 except that they do not involve rectifying and augmenting bit-encoded tokenization information.
   366 except that they do not involve rectifying and augmenting bit-encoded tokenization information.
   365 As expected, most functions are simpler, such as the derivative:
   367 As expected, most functions are simpler, such as the derivative:
   366 \begin{center}
   368 \begin{center}
   367   \begin{tabular}{@{}lcl@{}}
   369 	\begin{tabular}{@{}lcl@{}}
   368   $(\ZERO)\,\backslash_r c$ & $\dn$ & $\ZERO$\\  
   370 		$(\ZERO)\,\backslash_r c$ & $\dn$ & $\ZERO$\\  
   369   $(\ONE)\,\backslash_r c$ & $\dn$ &
   371 		$(\ONE)\,\backslash_r c$ & $\dn$ &
   370         $\textit{if}\;c=d\; \;\textit{then}\;
   372 		$\textit{if}\;c=d\; \;\textit{then}\;
   371          \ONE\;\textit{else}\;\ZERO$\\  
   373 		\ONE\;\textit{else}\;\ZERO$\\  
   372   $(\sum \;\textit{rs})\,\backslash_r c$ & $\dn$ &
   374 		$(\sum \;\textit{rs})\,\backslash_r c$ & $\dn$ &
   373   $\sum\;(\textit{map} \; (\_\backslash_r c) \; rs )$\\
   375 		$\sum\;(\textit{map} \; (\_\backslash_r c) \; rs )$\\
   374   $(r_1\cdot r_2)\,\backslash_r c$ & $\dn$ &
   376 		$(r_1\cdot r_2)\,\backslash_r c$ & $\dn$ &
   375      $\textit{if}\;\textit{rnullable}\,r_1$\\
   377 		$\textit{if}\;\textit{rnullable}\,r_1$\\
   376 					       & &$\textit{then}\;\sum\,[(r_1\,\backslash_r c)\cdot\,r_2,$\\
   378 						 & &$\textit{then}\;\sum\,[(r_1\,\backslash_r c)\cdot\,r_2,$\\
   377 					       & &$\phantom{\textit{then},\;\sum\,}((r_2\,\backslash_r c))]$\\
   379 						 & &$\phantom{\textit{then},\;\sum\,}((r_2\,\backslash_r c))]$\\
   378   & &$\textit{else}\;\,(r_1\,\backslash_r c)\cdot r_2$\\
   380 						 & &$\textit{else}\;\,(r_1\,\backslash_r c)\cdot r_2$\\
   379   $(r^*)\,\backslash_r c$ & $\dn$ &
   381 		$(r^*)\,\backslash_r c$ & $\dn$ &
   380       $( r\,\backslash_r c)\cdot
   382 		$( r\,\backslash_r c)\cdot
   381        (_{[]}r^*))$
   383 		(_{[]}r^*))$
   382 \end{tabular}    
   384 	\end{tabular}    
   383 \end{center}  
   385 \end{center}  
   384 \noindent
   386 \noindent
   385 The simplification function is simplified without annotation causing superficial differences.
   387 The simplification function is simplified without annotation causing superficial differences.
   386 Duplicate removal without  an equivalence relation:
   388 Duplicate removal without  an equivalence relation:
   387 \begin{center}
   389 \begin{center}
   388 \begin{tabular}{lcl}
   390 	\begin{tabular}{lcl}
   389 	$\rdistinct{[]}{rset} $ & $\dn$ & $[]$\\
   391 		$\rdistinct{[]}{rset} $ & $\dn$ & $[]$\\
   390 $\rdistinct{r :: rs}{rset}$ & $\dn$ & $\textit{if}(r \in \textit{rset}) \; \textit{then} \; \rdistinct{rs}{rset}$\\
   392 		$\rdistinct{r :: rs}{rset}$ & $\dn$ & $\textit{if}(r \in \textit{rset}) \; \textit{then} \; \rdistinct{rs}{rset}$\\
   391            			    &        & $\textit{else}\; r::\rdistinct{rs}{(rset \cup \{r\})}$
   393 					    &        & $\textit{else}\; r::\rdistinct{rs}{(rset \cup \{r\})}$
   392 \end{tabular}
   394 	\end{tabular}
   393 \end{center}
   395 \end{center}
   394 %TODO: definition of rsimp (maybe only the alternative clause)
   396 %TODO: definition of rsimp (maybe only the alternative clause)
   395 \noindent
   397 \noindent
   396 The prefix $r$ in front of $\rdistinct{}{}$ is used mainly to 
   398 The prefix $r$ in front of $\rdistinct{}{}$ is used mainly to 
   397 differentiate with $\textit{distinct}$, which is a built-in predicate
   399 differentiate with $\textit{distinct}$, which is a built-in predicate
   416 \noindent
   418 \noindent
   417 In the following content, we will focus on $\rrexp$'s size bound.
   419 In the following content, we will focus on $\rrexp$'s size bound.
   418 We will piece together this bound and show the same bound for annotated regular 
   420 We will piece together this bound and show the same bound for annotated regular 
   419 expressions in the end.
   421 expressions in the end.
   420 Unless stated otherwise in this chapter all $\textit{rexp}$s without
   422 Unless stated otherwise in this chapter all $\textit{rexp}$s without
   421  bitcodes are seen as $\rrexp$s.
   423 bitcodes are seen as $\rrexp$s.
   422  We also use $r_1 + r_2$ and $\RALTS{[r_1, r_2]}$ interchageably
   424 We also use $r_1 + r_2$ and $\RALTS{[r_1, r_2]}$ interchageably
   423  as the former suits people's intuitive way of stating a binary alternative
   425 as the former suits people's intuitive way of stating a binary alternative
   424  regular expression.
   426 regular expression.
   425 
   427 
   426 
   428 
   427 
   429 
   428 %-----------------------------------
   430 %-----------------------------------
   429 %	SECTION ?
   431 %	SECTION ?
   430 %-----------------------------------
   432 %-----------------------------------
   431  \subsection{Finiteness Proof Using $\rrexp$s}
   433 \subsection{Finiteness Proof Using $\rrexp$s}
   432  Now that we have defined the $\rrexp$ datatype, and proven that its size changes
   434 Now that we have defined the $\rrexp$ datatype, and proven that its size changes
   433  w.r.t derivatives and simplifications mirrors precisely those of annotated regular expressions,
   435 w.r.t derivatives and simplifications mirrors precisely those of annotated regular expressions,
   434  we aim to bound the size of $r \backslash s$ for any $\rrexp$  $r$.
   436 we aim to bound the size of $r \backslash s$ for any $\rrexp$  $r$.
   435  Once we have a bound like: 
   437 Once we have a bound like: 
   436  \[
   438 \[
   437 	 \llbracket r \backslash_{rsimp} s \rrbracket_r \leq N_r
   439 	\llbracket r \backslash_{rsimp} s \rrbracket_r \leq N_r
   438  \]
   440 \]
   439  \noindent
   441 \noindent
   440  we could easily extend that to 
   442 we could easily extend that to 
   441  \[
   443 \[
   442 	 \llbracket a \backslash_{bsimps} s \rrbracket \leq N_r.
   444 	\llbracket a \backslash_{bsimps} s \rrbracket \leq N_r.
   443  \]
   445 \]
   444 
   446 
   445  \subsection{Roadmap to a Bound for $\textit{Rrexp}$}
   447 \subsection{Roadmap to a Bound for $\textit{Rrexp}$}
   446 
   448 
   447 The way we obtain the bound for $\rrexp$s is by two steps:
   449 The way we obtain the bound for $\rrexp$s is by two steps:
   448 \begin{itemize}
   450 \begin{itemize}
   449 	\item
   451 	\item
   450 		First, we rewrite $r\backslash s$ into something else that is easier
   452 		First, we rewrite $r\backslash s$ into something else that is easier
   457 		by estimation, since $\distinctBy$ and $\flts$ are well-behaved and working together would only 
   459 		by estimation, since $\distinctBy$ and $\flts$ are well-behaved and working together would only 
   458 		reduce the size of a regular expression, not adding to it.
   460 		reduce the size of a regular expression, not adding to it.
   459 \end{itemize}
   461 \end{itemize}
   460 
   462 
   461 \section{Step One: Closed Forms}
   463 \section{Step One: Closed Forms}
   462 		We transform the function application $\rderssimp{r}{s}$
   464 We transform the function application $\rderssimp{r}{s}$
   463 		into an equivalent 
   465 into an equivalent 
   464 		form $f\; (g \; (\sum rs))$.
   466 form $f\; (g \; (\sum rs))$.
   465 		The functions $f$ and $g$ can be anything from $\flts$, $\distinctBy$ and other helper functions from $\bsimp{\_}$.
   467 The functions $f$ and $g$ can be anything from $\flts$, $\distinctBy$ and other helper functions from $\bsimp{\_}$.
   466 		This way we get a different but equivalent way of expressing : $r\backslash s = f \; (g\; (\sum rs))$, we call the
   468 This way we get a different but equivalent way of expressing : $r\backslash s = f \; (g\; (\sum rs))$, we call the
   467 		right hand side the "closed form" of $r\backslash s$.
   469 right hand side the "closed form" of $r\backslash s$.
   468 
   470 
   469 \begin{quote}\it
   471 \begin{quote}\it
   470 	Claim: For regular expressions $r_1 \cdot r_2$, we claim that
   472 	Claim: For regular expressions $r_1 \cdot r_2$, we claim that
   471 	\begin{center}
   473 	\begin{center}
   472 		$ \rderssimp{r_1 \cdot r_2}{s} = 
   474 		$ \rderssimp{r_1 \cdot r_2}{s} = 
   473 	\rsimp{(\sum (r_1 \backslash s \cdot r_2 ) \; :: \;(\map \; \rderssimp{r2}{\_} \;(\vsuf{s}{r_1})))}$
   475 		\rsimp{(\sum (r_1 \backslash s \cdot r_2 ) \; :: \;(\map \; \rderssimp{r2}{\_} \;(\vsuf{s}{r_1})))}$
   474 	\end{center}
   476 	\end{center}
   475 \end{quote}
   477 \end{quote}
   476 \noindent
   478 \noindent
   477 We explain in detail how we reached those claims.
   479 We explain in detail how we reached those claims.
   478 \subsection{Basic Properties needed for Closed Forms}
   480 \subsection{Basic Properties needed for Closed Forms}
   502 and we omit the precise definition here.
   504 and we omit the precise definition here.
   503 \begin{proof}
   505 \begin{proof}
   504 	The first part is by an induction on $rs$.
   506 	The first part is by an induction on $rs$.
   505 	The second and third part can be proven by using the 
   507 	The second and third part can be proven by using the 
   506 	induction rules of $\rdistinct{\_}{\_}$.
   508 	induction rules of $\rdistinct{\_}{\_}$.
   507 	
   509 
   508 \end{proof}
   510 \end{proof}
   509 
   511 
   510 \noindent
   512 \noindent
   511 $\rdistinct{\_}{\_}$ will cancel out all regular expression terms
   513 $\rdistinct{\_}{\_}$ will cancel out all regular expression terms
   512 that are in the accumulator, therefore prepending a list $rs_a$ with an arbitrary
   514 that are in the accumulator, therefore prepending a list $rs_a$ with an arbitrary
   535 \begin{lemma}
   537 \begin{lemma}
   536 	The accumulator can be augmented to include elements not appearing in the input list,
   538 	The accumulator can be augmented to include elements not appearing in the input list,
   537 	and the output will not change.	
   539 	and the output will not change.	
   538 	\begin{itemize}
   540 	\begin{itemize}
   539 		\item
   541 		\item
   540 		If $r \notin rs$, then $\rdistinct{rs}{acc} = \rdistinct{rs}{\{r\} \cup acc}$.
   542 			If $r \notin rs$, then $\rdistinct{rs}{acc} = \rdistinct{rs}{\{r\} \cup acc}$.
   541 		\item
   543 		\item
   542 			Particularly, when $acc = \varnothing$ and $rs$ de-duplicated, we have\\
   544 			Particularly, when $acc = \varnothing$ and $rs$ de-duplicated, we have\\
   543 			\[ \rdistinct{rs}{\varnothing} = rs \]
   545 			\[ \rdistinct{rs}{\varnothing} = rs \]
   544 	\end{itemize}
   546 	\end{itemize}
   545 \end{lemma}
   547 \end{lemma}
   558 	= rs_1@(\textit{rdistinct} rsa \; (acc \cup rs_1))\]
   560 	= rs_1@(\textit{rdistinct} rsa \; (acc \cup rs_1))\]
   559 \end{lemma}
   561 \end{lemma}
   560 \noindent
   562 \noindent
   561 In other words, it can be taken out and left untouched in the output.
   563 In other words, it can be taken out and left untouched in the output.
   562 \begin{proof}
   564 \begin{proof}
   563 By an induction on $rs_1$, where $rsa$ and $acc$ are allowed to be arbitrary.
   565 	By an induction on $rs_1$, where $rsa$ and $acc$ are allowed to be arbitrary.
   564 \end{proof}
   566 \end{proof}
   565 \noindent
   567 \noindent
   566 $\rdistinct{}{}$ removes any element in anywhere of a list, if it
   568 $\rdistinct{}{}$ removes any element in anywhere of a list, if it
   567 had appeared previously:
   569 had appeared previously:
   568 \begin{lemma}\label{distinctRemovesMiddle}
   570 \begin{lemma}\label{distinctRemovesMiddle}
   574 			$\rdistinct{(ab :: rs @ [ab])}{rset'} = \rdistinct{(ab :: rs)}{rset'}$
   576 			$\rdistinct{(ab :: rs @ [ab])}{rset'} = \rdistinct{(ab :: rs)}{rset'}$
   575 		\item
   577 		\item
   576 			$\rdistinct{ (rs @ rs') }{rset} = \rdistinct{rs @ [r] @ rs'}{rset}$\\
   578 			$\rdistinct{ (rs @ rs') }{rset} = \rdistinct{rs @ [r] @ rs'}{rset}$\\
   577 			and\\
   579 			and\\
   578 			$\rdistinct{(ab :: rs @ [ab] @ rs'')}{rset'} = 
   580 			$\rdistinct{(ab :: rs @ [ab] @ rs'')}{rset'} = 
   579 			 \rdistinct{(ab :: rs @ rs'')}{rset'}$
   581 			\rdistinct{(ab :: rs @ rs'')}{rset'}$
   580 	\end{itemize}
   582 	\end{itemize}
   581 \end{lemma}
   583 \end{lemma}
   582 \noindent
   584 \noindent
   583 \begin{proof}
   585 \begin{proof}
   584 By induction on $rs$. All other variables are allowed to be arbitrary.
   586 	By induction on $rs$. All other variables are allowed to be arbitrary.
   585 The second half of the lemma requires the first half.
   587 	The second half of the lemma requires the first half.
   586 Note that for each half's two sub-propositions need to be proven concurrently,
   588 	Note that for each half's two sub-propositions need to be proven concurrently,
   587 so that the induction goes through.
   589 	so that the induction goes through.
   588 \end{proof}
   590 \end{proof}
   589 
   591 
   590 \noindent
   592 \noindent
   591 This allows us to prove ``Idempotency" of $\rdistinct{}{}$ of some kind:
   593 This allows us to prove ``Idempotency" of $\rdistinct{}{}$ of some kind:
   592 \begin{lemma}\label{rdistinctConcatGeneral}
   594 \begin{lemma}\label{rdistinctConcatGeneral}
   656 
   658 
   657 \subsubsection{The $RL$ Function: Language Interpretation of $\textit{Rrexp}$s}
   659 \subsubsection{The $RL$ Function: Language Interpretation of $\textit{Rrexp}$s}
   658 Much like the definition of $L$ on plain regular expressions, one could also 
   660 Much like the definition of $L$ on plain regular expressions, one could also 
   659 define the language interpretation of $\rrexp$s.
   661 define the language interpretation of $\rrexp$s.
   660 \begin{center}
   662 \begin{center}
   661 \begin{tabular}{lcl}
   663 	\begin{tabular}{lcl}
   662 $RL \; (\ZERO)$ & $\dn$ & $\phi$\\
   664 		$RL \; (\ZERO)$ & $\dn$ & $\phi$\\
   663 $RL \; (\ONE)$ & $\dn$ & $\{[]\}$\\
   665 		$RL \; (\ONE)$ & $\dn$ & $\{[]\}$\\
   664 $RL \; (c)$ & $\dn$ & $\{[c]\}$\\
   666 		$RL \; (c)$ & $\dn$ & $\{[c]\}$\\
   665 $RL \; \sum rs$ & $\dn$ & $ \bigcup_{r \in rs} (RL \; r)$\\
   667 		$RL \; \sum rs$ & $\dn$ & $ \bigcup_{r \in rs} (RL \; r)$\\
   666 $RL \; (r_1 \cdot r_2)$ & $\dn$ & $ RL \; (r_1) @ RL \; (r_2)$\\
   668 		$RL \; (r_1 \cdot r_2)$ & $\dn$ & $ RL \; (r_1) @ RL \; (r_2)$\\
   667 $RL \; (r^*)$ & $\dn$ & $ (RL(r))^*$
   669 		$RL \; (r^*)$ & $\dn$ & $ (RL(r))^*$
   668 \end{tabular}
   670 	\end{tabular}
   669 \end{center}
   671 \end{center}
   670 \noindent
   672 \noindent
   671 The main use of $RL$ is to establish some connections between $\rsimp{}$ 
   673 The main use of $RL$ is to establish some connections between $\rsimp{}$ 
   672 and $\rnullable{}$:
   674 and $\rnullable{}$:
   673 \begin{lemma}
   675 \begin{lemma}
   682 \begin{proof}
   684 \begin{proof}
   683 	The first part is by induction on $r$. 
   685 	The first part is by induction on $r$. 
   684 	The second part is true because property 
   686 	The second part is true because property 
   685 	\[ RL \; r = RL \; (\rsimp{r})\] holds.
   687 	\[ RL \; r = RL \; (\rsimp{r})\] holds.
   686 \end{proof}
   688 \end{proof}
   687 	
   689 
   688 \subsubsection{$\rsimp{}$ is Non-Increasing}
   690 \subsubsection{$\rsimp{}$ is Non-Increasing}
   689 In this subsection, we prove that the function $\rsimp{}$ does not 
   691 In this subsection, we prove that the function $\rsimp{}$ does not 
   690 make the $\llbracket \rrbracket_r$ size increase.
   692 make the $\llbracket \rrbracket_r$ size increase.
   691 
   693 
   692 
   694 
   707 		$\good\; \RCHAR{c}$ & $\dn$ & $\btrue$\\
   709 		$\good\; \RCHAR{c}$ & $\dn$ & $\btrue$\\
   708 		$\good\; \RALTS{[]}$ & $\dn$ & $\bfalse$\\
   710 		$\good\; \RALTS{[]}$ & $\dn$ & $\bfalse$\\
   709 		$\good\; \RALTS{[r]}$ & $\dn$ & $\bfalse$\\
   711 		$\good\; \RALTS{[r]}$ & $\dn$ & $\bfalse$\\
   710 		$\good\; \RALTS{r_1 :: r_2 :: rs}$ & $\dn$ & 
   712 		$\good\; \RALTS{r_1 :: r_2 :: rs}$ & $\dn$ & 
   711 		$\textit{isDistinct} \; (r_1 :: r_2 :: rs) \;$\\
   713 		$\textit{isDistinct} \; (r_1 :: r_2 :: rs) \;$\\
   712 		& & $\textit{and}\; (\forall r' \in (r_1 :: r_2 :: rs).\; \good \; r'\; \,  \textit{and}\; \, \textit{nonAlt}\; r')$\\
   714 						   & & $\textit{and}\; (\forall r' \in (r_1 :: r_2 :: rs).\; \good \; r'\; \,  \textit{and}\; \, \textit{nonAlt}\; r')$\\
   713 		$\good \; \RSEQ{\RZERO}{r}$ & $\dn$ & $\bfalse$\\
   715 		$\good \; \RSEQ{\RZERO}{r}$ & $\dn$ & $\bfalse$\\
   714 		$\good \; \RSEQ{\RONE}{r}$ & $\dn$ & $\bfalse$\\
   716 		$\good \; \RSEQ{\RONE}{r}$ & $\dn$ & $\bfalse$\\
   715 		$\good \; \RSEQ{r}{\RZERO}$ & $\dn$ & $\bfalse$\\
   717 		$\good \; \RSEQ{r}{\RZERO}$ & $\dn$ & $\bfalse$\\
   716 		$\good \; \RSEQ{r_1}{r_2}$ & $\dn$ & $\good \; r_1 \;\, \textit{and} \;\, \good \; r_2$\\
   718 		$\good \; \RSEQ{r_1}{r_2}$ & $\dn$ & $\good \; r_1 \;\, \textit{and} \;\, \good \; r_2$\\
   717 		$\good \; \RSTAR{r}$ & $\dn$ & $\btrue$\\
   719 		$\good \; \RSTAR{r}$ & $\dn$ & $\btrue$\\
   848 	(\rdistinct{(\rflts \; (\map \; \rsimp{}\; rs))}{\varnothing})) :: 
   850 	(\rdistinct{(\rflts \; (\map \; \rsimp{}\; rs))}{\varnothing})) :: 
   849 	\map \; \rsimp{} \; rs' ) = 
   851 	\map \; \rsimp{} \; rs' ) = 
   850 	\rflts \; ( (\rdistinct{(\rflts \; (\map \; \rsimp{}\; rs))}{\varnothing}) @ (
   852 	\rflts \; ( (\rdistinct{(\rflts \; (\map \; \rsimp{}\; rs))}{\varnothing}) @ (
   851 	\map \; \rsimp{rs'}))$
   853 	\map \; \rsimp{rs'}))$
   852 
   854 
   853 	
   855 
   854 \end{lemma}
   856 \end{lemma}
   855 \begin{proof}
   857 \begin{proof}
   856 	By \ref{good1}.
   858 	By \ref{good1}.
   857 \end{proof}
   859 \end{proof}
   858 \noindent
   860 \noindent
   866 The idempotency of $\rsimp$ is very useful in 
   868 The idempotency of $\rsimp$ is very useful in 
   867 manipulating regular expression terms into desired
   869 manipulating regular expression terms into desired
   868 forms so that key steps allowing further rewriting to closed forms
   870 forms so that key steps allowing further rewriting to closed forms
   869 are possible.
   871 are possible.
   870 \begin{lemma}\label{rsimpIdem}
   872 \begin{lemma}\label{rsimpIdem}
   871 $\rsimp{r} = \rsimp{\rsimp{r}}$
   873 	$\rsimp{r} = \rsimp{\rsimp{r}}$
   872 \end{lemma}
   874 \end{lemma}
   873 
   875 
   874 \begin{proof}
   876 \begin{proof}
   875 	By \ref{test} and \ref{good1}.
   877 	By \ref{test} and \ref{good1}.
   876 \end{proof}
   878 \end{proof}
   916 we can start proving some key equalities leading to the 
   918 we can start proving some key equalities leading to the 
   917 closed forms.
   919 closed forms.
   918 Now presented are a few equivalent terms under $\rsimp{}$.
   920 Now presented are a few equivalent terms under $\rsimp{}$.
   919 We use $r_1 \sequal r_2 $ here to denote $\rsimp{r_1} = \rsimp{r_2}$.
   921 We use $r_1 \sequal r_2 $ here to denote $\rsimp{r_1} = \rsimp{r_2}$.
   920 \begin{lemma}
   922 \begin{lemma}
   921 \begin{itemize}
   923 	\begin{itemize}
   922 	The following equivalence hold:
   924 		The following equivalence hold:
   923 	\item
   925 	\item
   924 		$\rsimpalts \; (\RZERO :: rs) \sequal \rsimpalts\; rs$
   926 		$\rsimpalts \; (\RZERO :: rs) \sequal \rsimpalts\; rs$
   925 	\item
   927 	\item
   926 		$\rsimpalts \; rs \sequal \rsimpalts (\map \; \rsimp{} \; rs)$
   928 		$\rsimpalts \; rs \sequal \rsimpalts (\map \; \rsimp{} \; rs)$
   927 	\item
   929 	\item
   964 		\rsimpalts \; (\rdistinct{(\rflts \; rs' @ \rflts \; rs @ \rflts \; rs'')}{
   966 		\rsimpalts \; (\rdistinct{(\rflts \; rs' @ \rflts \; rs @ \rflts \; rs'')}{
   965 		\varnothing})$\\
   967 		\varnothing})$\\
   966 		$\stackrel{by \ref{rdistinctConcatGeneral}}{=}
   968 		$\stackrel{by \ref{rdistinctConcatGeneral}}{=}
   967 		\rsimpalts \; (\rdistinct{\rflts \; rs'}{\varnothing} @ \rdistinct{(
   969 		\rsimpalts \; (\rdistinct{\rflts \; rs'}{\varnothing} @ \rdistinct{(
   968 		\rflts\; rs @ \rflts \; rs'')}{\rflts \; rs'})$\\
   970 		\rflts\; rs @ \rflts \; rs'')}{\rflts \; rs'})$\\
   969 		
   971 
   970 	\end{center}
   972 	\end{center}
   971 \end{comment}
   973 \end{comment}
   972 %Rewriting steps not put in--too long and complicated-------------------------------
   974 %Rewriting steps not put in--too long and complicated-------------------------------
   973 \noindent
   975 \noindent
   974 We need more equalities like the above to enable a closed form,
   976 We need more equalities like the above to enable a closed form,
   996 to mean atomic simplification transitions 
   998 to mean atomic simplification transitions 
   997 of $\rrexp$s and $\rrexp$ lists, respectively.
   999 of $\rrexp$s and $\rrexp$ lists, respectively.
   998 
  1000 
   999 
  1001 
  1000 
  1002 
  1001 	List of one-step rewrite rules for $\rrexp$ ($\hrewrite$):
  1003 List of one-step rewrite rules for $\rrexp$ ($\hrewrite$):
  1002 
  1004 
  1003 
  1005 
  1004 \begin{center}
  1006 \begin{center}
  1005 \begin{mathpar}
  1007 	\begin{mathpar}
  1006 	\inferrule[RSEQ0L]{}{\RZERO \cdot r_2 \hrewrite \RZERO\\}
  1008 		\inferrule[RSEQ0L]{}{\RZERO \cdot r_2 \hrewrite \RZERO\\}
  1007 
  1009 
  1008 	\inferrule[RSEQ0R]{}{r_1 \cdot \RZERO \hrewrite \RZERO\\}
  1010 		\inferrule[RSEQ0R]{}{r_1 \cdot \RZERO \hrewrite \RZERO\\}
  1009 
  1011 
  1010 	\inferrule[RSEQ1]{}{(\RONE \cdot r) \hrewrite  r\\}\\	
  1012 		\inferrule[RSEQ1]{}{(\RONE \cdot r) \hrewrite  r\\}\\	
  1011 	
  1013 
  1012 	\inferrule[RSEQL]{ r_1 \hrewrite r_2}{r_1 \cdot r_3 \hrewrite r_2 \cdot r_3\\}
  1014 		\inferrule[RSEQL]{ r_1 \hrewrite r_2}{r_1 \cdot r_3 \hrewrite r_2 \cdot r_3\\}
  1013 
  1015 
  1014 	\inferrule[RSEQR]{ r_3 \hrewrite r_4}{r_1 \cdot r_3 \hrewrite r_1 \cdot r_4\\}\\
  1016 		\inferrule[RSEQR]{ r_3 \hrewrite r_4}{r_1 \cdot r_3 \hrewrite r_1 \cdot r_4\\}\\
  1015 
  1017 
  1016 	\inferrule[RALTSChild]{r \hrewrite r'}{\sum (rs_1 @ [r] @ rs_2) \hrewrite \sum (rs_1 @ [r'] @ rs_2)\\}
  1018 		\inferrule[RALTSChild]{r \hrewrite r'}{\sum (rs_1 @ [r] @ rs_2) \hrewrite \sum (rs_1 @ [r'] @ rs_2)\\}
  1017 
  1019 
  1018 	\inferrule[RALTS0]{}{\sum (rs_a @ [\RZERO] @ rs_b) \hrewrite \sum (rs_a @ rs_b)}
  1020 		\inferrule[RALTS0]{}{\sum (rs_a @ [\RZERO] @ rs_b) \hrewrite \sum (rs_a @ rs_b)}
  1019 
  1021 
  1020 	\inferrule[RALTSNested]{}{\sum (rs_a @ [\sum rs_1] @ rs_b) \hrewrite \sum (rs_a @ rs_1 @ rs_b)}
  1022 		\inferrule[RALTSNested]{}{\sum (rs_a @ [\sum rs_1] @ rs_b) \hrewrite \sum (rs_a @ rs_1 @ rs_b)}
  1021 
  1023 
  1022 	\inferrule[RALTSNil]{}{ \sum [] \hrewrite \RZERO\\}
  1024 		\inferrule[RALTSNil]{}{ \sum [] \hrewrite \RZERO\\}
  1023 
  1025 
  1024 	\inferrule[RALTSSingle]{}{ \sum [r] \hrewrite  r\\}	
  1026 		\inferrule[RALTSSingle]{}{ \sum [r] \hrewrite  r\\}	
  1025 
  1027 
  1026 	\inferrule[RALTSDelete]{\\ r_1 = r_2}{\sum rs_a @ [r_1] @ rs_b @ [r_2] @ rsc \hrewrite \sum rs_a @ [r_1] @ rs_b @ rs_c}
  1028 		\inferrule[RALTSDelete]{\\ r_1 = r_2}{\sum rs_a @ [r_1] @ rs_b @ [r_2] @ rsc \hrewrite \sum rs_a @ [r_1] @ rs_b @ rs_c}
  1027 
  1029 
  1028 \end{mathpar}
  1030 	\end{mathpar}
  1029 \end{center}
  1031 \end{center}
  1030 
  1032 
  1031 
  1033 
  1032 	List of rewrite rules for a list of regular expressions,
  1034 List of rewrite rules for a list of regular expressions,
  1033 	where each element can rewrite in many steps to the other (scf stands for
  1035 where each element can rewrite in many steps to the other (scf stands for
  1034 	li\emph{s}t \emph{c}losed \emph{f}orm). This relation is similar to the 
  1036 li\emph{s}t \emph{c}losed \emph{f}orm). This relation is similar to the 
  1035 	$\stackrel{s*}{\rightsquigarrow}$ for annotated regular expressions.
  1037 $\stackrel{s*}{\rightsquigarrow}$ for annotated regular expressions.
  1036 
  1038 
  1037 \begin{center}
  1039 \begin{center}
  1038 \begin{mathpar}
  1040 	\begin{mathpar}
  1039 	\inferrule{}{[] \scfrewrites [] }
  1041 		\inferrule{}{[] \scfrewrites [] }
  1040 	\inferrule{r \hrewrites r' \\ rs \scfrewrites rs'}{r :: rs \scfrewrites r' :: rs'}
  1042 		\inferrule{r \hrewrites r' \\ rs \scfrewrites rs'}{r :: rs \scfrewrites r' :: rs'}
  1041 \end{mathpar}
  1043 	\end{mathpar}
  1042 \end{center}
  1044 \end{center}
  1043 %frewrite
  1045 %frewrite
  1044 	List of one-step rewrite rules for flattening 
  1046 List of one-step rewrite rules for flattening 
  1045 	a list of  regular expressions($\frewrite$):
  1047 a list of  regular expressions($\frewrite$):
  1046 \begin{center}
  1048 \begin{center}
  1047 \begin{mathpar}
  1049 	\begin{mathpar}
  1048 	\inferrule{}{\RZERO :: rs \frewrite rs \\}
  1050 		\inferrule{}{\RZERO :: rs \frewrite rs \\}
  1049 
  1051 
  1050 	\inferrule{}{(\sum rs) :: rs_a \frewrite rs @ rs_a \\}
  1052 		\inferrule{}{(\sum rs) :: rs_a \frewrite rs @ rs_a \\}
  1051 
  1053 
  1052 	\inferrule{rs_1 \frewrite rs_2}{r :: rs_1 \frewrite r :: rs_2}
  1054 		\inferrule{rs_1 \frewrite rs_2}{r :: rs_1 \frewrite r :: rs_2}
  1053 \end{mathpar}
  1055 	\end{mathpar}
  1054 \end{center}
  1056 \end{center}
  1055 
  1057 
  1056 	Lists of one-step rewrite rules for flattening and de-duplicating
  1058 Lists of one-step rewrite rules for flattening and de-duplicating
  1057 	a list of regular expressions ($\grewrite$):
  1059 a list of regular expressions ($\grewrite$):
  1058 \begin{center}
  1060 \begin{center}
  1059 \begin{mathpar}
  1061 	\begin{mathpar}
  1060 	\inferrule{}{\RZERO :: rs \grewrite rs \\}
  1062 		\inferrule{}{\RZERO :: rs \grewrite rs \\}
  1061 
  1063 
  1062 	\inferrule{}{(\sum rs) :: rs_a \grewrite rs @ rs_a \\}
  1064 		\inferrule{}{(\sum rs) :: rs_a \grewrite rs @ rs_a \\}
  1063 
  1065 
  1064 	\inferrule{rs_1 \grewrite rs_2}{r :: rs_1 \grewrite r :: rs_2}
  1066 		\inferrule{rs_1 \grewrite rs_2}{r :: rs_1 \grewrite r :: rs_2}
  1065 
  1067 
  1066 	\inferrule[dB]{}{rs_a @ [a] @ rs_b @[a] @ rs_c \grewrite rs_a @ [a] @ rsb @ rsc}
  1068 		\inferrule[dB]{}{rs_a @ [a] @ rs_b @[a] @ rs_c \grewrite rs_a @ [a] @ rsb @ rsc}
  1067 \end{mathpar}
  1069 	\end{mathpar}
  1068 \end{center}
  1070 \end{center}
  1069 
  1071 
  1070 \noindent
  1072 \noindent
  1071 The reason why we take the trouble of defining 
  1073 The reason why we take the trouble of defining 
  1072 two separate list rewriting definitions $\frewrite$ and $\grewrite$
  1074 two separate list rewriting definitions $\frewrite$ and $\grewrite$
  1076 more straightforward to prove.
  1078 more straightforward to prove.
  1077 For example, when proving the closed-form for the alternative regular expression,
  1079 For example, when proving the closed-form for the alternative regular expression,
  1078 one of the rewriting steps would be:
  1080 one of the rewriting steps would be:
  1079 \begin{lemma}
  1081 \begin{lemma}
  1080 	$\sum (\rDistinct \;\; (\map \; (\_ \backslash x) \; (\rflts \; rs)) \;\; \varnothing) \sequal
  1082 	$\sum (\rDistinct \;\; (\map \; (\_ \backslash x) \; (\rflts \; rs)) \;\; \varnothing) \sequal
  1081 	 \sum (\rDistinct \;\;  (\rflts \; (\map \; (\_ \backslash x) \; rs)) \;\; \varnothing)
  1083 	\sum (\rDistinct \;\;  (\rflts \; (\map \; (\_ \backslash x) \; rs)) \;\; \varnothing)
  1082 	 $
  1084 	$
  1083 \end{lemma}
  1085 \end{lemma}
  1084 \noindent
  1086 \noindent
  1085 Proving this is by first showing 
  1087 Proving this is by first showing 
  1086 \begin{lemma}\label{earlyLaterDerFrewrites}
  1088 \begin{lemma}\label{earlyLaterDerFrewrites}
  1087 	$\map \; (\_ \backslash x) \;  (\rflts \; rs) \frewrites
  1089 	$\map \; (\_ \backslash x) \;  (\rflts \; rs) \frewrites
  1097 is a piece of cake.
  1099 is a piece of cake.
  1098 But this trick will not work for $\grewrites$.
  1100 But this trick will not work for $\grewrites$.
  1099 For example, a rewriting step in proving
  1101 For example, a rewriting step in proving
  1100 closed forms is:
  1102 closed forms is:
  1101 \begin{center}
  1103 \begin{center}
  1102 $\rsimp{(\rsimpalts \; (\map \; (\_ \backslash x) \; (\rdistinct{(\rflts \; (\map \; (\rsimp{} \; \circ \; (\lambda r. \rderssimp{r}{xs}))))}{\varnothing})))}$\\
  1104 	$\rsimp{(\rsimpalts \; (\map \; (\_ \backslash x) \; (\rdistinct{(\rflts \; (\map \; (\rsimp{} \; \circ \; (\lambda r. \rderssimp{r}{xs}))))}{\varnothing})))}$\\
  1103 $=$ \\
  1105 	$=$ \\
  1104 $\rsimp{(\rsimpalts \; (\rdistinct{(\map \; (\_ \backslash x) \; (\rflts \; (\map \; (\rsimp{} \; \circ \; (\lambda r. \rderssimp{r}{xs})))) ) }{\varnothing}))} $
  1106 	$\rsimp{(\rsimpalts \; (\rdistinct{(\map \; (\_ \backslash x) \; (\rflts \; (\map \; (\rsimp{} \; \circ \; (\lambda r. \rderssimp{r}{xs})))) ) }{\varnothing}))} $
  1105 \noindent
  1107 	\noindent
  1106 \end{center}
  1108 \end{center}
  1107 For this one would hope to have a rewriting relation between the two lists involved,
  1109 For this one would hope to have a rewriting relation between the two lists involved,
  1108 similar to \ref{earlyLaterDerFrewrites}. However, it turns out that 
  1110 similar to \ref{earlyLaterDerFrewrites}. However, it turns out that 
  1109 \begin{center}
  1111 \begin{center}
  1110 $\map \; (\_ \backslash x) \; (\rDistinct \; rs \; rset) \grewrites \rDistinct \; (\map \;
  1112 	$\map \; (\_ \backslash x) \; (\rDistinct \; rs \; rset) \grewrites \rDistinct \; (\map \;
  1111 (\_ \backslash x) \; rs) \; ( rset \backslash x)$
  1113 	(\_ \backslash x) \; rs) \; ( rset \backslash x)$
  1112 \end{center}
  1114 \end{center}
  1113 \noindent
  1115 \noindent
  1114 does $\mathbf{not}$ hold in general.
  1116 does $\mathbf{not}$ hold in general.
  1115 For this rewriting step we will introduce some slightly more cumbersome
  1117 For this rewriting step we will introduce some slightly more cumbersome
  1116 proof technique in later sections.
  1118 proof technique in later sections.
  1198 	\begin{itemize}
  1200 	\begin{itemize}
  1199 		\item
  1201 		\item
  1200 			If $s \neq []$ then $\rderssimp{r}{s} = \rsimp{(\rders \; r \; s)}$.
  1202 			If $s \neq []$ then $\rderssimp{r}{s} = \rsimp{(\rders \; r \; s)}$.
  1201 		\item
  1203 		\item
  1202 			$\rsimpalts \; (\map \; (\_ \backslash_r x) \;
  1204 			$\rsimpalts \; (\map \; (\_ \backslash_r x) \;
  1203 				(\rdistinct{rs}{\varnothing})) \sequal
  1205 			(\rdistinct{rs}{\varnothing})) \sequal
  1204 			 \rsimpalts \; (\rDistinct \; 
  1206 			\rsimpalts \; (\rDistinct \; 
  1205 			 (\map \; (\_ \backslash_r x) rs) \;\varnothing  )$
  1207 			(\map \; (\_ \backslash_r x) rs) \;\varnothing  )$
  1206 	 \end{itemize}
  1208 	\end{itemize}
  1207  \end{lemma}
  1209 \end{lemma}
  1208 \noindent
  1210 \noindent
  1209 
  1211 
  1210 Finally,
  1212 Finally,
  1211 together with 
  1213 together with 
  1212 \begin{lemma}\label{rderRsimpAltsCommute}
  1214 \begin{lemma}\label{rderRsimpAltsCommute}
  1213 	$\rder{x}{(\rsimpalts \; rs)} = \rsimpalts \; (\map \; (\rder{x}{\_}) \; rs)$
  1215 	$\rder{x}{(\rsimpalts \; rs)} = \rsimpalts \; (\map \; (\rder{x}{\_}) \; rs)$
  1214 \end{lemma}
  1216 \end{lemma}
  1215 \noindent
  1217 \noindent
  1216 this leads to the first closed form--
  1218 this leads to the first closed form--
  1217 \begin{lemma}\label{altsClosedForm}
  1219 \begin{lemma}\label{altsClosedForm}
  1218 \begin{center}
  1220 	\begin{center}
  1219 	$\rderssimp{(\sum rs)}{s} \sequal
  1221 		$\rderssimp{(\sum rs)}{s} \sequal
  1220 	 \sum \; (\map \; (\rderssimp{\_}{s}) \; rs)$
  1222 		\sum \; (\map \; (\rderssimp{\_}{s}) \; rs)$
  1221 \end{center}
  1223 	\end{center}
  1222 \end{lemma}
  1224 \end{lemma}
  1223 	
  1225 
  1224 \noindent
  1226 \noindent
  1225 \begin{proof}
  1227 \begin{proof}
  1226 	By a reverse induction on the string $s$.
  1228 	By a reverse induction on the string $s$.
  1227 	One rewriting step, as we mentioned earlier,
  1229 	One rewriting step, as we mentioned earlier,
  1228 	involves
  1230 	involves
  1230 		$\rsimpalts \; (\map \; (\_ \backslash x) \; 
  1232 		$\rsimpalts \; (\map \; (\_ \backslash x) \; 
  1231 		(\rdistinct{(\rflts \; (\map \; (\rsimp{} \; \circ \; 
  1233 		(\rdistinct{(\rflts \; (\map \; (\rsimp{} \; \circ \; 
  1232 		(\lambda r. \rderssimp{r}{xs}))))}{\varnothing}))
  1234 		(\lambda r. \rderssimp{r}{xs}))))}{\varnothing}))
  1233 		\sequal
  1235 		\sequal
  1234 		\rsimpalts \; (\rdistinct{(\map \; (\_ \backslash x) \; 
  1236 		\rsimpalts \; (\rdistinct{(\map \; (\_ \backslash x) \; 
  1235 		(\rflts \; (\map \; (\rsimp{} \; \circ \; 
  1237 			(\rflts \; (\map \; (\rsimp{} \; \circ \; 
  1236 		(\lambda r. \rderssimp{r}{xs})))) ) }{\varnothing}) $.
  1238 		(\lambda r. \rderssimp{r}{xs})))) ) }{\varnothing}) $.
  1237 	\end{center}
  1239 	\end{center}
  1238 	This can be proven by a combination of 
  1240 	This can be proven by a combination of 
  1239 	\ref{grewritesSimpalts}, \ref{gstarRdistinctGeneral}, \ref{rderRsimpAltsCommute}, and
  1241 	\ref{grewritesSimpalts}, \ref{gstarRdistinctGeneral}, \ref{rderRsimpAltsCommute}, and
  1240 	\ref{insideSimpRemoval}.
  1242 	\ref{insideSimpRemoval}.
  1259 First let's look at a series of derivatives steps on a sequence 
  1261 First let's look at a series of derivatives steps on a sequence 
  1260 regular expression, assuming that each time the first
  1262 regular expression, assuming that each time the first
  1261 component of the sequence is always nullable):
  1263 component of the sequence is always nullable):
  1262 \begin{center}
  1264 \begin{center}
  1263 
  1265 
  1264 $r_1 \cdot r_2 \quad \longrightarrow_{\backslash c}  \quad   r_1  \backslash c \cdot r_2 + r_2 \backslash c \quad \longrightarrow_{\backslash c'} \quad (r_1 \backslash cc' \cdot r_2 + r_2 \backslash c') + r_2 \backslash cc' \longrightarrow_{\backslash c''} \quad$\\
  1266 	$r_1 \cdot r_2 \quad \longrightarrow_{\backslash c}  \quad   r_1  \backslash c \cdot r_2 + r_2 \backslash c \quad \longrightarrow_{\backslash c'} \quad (r_1 \backslash cc' \cdot r_2 + r_2 \backslash c') + r_2 \backslash cc' \longrightarrow_{\backslash c''} \quad$\\
  1265 $((r_1 \backslash cc'c'' \cdot r_2 + r_2 \backslash c'') + r_2 \backslash c'c'') + r_2 \backslash cc'c''   \longrightarrow_{\backslash c''} \quad
  1267 	$((r_1 \backslash cc'c'' \cdot r_2 + r_2 \backslash c'') + r_2 \backslash c'c'') + r_2 \backslash cc'c''   \longrightarrow_{\backslash c''} \quad
  1266  \ldots$
  1268 	\ldots$
  1267 
  1269 
  1268 \end{center}
  1270 \end{center}
  1269 Roughly speaking $r_1 \cdot r_2 \backslash s$ can be expresssed as 
  1271 Roughly speaking $r_1 \cdot r_2 \backslash s$ can be expresssed as 
  1270 a giant alternative taking a list of terms 
  1272 a giant alternative taking a list of terms 
  1271 $[r_1 \backslash_r s \cdot r_2, r_2 \backslash_r s'', r_2 \backslash_r s_1'', \ldots]$,
  1273 $[r_1 \backslash_r s \cdot r_2, r_2 \backslash_r s'', r_2 \backslash_r s_1'', \ldots]$,
  1274 terms of the shape $r_2 \backslash_r s''$, $s''$ being a suffix of $s$.
  1276 terms of the shape $r_2 \backslash_r s''$, $s''$ being a suffix of $s$.
  1275 This intuition is also echoed by IndianPaper, where they gave
  1277 This intuition is also echoed by IndianPaper, where they gave
  1276 a pencil-and-paper derivation of $(r_1 \cdot r_2)\backslash s$:
  1278 a pencil-and-paper derivation of $(r_1 \cdot r_2)\backslash s$:
  1277 \begin{center}
  1279 \begin{center}
  1278 	\begin{tabular}{c}
  1280 	\begin{tabular}{c}
  1279 	$(r_1 \cdot r_2) \backslash_r (c_1 :: c_2 :: \ldots c_n) \myequiv$\\ 
  1281 		$(r_1 \cdot r_2) \backslash_r (c_1 :: c_2 :: \ldots c_n) \myequiv$\\ 
  1280 	\rule{0pt}{3ex} $((r_1 \backslash_r c_1) \cdot r_2 + (\delta\; (\rnullable \; r_1) \; r_2 \backslash_r c_1)) \backslash_r (c_2 :: \ldots c_n) 
  1282 		\rule{0pt}{3ex} $((r_1 \backslash_r c_1) \cdot r_2 + (\delta\; (\rnullable \; r_1) \; r_2 \backslash_r c_1)) \backslash_r (c_2 :: \ldots c_n) 
  1281 	\myequiv$\\
  1283 		\myequiv$\\
  1282 	\rule{0pt}{3ex} $((r_1 \backslash_r c_1c_2 \cdot r_2 + (\delta \; (\rnullable \; r_1) \; r_2 \backslash_r c_1c_2))
  1284 		\rule{0pt}{3ex} $((r_1 \backslash_r c_1c_2 \cdot r_2 + (\delta \; (\rnullable \; r_1) \; r_2 \backslash_r c_1c_2))
  1283 	+ (\delta \ (\rnullable \; r_1 \backslash_r c)\; r_2 \backslash_r c_2)) \backslash_r (c_3 \ldots c_n)
  1285 		+ (\delta \ (\rnullable \; r_1 \backslash_r c)\; r_2 \backslash_r c_2)) \backslash_r (c_3 \ldots c_n)
  1284 	$
  1286 		$
  1285 	\end{tabular}
  1287 	\end{tabular}
  1286 \end{center}
  1288 \end{center}
  1287 \noindent
  1289 \noindent
  1288 The equality in above should be interpretated
  1290 The equality in above should be interpretated
  1289 as language equivalence. 
  1291 as language equivalence. 
  1317 We denote the list consisting of such 
  1319 We denote the list consisting of such 
  1318 strings $s''$ as $\vsuf{s}{r_1}$.
  1320 strings $s''$ as $\vsuf{s}{r_1}$.
  1319 
  1321 
  1320 The function $\vsuf{\_}{\_}$ is defined recursively on the structure of the string:
  1322 The function $\vsuf{\_}{\_}$ is defined recursively on the structure of the string:
  1321 \begin{center}
  1323 \begin{center}
  1322 \begin{tabular}{lcl}
  1324 	\begin{tabular}{lcl}
  1323 $\vsuf{[]}{\_} $ & $=$ &  $[]$\\
  1325 		$\vsuf{[]}{\_} $ & $=$ &  $[]$\\
  1324 $\vsuf{c::cs}{r_1}$ & $ =$ & $ \textit{if} (\rnullable{r_1}) \textit{then} \; (\vsuf{cs}{(\rder{c}{r_1})}) @ [c :: cs]$\\
  1326 		$\vsuf{c::cs}{r_1}$ & $ =$ & $ \textit{if} (\rnullable{r_1}) \textit{then} \; (\vsuf{cs}{(\rder{c}{r_1})}) @ [c :: cs]$\\
  1325                                      && $\textit{else} \; (\vsuf{cs}{(\rder{c}{r_1}) })  $
  1327 				    && $\textit{else} \; (\vsuf{cs}{(\rder{c}{r_1}) })  $
  1326 \end{tabular}
  1328 	\end{tabular}
  1327 \end{center}
  1329 \end{center}
  1328 \noindent
  1330 \noindent
  1329 The list is sorted in the order $r_2\backslash s''$ 
  1331 The list is sorted in the order $r_2\backslash s''$ 
  1330 appears in $(r_1\cdot r_2)\backslash s$.
  1332 appears in $(r_1\cdot r_2)\backslash s$.
  1331 In essence, $\vsuf{\_}{\_}$ is doing a 
  1333 In essence, $\vsuf{\_}{\_}$ is doing a 
  1338 more straightforward,
  1340 more straightforward,
  1339 the flattetning function $\sflat{\_}$ is used to enable the transformation from 
  1341 the flattetning function $\sflat{\_}$ is used to enable the transformation from 
  1340 a left-associative nested sequence of alternatives into 
  1342 a left-associative nested sequence of alternatives into 
  1341 a flattened list:
  1343 a flattened list:
  1342 \[
  1344 \[
  1343   \sum [r_1, r_2, r_3, \ldots] \stackrel{\sflat{\_}}{\rightarrow} 
  1345 	\sum [r_1, r_2, r_3, \ldots] \stackrel{\sflat{\_}}{\rightarrow} 
  1344 (\ldots ((r_1 + r_2) + r_3) + \ldots)
  1346 	(\ldots ((r_1 + r_2) + r_3) + \ldots)
  1345 \]
  1347 \]
  1346 \noindent
  1348 \noindent
  1347 The definitions $\sflat{\_}$, $\sflataux{\_}$ are given below.
  1349 The definitions $\sflat{\_}$, $\sflataux{\_}$ are given below.
  1348  \begin{center}  
  1350 \begin{center}  
  1349  \begin{tabular}{ccc}
  1351 	\begin{tabular}{ccc}
  1350  $\sflataux{\AALTS{ }{r :: rs}}$ & $=$ & $\sflataux{r} @ rs$\\
  1352 		$\sflataux{\AALTS{ }{r :: rs}}$ & $=$ & $\sflataux{r} @ rs$\\
  1351 $\sflataux{\AALTS{ }{[]}}$ & $ = $ & $ []$\\
  1353 		$\sflataux{\AALTS{ }{[]}}$ & $ = $ & $ []$\\
  1352 $\sflataux r$ & $=$ & $ [r]$
  1354 		$\sflataux r$ & $=$ & $ [r]$
  1353 \end{tabular}
  1355 	\end{tabular}
  1354 \end{center}
  1356 \end{center}
  1355 
  1357 
  1356  \begin{center} 
  1358 \begin{center} 
  1357  \begin{tabular}{ccc}
  1359 	\begin{tabular}{ccc}
  1358 	 $\sflat{(\sum r :: rs)}$ & $=$ & $\sum (\sflataux{r} @ rs)$\\
  1360 		$\sflat{(\sum r :: rs)}$ & $=$ & $\sum (\sflataux{r} @ rs)$\\
  1359 	 $\sflat{\sum []}$ & $ = $ & $ \sum []$\\
  1361 		$\sflat{\sum []}$ & $ = $ & $ \sum []$\\
  1360 $\sflat r$ & $=$ & $ r$
  1362 		$\sflat r$ & $=$ & $ r$
  1361 \end{tabular}
  1363 	\end{tabular}
  1362 \end{center}
  1364 \end{center}
  1363 \noindent
  1365 \noindent
  1364 $\sflataux{\_}$ breaks up nested alternative regular expressions 
  1366 $\sflataux{\_}$ breaks up nested alternative regular expressions 
  1365 of the $(\ldots((r_1 + r_2) + r_3) + \ldots )$(left-associated) shape
  1367 of the $(\ldots((r_1 + r_2) + r_3) + \ldots )$(left-associated) shape
  1366 into a "balanced" list: $\AALTS{\_}{[r_1,\, r_2 ,\, r_3, \ldots]}$.
  1368 into a "balanced" list: $\AALTS{\_}{[r_1,\, r_2 ,\, r_3, \ldots]}$.
  1384 	Note the key identify holds:
  1386 	Note the key identify holds:
  1385 	\[
  1387 	\[
  1386 		\map \; (r_2 \backslash_r \_) \; (\vsuf{[x]}{(r_1 \backslash_r xs)}) \;\; @ \;\;
  1388 		\map \; (r_2 \backslash_r \_) \; (\vsuf{[x]}{(r_1 \backslash_r xs)}) \;\; @ \;\;
  1387 		\map \; (\_ \backslash_r x) \; (\map \; (r_2 \backslash \_) \; (\vsuf{xs}{r_1}))
  1389 		\map \; (\_ \backslash_r x) \; (\map \; (r_2 \backslash \_) \; (\vsuf{xs}{r_1}))
  1388 	\]
  1390 	\]
  1389 		=
  1391 	=
  1390 	\[
  1392 	\[
  1391 		\map \; (r_2 \backslash_r \_) \; (\vsuf{xs @ [x]}{r_1})
  1393 		\map \; (r_2 \backslash_r \_) \; (\vsuf{xs @ [x]}{r_1})
  1392 	\]
  1394 	\]
  1393 	This enables the inductive case to go through.
  1395 	This enables the inductive case to go through.
  1394 \end{proof}
  1396 \end{proof}
  1401 Now we are able to use this for the intuition that 
  1403 Now we are able to use this for the intuition that 
  1402 the different ways in which regular expressions are 
  1404 the different ways in which regular expressions are 
  1403 nested do not matter under $\rsimp{}$:
  1405 nested do not matter under $\rsimp{}$:
  1404 \begin{center}
  1406 \begin{center}
  1405 	$\rsimp{r} \stackrel{?}{\sequal} \rsimp{r'}$ if $r = \sum [r_1, r_2, r_3, \ldots]$ 
  1407 	$\rsimp{r} \stackrel{?}{\sequal} \rsimp{r'}$ if $r = \sum [r_1, r_2, r_3, \ldots]$ 
  1406 and $r' =(\ldots ((r_1 + r_2) + r_3) + \ldots)$
  1408 	and $r' =(\ldots ((r_1 + r_2) + r_3) + \ldots)$
  1407 \end{center}
  1409 \end{center}
  1408 Simply wrap with $\sum$ constructor and add 
  1410 Simply wrap with $\sum$ constructor and add 
  1409 simplifications to both sides of \ref{seqSfau0}
  1411 simplifications to both sides of \ref{seqSfau0}
  1410 and one gets
  1412 and one gets
  1411 \begin{corollary}\label{seqClosedFormGeneral}
  1413 \begin{corollary}\label{seqClosedFormGeneral}
  1412 	$\rsimp{\sflat{(r_1 \cdot r_2) \backslash s} }
  1414 	$\rsimp{\sflat{(r_1 \cdot r_2) \backslash s} }
  1413 	=\rsimp{(\sum (  (r_1 \backslash s) \cdot r_2 :: 
  1415 	=\rsimp{(\sum (  (r_1 \backslash s) \cdot r_2 :: 
  1414 		\map\; (r_2 \backslash \_) \; (\vsuf{s}{r_1})))}$
  1416 	\map\; (r_2 \backslash \_) \; (\vsuf{s}{r_1})))}$
  1415 \end{corollary}
  1417 \end{corollary}
  1416 Together with the idempotency property of $\rsimp{}$ (lemma \ref{rsimpIdem}),
  1418 Together with the idempotency property of $\rsimp{}$ (lemma \ref{rsimpIdem}),
  1417 it is possible to convert the above lemma to obtain a "closed form"
  1419 it is possible to convert the above lemma to obtain a "closed form"
  1418 for  derivatives nested with simplification:
  1420 for  derivatives nested with simplification:
  1419 \begin{lemma}\label{seqClosedForm}
  1421 \begin{lemma}\label{seqClosedForm}
  1430 of the sequence derivative $r_1 \cdot r_2 \backslash_r s$ using 
  1432 of the sequence derivative $r_1 \cdot r_2 \backslash_r s$ using 
  1431 an easier-to-handle expression:
  1433 an easier-to-handle expression:
  1432 \begin{corollary}\label{seqEstimate1}
  1434 \begin{corollary}\label{seqEstimate1}
  1433 	\begin{center}
  1435 	\begin{center}
  1434 
  1436 
  1435 	$\llbracket \rderssimp{(r_1 \cdot r_2)}{s} \rrbracket_r = \llbracket \rsimp{(\sum ((r_1 \backslash s) \cdot r_2 ) 
  1437 		$\llbracket \rderssimp{(r_1 \cdot r_2)}{s} \rrbracket_r = \llbracket \rsimp{(\sum ((r_1 \backslash s) \cdot r_2 ) 
  1436 	:: (\map \; (r_2 \backslash \_) (\vsuf{s}{r_1})))} \rrbracket_r$
  1438 		:: (\map \; (r_2 \backslash \_) (\vsuf{s}{r_1})))} \rrbracket_r$
  1437     	
  1439 
  1438 	\end{center}
  1440 	\end{center}
  1439 \end{corollary}
  1441 \end{corollary}
  1440 \noindent
  1442 \noindent
  1441 \subsection{Closed Forms for Star Regular Expressions}
  1443 \subsection{Closed Forms for Star Regular Expressions}
  1442 We have shown how to control the size of the sequence regular expression $r_1\cdot r_2$ using
  1444 We have shown how to control the size of the sequence regular expression $r_1\cdot r_2$ using
  1444 the property of the $\distinct$ function.
  1446 the property of the $\distinct$ function.
  1445 Now we try to get a bound on $r^* \backslash s$ as well.
  1447 Now we try to get a bound on $r^* \backslash s$ as well.
  1446 Again, we first look at how a star's derivatives evolve, if they grow maximally: 
  1448 Again, we first look at how a star's derivatives evolve, if they grow maximally: 
  1447 \begin{center}
  1449 \begin{center}
  1448 
  1450 
  1449 $r^* \quad \longrightarrow_{\backslash c}  \quad   (r\backslash c)  \cdot  r^* \quad \longrightarrow_{\backslash c'}  \quad
  1451 	$r^* \quad \longrightarrow_{\backslash c}  \quad   (r\backslash c)  \cdot  r^* \quad \longrightarrow_{\backslash c'}  \quad
  1450 r \backslash cc'  \cdot r^* + r \backslash c' \cdot r^*  \quad \longrightarrow_{\backslash c''} \quad 
  1452 	r \backslash cc'  \cdot r^* + r \backslash c' \cdot r^*  \quad \longrightarrow_{\backslash c''} \quad 
  1451 (r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*)   \quad \longrightarrow_{\backslash c'''}
  1453 	(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*)   \quad \longrightarrow_{\backslash c'''}
  1452 \quad \ldots$
  1454 	\quad \ldots$
  1453 
  1455 
  1454 \end{center}
  1456 \end{center}
  1455 When we have a string $s = c :: c' :: c'' \ldots$  such that $r \backslash c$, $r \backslash cc'$, $r \backslash c'$, 
  1457 When we have a string $s = c :: c' :: c'' \ldots$  such that $r \backslash c$, $r \backslash cc'$, $r \backslash c'$, 
  1456 $r \backslash cc'c''$, $r \backslash c'c''$, $r\backslash c''$ etc. are all nullable,
  1458 $r \backslash cc'c''$, $r \backslash c'c''$, $r\backslash c''$ etc. are all nullable,
  1457 the number of terms in $r^* \backslash s$ will grow exponentially, causing the size
  1459 the number of terms in $r^* \backslash s$ will grow exponentially, causing the size
  1530 originated from a star's derivatives, so we do not attempt to open up all possible 
  1532 originated from a star's derivatives, so we do not attempt to open up all possible 
  1531 regular expressions of the form $\RALTS{rs}$, where $\textit{rs}$ might not contain precisely 2
  1533 regular expressions of the form $\RALTS{rs}$, where $\textit{rs}$ might not contain precisely 2
  1532 elements.
  1534 elements.
  1533 We give a predicate for such "star-created" regular expressions:
  1535 We give a predicate for such "star-created" regular expressions:
  1534 \begin{center}
  1536 \begin{center}
  1535 \begin{tabular}{lcr}
  1537 	\begin{tabular}{lcr}
  1536          &    &       $\createdByStar{(\RSEQ{ra}{\RSTAR{rb}}) }$\\
  1538 	 &    &       $\createdByStar{(\RSEQ{ra}{\RSTAR{rb}}) }$\\
  1537  $\createdByStar{r_1} \land \createdByStar{r_2} $ & $ \Longrightarrow$ & $\createdByStar{(r_1 + r_2)}$
  1539 		$\createdByStar{r_1} \land \createdByStar{r_2} $ & $ \Longrightarrow$ & $\createdByStar{(r_1 + r_2)}$
  1538  \end{tabular}
  1540 	\end{tabular}
  1539  \end{center}
  1541 \end{center}
  1540  
  1542 
  1541  These definitions allows us the flexibility to talk about 
  1543 These definitions allows us the flexibility to talk about 
  1542  regular expressions in their most convenient format,
  1544 regular expressions in their most convenient format,
  1543  for example, flattened out $\RALTS{[r_1, r_2, \ldots, r_n]} $
  1545 for example, flattened out $\RALTS{[r_1, r_2, \ldots, r_n]} $
  1544  instead of binary-nested: $((r_1 + r_2) + (r_3 + r_4)) + \ldots$.
  1546 instead of binary-nested: $((r_1 + r_2) + (r_3 + r_4)) + \ldots$.
  1545  These definitions help express that certain classes of syntatically 
  1547 These definitions help express that certain classes of syntatically 
  1546  distinct regular expressions are actually the same under simplification.
  1548 distinct regular expressions are actually the same under simplification.
  1547  This is not entirely true for annotated regular expressions: 
  1549 This is not entirely true for annotated regular expressions: 
  1548  %TODO: bsimp bders \neq bderssimp
  1550 %TODO: bsimp bders \neq bderssimp
  1549  \begin{center}
  1551 \begin{center}
  1550  $(1+ (c\cdot \ASEQ{bs}{c^*}{c} ))$
  1552 	$(1+ (c\cdot \ASEQ{bs}{c^*}{c} ))$
  1551  \end{center}
  1553 \end{center}
  1552  For bit-codes, the order in which simplification is applied
  1554 For bit-codes, the order in which simplification is applied
  1553  might cause a difference in the location they are placed.
  1555 might cause a difference in the location they are placed.
  1554  If we want something like
  1556 If we want something like
  1555  \begin{center}
  1557 \begin{center}
  1556  $\bderssimp{r}{s} \myequiv \bsimp{\bders{r}{s}}$
  1558 	$\bderssimp{r}{s} \myequiv \bsimp{\bders{r}{s}}$
  1557  \end{center}
  1559 \end{center}
  1558  Some "canonicalization" procedure is required,
  1560 Some "canonicalization" procedure is required,
  1559  which either pushes all the common bitcodes to nodes
  1561 which either pushes all the common bitcodes to nodes
  1560   as senior as possible:
  1562 as senior as possible:
  1561   \begin{center}
  1563 \begin{center}
  1562   $_{bs}(_{bs_1 @ bs'}r_1 + _{bs_1 @ bs''}r_2) \rightarrow _{bs @ bs_1}(_{bs'}r_1 + _{bs''}r_2) $
  1564 	$_{bs}(_{bs_1 @ bs'}r_1 + _{bs_1 @ bs''}r_2) \rightarrow _{bs @ bs_1}(_{bs'}r_1 + _{bs''}r_2) $
  1563   \end{center}
  1565 \end{center}
  1564  or does the reverse. However bitcodes are not of interest if we are talking about
  1566 or does the reverse. However bitcodes are not of interest if we are talking about
  1565  the $\llbracket r \rrbracket$ size of a regex.
  1567 the $\llbracket r \rrbracket$ size of a regex.
  1566  Therefore for the ease and simplicity of producing a
  1568 Therefore for the ease and simplicity of producing a
  1567  proof for a size bound, we are happy to restrict ourselves to 
  1569 proof for a size bound, we are happy to restrict ourselves to 
  1568  unannotated regular expressions, and obtain such equalities as
  1570 unannotated regular expressions, and obtain such equalities as
  1569  \begin{lemma}
  1571 \begin{lemma}
  1570  $\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$
  1572 	$\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$
  1571  \end{lemma}
  1573 \end{lemma}
  1572  
  1574 
  1573  \begin{proof}
  1575 \begin{proof}
  1574  By using the rewriting relation $\rightsquigarrow$
  1576 	By using the rewriting relation $\rightsquigarrow$
  1575  \end{proof}
  1577 \end{proof}
  1576  %TODO: rsimp sflat
  1578 %TODO: rsimp sflat
  1577 And from this we obtain a proof that a star's derivative will be the same
  1579 And from this we obtain a proof that a star's derivative will be the same
  1578 as if it had all its nested alternatives created during deriving being flattened out:
  1580 as if it had all its nested alternatives created during deriving being flattened out:
  1579  For example,
  1581 For example,
  1580  \begin{lemma}
  1582 \begin{lemma}
  1581  $\createdByStar{r} \implies \rsimp{r} = \rsimp{\RALTS{\hflataux{r}}}$
  1583 	$\createdByStar{r} \implies \rsimp{r} = \rsimp{\RALTS{\hflataux{r}}}$
  1582  \end{lemma}
  1584 \end{lemma}
  1583  \begin{proof}
  1585 \begin{proof}
  1584  By structural induction on $r$, where the induction rules are these of $\createdByStar{_}$.
  1586 	By structural induction on $r$, where the induction rules are these of $\createdByStar{_}$.
  1585  \end{proof}
  1587 \end{proof}
  1586 % The simplification of a flattened out regular expression, provided it comes
  1588 % The simplification of a flattened out regular expression, provided it comes
  1587 %from the derivative of a star, is the same as the one nested.
  1589 %from the derivative of a star, is the same as the one nested.
  1588  
  1590 
  1589 
  1591 
  1590 
  1592 
  1591 We first introduce an inductive property
  1593 We first introduce an inductive property
  1592 for $\starupdate$ and $\hflataux{\_}$, 
  1594 for $\starupdate$ and $\hflataux{\_}$, 
  1593 it says if we do derivatives of $r^*$
  1595 it says if we do derivatives of $r^*$
  1627 	$\rsimp{a+b} = \rsimp{(\sum \hflataux{a} @ \hflataux{b})}$.
  1629 	$\rsimp{a+b} = \rsimp{(\sum \hflataux{a} @ \hflataux{b})}$.
  1628 \end{lemma}
  1630 \end{lemma}
  1629 \noindent
  1631 \noindent
  1630 This yields
  1632 This yields
  1631 \begin{lemma}\label{hfauRsimpeq2}
  1633 \begin{lemma}\label{hfauRsimpeq2}
  1632 $\rsimp{r} = \rsimp{(\sum \hflataux{r})}$
  1634 	$\rsimp{r} = \rsimp{(\sum \hflataux{r})}$
  1633 \end{lemma}
  1635 \end{lemma}
  1634 \noindent
  1636 \noindent
  1635 Together with the rewriting relation
  1637 Together with the rewriting relation
  1636 \begin{lemma}\label{starClosedForm6Hrewrites}
  1638 \begin{lemma}\label{starClosedForm6Hrewrites}
  1637 	$\map \; (\lambda s. (\rsimp{r \backslash_r s}) \cdot (r^*)) \; Ss
  1639 	$\map \; (\lambda s. (\rsimp{r \backslash_r s}) \cdot (r^*)) \; Ss
  1638 	\scfrewrites
  1640 	\scfrewrites
  1639 	 \map \; (\lambda s. (\rsimp{r \backslash_r s}) \cdot (r^*)) \; Ss$
  1641 	\map \; (\lambda s. (\rsimp{r \backslash_r s}) \cdot (r^*)) \; Ss$
  1640 \end{lemma}
  1642 \end{lemma}
  1641 \noindent
  1643 \noindent
  1642 We obtain the closed form for star regular expression:
  1644 We obtain the closed form for star regular expression:
  1643 \begin{lemma}\label{starClosedForm}
  1645 \begin{lemma}\label{starClosedForm}
  1644 	$\rderssimp{r^*}{c::s} = 
  1646 	$\rderssimp{r^*}{c::s} = 
  1645 	\rsimp{
  1647 	\rsimp{
  1646 		(\sum (\map \; (\lambda s. (\rderssimp{r}{s})\cdot r^*) \; 
  1648 		(\sum (\map \; (\lambda s. (\rderssimp{r}{s})\cdot r^*) \; 
  1647 			(\starupdates \; s\; r \; [[c]])
  1649 		(\starupdates \; s\; r \; [[c]])
  1648 		      )
       
  1649 		)
  1650 		)
  1650 	      }
  1651 		)
       
  1652 	}
  1651 	$
  1653 	$
  1652 \end{lemma}
  1654 \end{lemma}
  1653 \begin{proof}
  1655 \begin{proof}
  1654 	By an induction on $s$.
  1656 	By an induction on $s$.
  1655 	The lemmas \ref{rsimpIdem}, \ref{starHfauInduct}, and \ref{hfauRsimpeq2}
  1657 	The lemmas \ref{rsimpIdem}, \ref{starHfauInduct}, and \ref{hfauRsimpeq2}
  1657 \end{proof}
  1659 \end{proof}
  1658 \section{Estimating the Closed Forms' sizes}
  1660 \section{Estimating the Closed Forms' sizes}
  1659 We now summarize the closed forms below:
  1661 We now summarize the closed forms below:
  1660 \begin{itemize}
  1662 \begin{itemize}
  1661 	\item
  1663 	\item
  1662 	$\rderssimp{(\sum rs)}{s} \sequal
  1664 		$\rderssimp{(\sum rs)}{s} \sequal
  1663 	 \sum \; (\map \; (\rderssimp{\_}{s}) \; rs)$
  1665 		\sum \; (\map \; (\rderssimp{\_}{s}) \; rs)$
  1664 	\item
  1666 	\item
  1665 	$\rderssimp{(r_1 \cdot r_2)}{s} \sequal \sum ((r_1 \backslash s) \cdot r_2 ) 
  1667 		$\rderssimp{(r_1 \cdot r_2)}{s} \sequal \sum ((r_1 \backslash s) \cdot r_2 ) 
  1666 	:: (\map \; (r_2 \backslash \_) (\vsuf{s}{r_1}))$
  1668 		:: (\map \; (r_2 \backslash \_) (\vsuf{s}{r_1}))$
  1667 	\item
  1669 	\item
  1668 
  1670 
  1669 	$\rderssimp{r^*}{c::s} = 
  1671 		$\rderssimp{r^*}{c::s} = 
  1670 	\rsimp{
  1672 		\rsimp{
  1671 		(\sum (\map \; (\lambda s. (\rderssimp{r}{s})\cdot r^*) \; 
  1673 			(\sum (\map \; (\lambda s. (\rderssimp{r}{s})\cdot r^*) \; 
  1672 			(\starupdates \; s\; r \; [[c]])
  1674 			(\starupdates \; s\; r \; [[c]])
  1673 		      )
  1675 			)
  1674 		)
  1676 			)
  1675 	      }
  1677 		}
  1676 	$
  1678 		$
  1677 \end{itemize}	
  1679 \end{itemize}	
  1678 \noindent	
  1680 \noindent	
  1679 The closed forms on the left-hand-side
  1681 The closed forms on the left-hand-side
  1680 are all of the same shape: $\rsimp{ (\sum rs)} $.
  1682 are all of the same shape: $\rsimp{ (\sum rs)} $.
  1681 Such regular expression will be bounded by the size of $\sum rs'$, 
  1683 Such regular expression will be bounded by the size of $\sum rs'$, 
  1708 \begin{proof}
  1710 \begin{proof}
  1709 	By overestimating the set $\sizeNregex \; N + 1$
  1711 	By overestimating the set $\sizeNregex \; N + 1$
  1710 	using union of sets like
  1712 	using union of sets like
  1711 	$\{r_1 \cdot r_2 \mid r_1 \in A
  1713 	$\{r_1 \cdot r_2 \mid r_1 \in A
  1712 		\text{and}
  1714 		\text{and}
  1713 		r_2 \in A\}
  1715 	r_2 \in A\}
  1714 	$ where $A = \sizeNregex \; N$.
  1716 	$ where $A = \sizeNregex \; N$.
  1715 \end{proof}
  1717 \end{proof}
  1716 \noindent
  1718 \noindent
  1717 From this we get a corollary that
  1719 From this we get a corollary that
  1718 if forall $r \in rs$, $\rsize{r} \leq N$, then the output of 
  1720 if forall $r \in rs$, $\rsize{r} \leq N$, then the output of 
  1756 
  1758 
  1757 
  1759 
  1758 Now we are ready to control the sizes of
  1760 Now we are ready to control the sizes of
  1759 $r_1 \cdot r_2 \backslash s$, $r^* \backslash s$.
  1761 $r_1 \cdot r_2 \backslash s$, $r^* \backslash s$.
  1760 \begin{theorem}
  1762 \begin{theorem}
  1761 For any regex $r$, $\exists N_r. \forall s. \; \rsize{\rderssimp{r}{s}} \leq N_r$
  1763 	For any regex $r$, $\exists N_r. \forall s. \; \rsize{\rderssimp{r}{s}} \leq N_r$
  1762 \end{theorem}
  1764 \end{theorem}
  1763 \noindent
  1765 \noindent
  1764 \begin{proof}
  1766 \begin{proof}
  1765 We prove this by induction on $r$. The base cases for $\RZERO$,
  1767 	We prove this by induction on $r$. The base cases for $\RZERO$,
  1766 $\RONE $ and $\RCHAR{c}$ are straightforward. 
  1768 	$\RONE $ and $\RCHAR{c}$ are straightforward. 
  1767 In the sequence $r_1 \cdot r_2$ case,
  1769 	In the sequence $r_1 \cdot r_2$ case,
  1768 the inductive hypotheses state 
  1770 	the inductive hypotheses state 
  1769 $\exists N_1. \forall s. \; \llbracket \rderssimp{r}{s} \rrbracket \leq N_1$ and
  1771 	$\exists N_1. \forall s. \; \llbracket \rderssimp{r}{s} \rrbracket \leq N_1$ and
  1770 $\exists N_2. \forall s. \; \llbracket \rderssimp{r_2}{s} \rrbracket \leq N_2$. 
  1772 	$\exists N_2. \forall s. \; \llbracket \rderssimp{r_2}{s} \rrbracket \leq N_2$. 
  1771 
  1773 
  1772 When the string $s$ is not empty, we can reason as follows
  1774 	When the string $s$ is not empty, we can reason as follows
  1773 %
  1775 	%
  1774 \begin{center}
  1776 	\begin{center}
  1775 \begin{tabular}{lcll}
  1777 		\begin{tabular}{lcll}
  1776 & & $ \llbracket   \rderssimp{r_1\cdot r_2 }{s} \rrbracket_r $\\
  1778 & & $ \llbracket   \rderssimp{r_1\cdot r_2 }{s} \rrbracket_r $\\
  1777 & $ = $ & $\llbracket \rsimp{(\sum(r_1 \backslash_{rsimp} s \cdot r_2 \; \;  :: \; \; 
  1779 & $ = $ & $\llbracket \rsimp{(\sum(r_1 \backslash_{rsimp} s \cdot r_2 \; \;  :: \; \; 
  1778 \map \; (r_2\backslash_{rsimp} \_)\; (\vsuf{s}{r})))} \rrbracket_r $ & (1) \\			
  1780 		\map \; (r_2\backslash_{rsimp} \_)\; (\vsuf{s}{r})))} \rrbracket_r $ & (1) \\			
  1779 & $\leq$ & $\llbracket \rdistinct{(r_1 \backslash_{rsimp} s \cdot r_2 \; \;  :: \; \; 
  1781 										     & $\leq$ & $\llbracket \rdistinct{(r_1 \backslash_{rsimp} s \cdot r_2 \; \;  :: \; \; 
  1780 \map \; (r_2\backslash_{rsimp} \_)\; (\vsuf{s}{r}))}{\varnothing} \rrbracket_r  + 1$ & (2) \\
  1782 	\map \; (r_2\backslash_{rsimp} \_)\; (\vsuf{s}{r}))}{\varnothing} \rrbracket_r  + 1$ & (2) \\
  1781 										     & $\leq$ & $2 + N_1 + \rsize{r_2} + (N_2 * (card\;(\sizeNregex \; N_2)))$ & (3)\\
  1783 											     & $\leq$ & $2 + N_1 + \rsize{r_2} + (N_2 * (card\;(\sizeNregex \; N_2)))$ & (3)\\
  1782 \end{tabular}
  1784 \end{tabular}
  1783 \end{center}
  1785 \end{center}
  1784 \noindent
  1786 \noindent
  1785 (1) is by the corollary \ref{seqEstimate1}.
  1787 (1) is by the corollary \ref{seqEstimate1}.
  1786 (2) is by \ref{altsSimpControl}.
  1788 (2) is by \ref{altsSimpControl}.
  1801 The inductive hypothesis is
  1803 The inductive hypothesis is
  1802 $\exists N. \forall s. \; \llbracket \rderssimp{r}{s} \rrbracket \leq N$.
  1804 $\exists N. \forall s. \; \llbracket \rderssimp{r}{s} \rrbracket \leq N$.
  1803 Let $n_r = \llbracket r^* \rrbracket_r$.
  1805 Let $n_r = \llbracket r^* \rrbracket_r$.
  1804 When $s = c :: cs$ is not empty,
  1806 When $s = c :: cs$ is not empty,
  1805 \begin{center}
  1807 \begin{center}
  1806 \begin{tabular}{lcll}
  1808 	\begin{tabular}{lcll}
  1807 & & $ \llbracket   \rderssimp{r^* }{c::cs} \rrbracket_r $\\
  1809 & & $ \llbracket   \rderssimp{r^* }{c::cs} \rrbracket_r $\\
  1808 & $ = $ & $\llbracket \rsimp{(\sum (\map \; (\lambda s. (r \backslash_{rsimp} s) \cdot r^*) \; (\starupdates\; 
  1810 & $ = $ & $\llbracket \rsimp{(\sum (\map \; (\lambda s. (r \backslash_{rsimp} s) \cdot r^*) \; (\starupdates\; 
  1809 cs \; r \; [[c]] )) )} \rrbracket_r $ & (5) \\			
  1811 	cs \; r \; [[c]] )) )} \rrbracket_r $ & (5) \\			
  1810 & $\leq$ & $\llbracket 
  1812 					      & $\leq$ & $\llbracket 
  1811 \rdistinct{
  1813 					      \rdistinct{
  1812 	(\map \; 
  1814 						      (\map \; 
  1813 		(\lambda s. (r \backslash_{rsimp} s) \cdot r^*) \; 
  1815 						      (\lambda s. (r \backslash_{rsimp} s) \cdot r^*) \; 
  1814 		(\starupdates\; cs \; r \; [[c]] )
  1816 						      (\starupdates\; cs \; r \; [[c]] )
  1815 	)}
  1817 					      )}
  1816 	{\varnothing} \rrbracket_r  + 1$ & (6) \\
  1818 	{\varnothing} \rrbracket_r  + 1$ & (6) \\
  1817 					 & $\leq$ & $1 + (\textit{card} (\sizeNregex \; (N + n_r)))
  1819 					 & $\leq$ & $1 + (\textit{card} (\sizeNregex \; (N + n_r)))
  1818 	* (1 + (N + n_r)) $ & (7)\\
  1820 	* (1 + (N + n_r)) $ & (7)\\
  1819 \end{tabular}
  1821 \end{tabular}
  1820 \end{center}
  1822 \end{center}
  1822 (5) is by the lemma  \ref{starClosedForm}.
  1824 (5) is by the lemma  \ref{starClosedForm}.
  1823 (6) is by \ref{altsSimpControl}.
  1825 (6) is by \ref{altsSimpControl}.
  1824 (7) is by \ref{finiteSizeNCorollary}.
  1826 (7) is by \ref{finiteSizeNCorollary}.
  1825 Combining with the case when $s = []$, one gets
  1827 Combining with the case when $s = []$, one gets
  1826 \begin{center}
  1828 \begin{center}
  1827 \begin{tabular}{lcll}
  1829 	\begin{tabular}{lcll}
  1828 	$\rsize{r^* \backslash_r s}$ & $\leq$ & $max \; n_r \; 1 + (\textit{card} (\sizeNregex \; (N + n_r)))
  1830 		$\rsize{r^* \backslash_r s}$ & $\leq$ & $max \; n_r \; 1 + (\textit{card} (\sizeNregex \; (N + n_r)))
  1829 	* (1 + (N + n_r)) $ & (8)\\
  1831 		* (1 + (N + n_r)) $ & (8)\\
  1830 \end{tabular}
  1832 	\end{tabular}
  1831 \end{center}
  1833 \end{center}
  1832 \noindent
  1834 \noindent
  1833 
  1835 
  1834 The alternative case is slightly less involved.
  1836 The alternative case is slightly less involved.
  1835 The inductive hypothesis 
  1837 The inductive hypothesis 
  1836 is equivalent to $\exists N. \forall r \in (\map \; (\_ \backslash_r s) \; rs). \rsize{r} \leq N$.
  1838 is equivalent to $\exists N. \forall r \in (\map \; (\_ \backslash_r s) \; rs). \rsize{r} \leq N$.
  1837 In the case when $s = c::cs$, we have 
  1839 In the case when $s = c::cs$, we have 
  1838 \begin{center}
  1840 \begin{center}
  1839 \begin{tabular}{lcll}
  1841 	\begin{tabular}{lcll}
  1840 & & $ \llbracket   \rderssimp{\sum rs }{c::cs} \rrbracket_r $\\
  1842 & & $ \llbracket   \rderssimp{\sum rs }{c::cs} \rrbracket_r $\\
  1841 & $ = $ & $\llbracket \rsimp{(\sum (\map \; (\_ \backslash_{rsimp} s)  \; rs) )} \rrbracket_r $ & (9) \\			
  1843 & $ = $ & $\llbracket \rsimp{(\sum (\map \; (\_ \backslash_{rsimp} s)  \; rs) )} \rrbracket_r $ & (9) \\			
  1842 & $\leq$ & $\llbracket (\sum (\map \; (\_ \backslash_{rsimp} s)  \; rs) ) \rrbracket_r $  & (10) \\
  1844 & $\leq$ & $\llbracket (\sum (\map \; (\_ \backslash_{rsimp} s)  \; rs) ) \rrbracket_r $  & (10) \\
  1843 & $\leq$ & $1 + N * (length \; rs) $ & (11)\\
  1845 & $\leq$ & $1 + N * (length \; rs) $ & (11)\\
  1844 \end{tabular}
  1846 	\end{tabular}
  1845 \end{center}
  1847 \end{center}
  1846 \noindent
  1848 \noindent
  1847 (9) is by \ref{altsClosedForm}, (10) by \ref{rsimpSize} and (11) by inductive hypothesis.
  1849 (9) is by \ref{altsClosedForm}, (10) by \ref{rsimpSize} and (11) by inductive hypothesis.
  1848 
  1850 
  1849 Combining with the case when $s = []$, one gets
  1851 Combining with the case when $s = []$, one gets
  1850 \begin{center}
  1852 \begin{center}
  1851 \begin{tabular}{lcll}
  1853 	\begin{tabular}{lcll}
  1852 	$\rsize{\sum rs \backslash_r s}$ & $\leq$ & $max \; \rsize{\sum rs} \; 1+N*(length \; rs)$ 
  1854 		$\rsize{\sum rs \backslash_r s}$ & $\leq$ & $max \; \rsize{\sum rs} \; 1+N*(length \; rs)$ 
  1853 	 & (12)\\
  1855 						 & (12)\\
  1854 \end{tabular}
  1856 	\end{tabular}
  1855 \end{center}
  1857 \end{center}
  1856 (4), (8), and (12) are all the inductive cases proven.
  1858 (4), (8), and (12) are all the inductive cases proven.
  1857 \end{proof}
  1859 \end{proof}
  1858 
  1860 
  1859 
  1861 
  1860 \begin{corollary}
  1862 \begin{corollary}
  1861 For any regex $a$, $\exists N_r. \forall s. \; \rsize{\bderssimp{a}{s}} \leq N_r$
  1863 	For any regex $a$, $\exists N_r. \forall s. \; \rsize{\bderssimp{a}{s}} \leq N_r$
  1862 \end{corollary}
  1864 \end{corollary}
  1863 \begin{proof}
  1865 \begin{proof}
  1864 	By \ref{sizeRelations}.
  1866 	By \ref{sizeRelations}.
  1865 \end{proof}
  1867 \end{proof}
  1866 \noindent
  1868 \noindent
  1892 What guarantee does this bound give us?
  1894 What guarantee does this bound give us?
  1893 
  1895 
  1894 Whatever the regex is, it will not grow indefinitely.
  1896 Whatever the regex is, it will not grow indefinitely.
  1895 Take our previous example $(a + aa)^*$ as an example:
  1897 Take our previous example $(a + aa)^*$ as an example:
  1896 \begin{center}
  1898 \begin{center}
  1897 \begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
  1899 	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
  1898 \begin{tikzpicture}
  1900 		\begin{tikzpicture}
  1899 \begin{axis}[
  1901 			\begin{axis}[
  1900     xlabel={number of $a$'s},
  1902 				xlabel={number of $a$'s},
  1901     x label style={at={(1.05,-0.05)}},
  1903 				x label style={at={(1.05,-0.05)}},
  1902     ylabel={regex size},
  1904 				ylabel={regex size},
  1903     enlargelimits=false,
  1905 				enlargelimits=false,
  1904     xtick={0,5,...,30},
  1906 				xtick={0,5,...,30},
  1905     xmax=33,
  1907 				xmax=33,
  1906     ymax= 40,
  1908 				ymax= 40,
  1907     ytick={0,10,...,40},
  1909 				ytick={0,10,...,40},
  1908     scaled ticks=false,
  1910 				scaled ticks=false,
  1909     axis lines=left,
  1911 				axis lines=left,
  1910     width=5cm,
  1912 				width=5cm,
  1911     height=4cm, 
  1913 				height=4cm, 
  1912     legend entries={$(a + aa)^*$},  
  1914 				legend entries={$(a + aa)^*$},  
  1913     legend pos=north west,
  1915 				legend pos=north west,
  1914     legend cell align=left]
  1916 				legend cell align=left]
  1915 \addplot[red,mark=*, mark options={fill=white}] table {a_aa_star.data};
  1917 				\addplot[red,mark=*, mark options={fill=white}] table {a_aa_star.data};
  1916 \end{axis}
  1918 			\end{axis}
  1917 \end{tikzpicture}
  1919 		\end{tikzpicture}
  1918 \end{tabular}
  1920 	\end{tabular}
  1919 \end{center}
  1921 \end{center}
  1920 We are able to limit the size of the regex $(a + aa)^*$'s derivatives
  1922 We are able to limit the size of the regex $(a + aa)^*$'s derivatives
  1921  with our simplification
  1923 with our simplification
  1922 rules very effectively.
  1924 rules very effectively.
  1923 
  1925 
  1924 
  1926 
  1925 In our proof for the inductive case $r_1 \cdot r_2$, the dominant term in the bound
  1927 In our proof for the inductive case $r_1 \cdot r_2$, the dominant term in the bound
  1926 is $l_{N_2} * N_2$, where $N_2$ is the bound we have for $\llbracket \bderssimp{r_2}{s} \rrbracket$.
  1928 is $l_{N_2} * N_2$, where $N_2$ is the bound we have for $\llbracket \bderssimp{r_2}{s} \rrbracket$.
  1939 %TODO: giving regex1_size_change.data showing a few regular expressions' size changes 
  1941 %TODO: giving regex1_size_change.data showing a few regular expressions' size changes 
  1940 %w;r;t the input characters number, where the size is usually cubic in terms of original size
  1942 %w;r;t the input characters number, where the size is usually cubic in terms of original size
  1941 %a*, aa*, aaa*, .....
  1943 %a*, aa*, aaa*, .....
  1942 %randomly generated regular expressions
  1944 %randomly generated regular expressions
  1943 \begin{center}
  1945 \begin{center}
  1944 \begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
  1946 	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
  1945 \begin{tikzpicture}
  1947 		\begin{tikzpicture}
  1946 \begin{axis}[
  1948 			\begin{axis}[
  1947     xlabel={number of $a$'s},
  1949 				xlabel={number of $a$'s},
  1948     x label style={at={(1.05,-0.05)}},
  1950 				x label style={at={(1.05,-0.05)}},
  1949     ylabel={regex size},
  1951 				ylabel={regex size},
  1950     enlargelimits=false,
  1952 				enlargelimits=false,
  1951     xtick={0,5,...,30},
  1953 				xtick={0,5,...,30},
  1952     xmax=33,
  1954 				xmax=33,
  1953     ymax=1000,
  1955 				ymax=1000,
  1954     ytick={0,100,...,1000},
  1956 				ytick={0,100,...,1000},
  1955     scaled ticks=false,
  1957 				scaled ticks=false,
  1956     axis lines=left,
  1958 				axis lines=left,
  1957     width=5cm,
  1959 				width=5cm,
  1958     height=4cm, 
  1960 				height=4cm, 
  1959     legend entries={regex1},  
  1961 				legend entries={regex1},  
  1960     legend pos=north west,
  1962 				legend pos=north west,
  1961     legend cell align=left]
  1963 				legend cell align=left]
  1962 \addplot[red,mark=*, mark options={fill=white}] table {regex1_size_change.data};
  1964 				\addplot[red,mark=*, mark options={fill=white}] table {regex1_size_change.data};
  1963 \end{axis}
  1965 			\end{axis}
  1964 \end{tikzpicture}
  1966 		\end{tikzpicture}
  1965   &
  1967   &
  1966 \begin{tikzpicture}
  1968   \begin{tikzpicture}
  1967 \begin{axis}[
  1969 	  \begin{axis}[
  1968     xlabel={$n$},
  1970 		  xlabel={$n$},
  1969     x label style={at={(1.05,-0.05)}},
  1971 		  x label style={at={(1.05,-0.05)}},
  1970     %ylabel={time in secs},
  1972 		  %ylabel={time in secs},
  1971     enlargelimits=false,
  1973 		  enlargelimits=false,
  1972     xtick={0,5,...,30},
  1974 		  xtick={0,5,...,30},
  1973     xmax=33,
  1975 		  xmax=33,
  1974     ymax=1000,
  1976 		  ymax=1000,
  1975     ytick={0,100,...,1000},
  1977 		  ytick={0,100,...,1000},
  1976     scaled ticks=false,
  1978 		  scaled ticks=false,
  1977     axis lines=left,
  1979 		  axis lines=left,
  1978     width=5cm,
  1980 		  width=5cm,
  1979     height=4cm, 
  1981 		  height=4cm, 
  1980     legend entries={regex2},  
  1982 		  legend entries={regex2},  
  1981     legend pos=north west,
  1983 		  legend pos=north west,
  1982     legend cell align=left]
  1984 		  legend cell align=left]
  1983 \addplot[blue,mark=*, mark options={fill=white}] table {regex2_size_change.data};
  1985 		  \addplot[blue,mark=*, mark options={fill=white}] table {regex2_size_change.data};
  1984 \end{axis}
  1986 	  \end{axis}
  1985 \end{tikzpicture}
  1987   \end{tikzpicture}
  1986   &
  1988   &
  1987 \begin{tikzpicture}
  1989   \begin{tikzpicture}
  1988 \begin{axis}[
  1990 	  \begin{axis}[
  1989     xlabel={$n$},
  1991 		  xlabel={$n$},
  1990     x label style={at={(1.05,-0.05)}},
  1992 		  x label style={at={(1.05,-0.05)}},
  1991     %ylabel={time in secs},
  1993 		  %ylabel={time in secs},
  1992     enlargelimits=false,
  1994 		  enlargelimits=false,
  1993     xtick={0,5,...,30},
  1995 		  xtick={0,5,...,30},
  1994     xmax=33,
  1996 		  xmax=33,
  1995     ymax=1000,
  1997 		  ymax=1000,
  1996     ytick={0,100,...,1000},
  1998 		  ytick={0,100,...,1000},
  1997     scaled ticks=false,
  1999 		  scaled ticks=false,
  1998     axis lines=left,
  2000 		  axis lines=left,
  1999     width=5cm,
  2001 		  width=5cm,
  2000     height=4cm, 
  2002 		  height=4cm, 
  2001     legend entries={regex3},  
  2003 		  legend entries={regex3},  
  2002     legend pos=north west,
  2004 		  legend pos=north west,
  2003     legend cell align=left]
  2005 		  legend cell align=left]
  2004 \addplot[cyan,mark=*, mark options={fill=white}] table {regex3_size_change.data};
  2006 		  \addplot[cyan,mark=*, mark options={fill=white}] table {regex3_size_change.data};
  2005 \end{axis}
  2007 	  \end{axis}
  2006 \end{tikzpicture}\\
  2008   \end{tikzpicture}\\
  2007 \multicolumn{3}{c}{Graphs: size change of 3 randomly generated regular expressions $w.r.t.$ input string length.}
  2009   \multicolumn{3}{c}{Graphs: size change of 3 randomly generated regular expressions $w.r.t.$ input string length.}
  2008 \end{tabular}    
  2010 	\end{tabular}    
  2009 \end{center}  
  2011 \end{center}  
  2010 \noindent
  2012 \noindent
  2011 Most of the regex's sizes seem to stay within a polynomial bound $w.r.t$ the 
  2013 Most of the regex's sizes seem to stay within a polynomial bound $w.r.t$ the 
  2012 original size.
  2014 original size.
  2013 We will discuss improvements to this bound in the next chapter.
  2015 We will discuss improvements to this bound in the next chapter.
  2024 \section{Syntactic Equivalence Under $\simp$}
  2026 \section{Syntactic Equivalence Under $\simp$}
  2025 We prove that minor differences can be annhilated
  2027 We prove that minor differences can be annhilated
  2026 by $\simp$.
  2028 by $\simp$.
  2027 For example,
  2029 For example,
  2028 \begin{center}
  2030 \begin{center}
  2029 $\simp \;(\simpALTs\; (\map \;(\_\backslash \; x)\; (\distinct \; \mathit{rs}\; \phi))) = 
  2031 	$\simp \;(\simpALTs\; (\map \;(\_\backslash \; x)\; (\distinct \; \mathit{rs}\; \phi))) = 
  2030  \simp \;(\simpALTs \;(\distinct \;(\map \;(\_ \backslash\; x) \; \mathit{rs}) \; \phi))$
  2032 	\simp \;(\simpALTs \;(\distinct \;(\map \;(\_ \backslash\; x) \; \mathit{rs}) \; \phi))$
  2031 \end{center}
  2033 \end{center}
  2032 
  2034 
  2033 
  2035 
  2034 %----------------------------------------------------------------------------------------
  2036 %----------------------------------------------------------------------------------------
  2035 %	SECTION ALTS CLOSED FORM
  2037 %	SECTION ALTS CLOSED FORM
  2051 
  2053 
  2052 Unfortunately the function $\backslash c$ is not an injective mapping.
  2054 Unfortunately the function $\backslash c$ is not an injective mapping.
  2053 
  2055 
  2054 \subsection{function $\backslash c$ is not injective (1-to-1)}
  2056 \subsection{function $\backslash c$ is not injective (1-to-1)}
  2055 \begin{center}
  2057 \begin{center}
  2056 The derivative $w.r.t$ character $c$ is not one-to-one.
  2058 	The derivative $w.r.t$ character $c$ is not one-to-one.
  2057 Formally,
  2059 	Formally,
  2058 	$\exists r_1 \;r_2. r_1 \neq r_2 \mathit{and} r_1 \backslash c = r_2 \backslash c$
  2060 	$\exists r_1 \;r_2. r_1 \neq r_2 \mathit{and} r_1 \backslash c = r_2 \backslash c$
  2059 \end{center}
  2061 \end{center}
  2060 This property is trivially true for the
  2062 This property is trivially true for the
  2061 character regex example:
  2063 character regex example:
  2062 \begin{center}
  2064 \begin{center}
  2082 \end{center}
  2084 \end{center}
  2083 
  2085 
  2084 There are two problems with this finiteness result, though.
  2086 There are two problems with this finiteness result, though.
  2085 \begin{itemize}
  2087 \begin{itemize}
  2086 	\item
  2088 	\item
  2087 First, It is not yet a direct formalisation of our lexer's complexity,
  2089 		First, It is not yet a direct formalisation of our lexer's complexity,
  2088 as a complexity proof would require looking into 
  2090 		as a complexity proof would require looking into 
  2089 the time it takes to execute {\bf all} the operations
  2091 		the time it takes to execute {\bf all} the operations
  2090 involved in the lexer (simp, collect, decode), not just the derivative.
  2092 		involved in the lexer (simp, collect, decode), not just the derivative.
  2091 \item
  2093 	\item
  2092 Second, the bound is not yet tight, and we seek to improve $N_a$ so that
  2094 		Second, the bound is not yet tight, and we seek to improve $N_a$ so that
  2093 it is polynomial on $\llbracket a \rrbracket$.
  2095 		it is polynomial on $\llbracket a \rrbracket$.
  2094 \end{itemize}
  2096 \end{itemize}
  2095 Still, we believe this contribution is fruitful,
  2097 Still, we believe this contribution is fruitful,
  2096 because
  2098 because
  2097 \begin{itemize}
  2099 \begin{itemize}
  2098 	\item
  2100 	\item
  2107 		backtracking is much less likely to occur in our $\blexersimp$
  2109 		backtracking is much less likely to occur in our $\blexersimp$
  2108 		algorithm.
  2110 		algorithm.
  2109 		We refine $\blexersimp$ with $\blexerStrong$ in the next chapter
  2111 		We refine $\blexersimp$ with $\blexerStrong$ in the next chapter
  2110 		so that the bound becomes polynomial.
  2112 		so that the bound becomes polynomial.
  2111 \end{itemize}
  2113 \end{itemize}
  2112 		
  2114 
  2113 %----------------------------------------------------------------------------------------
  2115 %----------------------------------------------------------------------------------------
  2114 %	SECTION 4
  2116 %	SECTION 4
  2115 %----------------------------------------------------------------------------------------
  2117 %----------------------------------------------------------------------------------------
  2116  
  2118 
  2117  
  2119 
  2118  
  2120 
  2119  
  2121 
  2120  
  2122 
  2121  
  2123 
  2122  
  2124 
  2123  
  2125 
  2124 One might wonder the actual bound rather than the loose bound we gave
  2126 One might wonder the actual bound rather than the loose bound we gave
  2125 for the convenience of an easier proof.
  2127 for the convenience of an easier proof.
  2126 How much can the regex $r^* \backslash s$ grow? 
  2128 How much can the regex $r^* \backslash s$ grow? 
  2127 As  earlier graphs have shown,
  2129 As  earlier graphs have shown,
  2128 %TODO: reference that graph where size grows quickly
  2130 %TODO: reference that graph where size grows quickly
  2129  they can grow at a maximum speed
  2131 they can grow at a maximum speed
  2130   exponential $w.r.t$ the number of characters, 
  2132 exponential $w.r.t$ the number of characters, 
  2131 but will eventually level off when the string $s$ is long enough.
  2133 but will eventually level off when the string $s$ is long enough.
  2132 If they grow to a size exponential $w.r.t$ the original regex, our algorithm
  2134 If they grow to a size exponential $w.r.t$ the original regex, our algorithm
  2133 would still be slow.
  2135 would still be slow.
  2134 And unfortunately, we have concrete examples
  2136 And unfortunately, we have concrete examples
  2135 where such regular expressions grew exponentially large before levelling off:
  2137 where such regular expressions grew exponentially large before levelling off:
  2136 $(a ^ * + (aa) ^ * + (aaa) ^ * + \ldots + 
  2138 $(a ^ * + (aa) ^ * + (aaa) ^ * + \ldots + 
  2137 (\underbrace{a \ldots a}_{\text{n a's}})^*$ will already have a maximum
  2139 (\underbrace{a \ldots a}_{\text{n a's}})^*$ will already have a maximum
  2138  size that is  exponential on the number $n$ 
  2140 size that is  exponential on the number $n$ 
  2139 under our current simplification rules:
  2141 under our current simplification rules:
  2140 %TODO: graph of a regex whose size increases exponentially.
  2142 %TODO: graph of a regex whose size increases exponentially.
  2141 \begin{center}
  2143 \begin{center}
  2142 \begin{tikzpicture}
  2144 	\begin{tikzpicture}
  2143     \begin{axis}[
  2145 		\begin{axis}[
  2144         height=0.5\textwidth,
  2146 			height=0.5\textwidth,
  2145         width=\textwidth,
  2147 			width=\textwidth,
  2146         xlabel=number of a's,
  2148 			xlabel=number of a's,
  2147         xtick={0,...,9},
  2149 			xtick={0,...,9},
  2148         ylabel=maximum size,
  2150 			ylabel=maximum size,
  2149         ymode=log,
  2151 			ymode=log,
  2150        log basis y={2}
  2152 			log basis y={2}
  2151 ]
  2153 			]
  2152         \addplot[mark=*,blue] table {re-chengsong.data};
  2154 			\addplot[mark=*,blue] table {re-chengsong.data};
  2153     \end{axis}
  2155 		\end{axis}
  2154 \end{tikzpicture}
  2156 	\end{tikzpicture}
  2155 \end{center}
  2157 \end{center}
  2156 
  2158 
  2157 For convenience we use $(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$
  2159 For convenience we use $(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$
  2158 to express $(a ^ * + (aa) ^ * + (aaa) ^ * + \ldots + 
  2160 to express $(a ^ * + (aa) ^ * + (aaa) ^ * + \ldots + 
  2159 (\underbrace{a \ldots a}_{\text{n a's}})^*$ in the below discussion.
  2161 (\underbrace{a \ldots a}_{\text{n a's}})^*$ in the below discussion.
  2163 different derivatives, despite those difference being minor.
  2165 different derivatives, despite those difference being minor.
  2164 $(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*\backslash \underbrace{a \ldots a}_{\text{m a's}}$
  2166 $(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*\backslash \underbrace{a \ldots a}_{\text{m a's}}$
  2165 will therefore contain the following terms (after flattening out all nested 
  2167 will therefore contain the following terms (after flattening out all nested 
  2166 alternatives):
  2168 alternatives):
  2167 \begin{center}
  2169 \begin{center}
  2168 $(\oplus_{i = 1]{n}  (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot  (\underbrace{a \ldots a}_{\text{i a's}})^* })\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)$\\
  2170 	$(\oplus_{i = 1]{n}  (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot  (\underbrace{a \ldots a}_{\text{i a's}})^* })\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)$\\
  2169 $(1 \leq m' \leq m )$
  2171 	$(1 \leq m' \leq m )$
  2170 \end{center}
  2172 \end{center}
  2171 These terms are distinct for $m' \leq L.C.M.(1, \ldots, n)$ (will be explained in appendix).
  2173 These terms are distinct for $m' \leq L.C.M.(1, \ldots, n)$ (will be explained in appendix).
  2172  With each new input character taking the derivative against the intermediate result, more and more such distinct
  2174 With each new input character taking the derivative against the intermediate result, more and more such distinct
  2173  terms will accumulate, 
  2175 terms will accumulate, 
  2174 until the length reaches $L.C.M.(1, \ldots, n)$.
  2176 until the length reaches $L.C.M.(1, \ldots, n)$.
  2175 $\textit{distinctBy}$ will not be able to de-duplicate any two of these terms 
  2177 $\textit{distinctBy}$ will not be able to de-duplicate any two of these terms 
  2176 $(\oplus_{i = 1}^{n}  (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot  (\underbrace{a \ldots a}_{\text{i a's}})^* )\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$\\
  2178 $(\oplus_{i = 1}^{n}  (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot  (\underbrace{a \ldots a}_{\text{i a's}})^* )\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$\\
  2177 
  2179 
  2178 $(\oplus_{i = 1}^{n}  (\underbrace{a \ldots a}_{\text{((i - (m'' \% i))\%i) a's}})\cdot  (\underbrace{a \ldots a}_{\text{i a's}})^* )\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$\\
  2180 $(\oplus_{i = 1}^{n}  (\underbrace{a \ldots a}_{\text{((i - (m'' \% i))\%i) a's}})\cdot  (\underbrace{a \ldots a}_{\text{i a's}})^* )\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$\\
  2179  where $m' \neq m''$ \\
  2181 where $m' \neq m''$ \\
  2180  as they are slightly different.
  2182 as they are slightly different.
  2181  This means that with our current simplification methods,
  2183 This means that with our current simplification methods,
  2182  we will not be able to control the derivative so that
  2184 we will not be able to control the derivative so that
  2183  $\llbracket \bderssimp{r}{s} \rrbracket$ stays polynomial %\leq O((\llbracket r\rrbacket)^c)$
  2185 $\llbracket \bderssimp{r}{s} \rrbracket$ stays polynomial %\leq O((\llbracket r\rrbacket)^c)$
  2184  as there are already exponentially many terms.
  2186 as there are already exponentially many terms.
  2185  These terms are similar in the sense that the head of those terms
  2187 These terms are similar in the sense that the head of those terms
  2186  are all consisted of sub-terms of the form: 
  2188 are all consisted of sub-terms of the form: 
  2187  $(\underbrace{a \ldots a}_{\text{j a's}})\cdot  (\underbrace{a \ldots a}_{\text{i a's}})^* $.
  2189 $(\underbrace{a \ldots a}_{\text{j a's}})\cdot  (\underbrace{a \ldots a}_{\text{i a's}})^* $.
  2188  For  $\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*$, there will be at most
  2190 For  $\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*$, there will be at most
  2189  $n * (n + 1) / 2$ such terms. 
  2191 $n * (n + 1) / 2$ such terms. 
  2190  For example, $(a^* + (aa)^* + (aaa)^*) ^*$'s derivatives
  2192 For example, $(a^* + (aa)^* + (aaa)^*) ^*$'s derivatives
  2191  can be described by 6 terms:
  2193 can be described by 6 terms:
  2192  $a^*$, $a\cdot (aa)^*$, $ (aa)^*$, 
  2194 $a^*$, $a\cdot (aa)^*$, $ (aa)^*$, 
  2193  $aa \cdot (aaa)^*$, $a \cdot (aaa)^*$, and $(aaa)^*$.
  2195 $aa \cdot (aaa)^*$, $a \cdot (aaa)^*$, and $(aaa)^*$.
  2194 The total number of different "head terms",  $n * (n + 1) / 2$,
  2196 The total number of different "head terms",  $n * (n + 1) / 2$,
  2195  is proportional to the number of characters in the regex 
  2197 is proportional to the number of characters in the regex 
  2196 $(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$.
  2198 $(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$.
  2197 This suggests a slightly different notion of size, which we call the 
  2199 This suggests a slightly different notion of size, which we call the 
  2198 alphabetic width:
  2200 alphabetic width:
  2199 %TODO:
  2201 %TODO:
  2200 (TODO: Alphabetic width def.)
  2202 (TODO: Alphabetic width def.)
  2201 
  2203 
  2202  
  2204 
  2203 Antimirov\parencite{Antimirov95} has proven that 
  2205 Antimirov\parencite{Antimirov95} has proven that 
  2204 $\textit{PDER}_{UNIV}(r) \leq \textit{awidth}(r)$.
  2206 $\textit{PDER}_{UNIV}(r) \leq \textit{awidth}(r)$.
  2205 where $\textit{PDER}_{UNIV}(r)$ is a set of all possible subterms
  2207 where $\textit{PDER}_{UNIV}(r)$ is a set of all possible subterms
  2206 created by doing derivatives of $r$ against all possible strings.
  2208 created by doing derivatives of $r$ against all possible strings.
  2207 If we can make sure that at any moment in our lexing algorithm our 
  2209 If we can make sure that at any moment in our lexing algorithm our 
  2224 \subsection{Syntactic Equivalence Under $\simp$}
  2226 \subsection{Syntactic Equivalence Under $\simp$}
  2225 We prove that minor differences can be annhilated
  2227 We prove that minor differences can be annhilated
  2226 by $\simp$.
  2228 by $\simp$.
  2227 For example,
  2229 For example,
  2228 \begin{center}
  2230 \begin{center}
  2229 $\simp \;(\simpALTs\; (\map \;(\_\backslash \; x)\; (\distinct \; \mathit{rs}\; \phi))) = 
  2231 	$\simp \;(\simpALTs\; (\map \;(\_\backslash \; x)\; (\distinct \; \mathit{rs}\; \phi))) = 
  2230  \simp \;(\simpALTs \;(\distinct \;(\map \;(\_ \backslash\; x) \; \mathit{rs}) \; \phi))$
  2232 	\simp \;(\simpALTs \;(\distinct \;(\map \;(\_ \backslash\; x) \; \mathit{rs}) \; \phi))$
  2231 \end{center}
  2233 \end{center}
  2232 
  2234