ChengsongTanPhdThesis/Chapters/Bitcoded2.tex
changeset 585 4969ef817d92
parent 584 1734bd5975a3
child 586 826af400b068
equal deleted inserted replaced
584:1734bd5975a3 585:4969ef817d92
    24 lexing we introduced in chapter \ref{Inj}.
    24 lexing we introduced in chapter \ref{Inj}.
    25 We then go on to prove the correctness with the improved version of 
    25 We then go on to prove the correctness with the improved version of 
    26 $\blexer$, called $\blexersimp$, by establishing 
    26 $\blexer$, called $\blexersimp$, by establishing 
    27 $\blexer \; r \; s= \blexersimp \; r \; s$ using a term rewriting system.
    27 $\blexer \; r \; s= \blexersimp \; r \; s$ using a term rewriting system.
    28 
    28 
    29 \section{Simplification for Annotated Regular Expressions}
    29 \section{Simplifications by Sulzmann and Lu}
    30 The first thing we notice in the fast growth of examples such as $(a^*a^*)^*$'s
    30 The first thing we notice in the fast growth of examples such as $(a^*a^*)^*$'s
    31 and $(a^* + (aa)^*)^*$'s derivatives is that a lot of duplicated sub-patterns
    31 and $(a^* + (aa)^*)^*$'s derivatives is that a lot of duplicated sub-patterns
    32 are scattered around different levels, and therefore requires 
    32 are scattered around different levels, and therefore requires 
    33 de-duplication at different levels:
    33 de-duplication at different levels:
    34 \begin{center}
    34 \begin{center}
    35 	$(a^*a^*)^* \rightarrow (a^*a^* + a^*)\cdot(a^*a^*)^* \rightarrow $\\
    35 	$(a^*a^*)^* \stackrel{\backslash a}{\longrightarrow} (a^*a^* + a^*)\cdot(a^*a^*)^* \stackrel{\backslash a}{\longrightarrow} $\\
    36 	$((a^*a^* + a^*) + a^*)\cdot(a^*a^*)^* + (a^*a^* + a^*)\cdot(a^*a^*)^* \rightarrow \ldots$
    36 	$((a^*a^* + a^*) + a^*)\cdot(a^*a^*)^* + (a^*a^* + a^*)\cdot(a^*a^*)^* \stackrel{\backslash a}{\longrightarrow} \ldots$
    37 \end{center}
    37 \end{center}
    38 \noindent
    38 \noindent
    39 As we have already mentioned in \ref{eqn:growth2},
    39 As we have already mentioned in \ref{eqn:growth2},
    40 a simple-minded simplification function cannot simplify
    40 a simple-minded simplification function cannot simplify
       
    41 the third regular expression in the above chain of derivative
       
    42 regular expressions:
    41 \begin{center}
    43 \begin{center}
    42 $((a^*a^* + a^*) + a^*)\cdot(a^*a^*)^* + (a^*a^* + a^*)\cdot(a^*a^*)^*$
    44 $((a^*a^* + a^*) + a^*)\cdot(a^*a^*)^* + (a^*a^* + a^*)\cdot(a^*a^*)^*$
    43 \end{center}
    45 \end{center}
    44 any further, one would expect a better simplification function to work in the 
    46 one would expect a better simplification function to work in the 
    45 following way:
    47 following way:
    46 \begin{gather*}
    48 \begin{gather*}
    47 	((a^*a^* + \underbrace{a^*}_\text{A})+\underbrace{a^*}_\text{duplicate of A})\cdot(a^*a^*)^* + 
    49 	((a^*a^* + \underbrace{a^*}_\text{A})+\underbrace{a^*}_\text{duplicate of A})\cdot(a^*a^*)^* + 
    48 	\underbrace{(a^*a^* + a^*)\cdot(a^*a^*)^*}_\text{further simp removes this}.\\
    50 	\underbrace{(a^*a^* + a^*)\cdot(a^*a^*)^*}_\text{further simp removes this}.\\
    49 	\bigg\downarrow \\
    51 	\bigg\downarrow \\
    63 algorithm: their simplification does 
    65 algorithm: their simplification does 
    64 not work!
    66 not work!
    65 We quote their $\textit{simp}$ function verbatim here:
    67 We quote their $\textit{simp}$ function verbatim here:
    66 \begin{center}
    68 \begin{center}
    67 	\begin{tabular}{lcl}
    69 	\begin{tabular}{lcl}
    68 		$\simp\; _{bs}(_{bs'}\ONE \cdot r)$ & $\dn$ & 
    70 		$\simpsulz \; _{bs}(_{bs'}\ONE \cdot r)$ & $\dn$ & 
    69 		$\textit{if} \; (\textit{zeroable} \; r)\; \textit{then} \;\; \ZERO$\\
    71 		$\textit{if} \; (\textit{zeroable} \; r)\; \textit{then} \;\; \ZERO$\\
    70 						   & &$\textit{else}\;\; \fuse \; (bs@ bs') \; r$\\
    72 						   & &$\textit{else}\;\; \fuse \; (bs@ bs') \; r$\\
    71 		$\simp\;(_{bs}r_1\cdot r_2)$ & $\dn$ & $\textit{if} 
    73 		$\simpsulz \;(_{bs}r_1\cdot r_2)$ & $\dn$ & $\textit{if} 
    72 		\; (\textit{zeroable} \; r_1 \; \textit{or} \; \textit{zeroable}\; r_2)\;
    74 		\; (\textit{zeroable} \; r_1 \; \textit{or} \; \textit{zeroable}\; r_2)\;
    73 		\textit{then} \;\; \ZERO$\\
    75 		\textit{then} \;\; \ZERO$\\
    74 					     & & $\textit{else}\;\;_{bs}((\simp\;r_1)\cdot
    76 					     & & $\textit{else}\;\;_{bs}((\simpsulz \;r_1)\cdot
    75 					     (\simp\; r_2))$\\
    77 					     (\simpsulz \; r_2))$\\
    76 		$\simp \; _{bs}\sum []$ & $\dn$ & $\ZERO$\\
    78 		$\simpsulz  \; _{bs}\sum []$ & $\dn$ & $\ZERO$\\
    77 		$\simp \; _{bs}\sum ((_{bs'}\sum rs_1) :: rs_2)$ & $\dn$ &
    79 		$\simpsulz  \; _{bs}\sum ((_{bs'}\sum rs_1) :: rs_2)$ & $\dn$ &
    78 		$_{bs}\sum ((\map \; (\fuse \; bs')\; rs_1) @ rs_2)$\\
    80 		$_{bs}\sum ((\map \; (\fuse \; bs')\; rs_1) @ rs_2)$\\
    79 		$\simp \; _{bs}\sum[r]$ & $\dn$ & $\fuse \; bs \; (\simp \; r)$\\
    81 		$\simpsulz  \; _{bs}\sum[r]$ & $\dn$ & $\fuse \; bs \; (\simpsulz  \; r)$\\
    80 		$\simp \; _{bs}\sum(r::rs)$ & $\dn$ & $_{bs}\sum 
    82 		$\simpsulz  \; _{bs}\sum(r::rs)$ & $\dn$ & $_{bs}\sum 
    81 		(\nub \; (\filter \; (\not \circ \zeroable)\;((\simp \; r) :: \map \; \simp \; rs)))$\\ 
    83 		(\nub \; (\filter \; (\not \circ \zeroable)\;((\simpsulz  \; r) :: \map \; \simpsulz  \; rs)))$\\ 
    82 		
    84 		
    83 	\end{tabular}
    85 	\end{tabular}
    84 \end{center}
    86 \end{center}
    85 \noindent
    87 \noindent
    86 the $\textit{zeroable}$ predicate 
    88 the $\textit{zeroable}$ predicate 
    97 		$\zeroable\;_{bs}\ONE$ & $\dn$ & $\textit{false}$\\
    99 		$\zeroable\;_{bs}\ONE$ & $\dn$ & $\textit{false}$\\
    98 		$\zeroable\;_{bs}\ZERO$ & $\dn$ & $\textit{true}$
   100 		$\zeroable\;_{bs}\ZERO$ & $\dn$ & $\textit{true}$
    99 	\end{tabular}
   101 	\end{tabular}
   100 \end{center}
   102 \end{center}
   101 \noindent
   103 \noindent
   102 They suggested that the $\simp$ function should be
   104 They suggested that the $\simpsulz $ function should be
   103 applied repeatedly until a fixpoint is reached.
   105 applied repeatedly until a fixpoint is reached.
   104 We implemented the algorithm as required in Scala, 
   106 We call this construction $\textit{sulzSimp}$:
   105 and found that final derivative regular expression
   107 \begin{center}
       
   108 	\begin{tabular}{lcl}
       
   109 		$\textit{sulzSimp} \; r$ & $\dn$ & 
       
   110 		$\textit{while}((\simpsulz  \; r)\; \cancel{=} \; r)$ \\
       
   111 		& & $\quad r := \simpsulz  \; r$\\
       
   112 		& & $\textit{return} \; r$
       
   113 	\end{tabular}
       
   114 \end{center}
       
   115 We call the operation of alternatingly 
       
   116 applying derivatives and simplifications
       
   117 (until the string is exhausted) Sulz-simp-derivative,
       
   118 written $\backslash_{sulzSimp}$:
       
   119 \begin{center}
       
   120 \begin{tabular}{lcl}
       
   121 	$r \backslash_{sulzSimp} (c\!::\!s) $ & $\dn$ & $(\textit{sulzSimp} \; (r \backslash c)) \backslash_{sulzSimp}\, s$ \\
       
   122 $r \backslash_{sulzSimp} [\,] $ & $\dn$ & $r$
       
   123 \end{tabular}
       
   124 \end{center}
       
   125 \noindent
       
   126 After the derivatives have been taken, the bitcodes
       
   127 are extracted and decoded in the same manner
       
   128 as $\blexer$:
       
   129 \begin{center}
       
   130 \begin{tabular}{lcl}
       
   131   $\textit{blexer\_sulzSimp}\;r\,s$ & $\dn$ &
       
   132       $\textit{let}\;a = (r^\uparrow)\backslash_{sulzSimp}\, s\;\textit{in}$\\                
       
   133   & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
       
   134   & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
       
   135   & & $\;\;\textit{else}\;\textit{None}$
       
   136 \end{tabular}
       
   137 \end{center}
       
   138 \noindent
       
   139 We implemented this lexing algorithm in Scala, 
       
   140 and found that the final derivative regular expression
   106 size grows exponentially fast:
   141 size grows exponentially fast:
   107 \begin{center}
       
   108 \begin{figure}[H]
   142 \begin{figure}[H]
       
   143 	\centering
   109 \begin{tikzpicture}
   144 \begin{tikzpicture}
   110 \begin{axis}[
   145 \begin{axis}[
   111     xlabel={$n$},
   146     xlabel={$n$},
   112     ylabel={time in secs},
   147     ylabel={size},
   113     ymode = log,
   148     ymode = log,
   114     legend entries={Sulzmann and Lu's $\simp$},  
   149     legend entries={Final Derivative Size},  
   115     legend pos=north west,
   150     legend pos=north west,
   116     legend cell align=left]
   151     legend cell align=left]
   117 \addplot[red,mark=*, mark options={fill=white}] table {SulzmannLuLexer.data};
   152 \addplot[red,mark=*, mark options={fill=white}] table {SulzmannLuLexer.data};
   118 \end{axis}
   153 \end{axis}
   119 \end{tikzpicture} 
   154 \end{tikzpicture} 
   120 \caption{Matching the regular expression $(a^*a^*)^*$ against strings of the form
   155 \caption{Lexing the regular expression $(a^*a^*)^*$ against strings of the form
   121 $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}
   156 $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}
   122 $ using Sulzmann and Lu's lexer algorithm}\label{SulzmannLuLexer}
   157 $ using Sulzmann and Lu's lexer}\label{SulzmannLuLexer}
   123 \end{figure}
   158 \end{figure}
   124 \end{center}
       
   125 \noindent
   159 \noindent
   126 At $n= 20$ we already get an out of memory error with Scala's normal 
   160 At $n= 20$ we already get an out of memory error with Scala's normal 
   127 JVM heap size settings,
   161 JVM heap size settings.
   128 which seems a counterexample for their linear complexity claim:
   162 In fact their simplification does not improve over
       
   163 the simple-minded simplifications we have shown in \ref{fig:BetterWaterloo}.
       
   164 The time required also grows exponentially:
       
   165 \begin{figure}[H]
       
   166 	\centering
       
   167 \begin{tikzpicture}
       
   168 \begin{axis}[
       
   169     xlabel={$n$},
       
   170     ylabel={time},
       
   171     ymode = log,
       
   172     legend entries={time in secs},  
       
   173     legend pos=north west,
       
   174     legend cell align=left]
       
   175 \addplot[red,mark=*, mark options={fill=white}] table {SulzmannLuLexerTime.data};
       
   176 \end{axis}
       
   177 \end{tikzpicture} 
       
   178 \caption{Lexing the regular expression $(a^*a^*)^*$ against strings of the form
       
   179 $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}
       
   180 $ using Sulzmann and Lu's lexer}\label{SulzmannLuLexerTime}
       
   181 \end{figure}
       
   182 \noindent
       
   183 which seems like a counterexample for 
       
   184 their linear complexity claim:
   129 \begin{quote}\it
   185 \begin{quote}\it
   130 Linear-Time Complexity Claim It is easy to see that each call of one of the functions/operations\b,simp,fuse,mkEpsBCandisPhi leadstosubcallswhose number is bound by the size of the regular expression involved. We claim that thanks to aggressively applying simp this size remains finite. Hence, we can argue that the above mentioned functions/operations have constant time complexity which implies that we can incrementally compute bit-coded parse trees in linear time in the size of the input. We yet need to work out detailed estimates regarding the space complexity of our algorithm.
   186 Linear-Time Complexity Claim \\It is easy to see that each call of one of the functions/operations:
   131 \end{quote}  
   187 simp, fuse, mkEpsBC and isPhi leads to subcalls whose number is bound by the size of the regular expression involved. We claim that thanks to aggressively applying simp this size remains finite. Hence, we can argue that the above mentioned functions/operations have constant time complexity which implies that we can incrementally compute bit-coded parse trees in linear time in the size of the input. 
   132 
   188 \end{quote}
   133 
   189 \noindent
   134 
   190 The assumption that the size of the regular expressions
   135 Despite that we have already implemented the simple-minded simplifications 
   191 in the algorithm
   136 such as throwing away useless $\ONE$s and $\ZERO$s.
   192 would stay below a finite constant is not ture.
   137 The simplification rule $r + r \rightarrow $ cannot make a difference either
   193 In addition to that, even if the regular expressions size
   138 since it only removes duplicates on the same level, not something like 
   194 do stay finite, one has to take into account that
   139 $(r+a)+r \rightarrow r+a$.
   195 the $\simpsulz$ function is applied many times
   140 This requires us to break up nested alternatives into lists, for example 
   196 in each derivative step, and that number is not necessarily
   141 using the flatten operation similar to the one defined for any function language's
   197 a constant with respect to the size of the regular expression.
   142 list datatype:
   198 To not get ``caught off guard'' by
   143 
   199 these counterexamples,
       
   200 one needs to be more careful when designing the
       
   201 simplification function and making claims about them.
       
   202 
       
   203 \section{Our $\textit{Simp}$ Function}
       
   204 We will now introduce our simplification function,
       
   205 by making a contrast with $\simpsulz$.
       
   206 We describe
       
   207 the ideas behind components in their algorithm 
       
   208 and why they fail to achieve the desired effect, followed
       
   209 by our solution. These solutions come with correctness
       
   210 statements that are backed up by formal proofs.
       
   211 \subsection{Flattening Nested Alternatives}
       
   212 The idea behind the 
       
   213 \begin{center}
       
   214 $\simpsulz  \; _{bs}\sum ((_{bs'}\sum rs_1) :: rs_2) \quad \dn \quad
       
   215 	       _{bs}\sum ((\map \; (\fuse \; bs')\; rs_1) @ rs_2)$
       
   216 \end{center}
       
   217 clause is that it allows
       
   218 duplicate removal of regular expressions at different
       
   219 levels.
       
   220 For example, this would help with the
       
   221 following simplification:
       
   222 
       
   223 \begin{center}
       
   224 $(a+r)+r \longrightarrow a+r$
       
   225 \end{center}
       
   226 The problem here is that only the head element
       
   227 is ``spilled out'',
       
   228 whereas we would want to flatten
       
   229 an entire list to open up possibilities for further simplifications.\\
       
   230 Not flattening the rest of the elements also means that
       
   231 the later de-duplication processs 
       
   232 does not fully remove apparent duplicates.
       
   233 For example,
       
   234 using $\simpsulz$ we could not 
       
   235 simplify
       
   236 \begin{center}
       
   237 $((a^* a^*)+ (a^* + a^*))\cdot (a^*a^*)^*+
       
   238 ((a^*a^*)+a^*)\cdot (a^*a^*)^*$
       
   239 \end{center}
       
   240 due to the underlined part not in the first element
       
   241 of the alternative.\\
       
   242 We define a flatten operation that flattens not only 
       
   243 the first regular expression of an alternative,
       
   244 but the entire list: 
   144  \begin{center}
   245  \begin{center}
   145   \begin{tabular}{@{}lcl@{}}
   246   \begin{tabular}{@{}lcl@{}}
   146   $\textit{flts} \; (_{bs}\sum \textit{as}) :: \textit{as'}$ & $\dn$ & $(\textit{map} \;
   247   $\textit{flts} \; (_{bs}\sum \textit{as}) :: \textit{as'}$ & $\dn$ & $(\textit{map} \;
   147      (\textit{fuse}\;bs)\; \textit{as}) \; @ \; \textit{flts} \; as' $ \\
   248      (\textit{fuse}\;bs)\; \textit{as}) \; @ \; \textit{flts} \; as' $ \\
   148   $\textit{flts} \; \ZERO :: as'$ & $\dn$ & $ \textit{flts} \;  \textit{as'} $ \\
   249   $\textit{flts} \; \ZERO :: as'$ & $\dn$ & $ \textit{flts} \;  \textit{as'} $ \\
   149     $\textit{flts} \; a :: as'$ & $\dn$ & $a :: \textit{flts} \; \textit{as'}$ \quad(otherwise) 
   250     $\textit{flts} \; a :: as'$ & $\dn$ & $a :: \textit{flts} \; \textit{as'}$ \quad(otherwise) 
   150 \end{tabular}    
   251 \end{tabular}    
   151 \end{center}  
   252 \end{center}  
   152 
   253 \noindent
   153 \noindent
   254 Our $\flts$ operation 
   154 There is a minor difference though, in that our $\flts$ operation defined by us
   255 also throws away $\ZERO$s
   155 also throws away $\ZERO$s.
   256 as they do not contribute to a lexing result.
   156 For a flattened list of regular expressions, a de-duplication can be done efficiently:
   257 \subsection{Duplicate Removal}
   157 
   258 After flattening is done, we are ready to deduplicate.
   158 
   259 The de-duplicate function is called $\distinctBy$,
       
   260 and that is where we make our second improvement over
       
   261 Sulzmann and Lu's.
       
   262 The process goes as follows:
       
   263 \begin{center}
       
   264 $rs \stackrel{\textit{flts}}{\longrightarrow} 
       
   265 rs_{flat} 
       
   266 \xrightarrow{\distinctBy \; 
       
   267 rs_{flat} \; \rerases\; \varnothing} rs_{distinct}$
       
   268 %\stackrel{\distinctBy \; 
       
   269 %rs_{flat} \; \erase\; \varnothing}{\longrightarrow} \; rs_{distinct}$
       
   270 \end{center}
       
   271 where the $\distinctBy$ function is defined as:
   159 \begin{center}
   272 \begin{center}
   160 	\begin{tabular}{@{}lcl@{}}
   273 	\begin{tabular}{@{}lcl@{}}
   161 		$\distinctBy \; [] \; f\; acc $ & $ =$ & $ []$\\
   274 		$\distinctBy \; [] \; f\; acc $ & $ =$ & $ []$\\
   162 		$\distinctBy \; (x :: xs) \; f \; acc$ & $=$ & \\
   275 		$\distinctBy \; (x :: xs) \; f \; acc$ & $=$ & $\quad \textit{if} (f \; x \in acc)\;\; \textit{then} \;\; \distinctBy \; xs \; f \; acc$\\
   163 						       & & $\quad \textit{if} (f \; x) \in acc \textit{then} \distinctBy \; xs \; f \; acc$\\
   276 						       & & $\quad \textit{else}\;\; x :: (\distinctBy \; xs \; f \; (\{f \; x\} \cup acc))$ 
   164 						       & & $\quad \textit{else} x :: (\distinctBy \; xs \; f \; (\{f \; x\} \cup acc))$ 
       
   165 	\end{tabular}
   277 	\end{tabular}
   166 \end{center}
   278 \end{center}
   167 \noindent
   279 \noindent
   168 The reason we define a distinct function under a mapping $f$ is because
   280 The reason we define a distinct function under a mapping $f$ is because
   169 we want to eliminate regular expressions that are the same syntactically,
   281 we want to eliminate regular expressions that are syntactically the same,
   170 but having different bit-codes (more on the reason why we can do this later).
   282 but with different bit-codes.
   171 To achieve this, we call $\erase$ as the function $f$ during the distinction
   283 For example, we can remove the second $a^*a^*$ from
   172 operation.
   284 $_{ZSZ}a^*a^* + _{SZZ}a^*a^*$, because it
       
   285 represents a match with shorter initial sub-match 
       
   286 (and therefore is definitely not POSIX),
       
   287 and will be discarded by $\bmkeps$ later.
       
   288 \begin{center}
       
   289 	$_{ZSZ}\underbrace{a^*}_{ZS:\; match \; 1\; times\quad}\underbrace{a^*}_{Z: \;match\; 1 \;times} + 
       
   290 	_{SZZ}\underbrace{a^*}_{S: \; match \; 0 \; times\quad}\underbrace{a^*}_{ZZ: \; match \; 2 \; times}
       
   291 	$
       
   292 \end{center}
       
   293 %$_{bs1} r_1 + _{bs2} r_2 \text{where} (r_1)_{\downarrow} = (r_2)_{\downarrow}$
       
   294 Due to the way our algorithm works,
       
   295 the matches that conform to the POSIX standard 
       
   296 will always be placed further to the left. When we 
       
   297 traverse the list from left to right,
       
   298 regular expressions we have already seen
       
   299 will definitely not contribute to a POSIX value,
       
   300 even if they are attached with different bitcodes.
       
   301 These duplicates therefore need to be removed.
       
   302 To achieve this, we call $\rerases$ as the function $f$ during the distinction
       
   303 operation.\\
       
   304 $\rerases$ is very similar to $\erase$, except that it preserves the structure
       
   305 when erasing an alternative regular expression.
       
   306 The reason why we use $\rerases$ instead of $\erase$ is that
       
   307 it keeps the structures of alternative 
       
   308 annotated regular expressions
       
   309 whereas $\erase$ would turn it back into a binary structure.
       
   310 Not having to mess with the structure 
       
   311 greatly simplifies the finiteness proof in chapter \ref{Finite}.
       
   312 We give the definitions of $\rerases$ here together with
       
   313 the new datatype used by $\rerases$ (as our plain
       
   314 regular expression datatype does not allow non-binary alternatives),
       
   315 and explain in detail
       
   316 why we want it in the next chapter.
       
   317 For the moment the reader can just think of 
       
   318 $\rerases$ as $\erase$ and $\rrexp$ as plain regular expressions.
       
   319 \[			\rrexp ::=   \RZERO \mid  \RONE
       
   320 			 \mid  \RCHAR{c}  
       
   321 			 \mid  \RSEQ{r_1}{r_2}
       
   322 			 \mid  \RALTS{rs}
       
   323 			 \mid \RSTAR{r}        
       
   324 \]
       
   325 The notation of $\rerases$ also follows that of $\erase$,
       
   326 which is a postfix operator written as a subscript,
       
   327 except that it has an \emph{r} attached to it to distinguish against $\erase$:
       
   328 \begin{center}
       
   329 \begin{tabular}{lcl}
       
   330 $\rerase{\ZERO}$ & $\dn$ & $\RZERO$\\
       
   331 $\rerase{_{bs}\ONE}$ & $\dn$ & $\RONE$\\
       
   332 	$\rerase{_{bs}\mathbf{c}}$ & $\dn$ & $\RCHAR{c}$\\
       
   333 $\rerase{_{bs}r_1\cdot r_2}$ & $\dn$ & $\RSEQ{\rerase{r_1}}{\rerase{r_2}}$\\
       
   334 $\rerase{_{bs}\sum as}$ & $\dn$ & $\RALTS{\map \; \rerase{\_} \; as}$\\
       
   335 $\rerase{_{bs} a ^*}$ & $\dn$ & $\rerase{a}^*$
       
   336 \end{tabular}
       
   337 \end{center}
       
   338 
       
   339 \subsection{Putting Things Together}
   173 A recursive definition of our  simplification function 
   340 A recursive definition of our  simplification function 
   174 that looks somewhat similar to our Scala code is given below:
   341 is given below:
   175 
   342 %that looks somewhat similar to our Scala code is 
   176 \begin{center}
   343 \begin{center}
   177   \begin{tabular}{@{}lcl@{}}
   344   \begin{tabular}{@{}lcl@{}}
   178    
   345    
   179 	  $\textit{bsimp} \; (_{bs}a_1\cdot a_2)$ & $\dn$ & $ \textit{bsimp}_{ASEQ} \; bs \;(\textit{bsimp} \; a_1) \; (\textit{bsimp}  \; a_2)  $ \\
   346 	  $\textit{bsimp} \; (_{bs}a_1\cdot a_2)$ & $\dn$ & $ \textit{bsimp}_{ASEQ} \; bs \;(\textit{bsimp} \; a_1) \; (\textit{bsimp}  \; a_2)  $ \\
   180 	  $\textit{bsimp} \; (_{bs}\sum \textit{as})$ & $\dn$ & $\textit{bsimp}_{ALTS} \; \textit{bs} \; (\textit{distinctBy} \; ( \textit{flatten} ( \textit{map} \; bsimp \; as)) \; \erase \; \phi) $ \\
   347 	  $\textit{bsimp} \; (_{bs}\sum \textit{as})$ & $\dn$ & $\textit{bsimp}_{ALTS} \; \textit{bs} \; (\textit{distinctBy} \; ( \textit{flatten} ( \textit{map} \; bsimp \; as)) \; \rerases \; \varnothing) $ \\
   181    $\textit{bsimp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$   
   348    $\textit{bsimp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$   
   182 \end{tabular}    
   349 \end{tabular}    
   183 \end{center}    
   350 \end{center}    
   184 
   351 
   185 \noindent
   352 \noindent
   186 The simplification (named $\bsimp$ for \emph{b}it-coded) does a pattern matching on the regular expression.
   353 The simplification (named $\textit{bsimp}$ for \emph{b}it-coded) 
       
   354 does a pattern matching on the regular expression.
   187 When it detected that the regular expression is an alternative or
   355 When it detected that the regular expression is an alternative or
   188 sequence, it will try to simplify its children regular expressions
   356 sequence, it will try to simplify its children regular expressions
   189 recursively and then see if one of the children turns into $\ZERO$ or
   357 recursively and then see if one of the children turns into $\ZERO$ or
   190 $\ONE$, which might trigger further simplification at the current level.
   358 $\ONE$, which might trigger further simplification at the current level.
   191 Current level simplifications are handled by the function $\textit{bsimp}_{ASEQ}$,
   359 Current level simplifications are handled by the function $\textit{bsimp}_{ASEQ}$,
   201 \end{center}
   369 \end{center}
   202 \noindent
   370 \noindent
   203 The most involved part is the $\sum$ clause, where we first call $\flts$ on
   371 The most involved part is the $\sum$ clause, where we first call $\flts$ on
   204 the simplified children regular expression list $\textit{map}\; \textit{bsimp}\; \textit{as}$.
   372 the simplified children regular expression list $\textit{map}\; \textit{bsimp}\; \textit{as}$.
   205 and then call $\distinctBy$ on that list, the predicate determining whether two 
   373 and then call $\distinctBy$ on that list, the predicate determining whether two 
   206 elements are the same is $\erase \; r_1 = \erase\; r_2$.
   374 elements are the same is $\rerases \; r_1 = \rerases\; r_2$.
   207 Finally, depending on whether the regular expression list $as'$ has turned into a
   375 Finally, depending on whether the regular expression list $as'$ has turned into a
   208 singleton or empty list after $\flts$ and $\distinctBy$, $\textit{bsimp}_{AALTS}$
   376 singleton or empty list after $\flts$ and $\distinctBy$, $\textit{bsimp}_{AALTS}$
   209 decides whether to keep the current level constructor $\sum$ as it is, and 
   377 decides whether to keep the current level constructor $\sum$ as it is, and 
   210 removes it when there are less than two elements:
   378 removes it when there are less than two elements:
   211 \begin{center}
   379 \begin{center}
   223 \begin{center}
   391 \begin{center}
   224 	\begin{tabular}{lcl}
   392 	\begin{tabular}{lcl}
   225 		$r \backslash_{bsimp} s$ & $\dn$ & $\textit{bsimp}(r \backslash s)$
   393 		$r \backslash_{bsimp} s$ & $\dn$ & $\textit{bsimp}(r \backslash s)$
   226 	\end{tabular}
   394 	\end{tabular}
   227 \end{center}
   395 \end{center}
   228 Following previous notation of  natural
   396 %Following previous notations
   229 extension from derivative w.r.t.~character to derivative
   397 %when extending from derivatives w.r.t.~character to derivative
   230 w.r.t.~string, we define the derivative that nests simplifications with derivatives:%\comment{simp in  the [] case?}
   398 %w.r.t.~string, we define the derivative that nests simplifications 
   231 
   399 %with derivatives:%\comment{simp in  the [] case?}
       
   400 We extend this from character to string:
   232 \begin{center}
   401 \begin{center}
   233 \begin{tabular}{lcl}
   402 \begin{tabular}{lcl}
   234 $r \backslash_{bsimps} (c\!::\!s) $ & $\dn$ & $(r \backslash_{bsimp}\, c) \backslash_{bsimps}\, s$ \\
   403 $r \backslash_{bsimps} (c\!::\!s) $ & $\dn$ & $(r \backslash_{bsimp}\, c) \backslash_{bsimps}\, s$ \\
   235 $r \backslash_{bsimps} [\,] $ & $\dn$ & $r$
   404 $r \backslash_{bsimps} [\,] $ & $\dn$ & $r$
   236 \end{tabular}
   405 \end{tabular}
   237 \end{center}
   406 \end{center}
   238 
   407 
   239 \noindent
   408 \noindent
   240 Extracting bit-codes from the derivatives that had been simplified repeatedly after 
   409 The lexer that extracts bitcodes from the 
   241 each derivative run, the simplified $\blexer$, called $\blexersimp$,
   410 derivatives with simplifications from our $\simp$ function
   242 is defined as 
   411 is called $\blexersimp$:
   243  \begin{center}
   412 \begin{center}
   244 \begin{tabular}{lcl}
   413 \begin{tabular}{lcl}
   245   $\textit{blexer\_simp}\;r\,s$ & $\dn$ &
   414   $\textit{blexer\_simp}\;r\,s$ & $\dn$ &
   246       $\textit{let}\;a = (r^\uparrow)\backslash_{simp}\, s\;\textit{in}$\\                
   415       $\textit{let}\;a = (r^\uparrow)\backslash_{simp}\, s\;\textit{in}$\\                
   247   & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
   416   & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
   248   & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
   417   & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
   249   & & $\;\;\textit{else}\;\textit{None}$
   418   & & $\;\;\textit{else}\;\textit{None}$
   250 \end{tabular}
   419 \end{tabular}
   251 \end{center}
   420 \end{center}
   252 
   421 
   253 \noindent
   422 \noindent
   254 This algorithm keeps the regular expression size small, for example,
   423 This algorithm keeps the regular expression size small.
   255 with this simplification our previous $(a + aa)^*$ example's fast growth (over
   424 
   256 $10^5$ nodes at around $20$ input length)
   425 
   257 will be reduced to just 17 and stays constant, no matter how long the
   426 \subsection{$(a+aa)^*$ and $(a^*\cdot a^*)^*$  against 
       
   427 $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}$ After Simplification}
       
   428 For example,
       
   429 with our simplification the
       
   430 previous $(a^*a^*)^*$ example
       
   431 where $\simpsulz$ could not
       
   432 stop the fast growth (over
       
   433 3 million nodes just below $20$ input length)
       
   434 will be reduced to just 15 and stays constant, no matter how long the
   258 input string is.
   435 input string is.
   259 We show some graphs to better demonstrate this imrpovement.
   436 This is demonstrated in the graphs below.
   260 
   437 \begin{figure}[H]
   261 
       
   262 \section{$(a+aa)^*$ and $(a^*\cdot a^*)^*$  against $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}$ After Simplification}
       
   263 For $(a+aa)^*$, it used to grow to over $9000$ nodes with simple-minded simplification
       
   264 at only around $15$ input characters:
       
   265 \begin{center}
       
   266 \begin{tabular}{ll}
       
   267 \begin{tikzpicture}
       
   268 \begin{axis}[
       
   269     xlabel={$n$},
       
   270     ylabel={derivative size},
       
   271         width=7cm,
       
   272     height=4cm, 
       
   273     legend entries={Lexer with $\textit{bsimp}$},  
       
   274     legend pos=  south east,
       
   275     legend cell align=left]
       
   276 \addplot[red,mark=*, mark options={fill=white}] table {a_aa_star_bsimp.data};
       
   277 \end{axis}
       
   278 \end{tikzpicture} %\label{fig:BitcodedLexer}
       
   279 &
       
   280 \begin{tikzpicture}
       
   281 \begin{axis}[
       
   282     xlabel={$n$},
       
   283     ylabel={derivative size},
       
   284     width = 7cm,
       
   285     height = 4cm,
       
   286     legend entries={Lexer without $\textit{bsimp}$},  
       
   287     legend pos=  north west,
       
   288     legend cell align=left]
       
   289 \addplot[red,mark=*, mark options={fill=white}] table {a_aa_star_easysimp.data};
       
   290 \end{axis}
       
   291 \end{tikzpicture} 
       
   292 \end{tabular}
       
   293 \end{center}
       
   294 And for $(a^*\cdot a^*)^*$, unlike in \ref{fig:BetterWaterloo}, the size with simplification now stay nicely constant with current
       
   295 simplification rules (we put the graphs together to show the contrast)
       
   296 \begin{center}
   438 \begin{center}
   297 \begin{tabular}{ll}
   439 \begin{tabular}{ll}
   298 \begin{tikzpicture}
   440 \begin{tikzpicture}
   299 \begin{axis}[
   441 \begin{axis}[
   300     xlabel={$n$},
   442     xlabel={$n$},
   312 \begin{axis}[
   454 \begin{axis}[
   313     xlabel={$n$},
   455     xlabel={$n$},
   314     ylabel={derivative size},
   456     ylabel={derivative size},
   315     width = 7cm,
   457     width = 7cm,
   316     height = 4cm,
   458     height = 4cm,
   317     legend entries={Lexer without $\textit{bsimp}$},  
   459     legend entries={Lexer with $\simpsulz$},  
   318     legend pos=  north west,
   460     legend pos=  north west,
   319     legend cell align=left]
   461     legend cell align=left]
   320 \addplot[red,mark=*, mark options={fill=white}] table {BetterWaterloo.data};
   462 \addplot[red,mark=*, mark options={fill=white}] table {BetterWaterloo.data};
   321 \end{axis}
   463 \end{axis}
   322 \end{tikzpicture} 
   464 \end{tikzpicture} 
   323 \end{tabular}
   465 \end{tabular}
   324 \end{center}
   466 \end{center}
   325 
   467 \caption{Our Improvement over Sulzmann and Lu's in terms of size}
       
   468 \end{figure}
       
   469 \noindent
       
   470 Given the size difference, it is not
       
   471 surprising that our $\blexersimp$ significantly outperforms
       
   472 $\textit{blexer\_sulzSimp}$.
       
   473 In the next section we are going to establish the
       
   474 first important property of our lexer--the correctness.
   326 %----------------------------------------------------------------------------------------
   475 %----------------------------------------------------------------------------------------
   327 %	SECTION rewrite relation
   476 %	SECTION rewrite relation
   328 %----------------------------------------------------------------------------------------
   477 %----------------------------------------------------------------------------------------
   329 \section{The Rewriting Relation $\rrewrite$($\rightsquigarrow$)}
   478 \section{Correctness of $\blexersimp$}
       
   479 In this section we give details
       
   480 of the correctness proof of $\blexersimp$,
       
   481 an important contribution of this thesis.\\
       
   482 We first introduce the rewriting relation \emph{rrewrite}
       
   483 ($\rrewrite$) between two regular expressions,
       
   484 which expresses an atomic
       
   485 simplification step from the left-hand-side
       
   486 to the right-hand-side.
       
   487 We then prove properties about
       
   488 this rewriting relation and its reflexive transitive closure.
       
   489 Finally we leverage these properties to show
       
   490 an equivalence between the internal data structures of 
       
   491 $\blexer$ and $\blexersimp$.
       
   492 
       
   493 \subsection{The Rewriting Relation $\rrewrite$($\rightsquigarrow$)}
   330 In the $\blexer$'s correctness proof, we
   494 In the $\blexer$'s correctness proof, we
   331 did not directly derive the fact that $\blexer$ gives out the POSIX value,
   495 did not directly derive the fact that $\blexer$ gives out the POSIX value,
   332 but first prove that $\blexer$ is linked with $\lexer$.
   496 but first proved that $\blexer$ is linked with $\lexer$.
   333 Then we re-use
   497 Then we re-use
   334 the correctness of $\lexer$.
   498 the correctness of $\lexer$
   335 Here we follow suit by first proving that
   499 to obtain
       
   500 \begin{center}
       
   501 	$(r, s) \rightarrow v \;\; \textit{iff} \;\; \blexer \; r \;s = v$.
       
   502 \end{center}
       
   503 Here we apply this
       
   504 modularised technique again
       
   505 by first proving that
   336 $\blexersimp \; r \; s $ 
   506 $\blexersimp \; r \; s $ 
   337 produces the same output as $\blexer \; r\; s$,
   507 produces the same output as $\blexer \; r\; s$,
   338 and then putting it together with 
   508 and then piecing it together with 
   339 $\blexer$'s correctness result
   509 $\blexer$'s correctness to achieve our main
   340 ($(r, s) \rightarrow v \implies \blexer \; r \;s = v$)
   510 theorem:\footnote{ the case when 
   341 to obtain half of the correctness property (the other half
   511 $s$ is not in $L \; r$, is routine to establish }
   342 being when $s$ is not $L \; r$ which is routine to establish)
   512 \begin{center}
   343 \begin{center}
   513 	$(r, s) \rightarrow v \; \;   \textit{iff} \;\;  \blexersimp \; r \; s = v$
   344 	$(r, s) \rightarrow v \implies \blexersimp \; r \; s = v$
   514 \end{center}
   345 \end{center}
   515 \noindent
   346 \noindent
   516 The overall idea for the proof
   347 because it makes the proof simpler
   517 of $\blexer \;r \;s = \blexersimp \; r \;s$ 
   348 The overall idea for the correctness of our simplified bitcoded lexer
   518 is that the transition from $r$ to $\textit{bsimp}\; r$ can be
   349 \noindent
   519 broken down into finitely many rewrite steps:
   350 is that the $\textit{bsimp}$ will not change the regular expressions so much that
   520 \begin{center}
   351 it becomes impossible to extract the $\POSIX$ values.
   521 	$r \rightsquigarrow^* \textit{bsimp} \; r$
   352 To capture this "similarity" between unsimplified regular expressions and simplified ones,
   522 \end{center}
   353 we devise the rewriting relation $\rrewrite$, written infix as $\rightsquigarrow$:
   523 where each rewrite step, written $\rightsquigarrow$,
   354 
   524 is an ``atomic'' simplification that
   355 \begin{center}
   525 cannot be broken down any further:
       
   526 \begin{figure}[H]
   356 \begin{mathpar}
   527 \begin{mathpar}
   357 	\inferrule{}{_{bs} \ZERO \cdot r_2 \rightsquigarrow \ZERO\\}
   528 	\inferrule * [Right = $S\ZERO_l$]{\vspace{0em}}{_{bs} \ZERO \cdot r_2 \rightsquigarrow \ZERO\\}
   358 
   529 
   359 	\inferrule{}{_{bs} r_1 \cdot \ZERO \rightsquigarrow \ZERO\\}
   530 	\inferrule * [Right = $S\ZERO_r$]{\vspace{0em}}{_{bs} r_1 \cdot \ZERO \rightsquigarrow \ZERO\\}
   360 
   531 
   361 	\inferrule{}{_{bs1} ((_{bs2} \ONE) \cdot r) \rightsquigarrow \fuse \; (bs_1 @ bs_2) \; r\\}\\
   532 	\inferrule * [Right = $S_1$]{\vspace{0em}}{_{bs1} ((_{bs2} \ONE) \cdot r) \rightsquigarrow \fuse \; (bs_1 @ bs_2) \; r\\}\\
   362 
   533 
   363 	
   534 	
   364 	
   535 	
   365 	\inferrule{\\ r_1 \rightsquigarrow r_2}{_{bs} r_1 \cdot r_3 \rightsquigarrow _{bs} r_2 \cdot r_3\\}
   536 	\inferrule * [Right = $SL$] {\\ r_1 \rightsquigarrow r_2}{_{bs} r_1 \cdot r_3 \rightsquigarrow _{bs} r_2 \cdot r_3\\}
   366 
   537 
   367 	\inferrule{\\ r_3 \rightsquigarrow r_4}{_{bs} r_1 \cdot r_3 \rightsquigarrow _{bs} r_1 \cdot r_4\\}\\
   538 	\inferrule * [Right = $SR$] {\\ r_3 \rightsquigarrow r_4}{_{bs} r_1 \cdot r_3 \rightsquigarrow _{bs} r_1 \cdot r_4\\}\\
   368 
   539 
   369 	\inferrule{}{ _{bs}\sum [] \rightsquigarrow \ZERO\\}
   540 	\inferrule * [Right = $A0$] {\vspace{0em}}{ _{bs}\sum [] \rightsquigarrow \ZERO}
   370 
   541 
   371 	\inferrule{}{ _{bs}\sum [a] \rightsquigarrow \fuse \; bs \; a\\}
   542 	\inferrule * [Right = $A1$] {\vspace{0em}}{ _{bs}\sum [a] \rightsquigarrow \fuse \; bs \; a}
   372 
   543 
   373 	\inferrule{\\ rs_1 \stackrel{s}{\rightsquigarrow} rs_2}{_{bs}\sum rs_1 \rightsquigarrow rs_2\\}
   544 	\inferrule * [Right = $AL$] {\\ rs_1 \stackrel{s}{\rightsquigarrow} rs_2}{_{bs}\sum rs_1 \rightsquigarrow rs_2}
   374 
   545 
   375 	\inferrule{}{\\ [] \stackrel{s}{\rightsquigarrow} []}
   546 	\inferrule * [Right = $LE$] {\vspace{0em}}{ [] \stackrel{s}{\rightsquigarrow} []}
   376 
   547 
   377 	\inferrule{rs_1 \stackrel{s}{\rightsquigarrow} rs_2}{\\ r :: rs_1 \rightsquigarrow r :: rs_2 \rightsquigarrow}
   548 	\inferrule * [Right = $LT$] {rs_1 \stackrel{s}{\rightsquigarrow} rs_2}{ r :: rs_1 \stackrel{s}{\rightsquigarrow} r :: rs_2 }
   378 
   549 
   379 	\inferrule{r_1 \rightsquigarrow r_2}{ r_1 :: rs \stackrel{s}{\rightsquigarrow} r_2 :: rs\\}
   550 	\inferrule * [Right = $LH$] {r_1 \rightsquigarrow r_2}{ r_1 :: rs \stackrel{s}{\rightsquigarrow} r_2 :: rs}
   380 
   551 
   381 	\inferrule{}{\ZERO :: rs \stackrel{s}{\rightsquigarrow} rs}
   552 	\inferrule * [Right = $L\ZERO$] {\vspace{0em}}{\ZERO :: rs \stackrel{s}{\rightsquigarrow} rs}
   382 
   553 
   383 	\inferrule{}{_{bs} \sum (rs_1 :: rs_b) \stackrel{s}{\rightsquigarrow} ((\map \; (\fuse \; bs_1) \; rs_1) @ rsb) }
   554 	\inferrule * [Right = $LS$] {\vspace{0em}}{_{bs} \sum (rs_1 :: rs_b) \stackrel{s}{\rightsquigarrow} ((\map \; (\fuse \; bs_1) \; rs_1) @ rsb) }
   384 
   555 
   385 	\inferrule{\\ \rerase{a_1} = \rerase{a_2}}{rs_a @ [a_1] @ rs_b @ [a_2] @ rsc \stackrel{s}{\rightsquigarrow} rs_a @ [a_1] @ rs_b @ rs_c}
   556 	\inferrule * [Right = $LD$] {\\ \rerase{a_1} = \rerase{a_2}}{rs_a @ [a_1] @ rs_b @ [a_2] @ rsc \stackrel{s}{\rightsquigarrow} rs_a @ [a_1] @ rs_b @ rs_c}
   386 
   557 
   387 \end{mathpar}
   558 \end{mathpar}
   388 \end{center}
   559 \caption{
   389 These "rewrite" steps define the atomic simplifications we could impose on regular expressions
   560 The rewrite rules that generate simplified regular expressions 
   390 under our simplification algorithm.
   561 in small steps: $r_1 \rightsquigarrow r_2$ is for bitcoded regular expressions 
   391 For convenience, we define a relation $\stackrel{s}{\rightsquigarrow}$ for rewriting
   562 and $rs_1 \stackrel{s}{\rightsquigarrow} rs_2$ for 
   392 a list of regular exression to another list.
   563 lists of bitcoded regular expressions. 
   393 The $\rerase{}$ function is used instead of $_\downarrow$ for the finiteness bound proof of next chapter,
   564 Interesting is the LD rule that allows copies of regular expressions 
   394 which we will discuss later. For the moment the reader can assume they basically do the same thing as $\erase$.
   565 to be removed provided a regular expression 
   395 
   566 earlier in the list can match the same strings.
   396 Usually more than one steps are taking place during the simplification of a regular expression,
   567 }\label{rrewriteRules}
   397 therefore we define the reflexive transitive closure of the $\rightsquigarrow$ and $\stackrel{s}{\rightsquigarrow}$
   568 \end{figure}
   398 relations:
   569 \noindent
   399 
   570 The rules such as $LT$ and $LH$ are for rewriting between two regular expression lists
   400 \begin{center}
   571 such that one regular expression
       
   572 in the left-hand-side list is rewritable in one step
       
   573 to the right-hand-side's regular expression at the same position.
       
   574 This helps with defining the ``context rules'' such as $AL$.\\
       
   575 The reflexive transitive closure of $\rightsquigarrow$ and $\stackrel{s}{\rightsquigarrow}$
       
   576 are defined in the usual way:
       
   577 \begin{figure}[H]
       
   578 	\centering
   401 \begin{mathpar}
   579 \begin{mathpar}
   402 	\inferrule{}{ r \stackrel{*}{\rightsquigarrow} r \\}
   580 	\inferrule{\vspace{0em}}{ r \rightsquigarrow^* r \\}
   403 	\inferrule{}{\\ rs \stackrel{s*}{\rightsquigarrow} rs \\}
   581 
   404 	\inferrule{\\r_1 \stackrel{*}{\rightsquigarrow}  r_2 \land \; r_2 \stackrel{*}{\rightsquigarrow} r_3}{r_1 \stackrel{*}{\rightsquigarrow} r_3\\}
   582 	\inferrule{\vspace{0em}}{rs \stackrel{s*}{\rightsquigarrow} rs \\}
   405 	\inferrule{\\rs_1 \stackrel{*}{\rightsquigarrow}  rs_2 \land \; rs_2 \stackrel{*}{\rightsquigarrow} rs_3}{rs_1 \stackrel{*}{\rightsquigarrow} rs_3}
   583 
       
   584 	\inferrule{r_1 \rightsquigarrow^*  r_2 \land \; r_2 \rightsquigarrow^* r_3}{r_1 \rightsquigarrow^* r_3\\}
       
   585 	
       
   586 	\inferrule{rs_1 \stackrel{s*}{\rightsquigarrow}  rs_2 \land \; rs_2 \stackrel{s*}{\rightsquigarrow} rs_3}{rs_1 \stackrel{s*}{\rightsquigarrow} rs_3}
   406 \end{mathpar}
   587 \end{mathpar}
   407 \end{center}
   588 \caption{The Reflexive Transitive Closure of 
   408 Now that we have modelled the rewriting behaviour of our simplifier $\bsimp$, we prove mainly 
   589 $\rightsquigarrow$ and $\stackrel{s}{\rightsquigarrow}$}\label{transClosure}
   409 three properties about how these relations connect to $\blexersimp$:
   590 \end{figure}
   410 \begin{itemize}
   591 Two rewritable terms will remain rewritable to each other
   411 	\item
   592 even after a derivative is taken:
   412 		$r \stackrel{*}{\rightsquigarrow} \bsimp{r}$
   593 \begin{center}
   413 		The algorithm $\bsimp$ only transforms the regex $r$ using steps specified by 
   594 	$r_1 \rightsquigarrow r_2 \implies (r_1 \backslash c) \rightsquigarrow^* (r_2 \backslash c)$
   414 		$\stackrel{*}{\rightsquigarrow}$ and nothing else.
   595 \end{center}
   415 	\item
   596 And finally, if two terms are rewritable to each other,
   416 		$r \rightsquigarrow r' \implies r \backslash c \rightsquigarrow r'\backslash c$.
   597 then they produce the same bitcodes:
   417 		The relation $\stackrel{*}{rightsquigarrow}$ is preserved under derivative. 
   598 \begin{center}
   418 	\item
   599 	$r \rightsquigarrow^* r' \;\; \textit{then} \; \; \bmkeps \; r = \bmkeps \; r'$
   419 		$(r \stackrel{*}{\rightsquigarrow} r'\land \bnullable \; r_1) \implies \bmkeps \; r = \bmkeps \; r'$. If we reach another 
   600 \end{center}
   420 		expression in finitely many atomic simplification steps, then these two regular expressions
   601 The decoding phase of both $\blexer$ and $\blexersimp$
   421 		will produce the same bit-codes under the bit collection function $\bmkeps$ used by our $\blexer$.
   602 are the same, which means that if they get the same
   422 
   603 bitcodes before the decoding phase,
   423 \end{itemize}
   604 they get the same value after decoding is done.
   424 \section{Three Important properties}
   605 We will prove the three properties 
   425 These properties would work together towards the correctness theorem.
   606 we mentioned above in the next sub-section.
   426 We start proving each of these lemmas below.
   607 \subsection{Important Properties of $\rightsquigarrow$}
   427 \subsection{$(r \stackrel{*}{\rightsquigarrow} r'\land \bnullable \; r_1) \implies \bmkeps \; r = \bmkeps \; r'$ and $r \stackrel{*}{\rightsquigarrow} \bsimp{r}$}
   608 First we prove some basic facts 
   428 The first few properties we establish is that the inference rules we gave for $\rightsquigarrow$
   609 about $\rightsquigarrow$, $\stackrel{s}{\rightsquigarrow}$, 
   429 and $\stackrel{s}{\rightsquigarrow}$ also hold as implications for $\stackrel{*}{\rightsquigarrow}$ and 
   610 $\rightsquigarrow^*$ and $\stackrel{s*}{\rightsquigarrow}$,
   430 $\stackrel{s*}{\rightsquigarrow}$.
   611 which will be needed later.\\
   431 \begin{lemma}
   612 The inference rules (\ref{rrewriteRules}) we 
   432 	$rs_1 \stackrel{s*}{\rightsquigarrow} rs_2 \implies _{bs} \sum rs_1 \stackrel{*}{\rightsquigarrow} _{bs} \sum rs_2$
   613 gave in the previous section 
   433 \end{lemma}
   614 have their ``many-steps version'':
   434 \begin{proof}
   615 
   435 	By rule induction of $\stackrel{s*}{\rightsquigarrow}$.
   616 \begin{lemma}
   436 \end{proof}
   617 	\hspace{0em}
   437 \begin{lemma}
   618 	\begin{itemize}
   438 	$r \stackrel{*}{\rightsquigarrow} r' \implies _{bs} \sum r :: rs \stackrel{*}{\rightsquigarrow}  _{bs} \sum r' :: rs$
   619 		\item
   439 \end{lemma}
   620 			$rs_1 \stackrel{s*}{\rightsquigarrow} rs_2 \implies _{bs} \sum rs_1 \stackrel{*}{\rightsquigarrow} _{bs} \sum rs_2$
   440 \begin{proof}
   621 		\item
   441 	By rule induction of $\stackrel{*}{\rightsquigarrow} $.
   622 			$r \stackrel{*}{\rightsquigarrow} r' \implies _{bs} \sum r :: rs \stackrel{*}{\rightsquigarrow}  _{bs} \sum r' :: rs$
   442 \end{proof}
   623 	\end{itemize}
   443 \noindent
   624 \end{lemma}
   444 Then we establish that the $\stackrel{s}{\rightsquigarrow}$ and $\stackrel{s*}{\rightsquigarrow}$ relation is preserved w.r.t appending and prepending of a  list:
   625 \begin{proof}
   445 \begin{lemma}
   626 	By an induction on 
   446 	$rs_1 \stackrel{s}{\rightsquigarrow} rs_2 \implies rs @ rs_1 \stackrel{s}{\rightsquigarrow} rs @ rs_2$
   627 	the inductive cases of $\stackrel{s*}{\rightsquigarrow}$ and $\rightsquigarrow^*$ respectively.
   447 \end{lemma}
   628 \end{proof}
   448 \begin{proof}
   629 \noindent
   449 	By induction on the list $rs$.
   630 The inference rules of $\stackrel{s}{\rightsquigarrow}$
   450 \end{proof}
   631 are defined in terms of list cons operation, here
   451 
   632 we establish that the 
   452 \begin{lemma}
   633 $\stackrel{s}{\rightsquigarrow}$ and $\stackrel{s*}{\rightsquigarrow}$ 
   453 	$rs_1 \stackrel{s*}{\rightsquigarrow} rs_2 \implies rs @ rs_1 \stackrel{s*}{\rightsquigarrow} rs @ rs_2$
   634 relation is also preserved w.r.t appending and prepending of a list:
   454 \end{lemma}
       
   455 \begin{proof}
       
   456 	By rule induction of $\stackrel{s*}{\rightsquigarrow}$.
       
   457 \end{proof}
       
   458 
       
   459 \noindent
       
   460 The $\stackrel{s}{\rightsquigarrow} $ relation after appending a list becomes $\stackrel{s*}{\rightsquigarrow}$:
       
   461 \begin{lemma}\label{ssgqTossgs}
   635 \begin{lemma}\label{ssgqTossgs}
   462 	$rs_1 \stackrel{s}{\rightsquigarrow} rs_2 \implies rs_1 @ rs \stackrel{s*}{\rightsquigarrow} rs_2 @ rs$
   636 	\hspace{0em}
   463 \end{lemma}
   637 	\begin{itemize}
   464 \begin{proof}
   638 		\item
   465 	By rule induction of $\stackrel{s}{\rightsquigarrow}$.
   639 			$rs_1 \stackrel{s}{\rightsquigarrow} rs_2 \implies rs @ rs_1 \stackrel{s}{\rightsquigarrow} rs @ rs_2$
       
   640 
       
   641 		\item
       
   642 			$rs_1 \stackrel{s*}{\rightsquigarrow} rs_2 \implies 
       
   643 			rs @ rs_1 \stackrel{s*}{\rightsquigarrow} rs @ rs_2$
       
   644 		\item
       
   645 			The $\stackrel{s}{\rightsquigarrow} $ relation after appending 
       
   646 			a list becomes $\stackrel{s*}{\rightsquigarrow}$:\\
       
   647 			$rs_1 \stackrel{s}{\rightsquigarrow} rs_2 \implies rs_1 @ rs \stackrel{s*}{\rightsquigarrow} rs_2 @ rs$
       
   648 
       
   649 	\end{itemize}
       
   650 \end{lemma}
       
   651 \begin{proof}
       
   652 	The first part is by induction on the list $rs$.
       
   653 	The second part is by induction on the inductive cases of $\stackrel{s*}{\rightsquigarrow}$.
       
   654 	The third part is 
       
   655 	by rule induction of $\stackrel{s}{\rightsquigarrow}$.
   466 \end{proof}
   656 \end{proof}
   467 \begin{lemma}
   657 \begin{lemma}
   468 	$rs_1 \stackrel{s*}{\rightsquigarrow} rs_2 \implies rs_1 @ rs \stackrel{s*}{\rightsquigarrow} rs_2 @ rs$
   658 	$rs_1 \stackrel{s*}{\rightsquigarrow} rs_2 \implies rs_1 @ rs \stackrel{s*}{\rightsquigarrow} rs_2 @ rs$
   469 \end{lemma}
   659 \end{lemma}
   470 \begin{proof}
   660 \begin{proof}
   482 	r_2 :: rs_3 \stackrel{s*}{\rightsquigarrow} r_2 :: rs_4$
   672 	r_2 :: rs_3 \stackrel{s*}{\rightsquigarrow} r_2 :: rs_4$
   483 \end{lemma}
   673 \end{lemma}
   484 \begin{proof}
   674 \begin{proof}
   485 	By using \ref{singleton}.
   675 	By using \ref{singleton}.
   486 \end{proof}
   676 \end{proof}
       
   677 In this section we give details for the proofs of the below properties:
       
   678 \begin{itemize}
       
   679 	\item
       
   680 		$(r \stackrel{*}{\rightsquigarrow} r'\land \bnullable \; r_1) 
       
   681 		\implies \bmkeps \; r = \bmkeps \; r'$. \\
       
   682 		If we can rewrite 
       
   683 		in many steps from $r$ to $r'$, then they  
       
   684 		will produce the same bitcodes 
       
   685 		under $\bmkeps$. 
       
   686 	\item
       
   687 		$r \rightsquigarrow^* \textit{bsimp} \;r$.\\
       
   688 		The simplification function
       
   689 		$\textit{bsimp}$ only transforms the regex $r$ using steps specified by 
       
   690 		$\rightsquigarrow^*$ and nothing else.
       
   691 	\item
       
   692 		$r \rightsquigarrow r' \implies r \backslash c \rightsquigarrow^* r'\backslash c$.\\
       
   693 		The rewritability relation $\rightsquigarrow$ is preserved under derivatives--
       
   694 		it is just that we might need more steps.
       
   695 \end{itemize}
       
   696 These properties would work together towards the correctness theorem.
       
   697 We start proving each of these lemmas below.
       
   698 \subsubsection{Property 1: $(r \stackrel{*}{\rightsquigarrow} r'\land \bnullable \; r_1) \implies \bmkeps \; r = \bmkeps \; r'$}
       
   699 Intuitively this property says we can 
       
   700 extract the same bitcodes from the nullable
       
   701 components of two regular expressions
       
   702 if we can rewrite from one to the other.
       
   703 
       
   704 
       
   705 
       
   706 
   487 Now we get to the "meaty" part of the proof, which relates the relations $\stackrel{s*}{\rightsquigarrow}$ and 
   707 Now we get to the "meaty" part of the proof, which relates the relations $\stackrel{s*}{\rightsquigarrow}$ and 
   488 $\stackrel{*}{\rightsquigarrow} $ with our simplification components such $\distinctBy$ and $\flts$.
   708 $\stackrel{*}{\rightsquigarrow} $ with our simplification components such $\distinctBy$ and $\flts$.
   489 The first lemma below says that for a list made of two parts $rs_1 @ rs_2$, one can throw away the duplicate
   709 The first lemma below says that for a list made of two parts $rs_1 @ rs_2$, one can throw away the duplicate
   490 elements in $rs_2$, as well as those that have appeared in $rs_1$:
   710 elements in $rs_2$, as well as those that have appeared in $rs_1$:
   491 \begin{lemma}
   711 \begin{lemma}
   584 \end{lemma}
   804 \end{lemma}
   585 \begin{proof}
   805 \begin{proof}
   586 	By rule induction of $\stackrel{*}{\rightsquigarrow} $, using \ref{rewriteBmkepsAux} and $\ref{rewritesBnullable}$.
   806 	By rule induction of $\stackrel{*}{\rightsquigarrow} $, using \ref{rewriteBmkepsAux} and $\ref{rewritesBnullable}$.
   587 \end{proof}
   807 \end{proof}
   588 \noindent
   808 \noindent
       
   809 \subsubsection{Property 2: $r \stackrel{*}{\rightsquigarrow} \bsimp{r}$}
   589 the other property is also ready:
   810 the other property is also ready:
   590 \begin{lemma}
   811 \begin{lemma}
   591 	$r \stackrel{*}{\rightsquigarrow} \bsimp{r}$
   812 	$r \stackrel{*}{\rightsquigarrow} \bsimp{r}$
   592 \end{lemma}
   813 \end{lemma}
   593 \begin{proof}
   814 \begin{proof}
   599 	Then we could do the following regular expresssion many steps rewrite:\\
   820 	Then we could do the following regular expresssion many steps rewrite:\\
   600 	$ _{bs} \sum \distinctBy \; (\flts \; (\map \; \textit{bsimp}\; rs)) \; \rerase \; \phi \stackrel{*}{\rightsquigarrow} \bsimpalts \; bs \; (\distinctBy \; (\flts \; (\map \; \textit{bsimp}\; rs)) \; \rerase \; \phi)$
   821 	$ _{bs} \sum \distinctBy \; (\flts \; (\map \; \textit{bsimp}\; rs)) \; \rerase \; \phi \stackrel{*}{\rightsquigarrow} \bsimpalts \; bs \; (\distinctBy \; (\flts \; (\map \; \textit{bsimp}\; rs)) \; \rerase \; \phi)$
   601 	\\
   822 	\\
   602 	
   823 	
   603 \end{proof}
   824 \end{proof}
   604 \section{Proof for the Property: $r_1 \stackrel{*}{\rightsquigarrow}  r_2 \implies r_1 \backslash c \stackrel{*}{\rightsquigarrow} r_2 \backslash c$}
   825 \subsubsection{Property 3: $r_1 \stackrel{*}{\rightsquigarrow}  r_2 \implies r_1 \backslash c \stackrel{*}{\rightsquigarrow} r_2 \backslash c$}
   605 The first thing we prove is that if we could rewrite in one step, then after derivative
   826 The first thing we prove is that if we could rewrite in one step, then after derivative
   606 we could rewrite in many steps:
   827 we could rewrite in many steps:
   607 \begin{lemma}\label{rewriteBder}
   828 \begin{lemma}\label{rewriteBder}
   608 	$r_1 \rightsquigarrow r_2 \implies r_1 \backslash c \stackrel{*}{\rightsquigarrow}  r_2 \backslash c$ and
   829 	$r_1 \rightsquigarrow r_2 \implies r_1 \backslash c \stackrel{*}{\rightsquigarrow}  r_2 \backslash c$ and
   609 	$rs_1 \stackrel{s}{\rightsquigarrow} rs_2 \implies \map \; (\_\backslash c) \; rs_1 \stackrel{s*}{\rightsquigarrow} \map \; (\_ \backslash c) \; rs_2$
   830 	$rs_1 \stackrel{s}{\rightsquigarrow} rs_2 \implies \map \; (\_\backslash c) \; rs_1 \stackrel{s*}{\rightsquigarrow} \map \; (\_ \backslash c) \; rs_2$