ChengsongTanPhdThesis/Chapters/Bitcoded1.tex
changeset 532 cc54ce075db5
child 536 aff7bf93b9c7
equal deleted inserted replaced
531:c334f0b3ef52 532:cc54ce075db5
       
     1 % Chapter Template
       
     2 
       
     3 % Main chapter title
       
     4 \chapter{Correctness of Bit-coded Algorithm without Simplification}
       
     5 
       
     6 \label{Bitcoded1} % Change X to a consecutive number; for referencing this chapter elsewhere, use \ref{ChapterX}
       
     7 %Then we illustrate how the algorithm without bitcodes falls short for such aggressive 
       
     8 %simplifications and therefore introduce our version of the bitcoded algorithm and 
       
     9 %its correctness proof in 
       
    10 %Chapter 3\ref{Chapter3}. 
       
    11 
       
    12 \section*{Bit-coded Algorithm}
       
    13 Bits and bitcodes (lists of bits) are defined as:
       
    14 
       
    15 \begin{center}
       
    16 		$b ::=   1 \mid  0 \qquad
       
    17 bs ::= [] \mid b::bs    
       
    18 $
       
    19 \end{center}
       
    20 
       
    21 \noindent
       
    22 The $1$ and $0$ are not in bold in order to avoid 
       
    23 confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or
       
    24 bit-lists) can be used to encode values (or potentially incomplete values) in a
       
    25 compact form. This can be straightforwardly seen in the following
       
    26 coding function from values to bitcodes: 
       
    27 
       
    28 \begin{center}
       
    29 \begin{tabular}{lcl}
       
    30   $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
       
    31   $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
       
    32   $\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\
       
    33   $\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\
       
    34   $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
       
    35   $\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\
       
    36   $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\;
       
    37                                                  code(\Stars\,vs)$
       
    38 \end{tabular}    
       
    39 \end{center} 
       
    40 
       
    41 \noindent
       
    42 Here $\textit{code}$ encodes a value into a bitcodes by converting
       
    43 $\Left$ into $0$, $\Right$ into $1$, and marks the start of a non-empty
       
    44 star iteration by $1$. The border where a local star terminates
       
    45 is marked by $0$. This coding is lossy, as it throws away the information about
       
    46 characters, and also does not encode the ``boundary'' between two
       
    47 sequence values. Moreover, with only the bitcode we cannot even tell
       
    48 whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The
       
    49 reason for choosing this compact way of storing information is that the
       
    50 relatively small size of bits can be easily manipulated and ``moved
       
    51 around'' in a regular expression. In order to recover values, we will 
       
    52 need the corresponding regular expression as an extra information. This
       
    53 means the decoding function is defined as:
       
    54 
       
    55 
       
    56 %\begin{definition}[Bitdecoding of Values]\mbox{}
       
    57 \begin{center}
       
    58 \begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
       
    59   $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
       
    60   $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
       
    61   $\textit{decode}'\,(0\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
       
    62      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
       
    63        (\Left\,v, bs_1)$\\
       
    64   $\textit{decode}'\,(1\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
       
    65      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
       
    66        (\Right\,v, bs_1)$\\                           
       
    67   $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
       
    68         $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
       
    69   & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
       
    70   & &   \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
       
    71   $\textit{decode}'\,(0\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
       
    72   $\textit{decode}'\,(1\!::\!bs)\,(r^*)$ & $\dn$ & 
       
    73          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
       
    74   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
       
    75   & &   \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
       
    76   
       
    77   $\textit{decode}\,bs\,r$ & $\dn$ &
       
    78      $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
       
    79   & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
       
    80        \textit{else}\;\textit{None}$                       
       
    81 \end{tabular}    
       
    82 \end{center}    
       
    83 %\end{definition}
       
    84 
       
    85 Sulzmann and Lu's integrated the bitcodes into regular expressions to
       
    86 create annotated regular expressions \cite{Sulzmann2014}.
       
    87 \emph{Annotated regular expressions} are defined by the following
       
    88 grammar:%\comment{ALTS should have  an $as$ in  the definitions, not  just $a_1$ and $a_2$}
       
    89 
       
    90 \begin{center}
       
    91 \begin{tabular}{lcl}
       
    92   $\textit{a}$ & $::=$  & $\ZERO$\\
       
    93                   & $\mid$ & $_{bs}\ONE$\\
       
    94                   & $\mid$ & $_{bs}{\bf c}$\\
       
    95                   & $\mid$ & $_{bs}\sum\,as$\\
       
    96                   & $\mid$ & $_{bs}a_1\cdot a_2$\\
       
    97                   & $\mid$ & $_{bs}a^*$
       
    98 \end{tabular}    
       
    99 \end{center}  
       
   100 %(in \textit{ALTS})
       
   101 
       
   102 \noindent
       
   103 where $bs$ stands for bitcodes, $a$  for $\mathbf{a}$nnotated regular
       
   104 expressions and $as$ for a list of annotated regular expressions.
       
   105 The alternative constructor($\sum$) has been generalized to 
       
   106 accept a list of annotated regular expressions rather than just 2.
       
   107 We will show that these bitcodes encode information about
       
   108 the (POSIX) value that should be generated by the Sulzmann and Lu
       
   109 algorithm.
       
   110 
       
   111 
       
   112 To do lexing using annotated regular expressions, we shall first
       
   113 transform the usual (un-annotated) regular expressions into annotated
       
   114 regular expressions. This operation is called \emph{internalisation} and
       
   115 defined as follows:
       
   116 
       
   117 %\begin{definition}
       
   118 \begin{center}
       
   119 \begin{tabular}{lcl}
       
   120   $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
       
   121   $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
       
   122   $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\
       
   123   $(r_1 + r_2)^\uparrow$ & $\dn$ &
       
   124   $_{[]}\sum[\textit{fuse}\,[0]\,r_1^\uparrow,\,
       
   125   \textit{fuse}\,[1]\,r_2^\uparrow]$\\
       
   126   $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
       
   127          $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
       
   128   $(r^*)^\uparrow$ & $\dn$ &
       
   129          $_{[]}(r^\uparrow)^*$\\
       
   130 \end{tabular}    
       
   131 \end{center}    
       
   132 %\end{definition}
       
   133 
       
   134 \noindent
       
   135 We use up arrows here to indicate that the basic un-annotated regular
       
   136 expressions are ``lifted up'' into something slightly more complex. In the
       
   137 fourth clause, $\textit{fuse}$ is an auxiliary function that helps to
       
   138 attach bits to the front of an annotated regular expression. Its
       
   139 definition is as follows:
       
   140 
       
   141 \begin{center}
       
   142 \begin{tabular}{lcl}
       
   143   $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\
       
   144   $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ &
       
   145      $_{bs @ bs'}\ONE$\\
       
   146   $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ &
       
   147      $_{bs@bs'}{\bf c}$\\
       
   148   $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ &
       
   149      $_{bs@bs'}\sum\textit{as}$\\
       
   150   $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ &
       
   151      $_{bs@bs'}a_1 \cdot a_2$\\
       
   152   $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ &
       
   153      $_{bs @ bs'}a^*$
       
   154 \end{tabular}    
       
   155 \end{center}  
       
   156 
       
   157 \noindent
       
   158 After internalising the regular expression, we perform successive
       
   159 derivative operations on the annotated regular expressions. This
       
   160 derivative operation is the same as what we had previously for the
       
   161 basic regular expressions, except that we beed to take care of
       
   162 the bitcodes:
       
   163 
       
   164 
       
   165 \iffalse
       
   166  %\begin{definition}{bder}
       
   167 \begin{center}
       
   168   \begin{tabular}{@{}lcl@{}}
       
   169   $(\textit{ZERO})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
       
   170   $(\textit{ONE}\;bs)\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
       
   171   $(\textit{CHAR}\;bs\,d)\,\backslash c$ & $\dn$ &
       
   172         $\textit{if}\;c=d\; \;\textit{then}\;
       
   173          \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
       
   174   $(\textit{ALTS}\;bs\,as)\,\backslash c$ & $\dn$ &
       
   175   $\textit{ALTS}\;bs\,(map (\backslash c) as)$\\
       
   176   $(\textit{SEQ}\;bs\,a_1\,a_2)\,\backslash c$ & $\dn$ &
       
   177      $\textit{if}\;\textit{bnullable}\,a_1$\\
       
   178 					       & &$\textit{then}\;\textit{ALTS}\,bs\,List((\textit{SEQ}\,[]\,(a_1\,\backslash c)\,a_2),$\\
       
   179 					       & &$\phantom{\textit{then}\;\textit{ALTS}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c)))$\\
       
   180   & &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\,\backslash c)\,a_2$\\
       
   181   $(\textit{STAR}\,bs\,a)\,\backslash c$ & $\dn$ &
       
   182       $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\,\backslash c))\,
       
   183        (\textit{STAR}\,[]\,r)$
       
   184 \end{tabular}    
       
   185 \end{center}    
       
   186 %\end{definition}
       
   187 
       
   188 \begin{center}
       
   189   \begin{tabular}{@{}lcl@{}}
       
   190   $(\textit{ZERO})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
       
   191   $(_{bs}\textit{ONE})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
       
   192   $(_{bs}\textit{CHAR}\;d)\,\backslash c$ & $\dn$ &
       
   193         $\textit{if}\;c=d\; \;\textit{then}\;
       
   194          _{bs}\textit{ONE}\;\textit{else}\;\textit{ZERO}$\\  
       
   195   $(_{bs}\textit{ALTS}\;\textit{as})\,\backslash c$ & $\dn$ &
       
   196   $_{bs}\textit{ALTS}\;(\textit{as}.\textit{map}(\backslash c))$\\
       
   197   $(_{bs}\textit{SEQ}\;a_1\,a_2)\,\backslash c$ & $\dn$ &
       
   198      $\textit{if}\;\textit{bnullable}\,a_1$\\
       
   199 					       & &$\textit{then}\;_{bs}\textit{ALTS}\,List((_{[]}\textit{SEQ}\,(a_1\,\backslash c)\,a_2),$\\
       
   200 					       & &$\phantom{\textit{then}\;_{bs}\textit{ALTS}\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c)))$\\
       
   201   & &$\textit{else}\;_{bs}\textit{SEQ}\,(a_1\,\backslash c)\,a_2$\\
       
   202   $(_{bs}\textit{STAR}\,a)\,\backslash c$ & $\dn$ &
       
   203       $_{bs}\textit{SEQ}\;(\textit{fuse}\, [0] \; r\,\backslash c )\,
       
   204        (_{bs}\textit{STAR}\,[]\,r)$
       
   205 \end{tabular}    
       
   206 \end{center}    
       
   207 %\end{definition}
       
   208 \fi
       
   209 
       
   210 \begin{center}
       
   211   \begin{tabular}{@{}lcl@{}}
       
   212   $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\  
       
   213   $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\  
       
   214   $(_{bs}{\bf d})\,\backslash c$ & $\dn$ &
       
   215         $\textit{if}\;c=d\; \;\textit{then}\;
       
   216          _{bs}\ONE\;\textit{else}\;\ZERO$\\  
       
   217   $(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ &
       
   218   $_{bs}\sum\;(\textit{map} (\_\backslash c) as )$\\
       
   219   $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
       
   220      $\textit{if}\;\textit{bnullable}\,a_1$\\
       
   221 					       & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
       
   222 					       & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
       
   223   & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\
       
   224   $(_{bs}a^*)\,\backslash c$ & $\dn$ &
       
   225       $_{bs}(\textit{fuse}\, [0] \; r\,\backslash c)\cdot
       
   226        (_{[]}r^*))$
       
   227 \end{tabular}    
       
   228 \end{center}    
       
   229 
       
   230 %\end{definition}
       
   231 \noindent
       
   232 For instance, when we do derivative of  $_{bs}a^*$ with respect to c,
       
   233 we need to unfold it into a sequence,
       
   234 and attach an additional bit $0$ to the front of $r \backslash c$
       
   235 to indicate one more star iteration. Also the sequence clause
       
   236 is more subtle---when $a_1$ is $\textit{bnullable}$ (here
       
   237 \textit{bnullable} is exactly the same as $\textit{nullable}$, except
       
   238 that it is for annotated regular expressions, therefore we omit the
       
   239 definition). Assume that $\textit{bmkeps}$ correctly extracts the bitcode for how
       
   240 $a_1$ matches the string prior to character $c$ (more on this later),
       
   241 then the right branch of alternative, which is $\textit{fuse} \; \bmkeps \;  a_1 (a_2
       
   242 \backslash c)$ will collapse the regular expression $a_1$(as it has
       
   243 already been fully matched) and store the parsing information at the
       
   244 head of the regular expression $a_2 \backslash c$ by fusing to it. The
       
   245 bitsequence $\textit{bs}$, which was initially attached to the
       
   246 first element of the sequence $a_1 \cdot a_2$, has
       
   247 now been elevated to the top-level of $\sum$, as this information will be
       
   248 needed whichever way the sequence is matched---no matter whether $c$ belongs
       
   249 to $a_1$ or $ a_2$. After building these derivatives and maintaining all
       
   250 the lexing information, we complete the lexing by collecting the
       
   251 bitcodes using a generalised version of the $\textit{mkeps}$ function
       
   252 for annotated regular expressions, called $\textit{bmkeps}$:
       
   253 
       
   254 
       
   255 %\begin{definition}[\textit{bmkeps}]\mbox{}
       
   256 \begin{center}
       
   257 \begin{tabular}{lcl}
       
   258   $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\
       
   259   $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ &
       
   260      $\textit{if}\;\textit{bnullable}\,a$\\
       
   261   & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
       
   262   & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{bs}\sum \textit{as})$\\
       
   263   $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ &
       
   264      $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
       
   265   $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ &
       
   266      $bs \,@\, [0]$
       
   267 \end{tabular}    
       
   268 \end{center}    
       
   269 %\end{definition}
       
   270 
       
   271 \noindent
       
   272 This function completes the value information by travelling along the
       
   273 path of the regular expression that corresponds to a POSIX value and
       
   274 collecting all the bitcodes, and using $S$ to indicate the end of star
       
   275 iterations. If we take the bitcodes produced by $\textit{bmkeps}$ and
       
   276 decode them, we get the value we expect. The corresponding lexing
       
   277 algorithm looks as follows:
       
   278 
       
   279 \begin{center}
       
   280 \begin{tabular}{lcl}
       
   281   $\textit{blexer}\;r\,s$ & $\dn$ &
       
   282       $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\                
       
   283   & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
       
   284   & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
       
   285   & & $\;\;\textit{else}\;\textit{None}$
       
   286 \end{tabular}
       
   287 \end{center}
       
   288 
       
   289 \noindent
       
   290 In this definition $\_\backslash s$ is the  generalisation  of the derivative
       
   291 operation from characters to strings (just like the derivatives for un-annotated
       
   292 regular expressions).
       
   293 
       
   294 Now we introduce the simplifications, which is why we introduce the 
       
   295 bitcodes in the first place.
       
   296 
       
   297 \subsection*{Simplification Rules}
       
   298 
       
   299 This section introduces aggressive (in terms of size) simplification rules
       
   300 on annotated regular expressions
       
   301 to keep derivatives small. Such simplifications are promising
       
   302 as we have
       
   303 generated test data that show
       
   304 that a good tight bound can be achieved. We could only
       
   305 partially cover the search space as there are infinitely many regular
       
   306 expressions and strings. 
       
   307 
       
   308 One modification we introduced is to allow a list of annotated regular
       
   309 expressions in the $\sum$ constructor. This allows us to not just
       
   310 delete unnecessary $\ZERO$s and $\ONE$s from regular expressions, but
       
   311 also unnecessary ``copies'' of regular expressions (very similar to
       
   312 simplifying $r + r$ to just $r$, but in a more general setting). Another
       
   313 modification is that we use simplification rules inspired by Antimirov's
       
   314 work on partial derivatives. They maintain the idea that only the first
       
   315 ``copy'' of a regular expression in an alternative contributes to the
       
   316 calculation of a POSIX value. All subsequent copies can be pruned away from
       
   317 the regular expression. A recursive definition of our  simplification function 
       
   318 that looks somewhat similar to our Scala code is given below:
       
   319 %\comment{Use $\ZERO$, $\ONE$ and so on. 
       
   320 %Is it $ALTS$ or $ALTS$?}\\
       
   321 
       
   322 \begin{center}
       
   323   \begin{tabular}{@{}lcl@{}}
       
   324    
       
   325   $\textit{simp} \; (_{bs}a_1\cdot a_2)$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp}  \; a_2) \; \textit{match} $ \\
       
   326    &&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow  \ZERO$ \\
       
   327    &&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow  \ZERO$ \\
       
   328    &&$\quad\textit{case} \;  (\ONE, a_2') \Rightarrow  \textit{fuse} \; bs \;  a_2'$ \\
       
   329    &&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow  \textit{fuse} \; bs \;  a_1'$ \\
       
   330    &&$\quad\textit{case} \; (a_1', a_2') \Rightarrow   _{bs}a_1' \cdot a_2'$ \\
       
   331 
       
   332   $\textit{simp} \; (_{bs}\sum \textit{as})$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{map} \; simp \; as)) \; \textit{match} $ \\
       
   333   &&$\quad\textit{case} \; [] \Rightarrow  \ZERO$ \\
       
   334    &&$\quad\textit{case} \; a :: [] \Rightarrow  \textit{fuse bs a}$ \\
       
   335    &&$\quad\textit{case} \;  as' \Rightarrow _{bs}\sum \textit{as'}$\\ 
       
   336 
       
   337    $\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$   
       
   338 \end{tabular}    
       
   339 \end{center}    
       
   340 
       
   341 \noindent
       
   342 The simplification does a pattern matching on the regular expression.
       
   343 When it detected that the regular expression is an alternative or
       
   344 sequence, it will try to simplify its child regular expressions
       
   345 recursively and then see if one of the children turns into $\ZERO$ or
       
   346 $\ONE$, which might trigger further simplification at the current level.
       
   347 The most involved part is the $\sum$ clause, where we use two
       
   348 auxiliary functions $\textit{flatten}$ and $\textit{distinct}$ to open up nested
       
   349 alternatives and reduce as many duplicates as possible. Function
       
   350 $\textit{distinct}$  keeps the first occurring copy only and removes all later ones
       
   351 when detected duplicates. Function $\textit{flatten}$ opens up nested $\sum$s.
       
   352 Its recursive definition is given below:
       
   353 
       
   354  \begin{center}
       
   355   \begin{tabular}{@{}lcl@{}}
       
   356   $\textit{flatten} \; (_{bs}\sum \textit{as}) :: \textit{as'}$ & $\dn$ & $(\textit{map} \;
       
   357      (\textit{fuse}\;bs)\; \textit{as}) \; @ \; \textit{flatten} \; as' $ \\
       
   358   $\textit{flatten} \; \ZERO :: as'$ & $\dn$ & $ \textit{flatten} \;  \textit{as'} $ \\
       
   359     $\textit{flatten} \; a :: as'$ & $\dn$ & $a :: \textit{flatten} \; \textit{as'}$ \quad(otherwise) 
       
   360 \end{tabular}    
       
   361 \end{center}  
       
   362 
       
   363 \noindent
       
   364 Here $\textit{flatten}$ behaves like the traditional functional programming flatten
       
   365 function, except that it also removes $\ZERO$s. Or in terms of regular expressions, it
       
   366 removes parentheses, for example changing $a+(b+c)$ into $a+b+c$.
       
   367 
       
   368 Having defined the $\simp$ function,
       
   369 we can use the previous notation of  natural
       
   370 extension from derivative w.r.t.~character to derivative
       
   371 w.r.t.~string:%\comment{simp in  the [] case?}
       
   372 
       
   373 \begin{center}
       
   374 \begin{tabular}{lcl}
       
   375 $r \backslash_{simp} (c\!::\!s) $ & $\dn$ & $(r \backslash_{simp}\, c) \backslash_{simp}\, s$ \\
       
   376 $r \backslash_{simp} [\,] $ & $\dn$ & $r$
       
   377 \end{tabular}
       
   378 \end{center}
       
   379 
       
   380 \noindent
       
   381 to obtain an optimised version of the algorithm:
       
   382 
       
   383  \begin{center}
       
   384 \begin{tabular}{lcl}
       
   385   $\textit{blexer\_simp}\;r\,s$ & $\dn$ &
       
   386       $\textit{let}\;a = (r^\uparrow)\backslash_{simp}\, s\;\textit{in}$\\                
       
   387   & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
       
   388   & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
       
   389   & & $\;\;\textit{else}\;\textit{None}$
       
   390 \end{tabular}
       
   391 \end{center}
       
   392 
       
   393 \noindent
       
   394 This algorithm keeps the regular expression size small, for example,
       
   395 with this simplification our previous $(a + aa)^*$ example's 8000 nodes
       
   396 will be reduced to just 6 and stays constant, no matter how long the
       
   397 input string is.
       
   398 
       
   399 
       
   400 
       
   401 
       
   402 
       
   403 
       
   404 
       
   405 
       
   406 
       
   407 
       
   408 
       
   409 %-----------------------------------
       
   410 %	SUBSECTION 1
       
   411 %-----------------------------------
       
   412 \section{Specifications of Some Helper Functions}
       
   413 Here we give some functions' definitions, 
       
   414 which we will use later.
       
   415 \begin{center}
       
   416 \begin{tabular}{ccc}
       
   417 $\retrieve \; \ACHAR \, \textit{bs} \, c \; \Char(c) = \textit{bs}$
       
   418 \end{tabular}
       
   419 \end{center}
       
   420 
       
   421 
       
   422 %----------------------------------------------------------------------------------------
       
   423 %	SECTION  correctness proof
       
   424 %----------------------------------------------------------------------------------------
       
   425 \section{Correctness of Bit-coded Algorithm (Without Simplification)}
       
   426 We now give the proof the correctness of the algorithm with bit-codes.
       
   427 
       
   428 Ausaf and Urban cleverly defined an auxiliary function called $\flex$,
       
   429 defined as
       
   430 \[
       
   431 \flex \; r \; f \; [] \; v \; = \; f\; v
       
   432 \flex \; r \; f \; c :: s \; v =  \flex r \; \lambda v. \, f (\inj \; r\; c\; v)\; s \; v
       
   433 \]
       
   434 which accumulates the characters that needs to be injected back, 
       
   435 and does the injection in a stack-like manner (last taken derivative first injected).
       
   436 $\flex$ is connected to the $\lexer$:
       
   437 \begin{lemma}
       
   438 $\flex \; r \; \textit{id}\; s \; \mkeps (r\backslash s) = \lexer \; r \; s$
       
   439 \end{lemma}
       
   440 $\flex$ provides us a bridge between $\lexer$ and $\blexer$.
       
   441 What is even better about $\flex$ is that it allows us to 
       
   442 directly operate on the value $\mkeps (r\backslash v)$,
       
   443 which is pivotal in the definition of  $\lexer $ and $\blexer$, but not visible as an argument.
       
   444 When the value created by $\mkeps$ becomes available, one can 
       
   445 prove some stepwise properties of lexing nicely:
       
   446 \begin{lemma}\label{flexStepwise}
       
   447 $\textit{flex} \; r \; f \; s@[c] \; v= \flex \; r \; f\; s \; (\inj \; (r\backslash s) \; c \; v) $
       
   448 \end{lemma}
       
   449 
       
   450 And for $\blexer$ we have a function with stepwise properties like $\flex$ as well,
       
   451 called $\retrieve$\ref{retrieveDef}.
       
   452 $\retrieve$ takes bit-codes from annotated regular expressions
       
   453 guided by a value.
       
   454 $\retrieve$ is connected to the $\blexer$ in the following way:
       
   455 \begin{lemma}\label{blexer_retrieve}
       
   456 $\blexer \; r \; s = \decode  \; (\retrieve \; (\internalise \; r) \; (\mkeps \; (r \backslash s) )) \; r$
       
   457 \end{lemma}
       
   458 If you take derivative of an annotated regular expression, 
       
   459 you can $\retrieve$ the same bit-codes as before the derivative took place,
       
   460 provided that you use the corresponding value:
       
   461 
       
   462 \begin{lemma}\label{retrieveStepwise}
       
   463 $\retrieve \; (r \backslash c)  \;  v= \retrieve \; r \; (\inj \; r\; c\; v)$
       
   464 \end{lemma}
       
   465 The other good thing about $\retrieve$ is that it can be connected to $\flex$:
       
   466 %centralLemma1
       
   467 \begin{lemma}\label{flex_retrieve}
       
   468 $\flex \; r \; \textit{id}\; s\; v = \decode \; (\retrieve \; (r\backslash s )\; v) \; r$
       
   469 \end{lemma}
       
   470 \begin{proof}
       
   471 By induction on $s$. The induction tactic is reverse induction on strings.
       
   472 $v$ is allowed to be arbitrary.
       
   473 The crucial point is to rewrite 
       
   474 \[
       
   475 \retrieve \; (r \backslash s@[c]) \; \mkeps (r \backslash s@[c]) 
       
   476 \]
       
   477 as
       
   478 \[
       
   479 \retrieve \; (r \backslash s) \; (\inj \; (r \backslash s) \; c\;  \mkeps (r \backslash s@[c]))
       
   480 \].
       
   481 This enables us to equate 
       
   482 \[
       
   483 \retrieve \; (r \backslash s@[c]) \; \mkeps (r \backslash s@[c]) 
       
   484 \] 
       
   485 with 
       
   486 \[
       
   487 \flex \; r \; \textit{id} \; s \; (\inj \; (r\backslash s) \; c\; (\mkeps (r\backslash s@[c])))
       
   488 \],
       
   489 which in turn can be rewritten as
       
   490 \[
       
   491 \flex \; r \; \textit{id} \; s@[c] \;  (\mkeps (r\backslash s@[c]))
       
   492 \].
       
   493 \end{proof}
       
   494 
       
   495 With the above lemma we can now link $\flex$ and $\blexer$.
       
   496 
       
   497 \begin{lemma}\label{flex_blexer}
       
   498 $\textit{flex} \; r \; \textit{id} \; s \; \mkeps(r \backslash s)  = \blexer \; r \; s$
       
   499 \end{lemma}
       
   500 \begin{proof}
       
   501 Using two of the above lemmas: \ref{flex_retrieve} and \ref{blexer_retrieve}.
       
   502 \end{proof}
       
   503 Finally 
       
   504 
       
   505 
       
   506