lexing: ChengsongTanPhdThesis/Chapters/Bitcoded1.tex@3e19073e91f4


% Chapter Template

% Main chapter title
\chapter{Bit-coded Algorithm of Sulzmann and Lu}

\label{Bitcoded1} % Change X to a consecutive number; for referencing this chapter elsewhere, use \ref{ChapterX}
%Then we illustrate how the algorithm without bitcodes falls short for such aggressive 
%simplifications and therefore introduce our version of the bitcoded algorithm and 
%its correctness proof in 
%Chapter 3\ref{Chapter3}. 
In this chapter, we are going to describe the bit-coded algorithm
introduced by Sulzmann and Lu \parencite{Sulzmann2014} to address the growth problem of 
regular expressions. 
We have implemented their algorithm in Scala, and found out inefficiencies
in their algorithm such as de-duplication not working properly and redundant
fixpoint construction. Their algorithm is improved and verified with the help of
formal proofs.
\section{The Motivation Behind Using Bitcodes}
We first do a recap of what was going on 
in the lexer algorithm in Chapter \ref{Inj},
\begin{center}
\begin{tabular}{lcl}
	$\lexer \; r \; [] $ & $=$ & $\textit{if} \; (\nullable \; r)\; \textit{then}\;  \Some(\mkeps \; r) \; \textit{else} \; \None$\\
	$\lexer \; r \;c::s$ & $=$ & $\textit{case}\; (\lexer \; (r\backslash c) \; s) \;\textit{of}\; $\\
	& & $\quad \phantom{\mid}\; \None \implies \None$\\
	& & $\quad \mid           \Some(v) \implies \Some(\inj \; r\; c\; v)$
\end{tabular}
\end{center}
\noindent
The algorithm recursively calls $\lexer$ on 
each new character input,
and before starting a child call
it stores information of previous lexing steps
on a stack, in the form of regular expressions
and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
Each descent into deeper recursive calls in $\lexer$
causes a new pair of $r_i, c_i$ to be pushed to the call stack.
\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] 
%\draw (-6,-6) grid (6,6);
\node  [ circle ] (r) at (-6, 5) {$r$};

%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node  [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node  [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};

\node [minimum width = 2cm, rectangle, draw] (stack) at (0, 3) {Stack};

\path
	(r)
        edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);

\path   (r1)
	edge [bend right, dashed] node {saved} (stack);
\path   (c1)
	edge [bend right, dashed] node {} (stack);


\end{tikzpicture}
\caption{First Derivative Taken}
\end{figure}



\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] 
%\draw (-6,-6) grid (6,6);
\node  [ circle ] (r) at (-6, 5) {$r$};

%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node  [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node  [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};


\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, draw] (r2) at (2, 5) {$r_2$};
\node [minimum width = 3cm, minimum height = 1cm, rectangle, draw] (stack) at (0, 2) {\large Stack};

\path
	(r)
        edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);

\path   (r2)
	edge [bend right, dashed] node {} (stack);
\path   (c2)
	edge [bend right, dashed] node {} (stack);

\path   (r1)
	edge [] node {} (r2);

\end{tikzpicture}
\caption{Second Derivative Taken}
\end{figure}
\noindent
As the number of derivative steps increase,
the stack would increase:

\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] 
%\draw (-6,-6) grid (6,6);
\node  [ circle ] (r) at (-6, 5) {$r$};

%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node  [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node  [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};


\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, ] (r2) at (2, 5) {$r_2$};
\node [minimum width = 4cm, minimum height = 2.5cm, rectangle, draw] (stack) at (0, 1) { \large Stack};

\node [] (ldots) at (3.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};

\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {};

\node (rldots) at ($(ldots)!.4!(rn)$) {\ldots};

\path
	(r)
        edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);

\path   (rldots)
	edge [bend left, dashed] node {} (stack);

\path   (r1)
	edge [] node {} (r2);

\path   (r2)
	edge [] node {} (ldots);
\path   (ldots)
	edge [bend left, dashed] node {} (stack);
\path   (5.03, 4.9)
	edge [bend left, dashed] node {} (stack);
\end{tikzpicture}
\caption{More Derivatives Taken}
\end{figure}


\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] 
%\draw (-6,-6) grid (6,6);
\node  [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$};

%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node  [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node  [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
%
%\node (0, 6)  (c2) circle [radius = 0.3] {$c_2$};
\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, draw] (r2) at (2, 5) {$r_2$};
%
%
\node [] (ldots) at (4.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};

\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$};

\node at ($(ldots)!.4!(rn)$) {\ldots};




\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};

\path
	(r)
        edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path
	(r1)
        edge [] node {} (r2);
\path   (r2)
	edge [] node {} (ldots);
\path   (r)
	edge [dashed, bend right] node {} (stack);

\path   (r1)
	edge [dashed, ] node {} (stack);

\path   (c1)
	edge [dashed, bend right] node {} (stack);

\path   (c2)
	edge [dashed] node {} (stack);
\path   (4.5, 5)
	edge [dashed, bend left] node {} (stack);
\path   (4.9, 5)
	edge [dashed, bend left] node {} (stack);
\path   (5.3, 5)
	edge [dashed, bend left] node {} (stack);
\path (r2)
	edge [dashed, ] node {} (stack);
\path (rn)
	edge [dashed, bend left] node {} (stack);
\end{tikzpicture}
\caption{Before Injection Phase Starts}
\end{figure}


\noindent
After all derivatives have been taken, the stack grows to a maximum size
and the pair of regular expressions and characters $r_i, c_{i+1}$ 
are then popped out and used in the injection phase.



\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] 
%\draw (-6,-6) grid (6,6);
\node  [radius = 0.5, circle, ] (r) at (-6, 5) {$r$};

%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node  [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node  [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
%
%\node (0, 6)  (c2) circle [radius = 0.3] {$c_2$};
\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, ] (r2) at (2, 5) {$r_2$};
%
%
\node [] (ldots) at (4.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};

\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {$r_n$};

\node at ($(ldots)!.4!(rn)$) {\ldots};

\node [minimum size = 0.5, circle, ] (vn) at (6, -5) {$v_n$};

\node [] (ldots2) at (3.5, -5) {};

\node  [minimum size = 0.5, circle, ] (v2) at (2, -5) {$v_2$};

\node at ($(ldots2)!.4!(v2)$) {\ldots};


\node [circle, ] (v1) at (-2, -5) {$v_1$};

\node  [radius = 0.5, circle, ] (v) at (-6, -5) {$v$};

\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};

\path
	(r)
        edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path
	(r1)
        edge [] node {} (r2);
\path   (r2)
	edge [] node {} (ldots);
\path   (rn)
	edge [] node {$\mkeps$} (vn);
\path   (vn) 
	edge [] node {} (ldots2);
\path   (v2)
	edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1);

\path   (v1)
	edge [] node {$\inj \; r \; c_1 \; v_1$} (v);

\path (stack)
	edge [dashed] node {} (-4.2, -5.2);
\path (stack)
	edge [dashed] node {} (-4, -5.2);
\path (stack)
	edge [dashed] node {} (-0.1, -5.2);
\path (stack)
	edge [dashed] node {} (0.2, -5.26);
\path (stack)
	edge [dashed] node {} (3.2, -5);
\path (stack)
	edge [dashed] node {} (2.7, -5);
\path (stack)
	edge [dashed] node {} (3.7, -5);
\end{tikzpicture}
\caption{Stored $r_i, c_{i+1}$ Used by $\inj$}
\end{figure}
\noindent
Storing all $r_i, c_{i+1}$ pairs recursively
allows the algorithm to work in an elegant way, at the expense of 
storing quite a bit of verbose information.
The stack seems to grow at least quadratically fast with respect
to the input (not taking into account the size bloat of $r_i$),
which can be inefficient and prone to stack overflow.
\section{Bitcoded Algorithm}
To address this,
Sulzmann and Lu chose to  define a new datatype 
called \emph{annotated regular expression},
which condenses all the partial lexing information
(that was originally stored in $r_i, c_{i+1}$ pairs)
into bitcodes.
The bitcodes are then carried with the regular
expression, and augmented or moved around
as the lexing goes on.
It becomes unnecessary
to remember all the 
intermediate expresssions, but only the most recent one
with this bit-carrying regular expression.
Annotated regular expressions 
are defined as the following 
Isabelle datatype \footnote{ We use subscript notation to indicate
	that the bitcodes are auxiliary information that do not
interfere with the structure of the regular expressions }:
\begin{center}
\begin{tabular}{lcl}
  $\textit{a}$ & $::=$  & $\ZERO$\\
                  & $\mid$ & $_{bs}\ONE$\\
                  & $\mid$ & $_{bs}{\bf c}$\\
                  & $\mid$ & $_{bs}\sum\,as$\\
                  & $\mid$ & $_{bs}a_1\cdot a_2$\\
                  & $\mid$ & $_{bs}a^*$
\end{tabular}    
\end{center}  
\noindent
where $bs$ stands for bit-codes, $a$  for $\mathbf{a}$nnotated regular
expressions and $as$ for lists of annotated regular expressions.
The alternative constructor, written, $\sum$, has been generalised to 
accept a list of annotated regular expressions rather than just two.
Why is it generalised? This is because when we open up nested 
alternatives, there could be more than two elements at the same level
after de-duplication, which can no longer be stored in a binary
constructor.
Bits and bitcodes (lists of bits) are defined as:
\begin{center}
		$b ::=   S \mid  Z \qquad
bs ::= [] \mid b::bs    
$
\end{center}
\noindent
We use $S$ and $Z$ rather than $1$ and $0$ is to avoid
confusion with the regular expressions $\ZERO$ and $\ONE$.
The idea is to use the attached bitcodes
to indicate which choice was made at a certain point
during lexing.
For example,
$(_{Z}a+_{S}b) \backslash a$ gives us
$_{Z}\ONE + \ZERO$, this $Z$ bitcode will
later be used to decode that a left branch was
selected at the time when the part $a+b$ is being taken
derivative of.
\subsection{A Bird's Eye View of the Bit-coded Lexer}
Before we give out the rest of the functions and definitions 
related to our
$\blexer$ (\emph{b}-itcoded lexer), we first provide a high-level
view of the algorithm, so the reader does not get lost in
the details.
\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] 
%\draw (-6,-6) grid (6,6);

	\node [circle, draw] (r0) at (-6, 2) {$r$};

	\node  [radius = 0.5, circle, draw] (r) at (-6, 5) {$_{bs}a$};
	\path (r0)
		edge [] node {$\internalise$} (r);
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node  [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node  [minimum size = 1.0cm, circle, draw] (r1) at (-2, 5) {$_{bs_1}a_1$};
%
%\node (0, 6)  (c2) circle [radius = 0.3] {$c_2$};
\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, draw, minimum size = 1.4cm] (r2) at (2, 5) {$_{bs_2}a_2$};
%
%
\node [] (ldots) at (4.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};

\node [minimum size = 2.2cm, circle, draw] (rn) at (6, 5) {$_{bs_n}a_n$};

\node at ($(ldots)!.1!(rn)$) {\ldots};

\node [minimum size = 0.5, circle, ] (v) at (6, 2) {$v$};

%\node [] (v2) at (4, -5) {};
%
%\node [draw, cross out] (ldots2) at (5, -5) {};
%
%\node at ($(ldots2)!.4!(v2)$) {\ldots};


\node [align = center] (decode) at (6.6, 3.2) {$\bmkeps$\\$\decode$};

\path (c1)
	edge [dashed, bend left] node {} (r0);

\path (c2)
	edge [dashed, bend left] node {} (r0);

\path (r1)
	edge [dashed, bend right] node {} (r2);


\path
	(r)
        edge [dashed, bend right] node[left] {} (r1);

\path
	(r)
        edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);

\path
	(r1)
        edge [] node {} (r2);
\path   (r2)
	edge [] node {} (ldots);
\path   (rn)
	edge [] node {} (v);

\path	(r0)
	edge [dashed, bend right] node {} (decode);
%\path	(v)
	%edge [] node {} (ldots2);



\end{tikzpicture}
\caption{Bird's Eye View of $\blexer$}
\end{figure}
\noindent
The plain regular expressions
are first ``lifted'' to an annotated regular expression,
with the function called \emph{internalise}.
Then the annotated regular expression $_{bs}a$ will
go through successive derivatives with respect 
to the input stream of characters $c_1, c_2$ etc.
Each time a derivative is taken, the bitcodes
are moved around, discarded or augmented together 
with the regular expression they are attached to.
After all input has been consumed, the 
bitcodes are collected by $\bmkeps$,
which traverses the nullable part of the regular expression
and collect the bitcodes along the way.
The collected bitcodes are then $\decode$d with the guidance
of the input regular expression $r$.
The most notable improvements of $\blexer$
over $\lexer$ are 
\begin{itemize}
	\item

		An absence of the second injection phase.
	\item
		One need not store each pair of the 
		intermediate regular expressions $_{bs_i}a_i$ and  $c_{i+1}$. 
		The previous annotated regular expression $_{bs_i}a_i$'s information is passed
		on without loss to its successor $_{bs_{i+1}}a_{i+1}$,
		and $c_{i+1}$'s already contained in the strings in $L\;r$ \footnote{
		which can be easily recovered if we decode in the correct order}.
	\item
		The simplification works much better--one can maintain correctness
		while applying quite strong simplifications, as we shall wee.
\end{itemize}
\noindent
In the next section we will 
give the operations needed in $\blexer$,
with some remarks on the idea behind their definitions.
\subsection{Operations in $\textit{Blexer}$}
The first operation we define related to bit-coded regular expressions
is how we move bits to the inside of regular expressions.
Called $\fuse$, this operation attaches bit-codes 
to the front of an annotated regular expression:
\begin{center}
\begin{tabular}{lcl}
  $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\
  $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ &
     $_{bs @ bs'}\ONE$\\
  $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ &
     $_{bs@bs'}{\bf c}$\\
  $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ &
     $_{bs@bs'}\sum\textit{as}$\\
  $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ &
     $_{bs@bs'}a_1 \cdot a_2$\\
  $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ &
     $_{bs @ bs'}a^*$
\end{tabular}    
\end{center} 

\noindent
With \emph{fuse} we are able to define the $\internalise$ function
that translates a ``standard'' regular expression into an
annotated regular expression.
This function will be applied before we start
with the derivative phase of the algorithm.

\begin{center}
\begin{tabular}{lcl}
  $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
  $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
  $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\
  $(r_1 + r_2)^\uparrow$ & $\dn$ &
  $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\,
  \textit{fuse}\,[S]\,r_2^\uparrow]$\\
  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
         $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
  $(r^*)^\uparrow$ & $\dn$ &
         $_{[]}(r^\uparrow)^*$\\
\end{tabular}    
\end{center}    
\noindent
We use an up arrow with postfix notation
to denote this operation
for convenience. 
The opposite of $\textit{internalise}$ is
$\erase$, where all the bit-codes are removed,
and the alternative operator $\sum$ for annotated
regular expressions is transformed to the binary version 
in plain regular expressions.
\begin{center}
	\begin{tabular}{lcl}
		$\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\
		$( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\
		$( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\
		$( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ & 
		$ (a_1) _\downarrow \cdot  (a_2) _\downarrow$\\
		$( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\
		$( _{bs} [a]  )_\downarrow$ & $\dn$ & $a_\downarrow$\\
		$_{bs} \sum [a_1, \; a_2]$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\
		$(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\
		$( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$
	\end{tabular}
\end{center}
\noindent
We also abbreviate the $\erase\; a$ operation
as $a_\downarrow$, for conciseness.

Testing whether an annotated regular expression
contains the empty string in its lauguage requires
a dedicated function $\bnullable$.
$\bnullable$ simply calls $\erase$ before $\nullable$.
\begin{definition}
		$\bnullable \; a \dn  \nullable \; (a_\downarrow)$
\end{definition}
The function for collecting 
bitcodes from a 
(b)nullable regular expression is called $\bmkeps$.
$\bmkeps$ is a variation of the $\textit{mkeps}$ function,
which follows the path $\mkeps$ takes to traverse the
$\nullable$ branches when visiting a regular expression,
but gives back bitcodes instead of a value.
\begin{center}
\begin{tabular}{lcl}
  $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\
  $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ &
     $\textit{if}\;\textit{bnullable}\,a$\\
  & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
  & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\
  $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ &
     $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
  $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ &
     $bs \,@\, [Z]$
\end{tabular}    
\end{center}    
\noindent
$\bmkeps$, just like $\mkeps$, 
visits a regular expression tree respecting
the POSIX rules. The difference, however, is that
it does not create values, but only bitcodes.
It traverses each child of the sequence regular expression
from left to right and creates a bitcode by stitching
together bitcodes obtained from the children expressions.
In the case of alternative regular expressions, 
it looks for the leftmost
$\nullable$ branch
to visit and ignores other siblings.
%Whenever there is some bitcodes attached to a
%node, it returns the bitcodes concatenated with whatever
%child recursive calls return.
The only time when $\bmkeps$ creates new bitcodes
is when it completes a star's iterations by attaching a $S$ to the end of the bitcode
list it returns.\\
The bitcodes extracted by $\bmkeps$ need to be 
$\decode$d (with the guidance of a plain regular expression):
%\begin{definition}[Bitdecoding of Values]\mbox{}
\begin{center}
\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
  $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
  $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
  $\textit{decode}'\,(Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
       (\Left\,v, bs_1)$\\
  $\textit{decode}'\,(S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
       (\Right\,v, bs_1)$\\                           
  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
  & &   \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
  $\textit{decode}'\,(Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
  $\textit{decode}'\,(S\!::\!bs)\,(r^*)$ & $\dn$ & 
         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
  & &   \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
  
  $\textit{decode}\,bs\,r$ & $\dn$ &
     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
  & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
       \textit{else}\;\textit{None}$                       
\end{tabular}    
\end{center} 
\noindent
The function $\decode'$ returns a pair consisting of 
a partially decoded value and some leftover bit list that cannot
be decide yet.
The function $\decode'$ succeeds if the left-over 
bit-sequence is empty.
$\decode$ is terminating as $\decode'$ is terminating.
$\decode'$ is terminating 
because at least one of $\decode'$'s parameters will go down in terms
of size.\\
The reverse operation of $\decode$ is $\code$.
$\textit{code}$ encodes a value into a bitcode by converting
$\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty
star iteration by $S$. The border where a star iteration
terminates is marked by $Z$. 
This coding is lossy, as it throws away the information about
characters, and does not encode the ``boundary'' between two
sequence values. Moreover, with only the bitcode we cannot even tell
whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$.  
\begin{center}
\begin{tabular}{lcl}
  $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
  $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
  $\textit{code}(\Left\,v)$ & $\dn$ & $Z :: code(v)$\\
  $\textit{code}(\Right\,v)$ & $\dn$ & $S :: code(v)$\\
  $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
  $\textit{code}(\Stars\,[])$ & $\dn$ & $[Z]$\\
  $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $S :: code(v) \;@\;
                                                 code(\Stars\,vs)$
\end{tabular}    
\end{center} 
\noindent
Assume we have a value $v$ and regular expression $r$
with $\vdash v:r$,
then we have the property that $\decode$ and $\code$ are
reverse operations of one another:
\begin{lemma}
\[If \vdash v : r \; then \;\decode \; (\code \; v) \; r = \textit{Some}(v) \]
\end{lemma}
\begin{proof}
By proving a more general version of the lemma, on $\decode'$:
\[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \]
Then setting $ds$ to be $[]$ and unfolding $\decode$ definition,
we obtain the property.
\end{proof}
\noindent
Now we give out the central part of this lexing algorithm,
the $\bder$ function (stands for \emph{b}itcoded-derivative):
\begin{center}
  \begin{tabular}{@{}lcl@{}}
  $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\  
  $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\  
  $(_{bs}{\bf d})\,\backslash c$ & $\dn$ &
        $\textit{if}\;c=d\; \;\textit{then}\;
         _{bs}\ONE\;\textit{else}\;\ZERO$\\  
  $(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ &
  $_{bs}\sum\;(\textit{map} \; (\_\backslash c) \; as )$\\
  $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
     $\textit{if}\;\textit{bnullable}\,a_1$\\
					       & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
					       & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
  & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\
  $(_{bs}a^*)\,\backslash c$ & $\dn$ &
      $_{bs}(\textit{fuse}\, [Z] \; r\,\backslash c)\cdot
       (_{[]}r^*))$
\end{tabular}    
\end{center}    
\noindent
For most time we use the infix notation $(\_\backslash\_)$ 
to mean $\bder$ for brevity when
there is no danger of confusion with derivatives on plain regular expressions.
For example, we write $( _{[]}r^* ) \backslash c$ instead of $\bder \;c \; _{[]}r^*$,
as the bitcodes at the front of $r^*$ indicates that it is 
a bit-coded regular expression, not a plain one.\\
$\bder$ tells us how regular expressions can be recursively traversed,
where the bitcodes are augmented and carried around 
when a derivative is taken.
We give the intuition behind some of the more involved cases in 
$\bder$. \\
For example,
in the \emph{star} case,
a derivative on $_{bs}a^*$ means 
that one more star iteratoin needs to be taken.
we need to unfold it into a sequence,
and attach an additional bit $Z$ to the front of $r \backslash c$
as a record to indicate one new star iteration is unfolded.

\noindent
\begin{center}
  \begin{tabular}{@{}lcl@{}}
  $(_{bs}a^*)\,\backslash c$ & $\dn$ &
  $_{bs}(\underbrace{\textit{fuse}\, [Z] \; a\,\backslash c}_{\text{One more iteration}})\cdot
       (_{[]}a^*))$
\end{tabular}    
\end{center}   

\noindent
This information will be recovered later by the $\decode$ function.
The intuition is that the bit $Z$ will be decoded at the right location,
because we accumulate bits from left to right (a rigorous proof will be given
later).

%\begin{tikzpicture}[ > = stealth, % arrow head style
%        shorten > = 1pt, % don't touch arrow head to node
%        semithick % line style
%    ]
%
%    \tikzstyle{every state}=[
%        draw = black,
%        thin,
%        fill = cyan!29,
%        minimum size = 7mm
%    ]
%    \begin{scope}[node distance=1cm and 0cm, every node/.style=state]
%		\node (k) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%        {$bs$
%         \nodepart{two} $a^*$ };
%	 \node (l) [below =of k, rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%        { $bs$ + [Z]
%         \nodepart{two}  $(a\backslash c )\cdot a^*$ };
%    \end{scope}
%    \path[->] 
%	      (k) edge (l);
%\end{tikzpicture}
%
%Pictorially the process looks like below.
%Like before, the red region denotes
%previous lexing information (stored as bitcodes in $bs$).

%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
%	\begin{scope}[node distance=1cm]   
%		\node (a) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%        {$bs$
%         \nodepart{two} $a^*$ };
%	 \node (b) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%        { $bs$ + [Z]
%         \nodepart{two}  $(a\backslash c )\cdot a^*$ };
%%\caption{term 1 \ref{term:1}'s matching configuration}
% 	\end{scope}
%\end{tikzpicture}

\noindent
Another place the $\bder$ function differs
from derivatives on plain regular expressions
is the sequence case:
\begin{center}
  \begin{tabular}{@{}lcl@{}}

  $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
     $\textit{if}\;\textit{bnullable}\,a_1$\\
					       & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
					       & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
  & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$
\end{tabular}    
\end{center}    
\noindent
The difference is that (when $a_1$ is $\bnullable$)
we use $\bmkeps$ to store the lexing information
in $a_1$ before collapsing 
it (as it has been fully matched by string prior to $c$),
and attach the collected bit-codes to the front of $a_2$
before throwing away $a_1$. We assume that $\bmkeps$ 
correctly extracts the bitcode for how $a_1$
matches the string prior to $c$ (more on this later).
The bitsequence $\textit{bs}$ which was initially 
attached to the first element of the sequence
$a_1 \cdot a_2$, has now been 
elevated to the top level of the $\sum$ constructor. 
This is because this piece of information will be needed 
whichever way the sequence is matched,
regardless of whether $c$ belongs to $a_1$ or $a_2$.

In the injection-based lexer, $r_1$ is immediately thrown away in 
subsequent derivatives on the right branch (when $r_1$ is $\nullable$),
depite $r_1$ potentially storing information of previous matches:
\begin{center}
	$(r_1 \cdot r_2 )\backslash c = (r_1 \backslash c) \cdot r_2 + r_2 \backslash c$
\end{center}
\noindent
this loss of information makes it necessary
to store on stack all the regular expressions'
``snapshots'' before they were
taken derivative of.
So that the related information will be available 
once the child recursive 
call finishes.\\
The rest of the clauses of $\bder$ is rather similar to
$\der$. \\
Generalising the derivative operation with bitcodes to strings, we have 
\begin{center}
	\begin{tabular}{@{}lcl@{}}
		$a\backslash_s [] $ & $\dn$ & $a$\\
		$a\backslash (c :: s) $ & $\dn$ & $(a \backslash c) \backslash_s s$
	\end{tabular}
\end{center}
\noindent
As we did earlier, we omit the $s$ subscript at $\backslash_s$ when there is no danger
of confusion.
\subsection{Putting Things Together}
Putting these operations altogether, we obtain a lexer with bit-coded regular expressions
as its internal data structures, which we call $\blexer$:

\begin{center}
\begin{tabular}{lcl}
  $\textit{blexer}\;r\,s$ & $\dn$ &
      $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\                
  & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
  & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
  & & $\;\;\textit{else}\;\textit{None}$
\end{tabular}
\end{center}

\noindent
$\blexer$ first attaches bitcodes to a
plain regular expression, and then do successive derivatives
with respect to the input string $s$, and
then test whether the result is nullable.
If yes, then extract the bitcodes out of the nullable expression,
and decodes the bitcodes into a lexical value.
If there does not exists a match between $r$ and $s$ the lexer
outputs $\None$ indicating a failed lex.\\
Ausaf and Urban formally proved the correctness of the $\blexer$, namely
\begin{property}
$\blexer \;r \; s = \lexer \; r \; s$.
\end{property}
\noindent
This was claimed but not formalised in Sulzmann and Lu's work.
We introduce the proof later, after we give all
the needed auxiliary functions and definitions.
\subsection{An Example $\blexer$ Run}
Before introducing the proof we shall first walk the reader 
through a concrete example of our $\blexer$ calculating the right 
lexical information through bit-coded regular expressions.\\
Consider the regular expression $(aa)^* \cdot (b+c)$ matching the string $aab$.
We give again the bird's eye view of this particular example 
in each stage of the algorithm:

\tikzset{three sided/.style={
        draw=none,
        append after command={
            [-,shorten <= -0.5\pgflinewidth]
            ([shift={(-1.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north east)
        edge([shift={( 0.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north west) 
            ([shift={( 0.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north west)
        edge([shift={( 0.5\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south west)            
            ([shift={( 0.5\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south west)
        edge([shift={(-1.0\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south east)
        }
    }
}

\tikzset{three sided1/.style={
        draw=none,
        append after command={
            [-,shorten <= -0.5\pgflinewidth]
            ([shift={(1.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north west)
        edge([shift={(-0.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north east) 
            ([shift={(-0.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north east)
        edge([shift={(-0.5\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south east)            
            ([shift={(-0.5\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south east)
        edge([shift={(1.0\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south west)
        }
    }
}

\begin{figure}[H]
	\begin{tikzpicture}[->, >=stealth', shorten >= 1pt, auto, thick]
		\node [rectangle, draw] (r) at (-6, -1) {$(aa)^*(b+c)$};
		\node [rectangle, draw] (a) at (-6, 4)	  {$(aa)^*(_{Z}b + _{S}c)$};
		\path	(r)
			edge [] node {$\internalise$} (a);
		\node [rectangle, draw] (a1) at (-3, 1) {$(_{Z}(\ONE \cdot a) \cdot (aa)^*) (_{Z}b + _Sc)$};
		\path	(a)
			edge [] node {$\backslash a$} (a1);

		\node [rectangle, draw, three sided] (a21) at (-2.5, 4) {$(_{Z}\ONE \cdot (aa)^*)$};
		\node [rectangle, draw, three sided1] (a22) at (-0.8, 4) {$(_{Z}b + _{S}c)$};
		\path	(a1)
			edge [] node {$\backslash a$} (a21);
		\node [rectangle, draw] (a3) at (0.5, 2) {$_{ZS}(_{Z}\ONE + \ZERO)$};
		\path	(a22)
			edge [] node {$\backslash b$} (a3);
		\path	(a21)
			edge [dashed, bend right] node {} (a3);
		\node [rectangle, draw] (bs) at (2, 4) {$ZSZ$};
		\path	(a3)
			edge [below] node {$\bmkeps$} (bs);
		\node [rectangle, draw] (v) at (3, 0) {$\Seq \; (\Stars\; [\Seq \; a \; a]) \; (\Left \; b)$};
		\path 	(bs)
			edge [] node {$\decode$} (v);


	\end{tikzpicture}
	\caption{$\blexer \;\;\;\; (aa)^*(b+c) \;\;\;\; aab$}
\end{figure}
\noindent
The one dashed arrow indicates that $_Z(\ONE \cdot (aa)^*)$
turned into $ZS$ after derivative w.r.t $b$
is taken, which calls $\bmkeps$ on the nuallable $_Z\ONE\cdot (aa)^*$
before processing $_Zb+_Sc$.\\
The annotated regular expressions
would look too cumbersome if we explicitly indicate all the
locations where bitcodes are attached.
For example,
$(aa)^*\cdot (b+c)$ would 
look like $_{[]}(_{[]}(_{[]}a \cdot _{[]}a)^*\cdot _{[]}(_{[]}b+_{[]}c))$ 
after 
internalise.
Therefore for readability we omit bitcodes if they are empty. 
This applies to all example annotated 
regular expressions in this thesis.\\
%and assume we have just read the first character $a$:
%\begin{center}
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
%    \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%	    {$(_{[Z]}(\ONE \cdot a) \cdot (aa)^* )\cdot bc$ 
%	    \nodepart{two} $[Z] \iff \Seq \; (\Stars \; [\Seq\; a \; ??, \;??]) \; ??$};
%\end{tikzpicture}
%\end{center}
%\noindent
%We use the red area for (annotated) regular expressions and the blue 
%area the (partially calculated) bitcodes 
%and its corresponding (partial) values.
%The first derivative 
%generates a $Z$ bitcode to indicate
%a new iteration has been started.
%This bitcode is attached to the front of
%the unrolled star iteration $\ONE\cdot a$
%for later decoding.
%\begin{center}
%\begin{tikzpicture}[]
%    \node [rectangle split, rectangle split horizontal, 
%	    rectangle split parts=2, rectangle split part fill={red!30,blue!20}, draw, rounded corners, inner sep=10pt]
%	    (der2) at (0,0)
%	    {$(_{[Z]}(\ONE \cdot \ONE) \cdot (aa)^*) \cdot bc $ 
%	    \nodepart{two} $[Z] \iff \Seq \; (\Stars \; [\Seq \; a\;a, ??]) \; ??$};
%
%\node [draw=none, minimum size = 0.1, ] (r) at (-7, 0) {$a_1$};
%\path
%	(r)
%	edge [->, >=stealth',shorten >=1pt, above] node {$\backslash a$} (der2);
%%\caption{term 1 \ref{term:1}'s matching configuration}
%\end{tikzpicture}
%\end{center}
%\noindent
%After we take derivative with respect to 
%second input character $a$, the annotated
%regular expression has the second $a$ chopped off.
%The second derivative does not involve any 
%new bitcodes being generated, because
%there are no new iterations or bifurcations
%in the regular expression requiring any $S$ or $Z$ marker
%to indicate choices.
%\begin{center}
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
%    \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%	    {$(_{[Z]}(\ONE \cdot \ONE) \cdot (aa)^*) \cdot (\ONE \cdot c) $ 
%	    \nodepart{two} $[Z] \iff \Seq \; (\Stars \; [\Seq \; a\;a, ??]) \; ??$};
%%\caption{term 1 \ref{term:1}'s matching configuration}
%\end{tikzpicture}
%\end{center}
%\noindent
%
%
%\begin{center}
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
%    \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%	    {$\stackrel{Bitcoded}{\longrightarrow} \Seq(\Stars[\Char(a), \Char(a)], ???)$
%	\nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$  $\stackrel{Inj}{\longleftarrow}$};
%%\caption{term 1 \ref{term:1}'s matching configuration}
%\end{tikzpicture}
%\end{center}
%\noindent
%If we do this kind of "attachment"
%and each time augment the attached partially
%constructed value when taking off a 
%character:
%\begin{center}
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
%	\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] (spPoint)
%        {$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$
%         \nodepart{two} Remaining: $b c$};
%\end{tikzpicture}\\
%$\downarrow$\\
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
%    \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%        {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \ldots))$
%         \nodepart{two} Remaining: $c$};
%\end{tikzpicture}\\
%$\downarrow$\\
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
%    \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
%        {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \Char(c)))$
%         \nodepart{two} EOF};
%\end{tikzpicture}
%\end{center}
\noindent
In the next section we introduce the correctness proof
found by Ausaf and Urban
of the bitcoded lexer.
%-----------------------------------
%	SUBSECTION 1
%-----------------------------------
\section{Correctness Proof of $\textit{Blexer}$}
Why is $\blexer$ correct?
In other words, why is it the case that 
$\blexer$ outputs the same value as $\lexer$?
Intuitively,
that is because 
\begin{itemize}
	\item
		$\blexer$ follows an almost identical
		path as that of $\lexer$, 
		for example $r_1, r_2, \ldots$ and $a_1, a_2, \ldots$ being produced,
		which are the same up to the application of $\erase$.
	\item
		The bit-encodings work properly,
		allowing the possibility of 
		pulling out the right lexical value from an
		annotated regular expression at 
		any stage of the algorithm.
\end{itemize}
We will elaborate on this, with the help of
some helper functions such as $\retrieve$ and
$\flex$.
\subsection{Specifications of Some Helper Functions}
The functions we introduce will give a more detailed glimpse into 
the lexing process, which is not be possible
using $\lexer$ or $\blexer$ alone.
\subsubsection{$\textit{Retrieve}$}
The first function we shall introduce is $\retrieve$.
Sulzmann and Lu gave its definition, and
Ausaf and Urban found
its usage in mechanised proofs.
Our bit-coded lexer ``retrieve''s the bitcodes using $\bmkeps$,
after all the derivatives has been taken:
\begin{center}
\begin{tabular}{lcl}
	& & $\ldots$\\
  & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
  & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
  & & $\ldots$
\end{tabular}
\end{center}
\noindent
$\bmkeps$ retrieves the value $v$'s
information in the format
of bitcodes, by travelling along the
path of the regular expression that corresponds to a POSIX match,
collecting all the bitcodes.
We know that this "retrieved" bitcode leads to the correct value after decoding,
which is $v_0$ in the injection-based lexing diagram.
As an observation we pointed at the beginning of this section,
the annotated regular expressions generated in successive derivative steps
in $\blexer$ after $\erase$ has the same structure 
as those appeared in $\lexer$.
We redraw the diagram below to visualise this fact.
We pretend that all the values are 
ready despite they are only calculated in $\lexer$.
In general we have $\vdash v_i:(a_i)_\downarrow$.
\vspace{20mm}
\begin{figure}[H]%\label{graph:injLexer}
\begin{center}
\begin{tikzcd}[
	every matrix/.append style = {name=p},
	remember picture, overlay,
	]
	a_0 \arrow[r, "\backslash c_0"]  \arrow[d] & a_1 \arrow[r, "\backslash c_1"] \arrow[d] & a_2 \arrow[r, dashed] \arrow[d] & a_n \arrow[d] \\
v_0           & v_1 \arrow[l,"inj_{r_0} c_0"]                & v_2 \arrow[l, "inj_{r_1} c_1"]              & v_n \arrow[l, dashed]         
\end{tikzcd}
\end{center}
\begin{tikzpicture}[
	remember picture, overlay,
E/.style = {ellipse, draw=blue, dashed,
            inner xsep=4mm,inner ysep=-4mm, rotate=90, fit=#1}
                        ]
\node[E = (p-1-1) (p-2-1)] {};
\node[E = (p-1-4) (p-2-4)] {};
\node[E = (p-1-2) (p-2-2)] {};
\node[E = (p-1-3) (p-2-3)] {};
\end{tikzpicture}

\end{figure}
\vspace{20mm}
\noindent
We encircle in the diagram  all the pairs $v_i, a_i$ to show that these values
and regular expressions share the same structure.
These pairs all contain adequate information to 
obtain the final lexing result.
For example, in the leftmost pair the 
lexical information is condensed in 
$v_0$, whereas for the rightmost pair,
the lexing result hides is in the bitcodes of $a_n$.\\
$\bmkeps$ is able to extract from $a_n$ the result
by looking for nullable parts of the regular expression.
However for regular expressions $a_i$ in general,
they might not necessarily be nullable.
For those regular expressions that are not nullable,
$\bmkeps$ will not work. 
Therefore they need some additional ``help'' 
finding the POSIX bit-encoding.
The most straightforward ``help'' comes from $a_i$'s corresponding
value $v_i$, and this suggests a function $f$ satisfying the
following properties:
\begin{itemize}
	\item
		$f \; a_i\;v_i = f \; a_n \; v_n = \bmkeps \; a_n$%\decode \; (\bmkeps \; a_n) \; (\erase \; a_0)$
	\item
		$f \; a_i\;v_i = f \; a_0 \; v_0 = \code \; v_0$ %\decode \;(\code \; v_0) \; (\erase \; a_0)$
\end{itemize}
\noindent
Sulzmann and Lu came up with a function satisfying 
the above
requirements, named \emph{retrieve}:
\begin{center}
\begin{tabular}{llcl}
	$\retrieve \; \, _{bs} \ONE$ & $   \Empty$ & $\dn$ & $\textit{bs}$\\

	$\retrieve \; \, _{bs} \mathbf{c}$ & $ (\Char \; c) $ & $\dn$ & $ \textit{bs}$\\

	$\retrieve \; \, _{bs} a_1 \cdot a_2$ & $   (\Seq \; v_1 \; v_2) $ & 
	$\dn$ &  $\textit{bs} \; @\;  (\retrieve \; a_1\; v_1)\; @ \;(\retrieve \; a_2 \; v_2)$\\

	$\retrieve \; \, _{bs} \Sigma (a :: \textit{as})$ & $   (\Left \; v) $ & 
	$\dn$ & $\textit{bs}\; @\; (\retrieve \; a \; v)$\\

	$\retrieve \; \, _{bs} \Sigma (a :: \textit{as})$ & $   (\Right \; v) $ & 
	$\dn$ & $\textit{bs}\; @\; (\retrieve \; (_{[]}\Sigma \textit{as}) \; v)$\\

	$\retrieve \; \, _{bs} a^* $ & $   (\Stars \; (v :: vs)) $ & $\dn$ & 
	$\textit{bs}\; @\; (\retrieve \; a \; v)\; @ \; 
	(\retrieve \; (_{[]} a^*) \; (\Stars \; vs) )$\\

	$\retrieve \; \, _{bs} a^* $ & $  (\Stars \; []) $ & $\dn$ & $\textit{bs} \; @ \; [Z]$
\end{tabular}
\end{center}
\noindent
As promised, $\retrieve$ collects the right bit-codes from the 
final derivative $a_n$, guided by $v_n$:
\begin{lemma}\label{bmkepsRetrieve}
	$\bnullable \; a \implies \bmkeps \; a = \retrieve \; a \; (\mkeps \; (\erase \; a))$
\end{lemma}
\begin{proof}
	By a routine induction on $a$.
\end{proof}
\noindent
The design of $\retrieve$ enables us to extract bitcodes
from both annotated regular expressions and values.
$\retrieve$ always ``sucks up'' all the information.
When more information is stored in the value, we would be able to
extract more from the value, and vice versa.
For example in star iterations, $\retrieve$ will be able to extract from $\Stars \; vs$
exactly the same bitcodes as $\code \; (\Stars \; vs)$:
\begin{lemma}
  If $\forall v \in vs. \vdash v : r$, and  $\code \; v = \retrieve \; (\internalise\; r) \; v$\\
  then $\code \; (\Stars \; vs) = \retrieve \; _{[]}((\internalise \;  r)^*) \; (\Stars \; vs)$
\end{lemma}\label{retrieveEncodeSTARS}
\begin{proof}
	By induction on the value list $vs$.
\end{proof}
\noindent
Similarly, when the value list does not hold information,
only the bitcodes plus some delimiter will be recorded:
\begin{center}
  $\retrieve \; _{bs}((\internalise \;  r)^*) \; (\Stars \; [] )  = bs @ [S]$.
\end{center}
In general, if we have a regular expression just internalised
and the lexing result value, we could $\retrieve$ the bitcdoes
that look as if we have en$\code$-ed the value as bitcodes:
\begin{lemma}
	$\vdash v : r \implies \retrieve \; (r)^\uparrow \; v = \code \; v$
\end{lemma}
\begin{proof}
	By induction on $r$.
	The star case uses lemma \ref{retrieveEncodeSTARS}.
\end{proof}
\noindent
The following property is more interesting, as
it provides a "bridge" between $a_0$ and $a_n $ in the
lexing diagram by linking adjacent regular expressions $a_i$ and
$a_{i+1}$.
The property says that one 
can retrieve the same bits from a derivative 
regular expression as those from 
before the derivative took place,
provided that the right values are used respectively:
\begin{lemma}\label{retrieveStepwise}
	$\vdash v : (r\backslash c) \implies \retrieve \; (r \backslash c)  \;  v= \retrieve \; r \; (\inj \; r\; c\; v)$
\end{lemma}
\begin{proof}
	By induction on $r$, where $v$ is allowed to be arbitrary.
	The induction principle is function $\erase$'s cases.
\end{proof}
\noindent
Before we move on to the next
helper function, we rewrite $\blexer$ in
the following way which makes later proofs more convenient:
\begin{lemma}\label{blexer_retrieve}
$\blexer \; r \; s = \decode  \; (\retrieve \; (\internalise \; r) \; (\mkeps \; (r \backslash s) )) \; r$
\end{lemma}
\begin{proof}
	Using lemma \ref{bmkepsRetrieve}.
\end{proof}
\noindent
$\retrieve$ allows connecting
between the intermediate 
results $a_i$ and $a_{i+1}$ in $\blexer$.
For plain regular expressions something similar is required.

\subsection{$\flex$}
Ausaf and Urban cleverly defined an auxiliary function called $\flex$ for $\lexer$,
defined as
\begin{center}
\begin{tabular}{lcl}
$\flex \; r \; f \; [] \; v$       &  $=$ &   $f\; v$\\
$\flex \; r \; f \; (c :: s) \; v$ &  $=$ &   $\flex \; r \; (\lambda v. \, f (\inj \; r\; c\; v)) \;\; s \; v$
\end{tabular}
\end{center}
which accumulates the characters that need to be injected back, 
and does the injection in a stack-like manner (last taken derivative first injected).
$\flex$ can calculate what $\lexer$ calculates, given the input regular expression
$r$, the identity function $id$, the input string $s$ and the value
$v_n= \mkeps \; (r\backslash s)$:
\begin{lemma}
	$\flex \;\; r \;\; \textit{id}\;\; s \;\; (\mkeps \;(r\backslash s)) = \lexer \; r \; s$
\end{lemma}
\begin{proof}
	By reverse induction on $s$.
\end{proof}
\noindent
The main advantage of using $\flex$
in addition to $\lexer$ is that
$\flex$ makes the value $v$ and function $f$
that manipulates the value  explicit parameters,
which ``exposes'' $\lexer$'s recursive call
arguments and allows us to ``set breakpoints'' and ``resume''
at any point during $\lexer$'s recursive calls.\\
The following stepwise property holds. 
\begin{lemma}\label{flexStepwise}
	$\textit{flex} \;\; r \;\; f \;\; (s@[c]) \;\; v = \flex \;\; r \;\; f  \;\; s \;\; (\inj \; (r\backslash s) \; c \; v) $
\end{lemma}
\begin{proof}
	By induction on the shape of $r\backslash s$.
\end{proof}
\noindent
With $\flex$ and $\retrieve$, 
we are ready to connect $\lexer$ and $\blexer$,
giving the correctness proof.

%----------------------------------------------------------------------------------------
%	SECTION  correctness proof
%----------------------------------------------------------------------------------------
\section{Correctness of Bit-coded Algorithm (Without Simplification)}
$\flex$ and $\blexer$ essentially calculates the same thing.
\begin{lemma}\label{flex_retrieve}
	If $\vdash v: (r\backslash s)$, then\\
$\flex \; r \; \textit{id}\; s\; v = \decode \; (\retrieve \; (r\backslash s )\; v) \; r$
\end{lemma}
\begin{proof}
By induction on $s$. 
We prove the interesting case where
both $\flex$ and $\decode$ successfully terminates
with some value.
We take advantage of the stepwise properties  
both sides.
The induction tactic is reverse induction on string $s$.
The inductive hypothesis says that $\flex \; r \; id \;s \; v =
\decode \; (\retrieve \; (r\backslash s) \; v) \; r$ holds,
where $v$ can be any value satisfying 
the assumption $\vdash v: (r\backslash s)$.
The crucial point is to rewrite 
\[
	\retrieve \;\; (r \backslash (s@[c])) \;\; (\mkeps\; (r \backslash (s@[c]) ))
\]
as
\[
	\retrieve \;\; (r \backslash s) 
	\;\; (\inj \; (r \backslash s) \; c\;  \mkeps (r \backslash (s@[c])))
\]
using lemma \ref{retrieveStepwise}.
This enables us to equate 
\[
	\retrieve \; (r \backslash (s@[c])) \; (\mkeps \; (r \backslash (s@[c]) ))
\] 
with 
\[
\flex \; r \; \textit{id} \; s \; (\inj \; (r\backslash s) \; c\; (\mkeps (r\backslash s@[c])))
\]
using IH,
which in turn can be rewritten as
\[
	\flex \; r \; \textit{id} \; (s@[c]) \;  (\mkeps \; (r \backslash (s@[c])))
\].
\end{proof}
\noindent
With this pivotal lemma we can now link $\flex$ and $\blexer$
and finally give the correctness of $\blexer$--it outputs the same result as $\lexer$:
\begin{theorem}
	$\blexer\; r \; s = \lexer \; r \; s$
\end{theorem}
\begin{proof}
	From \ref{flex_retrieve} we have that 
	$\textit{flex} \; r \; \textit{id} \; s \; \mkeps(r \backslash s)  = \blexer \; r \; s$,
	which immediately implies this theorem.
\end{proof}
\noindent
To piece things together and spell out the correctness
result of the bitcoded lexer more explicitly,
we use the fact from the previous chapter that
\[
	 (r, s) \rightarrow v \;\; \textit{iff} \;\; \lexer \; r \; s = v
\]
to obtain this corollary:
\begin{corollary}\label{blexerPOSIX}
	$(r, s) \rightarrow v \;\; \textit{iff} \;\; \blexer \; r \; s = v$
\end{corollary}
Our main reason for wanting a bit-coded algorithm over 
the injection-based one is for its capabilities of allowing
more aggressive simplifications.
We will elaborate on this in the next chapter.
author	Chengsong
	Mon, 22 Aug 2022 17:58:29 +0100
changeset 582	3e19073e91f4
parent 581	9db2500629be
child 585	4969ef817d92
permissions	-rwxr-xr-x