% Chapter Template
% Main chapter title
\chapter{Bit-coded Algorithm of Sulzmann and Lu}
\label{Bitcoded1} % Change X to a consecutive number; for referencing this chapter elsewhere, use \ref{ChapterX}
%Then we illustrate how the algorithm without bitcodes falls short for such aggressive
%simplifications and therefore introduce our version of the bitcoded algorithm and
%its correctness proof in
%Chapter 3\ref{Chapter3}.
In this chapter, we are going to describe the bit-coded algorithm
introduced by Sulzmann and Lu \parencite{Sulzmann2014} to address the growth problem of
derivatives of
regular expressions.
We have implemented their algorithm in Scala and Isabelle,
and found problems
in their algorithm such as de-duplication not working properly and redundant
fixpoint construction.
\section{The Motivation Behind Using Bitcodes}
Let us give again the definition of $\lexer$ from Chapter \ref{Inj}:
\begin{center}
\begin{tabular}{lcl}
$\lexer \; r \; [] $ & $=$ & $\textit{if} \; (\nullable \; r)\; \textit{then}\; \Some(\mkeps \; r) \; \textit{else} \; \None$\\
$\lexer \; r \;c::s$ & $=$ & $\textit{case}\; (\lexer \; (r\backslash c) \; s) \;\textit{of}\; $\\
& & $\quad \phantom{\mid}\; \None \implies \None$\\
& & $\quad \mid \Some(v) \implies \Some(\inj \; r\; c\; v)$
\end{tabular}
\end{center}
\noindent
The first problem with this algorithm is that
for the function $\inj$ to work properly
we cannot destroy the structure of a regular expression,
and therefore cannot simplify too aggressively.
For example,
\[
r + (r + a) \rightarrow r + a
\]
cannot be applied because that would require
breaking up the inner alternative.
The $\lexer$ plus $\textit{simp}$ therefore only enables
same-level de-duplications like
\[
r + r \rightarrow r.
\]
Secondly, the algorithm recursively calls $\lexer$ on
each new character input,
and before starting a child call
it stores information of previous lexing steps
on a stack, in the form of regular expressions
and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
Each descent into deeper recursive calls in $\lexer$
causes a new pair of $r_i, c_i$ to be pushed to the call stack.
\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
\node [minimum width = 2cm, rectangle, draw] (stack) at (0, 3) {Stack};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path (r1)
edge [bend right, dashed] node {saved} (stack);
\path (c1)
edge [bend right, dashed] node {} (stack);
\end{tikzpicture}
\caption{First derivative taken}
\end{figure}
\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, draw] (r2) at (2, 5) {$r_2$};
\node [minimum width = 3cm, minimum height = 1cm, rectangle, draw] (stack) at (0, 2) {\large Stack};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path (r2)
edge [bend right, dashed] node {} (stack);
\path (c2)
edge [bend right, dashed] node {} (stack);
\path (r1)
edge [] node {} (r2);
\end{tikzpicture}
\caption{Second derivative taken}
\end{figure}
\noindent
As the number of derivative steps increase,
the stack would increase:
\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, ] (r2) at (2, 5) {$r_2$};
\node [minimum width = 4cm, minimum height = 2.5cm, rectangle, draw] (stack) at (0, 1) { \large Stack};
\node [] (ldots) at (3.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {};
\node (rldots) at ($(ldots)!.4!(rn)$) {\ldots};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path (rldots)
edge [bend left, dashed] node {} (stack);
\path (r1)
edge [] node {} (r2);
\path (r2)
edge [] node {} (ldots);
\path (ldots)
edge [bend left, dashed] node {} (stack);
\path (5.03, 4.9)
edge [bend left, dashed] node {} (stack);
\end{tikzpicture}
\caption{More derivatives taken}
\end{figure}
\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
%
%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, draw] (r2) at (2, 5) {$r_2$};
%
%
\node [] (ldots) at (4.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$};
\node at ($(ldots)!.4!(rn)$) {\ldots};
\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path
(r1)
edge [] node {} (r2);
\path (r2)
edge [] node {} (ldots);
\path (r)
edge [dashed, bend right] node {} (stack);
\path (r1)
edge [dashed, ] node {} (stack);
\path (c1)
edge [dashed, bend right] node {} (stack);
\path (c2)
edge [dashed] node {} (stack);
\path (4.5, 5)
edge [dashed, bend left] node {} (stack);
\path (4.9, 5)
edge [dashed, bend left] node {} (stack);
\path (5.3, 5)
edge [dashed, bend left] node {} (stack);
\path (r2)
edge [dashed, ] node {} (stack);
\path (rn)
edge [dashed, bend left] node {} (stack);
\end{tikzpicture}
\caption{Before injection phase starts}
\end{figure}
\noindent
After all derivatives have been taken, the stack grows to a maximum size
and the pair of regular expressions and characters $r_i, c_{i+1}$
are then popped out and used in the injection phase.
\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [radius = 0.5, circle, ] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
%
%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, ] (r2) at (2, 5) {$r_2$};
%
%
\node [] (ldots) at (4.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {$r_n$};
\node at ($(ldots)!.4!(rn)$) {\ldots};
\node [minimum size = 0.5, circle, ] (vn) at (6, -5) {$v_n$};
\node [] (ldots2) at (3.5, -5) {};
\node [minimum size = 0.5, circle, ] (v2) at (2, -5) {$v_2$};
\node at ($(ldots2)!.4!(v2)$) {\ldots};
\node [circle, ] (v1) at (-2, -5) {$v_1$};
\node [radius = 0.5, circle, ] (v) at (-6, -5) {$v$};
\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path
(r1)
edge [] node {} (r2);
\path (r2)
edge [] node {} (ldots);
\path (rn)
edge [] node {$\mkeps$} (vn);
\path (vn)
edge [] node {} (ldots2);
\path (v2)
edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1);
\path (v1)
edge [] node {$\inj \; r \; c_1 \; v_1$} (v);
\path (stack)
edge [dashed] node {} (-4.2, -5.2);
\path (stack)
edge [dashed] node {} (-4, -5.2);
\path (stack)
edge [dashed] node {} (-0.1, -5.2);
\path (stack)
edge [dashed] node {} (0.2, -5.26);
\path (stack)
edge [dashed] node {} (3.2, -5);
\path (stack)
edge [dashed] node {} (2.7, -5);
\path (stack)
edge [dashed] node {} (3.7, -5);
\end{tikzpicture}
\caption{Stored $r_i, c_{i+1}$ Used by $\inj$}
\end{figure}
\noindent
Storing all $r_i, c_{i+1}$ pairs recursively
allows the algorithm to work in an elegant way, at the expense of
storing quite a bit of verbose information on the stack.
The stack seems to grow at least quadratically with respect
to the input (not taking into account the size bloat of $r_i$),
which can be inefficient and prone to stack overflows.
\section{Bitcoded Algorithm}
To address this,
Sulzmann and Lu defined a new datatype
called \emph{annotated regular expression},
which condenses all the partial lexing information
(that was originally stored in $r_i, c_{i+1}$ pairs)
into bitcodes.
The bitcodes are then carried with the regular
expression, and augmented or moved around
as the lexing proceeds.
It becomes unnecessary
to remember all the
intermediate expresssions, but only the most recent one
with this bit-carrying regular expression.
Annotated regular expressions
are defined as the following
Isabelle datatype:\footnote{ We use subscript notation to indicate
that the bitcodes are auxiliary information that do not
interfere with the structure of the regular expressions }
\begin{center}
\begin{tabular}{lcl}
$\textit{a}$ & $::=$ & $\ZERO$\\
& $\mid$ & $_{bs}\ONE$\\
& $\mid$ & $_{bs}{\bf c}$\\
& $\mid$ & $_{bs}\sum\,as$\\
& $\mid$ & $_{bs}a_1\cdot a_2$\\
& $\mid$ & $_{bs}a^*$
\end{tabular}
\end{center}
\noindent
where $bs$ stands for bit-codes, $a$ for $\mathbf{a}$nnotated regular
expressions and $as$ for lists of annotated regular expressions.
The alternative constructor, written $\sum$, has been generalised to
take a list of annotated regular expressions rather than just two.
Why is it generalised? This is because when we analyse nested
alternatives, there can be more than two elements at the same level
after de-duplication, which can no longer be stored in a binary
constructor.
Bits and bitcodes (lists of bits) are defined as:
\begin{center}
$b ::= S \mid Z \qquad
bs ::= [] \mid b::bs
$
\end{center}
\noindent
We use $S$ and $Z$ rather than $1$ and $0$ is to avoid
confusion with the regular expressions $\ZERO$ and $\ONE$.
The idea is to use the bitcodes
to indicate which choice was made at a certain point
during lexing.
For example,
$(_{Z}a+_{S}b) \backslash a$ gives us
$_{Z}\ONE + \ZERO$, where the $Z$ bitcode will
later be used to decode that a left branch was
selected at the time when the part $a+b$ was analysed by
derivative.
\subsection{A Bird's Eye View of the Bit-coded Lexer}
Before we give the details of the functions and definitions
related to our
$\blexer$ (\emph{b}-itcoded lexer), we first provide a high-level
view of the algorithm.
\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [circle, draw] (r0) at (-6, 2) {$r$};
\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$_{bs}a$};
\path (r0)
edge [] node {$\internalise$} (r);
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 1.0cm, circle, draw] (r1) at (-2, 5) {$_{bs_1}a_1$};
%
%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, draw, minimum size = 1.4cm] (r2) at (2, 5) {$_{bs_2}a_2$};
%
%
\node [] (ldots) at (4.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
\node [minimum size = 2.2cm, circle, draw] (rn) at (6, 5) {$_{bs_n}a_n$};
\node at ($(ldots)!.1!(rn)$) {\ldots};
\node [minimum size = 0.5, circle, ] (v) at (6, 2) {$v$};
%\node [] (v2) at (4, -5) {};
%
%\node [draw, cross out] (ldots2) at (5, -5) {};
%
%\node at ($(ldots2)!.4!(v2)$) {\ldots};
\node [align = center] (decode) at (6.6, 3.2) {$\bmkeps$\\$\decode$};
\path (c1)
edge [dashed, bend left] node {} (r0);
\path (c2)
edge [dashed, bend left] node {} (r0);
\path (r1)
edge [dashed, bend right] node {} (r2);
\path
(r)
edge [dashed, bend right] node[left] {} (r1);
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path
(r1)
edge [] node {} (r2);
\path (r2)
edge [] node {} (ldots);
\path (rn)
edge [] node {} (v);
\path (r0)
edge [dashed, bend right] node {} (decode);
%\path (v)
%edge [] node {} (ldots2);
\end{tikzpicture}
\caption{A bird's eye view of $\blexer$. The $_{bsi}a_{i}$s stand
for the annotated regular expressions in each derivative step.
The characters used in each derivative is written as $c_i$.
The size of the derivatives typically increase during each derivative step,
therefore we draw the larger circles to indicate that.
The process starts with an internalise phase and concludes with a decoding phase.}
\end{figure}
\noindent
The plain regular expressions
are first ``lifted'' to an annotated regular expression,
with the function called \emph{internalise} ($r \rightarrow _{bs}a$).
Then the annotated regular expression $_{bs}a$ will
be transformed by successive derivatives with respect
to the input stream of characters $c_1, c_2$ etc.
Each time a derivative is taken, the bitcodes
are moved around, discarded or augmented together
with the regular expression they are attached to.
After all input has been consumed, the
bitcodes are collected by $\bmkeps$,
which traverses the nullable part of the regular expression
and collects the bitcodes along the way.
The collected bitcodes are then $\decode$d with the guidance
of the input regular expression $r$ ( $_{bs}a \rightarrow r$).
The most notable improvements of $\blexer$
over $\lexer$ are
\begin{itemize}
\item
the absence of the second injection phase.
\item
one does not need to store each pair of the
intermediate regular expressions $_{bs_i}a_i$ and $c_{i+1}$.
The previous annotated regular expression $_{bs_i}a_i$'s information is passed
on without loss to its successor $_{bs_{i+1}}a_{i+1}$,
and $c_{i+1}$'s information is stored in $L\;r$.\footnote{
which will be used during the decode phase, where we use $r$ as
a source of information.}
\item
simplification works much smoother--one can maintain correctness
while applying quite aggressive simplifications.
\end{itemize}
%In the next section we will
%give the operations needed in $\blexer$,
%with some remarks on the idea behind their definitions.
\subsection{Operations in $\textit{Blexer}$}
The first operation we define related to bit-coded regular expressions
is how we move bits to the inside of regular expressions.
This operation is called $\fuse$:
\begin{center}
\begin{tabular}{lcl}
$\textit{fuse}\;bs \; (\ZERO)$ & $\dn$ & $\ZERO$\\
$\textit{fuse}\;bs\; (_{bs'})\ONE$ & $\dn$ &
$_{bs @ bs'}\ONE$\\
$\textit{fuse}\;bs\;(_{bs'}{\bf c})$ & $\dn$ &
$_{bs@bs'}{\bf c}$\\
$\textit{fuse}\;bs\;(_{bs'}\sum\textit{as})$ & $\dn$ &
$_{bs@bs'}\sum\textit{as}$\\
$\textit{fuse}\;bs\; (_{bs'}a_1\cdot a_2)$ & $\dn$ &
$_{bs@bs'}a_1 \cdot a_2$\\
$\textit{fuse}\;bs\;(_{bs'}a^*)$ & $\dn$ &
$_{bs @ bs'}a^*$
\end{tabular}
\end{center}
\noindent
With \emph{fuse} we are able to define the $\internalise$ function,
written $(\_)^\uparrow$,
that translates a ``standard'' regular expression into an
annotated regular expression.
This function will be applied before we start
with the derivative phase of the algorithm.
\begin{center}
\begin{tabular}{lcl}
$(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
$(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
$(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\
$(r_1 + r_2)^\uparrow$ & $\dn$ &
$_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\,
\textit{fuse}\,[S]\,r_2^\uparrow]$\\
$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
$_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
$(r^*)^\uparrow$ & $\dn$ &
$_{[]}(r^\uparrow)^*$\\
\end{tabular}
\end{center}
\noindent
The opposite of $\textit{internalise}$ is
$\erase$, where all bit-codes are removed,
and the alternative operator $\sum$ for annotated
regular expressions is transformed to the binary version
in (plain) regular expressions. This can be defined as follows:
\begin{center}\label{eraseDef}
\begin{tabular}{lcl}
$\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\
$( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\
$( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\
$( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ &
$ (a_1) _\downarrow \cdot (a_2) _\downarrow$\\
$( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\
$( _{bs} [a] )_\downarrow$ & $\dn$ & $a_\downarrow$\\
$(_{bs} \sum [a_1, \; a_2])_\downarrow$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\
$(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\
$( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$
\end{tabular}
\end{center}
\noindent
Where we abbreviate the $\erase\; a$ operation
as $(a)_\downarrow$, for conciseness.
Determining whether an annotated regular expression
contains the empty string in its language requires
a dedicated function $\bnullable$.
In our formalisation this function is defined by simply calling $\erase$
before $\nullable$.
\begin{definition}
$\bnullable \; a \dn \nullable \; (a_\downarrow)$
\end{definition}
The function for collecting
bitcodes from a
(b)nullable regular expression is called $\bmkeps$.
This function is a lifted version of the $\textit{mkeps}$ function,
which follows the path $\mkeps$ takes to traverse the
$\nullable$ branches when visiting a regular expression,
but collects bitcodes instead of generating a value.
\begin{center}
\begin{tabular}{lcl}
$\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\
$\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ &
$\textit{if}\;\textit{bnullable}\,a$\\
& &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
& &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\
$\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ &
$bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
$\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ &
$bs \,@\, [S]$
\end{tabular}
\end{center}
\noindent
This function, just like $\mkeps$,
visits a regular expression tree respecting
the POSIX rules.
It traverses each child of the sequence regular expression
from left to right and creates a bitcode by stitching
together bitcodes obtained from the children expressions.
In the case of alternative regular expressions,
it looks for the leftmost
$\nullable$ branch
to visit and ignores the other siblings.
%Whenever there is some bitcodes attached to a
%node, it returns the bitcodes concatenated with whatever
%child recursive calls return.
The only time when $\bmkeps$ creates new bitcodes
is when it completes a star's iterations by attaching a $S$ to the end of the bitcode
list it returns.
The bitcodes extracted by $\bmkeps$ need to be
$\decode$d (with the guidance of a ``plain'' regular expression):
%\begin{definition}[Bitdecoding of Values]\mbox{}
\begin{center}
\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
$\textit{decode}'\,(Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
(\Left\,v, bs_1)$\\
$\textit{decode}'\,(S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
(\Right\,v, bs_1)$\\
$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
& & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
$\textit{decode}'\,(S\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
$\textit{decode}'\,(Z\!::\!bs)\,(r^*)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
& & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
$\textit{decode}\,bs\,r$ & $\dn$ &
$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
& & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
\textit{else}\;\textit{None}$
\end{tabular}
\end{center}
\noindent
The function $\decode'$ returns a pair consisting of
a partially decoded value and some leftover bit-list.
The function $\decode'$ succeeds if the left-over
bit-sequence is empty.
%$\decode$ is terminating as $\decode'$ is terminating.
%$\decode'$ is terminating
%because at least one of $\decode'$'s parameters will go down in terms
%of size.\\
The reverse operation of $\decode$ is $\code$.
This function encodes a value into a bitcode by converting
$\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty
star iteration by $S$. The border where a star iteration
terminates is marked by $Z$.
This coding is lossy, as it throws away the information about
characters, and does not encode the ``boundary'' between two
sequence values. Moreover, with only the bitcode we cannot even tell
whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$,
but this will not be necessary in our correctness proof.
\begin{center}
\begin{tabular}{lcl}
$\textit{code}(\Empty)$ & $\dn$ & $[]$\\
$\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
$\textit{code}(\Left\,v)$ & $\dn$ & $Z :: code(v)$\\
$\textit{code}(\Right\,v)$ & $\dn$ & $S :: code(v)$\\
$\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
$\textit{code}(\Stars\,[])$ & $\dn$ & $[Z]$\\
$\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $S :: code(v) \;@\;
code(\Stars\,vs)$
\end{tabular}
\end{center}
\noindent
We can prove that given a value $v$ and regular expression $r$
with $\vdash v:r$,
then we have the property that $\decode$ and $\code$ are
reverse operations of one another:
\begin{lemma}
\[If \vdash v : r \; then \;\decode \; (\code \; v) \; r = \textit{Some}(v) \]
\end{lemma}
\begin{proof}
By proving a more general version of the lemma, on $\decode'$:
\[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \]
Then setting $ds$ to be $[]$ and unfolding $\decode$ definition,
we obtain the property.
\end{proof}
\noindent
Now we define the central part of Sulzmann and Lu's second lexing algorithm,
the $\bder$ function (which stands for \emph{b}itcoded-derivative):
\begin{center}
\begin{tabular}{@{}lcl@{}}
$(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\
$(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\
$(_{bs}{\bf d})\,\backslash c$ & $\dn$ &
$\textit{if}\;c=d\; \;\textit{then}\;
_{bs}\ONE\;\textit{else}\;\ZERO$\\
$(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ &
$_{bs}\sum\;(\textit{map} \; (\_\backslash c) \; as )$\\
$(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
$\textit{if}\;\textit{bnullable}\,a_1$\\
& &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
& &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
& &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\
$(_{bs}a^*)\,\backslash c$ & $\dn$ &
$_{bs}(\textit{fuse}\, [Z] \; r\,\backslash c)\cdot
(_{[]}r^*))$
\end{tabular}
\end{center}
\noindent
For $\bder \; c\; a$, we use the infix notation $a\backslash c$ for readability.
The $\bder$ function tells us how regular expressions can be recursively traversed,
where the bitcodes are augmented and carried around
when a derivative is taken.
We give the intuition behind some of the more involved cases in
$\bder$.
For example,
in the \emph{star} case,
a derivative of $_{bs}a^*$ means
that one more star iteration needs to be taken.
We therefore need to unfold it into a sequence,
and attach an additional bit $Z$ to the front of $a \backslash c$
as a record to indicate one new star iteration is unfolded.
\noindent
\begin{center}
\begin{tabular}{@{}lcl@{}}
$(_{bs}a^*)\,\backslash c$ & $\dn$ &
$_{bs}(\underbrace{\textit{fuse}\, [Z] \; a\,\backslash c}_{\text{One more iteration}})\cdot
(_{[]}a^*))$
\end{tabular}
\end{center}
\noindent
This information will be recovered later by the $\decode$ function.
%The intuition is that the bit $Z$ will be decoded at the right location,
%because we accumulate bits from left to right (a rigorous proof will be given
%later).
%\begin{tikzpicture}[ > = stealth, % arrow head style
% shorten > = 1pt, % don't touch arrow head to node
% semithick % line style
% ]
%
% \tikzstyle{every state}=[
% draw = black,
% thin,
% fill = cyan!29,
% minimum size = 7mm
% ]
% \begin{scope}[node distance=1cm and 0cm, every node/.style=state]
% \node (k) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% {$bs$
% \nodepart{two} $a^*$ };
% \node (l) [below =of k, rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% { $bs$ + [Z]
% \nodepart{two} $(a\backslash c )\cdot a^*$ };
% \end{scope}
% \path[->]
% (k) edge (l);
%\end{tikzpicture}
%
%Pictorially the process looks like below.
%Like before, the red region denotes
%previous lexing information (stored as bitcodes in $bs$).
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
% \begin{scope}[node distance=1cm]
% \node (a) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% {$bs$
% \nodepart{two} $a^*$ };
% \node (b) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% { $bs$ + [Z]
% \nodepart{two} $(a\backslash c )\cdot a^*$ };
%%\caption{term 1 \ref{term:1}'s matching configuration}
% \end{scope}
%\end{tikzpicture}
\noindent
Another place the $\bder$ function differs
from derivatives on regular expressions
is the sequence case:
\begin{center}
\begin{tabular}{@{}lcl@{}}
$(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
$\textit{if}\;\textit{bnullable}\,a_1$\\
& &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
& &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
& &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$
\end{tabular}
\end{center}
\noindent
%We assume that $\bmkeps$
%correctly extracts the bitcode for how $a_1$
%matches the string prior to $c$ (more on this later).
%The bitsequence $\textit{bs}$ which was initially
%attached to the first element of the sequence
%$a_1 \cdot a_2$, has now been
%elevated to the top level of the $\sum$ constructor.
%This is because this piece of information will be needed
%whichever way the sequence is matched,
%regardless of whether $c$ belongs to $a_1$ or $a_2$.
The difference mainly lies in the $\textit{if}$ branch (when $a_1$ is $\bnullable$):
we use $\bmkeps$ to store the lexing information
in $a_1$ before collapsing
it (as it has been fully matched by string prior to $c$),
and attach the collected bit-codes to the front of $a_2$
before throwing away $a_1$.
In the injection-based lexer, $r_1$ is immediately thrown away in
the $\textit{if}$ branch,
the information $r_1$ stores is therefore lost:
\begin{center}
\begin{tabular}{lcl}
$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\
& & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
& & $\mathit{else}\; \ldots$\\
\end{tabular}
\end{center}
\noindent
%\noindent
%this loss of information makes it necessary
%to store on stack all the regular expressions'
%``snapshots'' before they were
%taken derivative of.
%So that the related information will be available
%once the child recursive
%call finishes.\\
The rest of the clauses of $\bder$ is rather similar to
$\der$. \\
Generalising the derivative operation with bitcodes to strings, we have
\begin{center}
\begin{tabular}{@{}lcl@{}}
$a\backslash_s [] $ & $\dn$ & $a$\\
$a\backslash (c :: s) $ & $\dn$ & $(a \backslash c) \backslash_s s$
\end{tabular}
\end{center}
\noindent
As we did earlier, we omit the $s$ subscript at $\backslash_s$ when there is no danger
of confusion.
\subsection{Putting Things Together}
Putting these operations altogether, we obtain a lexer with bit-coded regular expressions
as its internal data structures, which we call $\blexer$:
\begin{center}
\begin{tabular}{lcl}
$\textit{blexer}\;r\,s$ & $\dn$ &
$\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\
& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
& & $\;\;\textit{else}\;\textit{None}$
\end{tabular}
\end{center}
\noindent
This function first attaches bitcodes to a
plain regular expression $r$, and then builds successive derivatives
with respect to the input string $s$, and
then test whether the result is (b)nullable.
If yes, then extract the bitcodes from the nullable expression,
and decodes the bitcodes into a lexical value.
If there does not exists a match between $r$ and $s$ the lexer
outputs $\None$ indicating a mismatch.
Ausaf and Urban formally proved the correctness of the $\blexer$, namely
\begin{property}
$\blexer \;r \; s = \lexer \; r \; s$.
\end{property}
\noindent
This was claimed but not formalised in Sulzmann and Lu's work.
We introduce the proof later, after we give all
the needed auxiliary functions and definitions.
\subsection{An Example $\blexer$ Run}
Before introducing the proof we shall first walk
through a concrete example of our $\blexer$ calculating the right
lexical information through bit-coded regular expressions.
Consider the regular expression $(aa)^* \cdot (b+c)$ matching the string $aab$.
We give again the bird's eye view of this particular example
in each stage of the algorithm:
\tikzset{three sided/.style={
draw=none,
append after command={
[-,shorten <= -0.5\pgflinewidth]
([shift={(-1.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north east)
edge([shift={( 0.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north west)
([shift={( 0.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north west)
edge([shift={( 0.5\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south west)
([shift={( 0.5\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south west)
edge([shift={(-1.0\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south east)
}
}
}
\tikzset{three sided1/.style={
draw=none,
append after command={
[-,shorten <= -0.5\pgflinewidth]
([shift={(1.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north west)
edge([shift={(-0.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north east)
([shift={(-0.5\pgflinewidth,-0.5\pgflinewidth)}]\tikzlastnode.north east)
edge([shift={(-0.5\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south east)
([shift={(-0.5\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south east)
edge([shift={(1.0\pgflinewidth,+0.5\pgflinewidth)}]\tikzlastnode.south west)
}
}
}
\begin{figure}[H]
\begin{tikzpicture}[->, >=stealth', shorten >= 1pt, auto, thick]
\node [rectangle, draw] (r) at (-6, -1) {$(aa)^*(b+c)$};
\node [rectangle, draw] (a) at (-6, 4) {$(aa)^*(_{Z}b + _{S}c)$};
\path (r)
edge [] node {$\internalise$} (a);
\node [rectangle, draw] (a1) at (-3, 1) {$(_{Z}(\ONE \cdot a) \cdot (aa)^*) (_{Z}b + _Sc)$};
\path (a)
edge [] node {$\backslash a$} (a1);
\node [rectangle, draw, three sided] (a21) at (-2.5, 4) {$(_{Z}\ONE \cdot (aa)^*)$};
\node [rectangle, draw, three sided1] (a22) at (-0.8, 4) {$(_{Z}b + _{S}c)$};
\path (a1)
edge [] node {$\backslash a$} (a21);
\node [rectangle, draw] (a3) at (0.5, 2) {$_{ZS}(_{Z}\ONE + \ZERO)$};
\path (a22)
edge [] node {$\backslash b$} (a3);
\path (a21)
edge [dashed, bend right] node {} (a3);
\node [rectangle, draw] (bs) at (2, 4) {$ZSZ$};
\path (a3)
edge [below] node {$\bmkeps$} (bs);
\node [rectangle, draw] (v) at (3, 0) {$\Seq \; (\Stars\; [\Seq \; a \; a]) \; (\Left \; b)$};
\path (bs)
edge [] node {$\decode$} (v);
\end{tikzpicture}
\caption{$\blexer$ with the regular expression $(aa)^*(b+c)$ and $aab$}
\end{figure}
\noindent
The one dashed arrow indicates that $_Z(\ONE \cdot (aa)^*)$
turned into $ZS$ after derivative w.r.t $b$
is taken, which calls $\bmkeps$ on the nullable $_Z\ONE\cdot (aa)^*$
before processing $_Zb+_Sc$.
The annotated regular expressions
would look overwhelming if we explicitly indicate all the
locations where bitcodes are attached.
For example,
$(aa)^*\cdot (b+c)$ would
look like $_{[]}(_{[]}(_{[]}a \cdot _{[]}a)^*\cdot _{[]}(_{[]}b+_{[]}c))$
after
internalise.
Therefore for readability we omit bitcodes if they are empty.
This applies to all annotated
regular expressions in this thesis.
%and assume we have just read the first character $a$:
%\begin{center}
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
% \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% {$(_{[Z]}(\ONE \cdot a) \cdot (aa)^* )\cdot bc$
% \nodepart{two} $[Z] \iff \Seq \; (\Stars \; [\Seq\; a \; ??, \;??]) \; ??$};
%\end{tikzpicture}
%\end{center}
%\noindent
%We use the red area for (annotated) regular expressions and the blue
%area the (partially calculated) bitcodes
%and its corresponding (partial) values.
%The first derivative
%generates a $Z$ bitcode to indicate
%a new iteration has been started.
%This bitcode is attached to the front of
%the unrolled star iteration $\ONE\cdot a$
%for later decoding.
%\begin{center}
%\begin{tikzpicture}[]
% \node [rectangle split, rectangle split horizontal,
% rectangle split parts=2, rectangle split part fill={red!30,blue!20}, draw, rounded corners, inner sep=10pt]
% (der2) at (0,0)
% {$(_{[Z]}(\ONE \cdot \ONE) \cdot (aa)^*) \cdot bc $
% \nodepart{two} $[Z] \iff \Seq \; (\Stars \; [\Seq \; a\;a, ??]) \; ??$};
%
%\node [draw=none, minimum size = 0.1, ] (r) at (-7, 0) {$a_1$};
%\path
% (r)
% edge [->, >=stealth',shorten >=1pt, above] node {$\backslash a$} (der2);
%%\caption{term 1 \ref{term:1}'s matching configuration}
%\end{tikzpicture}
%\end{center}
%\noindent
%After we take derivative with respect to
%second input character $a$, the annotated
%regular expression has the second $a$ chopped off.
%The second derivative does not involve any
%new bitcodes being generated, because
%there are no new iterations or bifurcations
%in the regular expression requiring any $S$ or $Z$ marker
%to indicate choices.
%\begin{center}
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
% \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% {$(_{[Z]}(\ONE \cdot \ONE) \cdot (aa)^*) \cdot (\ONE \cdot c) $
% \nodepart{two} $[Z] \iff \Seq \; (\Stars \; [\Seq \; a\;a, ??]) \; ??$};
%%\caption{term 1 \ref{term:1}'s matching configuration}
%\end{tikzpicture}
%\end{center}
%\noindent
%
%
%\begin{center}
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
% \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% {$\stackrel{Bitcoded}{\longrightarrow} \Seq(\Stars[\Char(a), \Char(a)], ???)$
% \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$ $\stackrel{Inj}{\longleftarrow}$};
%%\caption{term 1 \ref{term:1}'s matching configuration}
%\end{tikzpicture}
%\end{center}
%\noindent
%If we do this kind of "attachment"
%and each time augment the attached partially
%constructed value when taking off a
%character:
%\begin{center}
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
% \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] (spPoint)
% {$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$
% \nodepart{two} Remaining: $b c$};
%\end{tikzpicture}\\
%$\downarrow$\\
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
% \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \ldots))$
% \nodepart{two} Remaining: $c$};
%\end{tikzpicture}\\
%$\downarrow$\\
%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
% \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
% {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \Char(c)))$
% \nodepart{two} EOF};
%\end{tikzpicture}
%\end{center}
In the next section we introduce the correctness proof
found by Ausaf and Urban
of the bitcoded lexer.
%-----------------------------------
% SUBSECTION 1
%-----------------------------------
\section{Correctness Proof of $\textit{Blexer}$}
Why is $\blexer$ correct?
In other words, why is it the case that
$\blexer$ outputs the same value as $\lexer$?
Intuitively,
that is because
\begin{itemize}
\item
$\blexer$ follows an almost identical
path as that of $\lexer$,
for example $r_1, r_2, \ldots$ and $a_1, a_2, \ldots$ being produced,
which are the same up to the application of $\erase$.
\item
The bit-encodings work properly,
allowing the possibility of
pulling out the right lexical value from an
annotated regular expression at
any stage, using $\bmkeps$ or $\retrieve$ (details in
lemmas \ref{bmkepsRetrieve} and \ref{blexer_retrieve}).
\end{itemize}
We will elaborate on this, with the help of
some helper functions such as $\retrieve$ and
$\flex$.
\subsection{Specifications of Some Helper Functions}
The functions we introduce will give a more detailed view into
the lexing process, which is not be possible
using $\lexer$ or $\blexer$ alone.
\subsubsection{$\textit{Retrieve}$}
The first function we shall introduce is $\retrieve$.
Sulzmann and Lu gave its definition, and
Ausaf and Urban found it
useful in their mechanised proofs.
Our bit-coded lexer ``retrieve''s the bitcodes using $\bmkeps$,
after all the derivatives have been taken:
\begin{center}
\begin{tabular}{lcl}
& & $\ldots$\\
& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
& & $\ldots$
\end{tabular}
\end{center}
\noindent
The function $\bmkeps$ retrieves the value $v$'s
information in the format
of bitcodes, by travelling along the
path of the regular expression that corresponds to a POSIX match,
collecting all the bitcodes.
We know that this "retrieved" bitcode leads to the correct value after decoding,
which is $v_0$ in the injection-based lexing diagram:
\vspace{20mm}
\begin{figure}[H]%\label{graph:injLexer}
\begin{center}
\begin{tikzcd}[
every matrix/.append style = {name=p},
remember picture, overlay,
]
a_0 \arrow[r, "\backslash c_0"] \arrow[d] & a_1 \arrow[r, "\backslash c_1"] \arrow[d] & a_2 \arrow[r, dashed] \arrow[d] & a_n \arrow[d] \\
v_0 & v_1 \arrow[l,"inj_{r_0} c_0"] & v_2 \arrow[l, "inj_{r_1} c_1"] & v_n \arrow[l, dashed]
\end{tikzcd}
\end{center}
\begin{tikzpicture}[
remember picture, overlay,
E/.style = {ellipse, draw=blue, dashed,
inner xsep=4mm,inner ysep=-4mm, rotate=90, fit=#1}
]
\node[E = (p-1-1) (p-2-1)] {};
\node[E = (p-1-4) (p-2-4)] {};
\node[E = (p-1-2) (p-2-2)] {};
\node[E = (p-1-3) (p-2-3)] {};
\end{tikzpicture}
\caption{Injection-based lexing diagram,
with matching value and regular expression pairs
encircled}\label{Inj2}
\end{figure}
\vspace{20mm}
\noindent
We have that $\vdash v_i:(a_i)_\downarrow$,
where $v_i$ are the intermediate values generated step by step in $\lexer$.
These values are not explicitly created in $\blexer$.
We encircled in the diagram all the pairs $v_i, a_i$ to show that these values
and regular expressions share the same structure.
These pairs all contain adequate information to
obtain the final lexing result.
For example, in the leftmost pair the
lexical information is condensed in
$v_0$, whereas for the rightmost pair,
the lexing result hides is in the bitcodes of $a_n$.
The function $\bmkeps$ is able to extract from $a_n$ the result
by looking for nullable parts of the regular expression.
However for regular expressions $a_i$ in general,
they might not necessarily be nullable.
For those regular expressions that are not nullable,
$\bmkeps$ will not work.
Therefore they need some additional ``help''
finding the POSIX bit-encoding.
The most straightforward ``help'' comes from $a_i$'s corresponding
value $v_i$, and this suggests a function $f$ satisfying the
following properties:
\begin{itemize}
\item
$f \; a_i\;v_i = f \; a_n \; v_n = \bmkeps \; a_n$%\decode \; (\bmkeps \; a_n) \; (\erase \; a_0)$
\item
$f \; a_i\;v_i = f \; a_0 \; v_0 = \code \; v_0$ %\decode \;(\code \; v_0) \; (\erase \; a_0)$
\end{itemize}
\noindent
Sulzmann and Lu came up with the following definition for $f$ satisfying
the above
requirements, and named it \emph{retrieve}:
\vspace{-7mm}
\begin{center}
\begin{tabular}{llcl}
$\retrieve \; \, _{bs} \ONE$ & $ \Empty$ & $\dn$ & $\textit{bs}$\\
$\retrieve \; \, _{bs} \mathbf{c}$ & $ (\Char \; c) $ & $\dn$ & $ \textit{bs}$\\
$\retrieve \; \, _{bs} a_1 \cdot a_2$ & $ (\Seq \; v_1 \; v_2) $ &
$\dn$ & $\textit{bs} \; @\; (\retrieve \; a_1\; v_1)\; @ \;(\retrieve \; a_2 \; v_2)$\\
$\retrieve \; \, _{bs} \Sigma (a :: \textit{as})$ & $ (\Left \; v) $ &
$\dn$ & $\textit{bs}\; @\; (\retrieve \; a \; v)$\\
$\retrieve \; \, _{bs} \Sigma (a :: \textit{as})$ & $ (\Right \; v) $ &
$\dn$ & $\textit{bs}\; @\; (\retrieve \; (_{[]}\Sigma \textit{as}) \; v)$\\
$\retrieve \; \, _{bs} a^* $ & $ (\Stars \; (v :: vs)) $ & $\dn$ &
$\textit{bs}\; @\; [Z] \; @ \; (\retrieve \; a \; v)\; @ \;
(\retrieve \; (_{[]} a^*) \; (\Stars \; vs) )$\\
$\retrieve \; \, _{bs} a^* $ & $ (\Stars \; []) $ & $\dn$ & $\textit{bs} \; @ \; [S]$
\end{tabular}
\end{center}
\noindent
As stated, $\retrieve$ collects the right bit-codes from the
final derivative $a_n$, guided by $v_n$. This can be proved
as follows:
\begin{lemma}\label{bmkepsRetrieve}
$\bnullable \; a \implies \bmkeps \; a = \retrieve \; a \; (\mkeps \; (\erase \; a))$
\end{lemma}
\begin{proof}
By a routine induction on $a$.
\end{proof}
\noindent
%The design of $\retrieve$ enables us to extract bitcodes
%from both annotated regular expressions and values.
%$\retrieve$ always ``sucks up'' all the information.
When more information is stored in the value, we would be able to
extract more from the value, and vice versa.
For example in star iterations, $\retrieve$ will be able to extract from $\Stars \; vs$
exactly the same bitcodes as $\code \; (\Stars \; vs)$:
\begin{lemma}\label{retrieveEncodeSTARS}
If $\forall v \in vs. \vdash v : r$, and $\code \; v = \retrieve \; (\internalise\; r) \; v$\\
then $\code \; (\Stars \; vs) = \retrieve \; _{[]}((\internalise \; r)^*) \; (\Stars \; vs)$
\end{lemma}
\begin{proof}
By induction on the value list $vs$.
\end{proof}
\noindent
Similarly, when the value list does not hold information,
only the bitcodes plus some delimiter will be recorded:
\begin{center}
$\retrieve \; _{bs}((\internalise \; r)^*) \; (\Stars \; [] ) = bs @ [S]$.
\end{center}
In general, if we have a regular expression that is just internalised
and the final lexing result value, we can $\retrieve$ the bitcodes
that look as if we have en$\code$-ed the value as bitcodes:
\begin{lemma}
$\vdash v : r \implies \retrieve \; (r)^\uparrow \; v = \code \; v$
\end{lemma}
\begin{proof}
By induction on $r$.
The star case uses lemma \ref{retrieveEncodeSTARS}.
\end{proof}
\noindent
The following property of $\retrieve$ is crucial, as
it provides a "bridge" between $a_0$ and $a_n $ in the
lexing diagram by linking adjacent regular expressions $a_i$ and
$a_{i+1}$.
The property says that one
can retrieve the same bits from a derivative
regular expression as those from
before the derivative took place,
provided that the corresponding values are used respectively:
\begin{lemma}\label{retrieveStepwise}
$\vdash v : (r\backslash c) \implies \retrieve \; (r \backslash c) \; v= \retrieve \; r \; (\inj \; r\; c\; v)$
\end{lemma}
\begin{proof}
By induction on $r$, generalising over $v$.
The induction principle in our formalisation
is function $\erase$'s cases.
\end{proof}
\noindent
Before we move on to the next
helper function, we rewrite $\blexer$ in
the following way which makes later proofs more convenient:
\begin{lemma}\label{blexer_retrieve}
$\blexer \; r \; s = \decode \; (\retrieve \; (\internalise \; r) \; (\mkeps \; (r \backslash s) )) \; r$
\end{lemma}
\begin{proof}
Using lemma \ref{bmkepsRetrieve}.
\end{proof}
\noindent
The function $\retrieve$ allows connecting
between the intermediate
results $a_i$ and $a_{i+1}$ in $\blexer$.
For plain regular expressions something similar is required.
\subsection{The Function $\flex$}
Ausaf and Urban defined an auxiliary function called $\flex$ for $\lexer$,
defined as
\begin{center}
\begin{tabular}{lcl}
$\flex \; r \; f \; [] \; v$ & $=$ & $f\; v$\\
$\flex \; r \; f \; (c :: s) \; v$ & $=$ & $\flex \; r \; (\lambda v. \, f (\inj \; r\; c\; v)) \;\; s \; v$
\end{tabular}
\end{center}
which accumulates the characters that need to be injected back,
and does the injection in a stack-like manner (the last taken derivative is first injected).
The function
$\flex$ can calculate what $\lexer$ calculates, given the input regular expression
$r$, the identity function $id$, the input string $s$ and the value
$v_n= \mkeps \; (r\backslash s)$:
\begin{lemma}
$\flex \;\; r \;\; \textit{id}\;\; s \;\; (\mkeps \;(r\backslash s)) = \lexer \; r \; s$
\end{lemma}
\begin{proof}
By reverse induction on $s$.
\end{proof}
\noindent
The main advantage of using $\flex$
in $\lexer$ is that
$\flex$ makes the value $v$ and function $f$
that manipulates the value explicit parameters,
which ``exposes'' $\lexer$'s recursive call
arguments and allows us to ``set breakpoints'' and ``resume''
at any point during $\lexer$'s recursive calls. Therefore
the following stepwise property holds.
\begin{lemma}\label{flexStepwise}
$\textit{flex} \;\; r \;\; f \;\; (s@[c]) \;\; v = \flex \;\; r \;\; f \;\; s \;\; (\inj \; (r\backslash s) \; c \; v) $
\end{lemma}
\begin{proof}
By induction on the shape of $r\backslash s$.
\end{proof}
\noindent
With $\flex$ and $\retrieve$,
we are ready to connect $\lexer$ and $\blexer$,
giving the correctness proof for $\blexer$.
%----------------------------------------------------------------------------------------
% SECTION correctness proof
%----------------------------------------------------------------------------------------
\section{Correctness of the Bit-coded Algorithm (Without Simplification)}
We can first show that
$\flex$ and $\blexer$ calculates the same value.
\begin{lemma}\label{flex_retrieve}
If $\vdash v: (r\backslash s)$, then\\
$\flex \; r \; \textit{id}\; s\; v = \decode \; (\retrieve \; (r\backslash s )\; v) \; r$
\end{lemma}
\begin{proof}
By induction on $s$.
We prove the interesting case where
both $\flex$ and $\decode$ successfully terminates
with some value.
We take advantage of the stepwise properties
both sides.
The induction tactic is reverse induction on string $s$.
The inductive hypothesis says that $\flex \; r \; id \;s \; v =
\decode \; (\retrieve \; (r\backslash s) \; v) \; r$ holds,
where $v$ can be any value satisfying
the assumption $\vdash v: (r\backslash s)$.
The crucial point is to rewrite
\[
\retrieve \;\; (r \backslash (s@[c])) \;\; (\mkeps\; (r \backslash (s@[c]) ))
\]
as
\[
\retrieve \;\; (r \backslash s)
\;\; (\inj \; (r \backslash s) \; c\; \mkeps (r \backslash (s@[c])))
\]
using lemma \ref{retrieveStepwise}.
This enables us to equate
\[
\retrieve \; (r \backslash (s@[c])) \; (\mkeps \; (r \backslash (s@[c]) ))
\]
with
\[
\flex \; r \; \textit{id} \; s \; (\inj \; (r\backslash s) \; c\; (\mkeps (r\backslash s@[c])))
\]
using IH,
which in turn can be rewritten as
\[
\flex \; r \; \textit{id} \; (s@[c]) \; (\mkeps \; (r \backslash (s@[c])))
\].
\end{proof}
\noindent
With this pivotal lemma we can now link $\flex$ and $\blexer$
and finally give the correctness of $\blexer$--it outputs the same result as $\lexer$:
\begin{theorem}
$\blexer\; r \; s = \lexer \; r \; s$
\end{theorem}
\begin{proof}
From \ref{flex_retrieve} we have that
$\textit{flex} \; r \; \textit{id} \; s \; \mkeps(r \backslash s) = \blexer \; r \; s$,
which immediately implies this theorem.
\end{proof}
\noindent
To piece things together and spell out the correctness
result of the bitcoded lexer more explicitly,
we use the fact from the previous chapter that
\[
(r, s) \rightarrow v \;\; \textit{iff} \;\; \lexer \; r \; s =\Some \; v
\quad \quad
\nexists v. (r, s) \rightarrow v\;\; \textit{iff} \;\; \lexer \;r\;s = None
\]
to obtain this corollary:
\begin{corollary}\label{blexerPOSIX}
The $\blexer$ function correctly implements a POSIX lexer, namely\\
$(r, s) \rightarrow v \;\; \textit{iff} \;\; \blexer \; r \; s = \Some \; v$\\
$\nexists v. (r, s) \rightarrow v\;\; \textit{iff} \;\; \blexer \;r\;s = None$
\end{corollary}
Our main reason for analysing the bit-coded algorithm over
the injection-based one is that it allows us to define
more aggressive simplifications.
We will elaborate on this in the next chapter.