% Chapter Template
% Main chapter title
\chapter{Bit-coded Algorithm of Sulzmann and Lu}
\label{Bitcoded1} % Change X to a consecutive number; for referencing this chapter elsewhere, use \ref{ChapterX}
%Then we illustrate how the algorithm without bitcodes falls short for such aggressive
%simplifications and therefore introduce our version of the bitcoded algorithm and
%its correctness proof in
%Chapter 3\ref{Chapter3}.
In this chapter, we are going to describe the bit-coded algorithm
introduced by Sulzmann and Lu \parencite{Sulzmann2014} to address the growth problem of
regular expressions.
We have implemented their algorithm in Scala, and found out inefficiencies
in their algorithm such as de-duplication not working properly and redundant
fixpoint construction. Their algorithm is improved and verified with the help of
formal proofs.
\section{Bit-coded Algorithm}
We first do a recap of what was going on
in the lexer algorithm in Chapter \ref{Inj},
\begin{center}
\begin{tabular}{lcl}
$\lexer \; r \; [] $ & $=$ & $\textit{if} \; (\nullable \; r)\; \textit{then}\; \Some(\mkeps \; r) \; \textit{else} \; \None$\\
$\lexer \; r \;c::s$ & $=$ & $\textit{case}\; (\lexer \; (r\backslash c) \; s) \;\textit{of}\; $\\
& & $\quad \phantom{\mid}\; \None \implies \None$\\
& & $\quad \mid \Some(v) \implies \Some(\inj \; r\; c\; v)$
\end{tabular}
\end{center}
\noindent
The algorithm recursively calls $\lexer$ on
each new character input,
and before starting a child call
it stores information of previous lexing steps
on a stack, in the form of regular expressions
and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
Each descent into deeper recursive calls in $\lexer$
causes a new pair of $r_i, c_i$ to be pushed to the call stack.
\begin{figure}
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
\node [minimum width = 2cm, rectangle, draw] (stack) at (0, 3) {Stack};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path (r1)
edge [bend right, dashed] node {saved} (stack);
\path (c1)
edge [bend right, dashed] node {} (stack);
\end{tikzpicture}
\caption{First Derivative Taken}
\end{figure}
\begin{figure}
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, draw] (r2) at (2, 5) {$r_2$};
\node [minimum width = 3cm, minimum height = 1cm, rectangle, draw] (stack) at (0, 2) {\large Stack};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path (r2)
edge [bend right, dashed] node {} (stack);
\path (c2)
edge [bend right, dashed] node {} (stack);
\path (r1)
edge [] node {} (r2);
\end{tikzpicture}
\caption{Second Derivative Taken}
\end{figure}
\noindent
As the number of derivative steps increase,
the stack would increase:
\begin{figure}
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, ] (r2) at (2, 5) {$r_2$};
\node [minimum width = 4cm, minimum height = 2.5cm, rectangle, draw] (stack) at (0, 1) { \large Stack};
\node [] (ldots) at (3.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {};
\node (rldots) at ($(ldots)!.4!(rn)$) {\ldots};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path (rldots)
edge [bend left, dashed] node {} (stack);
\path (r1)
edge [] node {} (r2);
\path (r2)
edge [] node {} (ldots);
\path (ldots)
edge [bend left, dashed] node {} (stack);
\path (5.03, 4.9)
edge [bend left, dashed] node {} (stack);
\end{tikzpicture}
\caption{More Derivatives Taken}
\end{figure}
\noindent
After all derivatives have been taken, the stack grows to a maximum size
and the pair of regular expressions and characters $r_i, c_{i+1}$
are then popped out and used in the injection phase:
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$};
%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
%
%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
%
%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
\node [circle, draw] (r2) at (2, 5) {$r_2$};
%
%
\node [] (ldots) at (4.5, 5) {};
%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$};
\node at ($(ldots)!.4!(rn)$) {\ldots};
\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$};
\node [] (ldots2) at (3.5, -5) {};
\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$};
\node at ($(ldots2)!.4!(v2)$) {\ldots};
\node [circle, draw] (v1) at (-2, -5) {$v_1$};
\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$};
\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
\path
(r)
edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
\path
(r1)
edge [] node {} (r2);
\path (r2)
edge [] node {} (ldots);
\path (rn)
edge [] node {$\mkeps$} (vn);
\path (vn)
edge [] node {} (ldots2);
\path (v2)
edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1);
\path (v1)
edge [] node {$\inj \; r \; c_1 \; v_1$} (v);
\path (r)
edge [dashed, bend right] node {} (stack);
\path (r1)
edge [dashed, ] node {} (stack);
\path (c1)
edge [dashed, bend right] node {} (stack);
\path (c2)
edge [dashed] node {} (stack);
\path (4.5, 5)
edge [dashed, bend left] node {} (stack);
\path (4.9, 5)
edge [dashed, bend left] node {} (stack);
\path (5.3, 5)
edge [dashed, bend left] node {} (stack);
\path (r2)
edge [dashed, ] node {} (stack);
\path (rn)
edge [dashed, bend left] node {} (stack);
\end{tikzpicture}
%\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%%\draw (-6,-6) grid (6,6);
%\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$};
%
%%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
%\node [circle, minimum size = 0.1] (c1) at (-4, 5.4) {$c_1$};
%%
%%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
%\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
%%
%%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
%\node [circle, minimum size = 0.1] (c2) at (0, 5.4) {$c_2$};
%%
%%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
%\node [circle, draw] (r2) at (2, 5) {$r_2$};
%%
%%
%\node [] (ldots) at (4.5, 5) {};
%%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
%
%\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$};
%
%\node at ($(ldots)!.4!(rn)$) {\ldots};
%
%\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$};
%
%\node [] (ldots2) at (3.5, -5) {};
%
%\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$};
%
%\node at ($(ldots2)!.4!(v2)$) {\ldots};
%
%
%\node [circle, draw] (v1) at (-2, -5) {$v_1$};
%
%\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$};
%
%\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
%
%\path
% (r)
% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
%\path
% (r1)
% edge [] node {} (r2);
%\path (r2)
% edge [] node {} (ldots);
%\path (rn)
% edge [] node {$\mkeps$} (vn);
%\path (vn)
% edge [] node {} (ldots2);
%\path (v2)
% edge [] node {} (v1);
%
%\path (v1)
% edge [] node {} (v);
%\path (r)
% edge [] node {saved} (stack);
%
%\path (r1)
% edge [] node {saved} (stack);
%\end{tikzpicture}
\noindent
The information stored in characters and regular expressions
make the algorithm work in an elegant way, at the expense of being
storing quite a bit of verbose information.
The lexer algorithm in Chapter \ref{Inj},
stores information of previous lexing steps
on a stack, in the form of regular expressions
and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
The red part represents what we already know during the first
derivative phase,
and the blue part represents the unknown part of input.
\begin{ceqn}
\begin{equation}%\label{graph:injLexer}
\begin{tikzcd}[ampersand replacement=\&, execute at end picture={
\begin{scope}[on background layer]
\node[rectangle, fill={red!30},
pattern=north east lines, pattern color=red,
fit={(-3,-1) (-3, 1) (1, -1)
(1, 1)}
]
{}; ,
\node[rectangle, fill={blue!20},
pattern=north east lines, pattern color=blue,
fit= {(1, -1) (1, 1) (3, -1) (3, 1)}
]
{};
\end{scope}}
]
r_0 \arrow[r, "\backslash c_0"] \arrow[d] \& r_1 \arrow[r, "\backslash c_1"] \arrow[d] \& r_2 \arrow[r, dashed] \arrow[d] \& r_n \arrow[d, "mkeps" description] \\
v_0 \& v_1 \arrow[l,"inj_{r_0} c_0"] \& v_2 \arrow[l, "inj_{r_1} c_1"] \& v_n \arrow[l, dashed] \\
\end{tikzcd}
\end{equation}
\end{ceqn}
\noindent
The red area expands as we move towards $r_n$,
indicating more is known about the lexing result.
Despite knowing this partial lexing information during
the forward derivative phase, we choose to store them
all the way until $r_n$ is reached.
Then we reconstruct the value character by character
values at a later stage, using information in a Last-In-First-Out
manner. Although the algorithm is elegant and natural,
it can be inefficient and prone to stack overflow.\\
It turns out we can store lexing
information on the fly, while still using regular expression
derivatives.
If we remove the individual
lexing steps, and use red and blue areas as before
to indicate consumed (seen) input and constructed
partial value (before recovering the rest of the stack),
one could see that the seen part's lexical information
is stored in the form of a regular expression.
Consider the regular expression $(aa)^* \cdot bc$ matching the string $aabc$
and assume we have just read the two characters $aa$:
\begin{center}
\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
{Partial lexing info: $\ONE \cdot a \cdot (aa)^* \cdot bc$ etc.
\nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$};
\end{tikzpicture}
\end{center}
\noindent
In the injection-based lexing algorithm, we ``neglect" the red area
by putting all the characters we have consumed and
intermediate regular expressions on the stack when
we go from left to right in the derivative phase.
The red area grows till the string is exhausted.
During the injection phase, the value in the blue area
is built up incrementally, while the red area shrinks.
Before we have recovered all characters and intermediate
derivative regular expressions from the stack,
what values these characters and regular expressions correspond
to are unknown:
\begin{center}
\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={white!30,blue!20},]
{$(\ONE \cdot \ONE) \cdot (aa)^* \cdot bc $ correspond to:$???$
\nodepart{two} $b c$ corresponds to $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$};
%\caption{term 1 \ref{term:1}'s matching configuration}
\end{tikzpicture}
\end{center}
\noindent
However, they should be calculable,
as characters and regular expression shapes
after taking derivative w.r.t those characters
have already been known, therefore in our example,
we know that the value starts with two $a$s,
and makes up to an iteration in a Kleene star:
(We have put the injection-based lexing's partial
result in the right part of the split rectangle
to contrast it with the partial valued produced
in a forward manner)
\begin{center}
\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
{$\stackrel{Bitcoded}{\longrightarrow} \Seq(\Stars[\Char(a), \Char(a)], ???)$
\nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$ $\stackrel{Inj}{\longleftarrow}$};
%\caption{term 1 \ref{term:1}'s matching configuration}
\end{tikzpicture}
\end{center}
\noindent
If we do this kind of "attachment"
and each time augment the attached partially
constructed value when taking off a
character:
\begin{center}
\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] (spPoint)
{$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$
\nodepart{two} Remaining: $b c$};
\end{tikzpicture}\\
$\downarrow$\\
\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
{$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \ldots))$
\nodepart{two} Remaining: $c$};
\end{tikzpicture}\\
$\downarrow$\\
\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
{$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \Char(c)))$
\nodepart{two} EOF};
\end{tikzpicture}
\end{center}
\noindent
In the end we could recover the value without a backward phase.
But (partial) values are a bit clumsy to stick together with a regular expression, so
we instead use bit-codes to encode them.
Bits and bitcodes (lists of bits) are defined as:
\begin{center}
$b ::= S \mid Z \qquad
bs ::= [] \mid b::bs
$
\end{center}
\noindent
Using $S$ and $Z$ rather than $1$ and $0$ is to avoid
confusion with the regular expressions $\ZERO$ and $\ONE$.
Bitcodes (or
bit-lists) can be used to encode values (or potentially incomplete values) in a
compact form. This can be straightforwardly seen in the following
coding function from values to bitcodes:
\begin{center}
\begin{tabular}{lcl}
$\textit{code}(\Empty)$ & $\dn$ & $[]$\\
$\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
$\textit{code}(\Left\,v)$ & $\dn$ & $Z :: code(v)$\\
$\textit{code}(\Right\,v)$ & $\dn$ & $S :: code(v)$\\
$\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
$\textit{code}(\Stars\,[])$ & $\dn$ & $[Z]$\\
$\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $S :: code(v) \;@\;
code(\Stars\,vs)$
\end{tabular}
\end{center}
\noindent
Here $\textit{code}$ encodes a value into a bit-code by converting
$\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty
star iteration by $S$. The border where a local star terminates
is marked by $Z$.
This coding is lossy, as it throws away the information about
characters, and also does not encode the ``boundary'' between two
sequence values. Moreover, with only the bitcode we cannot even tell
whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$. The
reason for choosing this compact way of storing information is that the
relatively small size of bits can be easily manipulated and ``moved
around" in a regular expression.
Because of the lossiness, the process of decoding a bitlist requires additionally
a regular expression. The function $\decode$ is defined as:
%\begin{definition}[Bitdecoding of Values]\mbox{}
\begin{center}
\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
$\textit{decode}'\,(Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
(\Left\,v, bs_1)$\\
$\textit{decode}'\,(S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
(\Right\,v, bs_1)$\\
$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
& & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
$\textit{decode}'\,(Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
$\textit{decode}'\,(S\!::\!bs)\,(r^*)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
& & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
$\textit{decode}\,bs\,r$ & $\dn$ &
$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
& & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
\textit{else}\;\textit{None}$
\end{tabular}
\end{center}
%\end{definition}
\noindent
The function $\decode'$ returns a pair consisting of
a partially decoded value and some leftover bit list that cannot
be decide yet.
The function $\decode'$ succeeds if the left-over
bit-sequence is empty.
$\decode$ is terminating as $\decode'$ is terminating.
$\decode'$ is terminating
because at least one of $\decode'$'s parameters will go down in terms
of size.
Assuming we have a value $v$ and regular expression $r$
with $\vdash v:r$,
then we have the property that $\decode$ and $\code$ are
reverse operations of one another:
\begin{lemma}
\[If \vdash v : r \; then \;\decode \; (\code \; v) \; r = \textit{Some}(v) \]
\end{lemma}
\begin{proof}
By proving a more general version of the lemma, on $\decode'$:
\[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \]
Then setting $ds$ to be $[]$ and unfolding $\decode$ definition,
we obtain the property.
\end{proof}
With the $\code$ and $\decode$ functions in hand, we know how to
switch between bit-codes and values.
The next step is to integrate this information into regular expression.
Attaching bits to the front of regular expressions is the solution Sulzamann and Lu
gave for storing partial values in regular expressions.
Annotated regular expressions are therefore defined as the Isabelle
datatype:
\begin{center}
\begin{tabular}{lcl}
$\textit{a}$ & $::=$ & $\ZERO$\\
& $\mid$ & $_{bs}\ONE$\\
& $\mid$ & $_{bs}{\bf c}$\\
& $\mid$ & $_{bs}\sum\,as$\\
& $\mid$ & $_{bs}a_1\cdot a_2$\\
& $\mid$ & $_{bs}a^*$
\end{tabular}
\end{center}
%(in \textit{ALTS})
\noindent
where $bs$ stands for bit-codes, $a$ for $\mathbf{a}$nnotated regular
expressions and $as$ for lists of annotated regular expressions.
The alternative constructor, written, $\sum$, has been generalised to
accept a list of annotated regular expressions rather than just two.
Why is it generalised? This is because when we open up nested
alternatives, there could be more than two elements at the same level
after de-duplication, which can no longer be stored in a binary
constructor.
The first operation we define related to bit-coded regular expressions
is how we move bits to the inside of regular expressions.
Called $\fuse$, this operation is attaches bit-codes
to the front of an annotated regular expression:
\begin{center}
\begin{tabular}{lcl}
$\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\
$\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ &
$_{bs @ bs'}\ONE$\\
$\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ &
$_{bs@bs'}{\bf c}$\\
$\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ &
$_{bs@bs'}\sum\textit{as}$\\
$\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ &
$_{bs@bs'}a_1 \cdot a_2$\\
$\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ &
$_{bs @ bs'}a^*$
\end{tabular}
\end{center}
\noindent
With \emph{fuse} we are able to define the $\internalise$ function
that translates a ``standard'' regular expression into an
annotated regular expression.
This function will be applied before we start
with the derivative phase of the algorithm.
\begin{center}
\begin{tabular}{lcl}
$(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
$(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
$(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\
$(r_1 + r_2)^\uparrow$ & $\dn$ &
$_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\,
\textit{fuse}\,[S]\,r_2^\uparrow]$\\
$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
$_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
$(r^*)^\uparrow$ & $\dn$ &
$_{[]}(r^\uparrow)^*$\\
\end{tabular}
\end{center}
%\end{definition}
\noindent
We use an up arrow with postfix notation
to denote this operation.
for convenience. The $\textit{internalise} \; r$
notation is more cumbersome.
The opposite of $\textit{internalise}$ is
$\erase$, where all the bit-codes are removed,
and the alternative operator $\sum$ for annotated
regular expressions is transformed to the binary alternatives
for plain regular expressions.
\begin{center}
\begin{tabular}{lcl}
$\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\
$( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\
$( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\
$( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ &
$ (a_1) _\downarrow \cdot (a_2) _\downarrow$\\
$( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\
$( _{bs} [a] )_\downarrow$ & $\dn$ & $a_\downarrow$\\
$_{bs} \sum [a_1, \; a_2]$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\
$(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\
$( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$
\end{tabular}
\end{center}
\noindent
We also abbreviate the $\erase\; a$ operation
as $a_\downarrow$, for conciseness.
For bit-coded regular expressions, as a different datatype,
testing whether they contain empty string in their lauguage requires
a dedicated function $\bnullable$
which simply calls $\erase$ first before testing whether it is $\nullable$.
\begin{definition}
$\bnullable \; a \dn \nullable \; (a_\downarrow)$
\end{definition}
The function for collecting the
bitcodes at the end of the derivative
phase from a (b)nullable regular expression
is a generalised version of the $\textit{mkeps}$ function
for annotated regular expressions, called $\textit{bmkeps}$:
%\begin{definition}[\textit{bmkeps}]\mbox{}
\begin{center}
\begin{tabular}{lcl}
$\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\
$\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ &
$\textit{if}\;\textit{bnullable}\,a$\\
& &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
& &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\
$\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ &
$bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
$\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ &
$bs \,@\, [Z]$
\end{tabular}
\end{center}
%\end{definition}
\noindent
$\bmkeps$ completes the value information by travelling along the
path of the regular expression that corresponds to a POSIX value and
collecting all the bitcodes, and attaching $S$ to indicate the end of star
iterations.
Now we give out the central part of this lexing algorithm,
the $\bder$ function (stands for \emph{b}itcoded-derivative).
For most time we use the infix notation $(\_\backslash\_)$
to mean $\bder$ for brevity when
there is no danger of confusion with derivatives on plain regular expressions.
For example, we write $( _{[]}r^* ) \backslash c$ instead of $\bder \;c \; _{[]}r^*$,
as the bitcodes at the front of $r^*$ indicates that it is
a bit-coded regular expression, not a plain one.
$\bder$ tells us how regular expressions can be recursively traversed,
where the bitcodes are augmented and carried around
when a derivative is taken.
\begin{center}
\begin{tabular}{@{}lcl@{}}
$(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\
$(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\
$(_{bs}{\bf d})\,\backslash c$ & $\dn$ &
$\textit{if}\;c=d\; \;\textit{then}\;
_{bs}\ONE\;\textit{else}\;\ZERO$\\
$(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ &
$_{bs}\sum\;(\textit{map} \; (\_\backslash c) \; as )$\\
$(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
$\textit{if}\;\textit{bnullable}\,a_1$\\
& &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
& &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
& &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\
$(_{bs}a^*)\,\backslash c$ & $\dn$ &
$_{bs}(\textit{fuse}\, [Z] \; r\,\backslash c)\cdot
(_{[]}r^*))$
\end{tabular}
\end{center}
\noindent
We give the intuition behind some of the more involved cases in
$\bder$. For example,
in the \emph{star} case,
a derivative on $_{bs}a^*$ means
that one more star iteratoin needs to be taken.
we need to unfold it into a sequence,
and attach an additional bit $Z$ to the front of $r \backslash c$
as a record to indicate one new star iteration is unfolded.
\noindent
\begin{center}
\begin{tabular}{@{}lcl@{}}
$(_{bs}a^*)\,\backslash c$ & $\dn$ &
$_{bs}(\underbrace{\textit{fuse}\, [Z] \; a\,\backslash c}_{\text{One more iteration}})\cdot
(_{[]}a^*))$
\end{tabular}
\end{center}
\noindent
This information will be recovered later by the $\decode$ function.
The intuition is that the bit $Z$ will be decoded at the right location,
because we accumulate bits from left to right (a rigorous proof will be given
later).
\begin{tikzpicture}[ > = stealth, % arrow head style
shorten > = 1pt, % don't touch arrow head to node
semithick % line style
]
\tikzstyle{every state}=[
draw = black,
thin,
fill = cyan!29,
minimum size = 7mm
]
\begin{scope}[node distance=1cm and 0cm, every node/.style=state]
\node (k) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
{$bs$
\nodepart{two} $a^*$ };
\node (l) [below =of k, rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
{ $bs$ + [Z]
\nodepart{two} $(a\backslash c )\cdot a^*$ };
\end{scope}
\path[->]
(k) edge (l);
\end{tikzpicture}
Pictorially the process looks like below.
Like before, the red region denotes
previous lexing information (stored as bitcodes in $bs$).
\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
\begin{scope}[node distance=1cm]
\node (a) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
{$bs$
\nodepart{two} $a^*$ };
\node (b) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
{ $bs$ + [Z]
\nodepart{two} $(a\backslash c )\cdot a^*$ };
%\caption{term 1 \ref{term:1}'s matching configuration}
\end{scope}
\end{tikzpicture}
\noindent
Another place in the $\bder$ function where it differs
from normal derivatives (on un-annotated regular expressions)
is the sequence case:
\begin{center}
\begin{tabular}{@{}lcl@{}}
$(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
$\textit{if}\;\textit{bnullable}\,a_1$\\
& &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
& &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
& &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$
\end{tabular}
\end{center}
The difference is that (when $a_1$ is $\bnullable$)
we use $\bmkeps$ to store the lexing information
in $a_1$ before collapsing it (as it has been fully matched by string prior to $c$,
and attach the collected bit-codes to the front of $a_2$
before throwing away $a_1$. We assume that $\bmkeps$ correctly extracts the bitcode for how $a_1$
matches the string prior to $c$ (more on this later).
The bitsequence $\textit{bs}$ which was initially attached to the first element of the sequence
$a_1 \cdot a_2$, has now been elevated to the top level of teh $\sum$.
This is because this piece of information will be needed whichever way the sequence is matched,
regardless of whether $c$ belongs to $a_1$ or $a_2$.
In the injection-based lexing, $r_1$ is immediately thrown away in
subsequent derivatives on the right branch (when $r_1$ is $\nullable$),
\begin{center}
$(r_1 \cdot r_2 )\backslash c = (r_1 \backslash c) \cdot r_2 + r_2 \backslash c$
\end{center}
\noindent
as it knows $r_1$ is stored on stack and available once the recursive
call to later derivatives finish.
Therefore, if the $\Right$ branch is taken in a $\POSIX$ match,
we construct back the sequence value once step back by
calling a on $\mkeps(r_1)$.
\begin{center}
\begin{tabular}{lcr}
$\ldots r_1 \cdot r_2$ & $\rightarrow$ & $r_1\cdot r_2 + r_2 \backslash c \ldots $\\
$\ldots \Seq(v_1, v_2) (\Seq(\mkeps(r1), (\inj \; r_2 \; c\; v_{2c})))$ & $\leftarrow$ & $\Right(v_{2c})\ldots$
\end{tabular}
\end{center}
\noindent
The rest of the clauses of $\bder$ is rather similar to
$\der$, and is put together here as a wholesome definition
for $\bder$:
\noindent
Generalising the derivative operation with bitcodes to strings, we have
\begin{center}
\begin{tabular}{@{}lcl@{}}
$a\backslash_s [] $ & $\dn$ & $a$\\
$a\backslash (c :: s) $ & $\dn$ & $(a \backslash c) \backslash_s s$
\end{tabular}
\end{center}
As we did earlier, we omit the $s$ subscript at $\backslash_s$ when there is no danger
of confusion.
Putting this all together, we obtain a lexer with bit-coded regular expressions
as its internal data structures, which we call $\blexer$:
\begin{center}
\begin{tabular}{lcl}
$\textit{blexer}\;r\,s$ & $\dn$ &
$\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\
& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
& & $\;\;\textit{else}\;\textit{None}$
\end{tabular}
\end{center}
\noindent
Ausaf and Urban formally proved the correctness of the $\blexer$, namely
\begin{conjecture}
$\blexer \;r \; s = \lexer \; r \; s$.
\end{conjecture}
This was claimed but not formalised in Sulzmann and Lu's work.
We introduce the proof later, after we give out all
the needed auxiliary functions and definitions
%-----------------------------------
% SUBSECTION 1
%-----------------------------------
\section{Specifications of Some Helper Functions}
The functions we introduce will give a more detailed glimpse into
the lexing process, which might not be possible
using $\lexer$ or $\blexer$ themselves.
The first function we shall look at is $\retrieve$.
\subsection{$\textit{Retrieve}$}
Our bit-coded lexer "retrieve"s the bitcodes using $\bmkeps$
after we finished doing all the derivatives:
\begin{center}
\begin{tabular}{lcl}
& & $\ldots$\\
& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
& & $\ldots$
\end{tabular}
\end{center}
\noindent
Recall that $\bmkeps$ looks for the leftmost branch of an alternative
and completes a star's iterations by attaching a $Z$ at the end of the bitcodes
extracted. It "retrieves" a sequence by visiting both children and then stitch together
two bitcodes using concatenation. After the entire tree structure of the regular
expression has been traversed using the above manner, we get a bitcode encoding the
lexing result.
We know that this "retrieved" bitcode leads to the correct value after decoding,
which is $v_0$ in the bird's eye view of the injection-based lexing diagram.
Now assume we keep every other data structure in the diagram \ref{InjFigure},
and only replace all the plain regular expression by their annotated counterparts,
computed during a $\blexer$ run.
Then we obtain a diagram for the annotated regular expression derivatives and
their corresponding values, though the values are never calculated in $\blexer$.
We have that $a_n$ contains all the lexing result information.
\vspace{20mm}
\begin{center}%\label{graph:injLexer}
\begin{tikzcd}[
every matrix/.append style = {name=p},
remember picture, overlay,
]
a_0 \arrow[r, "\backslash c_0"] \arrow[d] & a_1 \arrow[r, "\backslash c_1"] \arrow[d] & a_2 \arrow[r, dashed] \arrow[d] & a_n \arrow[d] \\
v_0 & v_1 \arrow[l,"inj_{r_0} c_0"] & v_2 \arrow[l, "inj_{r_1} c_1"] & v_n \arrow[l, dashed]
\end{tikzcd}
\begin{tikzpicture}[
remember picture, overlay,
E/.style = {ellipse, draw=blue, dashed,
inner xsep=4mm,inner ysep=-4mm, rotate=90, fit=#1}
]
\node[E = (p-1-1) (p-2-1)] {};
\node[E = (p-1-4) (p-2-4)] {};
\end{tikzpicture}
\end{center}
\vspace{20mm}
\noindent
On the other hand, $v_0$ also encodes the correct lexing result, as we have proven for $\lexer$.
Encircled in the diagram are the two pairs $v_0, a_0$ and $v_n, a_n$, which both
encode the correct lexical result. Though for the leftmost pair, we have
the information condensed in $v_0$ the value part, whereas for the rightmost pair,
the information is concentrated on $a_n$.
We know that in the intermediate steps the pairs $v_i, a_i$, must in some way encode the complete
lexing information as well. Therefore, we need a unified approach to extract such lexing result
from a value $v_i$ and its annotated regular expression $a_i$.
And the function $f$ must satisfy these requirements:
\begin{itemize}
\item
$f \; a_i\;v_i = f \; a_n \; v_n = \decode \; (\bmkeps \; a_n) \; (\erase \; a_0)$
\item
$f \; a_i\;v_i = f \; a_0 \; v_0 = v_0 = \decode \;(\code \; v_0) \; (\erase \; a_0)$
\end{itemize}
\noindent
If we factor out the common part $\decode \; \_ \; (\erase \; a_0)$,
The core of the function $f$ is something that produces the bitcodes
$\code \; v_0$.
It is unclear how, but Sulzmann and Lu came up with a function satisfying all the above
requirements, named \emph{retrieve}:
\begin{center}
\begin{tabular}{lcr}
$\retrieve \; \, (_{bs} \ONE) \; \, (\Empty)$ & $\dn$ & $\textit{bs}$\\
$\retrieve \; \, (_{bs} \mathbf{c} ) \; \Char(c)$ & $\dn$ & $ \textit{bs}$\\
$\retrieve \; \, (_{bs} a_1 \cdot a_2) \; \Seq(v_1, v_2)$ & $\dn$ & $\textit{bs} @ (\retrieve \; a_1\; v_1) @ (\retrieve \; a_2 \; v_2)$\\
$\retrieve \; \, (_{bs} \Sigma (a :: \textit{as}) \; \,\Left(v)$ & $\dn$ & $\textit{bs} @ (\retrieve \; a \; v)$\\
$\retrieve \; \, (_{bs} \Sigma (a :: \textit{as} \; \, \Right(v)$ & $\dn$ & $\textit{bs} @ (\retrieve \; (_{[]}\Sigma \textit{as}) \; v)$\\
$\retrieve \; \, (_{bs} a^*) \; \, (\Stars(v :: vs)) $ & $\dn$ & $\textit{bs} @ (\retrieve \; a \; v) @ (\retrieve \; (_{[]} a^*) \; (\Stars(vs)))$\\
$\retrieve \; \, (_{bs} a^*) \; \, (\Stars([]) $ & $\dn$ & $\textit{bs} @ [Z]$
\end{tabular}
\end{center}
\noindent
As promised, $\retrieve$ collects the right bit-codes from the
final derivative $a_n$:
\begin{lemma}
$\bnullable \; a \implies \bmkeps \; a = \retrieve \; a \; (\mkeps \; (\erase \; a))$
\end{lemma}
\begin{proof}
By a routine induction on $a$.
\end{proof}
The design of $\retrieve$ enables extraction of bit-codes
from not only $\bnullable$ (annotated) regular expressions,
but also those that are not $\bnullable$.
For example, if we have the regular expression just internalised
and the lexing result value, we could $\retrieve$ the bitcdoes
that look as if we have en$\code$-ed the value:
\begin{lemma}
$\vdash v : r \implies \retrieve \; (r)^\uparrow \; v = \code \; v$
\end{lemma}
\begin{proof}
By induction on $r$.
\end{proof}
\noindent
The following property is more interesting, as
it provides a "bridge" between $a_0, v_0$ and $a_n, v_n$ in the
lexing diagram.
If you take derivative of an annotated regular expression,
you can $\retrieve$ the same bit-codes as before the derivative took place,
provided that you use the corresponding value:
\begin{lemma}\label{retrieveStepwise}
$\vdash v : (r\backslash c) \implies \retrieve \; (r \backslash c) \; v= \retrieve \; r \; (\inj \; r\; c\; v)$
\end{lemma}
\begin{proof}
By induction on $r$, where $v$ is allowed to be arbitrary.
The induction principle is function $\erase$'s cases.
\end{proof}
\noindent
$\retrieve$ is connected to the $\blexer$ in the following way:
\begin{lemma}\label{blexer_retrieve}
$\blexer \; r \; s = \decode \; (\retrieve \; (\internalise \; r) \; (\mkeps \; (r \backslash s) )) \; r$
\end{lemma}
\noindent
$\retrieve$ allows free navigation on the diagram \ref{InjFigure} for annotated regular expressiones of $\blexer$.
For plain regular expressions something similar is required as well.
\subsection{$\flex$}
Ausaf and Urban cleverly defined an auxiliary function called $\flex$ for $\lexer$,
defined as
\begin{center}
\begin{tabular}{lcr}
$\flex \; r \; f \; [] \; v$ & $=$ & $f\; v$\\
$\flex \; r \; f \; c :: s \; v$ & $=$ & $\flex \; r \; \lambda v. \, f (\inj \; r\; c\; v)\; s \; v$
\end{tabular}
\end{center}
which accumulates the characters that needs to be injected back,
and does the injection in a stack-like manner (last taken derivative first injected).
$\flex$ is connected to the $\lexer$:
\begin{lemma}
$\flex \; r \; \textit{id}\; s \; \mkeps (r\backslash s) = \lexer \; r \; s$
\end{lemma}
\begin{proof}
By reverse induction on $s$.
\end{proof}
$\flex$ provides us a bridge between $\lexer$'s intermediate steps.
What is even better about $\flex$ is that it allows us to
directly operate on the value $\mkeps (r\backslash v)$,
which is pivotal in the definition of $\lexer $ and $\blexer$, but not visible as an argument.
When the value created by $\mkeps$ becomes available, one can
prove some stepwise properties of lexing nicely:
\begin{lemma}\label{flexStepwise}
$\textit{flex} \; r \; f \; s@[c] \; v= \flex \; r \; f\; s \; (\inj \; (r\backslash s) \; c \; v) $
\end{lemma}
\begin{proof}
By induction on the shape of $r\backslash s$
\end{proof}
\noindent
With $\flex$ and $\retrieve$ ready, we are ready to connect $\lexer$ and $\blexer$ .
\subsection{Correctness Proof of Bit-coded Algorithm}
\begin{lemma}\label{flex_retrieve}
$\flex \; r \; \textit{id}\; s\; v = \decode \; (\retrieve \; (r\backslash s )\; v) \; r$
\end{lemma}
\begin{proof}
By induction on $s$. The induction tactic is reverse induction on strings.
$v$ is allowed to be arbitrary.
The crucial point is to rewrite
\[
\retrieve \; (r \backslash s@[c]) \; \mkeps (r \backslash s@[c])
\]
as
\[
\retrieve \; (r \backslash s) \; (\inj \; (r \backslash s) \; c\; \mkeps (r \backslash s@[c]))
\].
This enables us to equate
\[
\retrieve \; (r \backslash s@[c]) \; \mkeps (r \backslash s@[c])
\]
with
\[
\flex \; r \; \textit{id} \; s \; (\inj \; (r\backslash s) \; c\; (\mkeps (r\backslash s@[c])))
\],
which in turn can be rewritten as
\[
\flex \; r \; \textit{id} \; s@[c] \; (\mkeps (r\backslash s@[c]))
\].
\end{proof}
With the above lemma we can now link $\flex$ and $\blexer$.
%----------------------------------------------------------------------------------------
% SECTION correctness proof
%----------------------------------------------------------------------------------------
\section{Correctness of Bit-coded Algorithm (Without Simplification)}
We now give the proof the correctness of the algorithm with bit-codes.
\begin{lemma}\label{flex_blexer}
$\textit{flex} \; r \; \textit{id} \; s \; \mkeps(r \backslash s) = \blexer \; r \; s$
\end{lemma}
\begin{proof}
Using two of the above lemmas: \ref{flex_retrieve} and \ref{blexer_retrieve}.
\end{proof}
Finally the correctness of $\blexer$ is given as it outputs the same result as $\lexer$:
\begin{theorem}
$\blexer\; r \; s = \lexer \; r \; s$
\end{theorem}
\begin{proof}
Straightforward corollary of \ref{flex_blexer}.
\end{proof}
\noindent
To piece things together and spell out the exact correctness
of the bitcoded lexer
in terms of producing POSIX values,
we use the fact from the previous chapter that
\[
If \; (r, s) \rightarrow v \; then \; \lexer \; r \; s = v
\]
to obtain this corollary:
\begin{corollary}\label{blexerPOSIX}
$If \; (r, s) \rightarrow v \; then \blexer \; r \; s = v$
\end{corollary}
Our main reason for wanting a bit-coded algorithm over
the injection-based one is for its capabilities of allowing
more aggressive simplifications.
We will elaborate on this in the next chapter.