--- a/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Wed Aug 17 01:09:13 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Wed Aug 17 22:59:15 2022 +0100
@@ -15,7 +15,7 @@
in their algorithm such as de-duplication not working properly and redundant
fixpoint construction. Their algorithm is improved and verified with the help of
formal proofs.
-\section{Bit-coded Algorithm}
+\section{The Motivation Behind Using Bitcodes}
We first do a recap of what was going on
in the lexer algorithm in Chapter \ref{Inj},
\begin{center}
@@ -35,7 +35,7 @@
and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
Each descent into deeper recursive calls in $\lexer$
causes a new pair of $r_i, c_i$ to be pushed to the call stack.
-\begin{figure}
+\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
@@ -64,7 +64,7 @@
-\begin{figure}
+\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
@@ -101,7 +101,7 @@
As the number of derivative steps increase,
the stack would increase:
-\begin{figure}
+\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [ circle ] (r) at (-6, 5) {$r$};
@@ -145,11 +145,9 @@
\end{tikzpicture}
\caption{More Derivatives Taken}
\end{figure}
-\noindent
-After all derivatives have been taken, the stack grows to a maximum size
-and the pair of regular expressions and characters $r_i, c_{i+1}$
-are then popped out and used in the injection phase:
+
+\begin{figure}[H]
\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
%\draw (-6,-6) grid (6,6);
\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$};
@@ -174,18 +172,8 @@
\node at ($(ldots)!.4!(rn)$) {\ldots};
-\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$};
-
-\node [] (ldots2) at (3.5, -5) {};
-
-\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$};
-
-\node at ($(ldots2)!.4!(v2)$) {\ldots};
-\node [circle, draw] (v1) at (-2, -5) {$v_1$};
-
-\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$};
\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
@@ -197,15 +185,6 @@
edge [] node {} (r2);
\path (r2)
edge [] node {} (ldots);
-\path (rn)
- edge [] node {$\mkeps$} (vn);
-\path (vn)
- edge [] node {} (ldots2);
-\path (v2)
- edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1);
-
-\path (v1)
- edge [] node {$\inj \; r \; c_1 \; v_1$} (v);
\path (r)
edge [dashed, bend right] node {} (stack);
@@ -228,133 +207,634 @@
\path (rn)
edge [dashed, bend left] node {} (stack);
\end{tikzpicture}
-%\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
-%%\draw (-6,-6) grid (6,6);
-%\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$};
+\caption{Before Injection Phase Starts}
+\end{figure}
+
+
+\noindent
+After all derivatives have been taken, the stack grows to a maximum size
+and the pair of regular expressions and characters $r_i, c_{i+1}$
+are then popped out and used in the injection phase.
+
+
+
+\begin{figure}[H]
+\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
+%\draw (-6,-6) grid (6,6);
+\node [radius = 0.5, circle, ] (r) at (-6, 5) {$r$};
+
+%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
+\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
%
-%%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
-%\node [circle, minimum size = 0.1] (c1) at (-4, 5.4) {$c_1$};
-%%
-%%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
-%\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
-%%
-%%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
-%\node [circle, minimum size = 0.1] (c2) at (0, 5.4) {$c_2$};
-%%
-%%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
-%\node [circle, draw] (r2) at (2, 5) {$r_2$};
-%%
-%%
-%\node [] (ldots) at (4.5, 5) {};
-%%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
+%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
+\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
%
-%\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$};
-%
-%\node at ($(ldots)!.4!(rn)$) {\ldots};
-%
-%\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$};
+%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
+\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
%
-%\node [] (ldots2) at (3.5, -5) {};
-%
-%\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$};
-%
-%\node at ($(ldots2)!.4!(v2)$) {\ldots};
+%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
+\node [circle, ] (r2) at (2, 5) {$r_2$};
%
%
-%\node [circle, draw] (v1) at (-2, -5) {$v_1$};
-%
-%\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$};
-%
-%\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
+\node [] (ldots) at (4.5, 5) {};
+%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
+
+\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {$r_n$};
+
+\node at ($(ldots)!.4!(rn)$) {\ldots};
+
+\node [minimum size = 0.5, circle, ] (vn) at (6, -5) {$v_n$};
+
+\node [] (ldots2) at (3.5, -5) {};
+
+\node [minimum size = 0.5, circle, ] (v2) at (2, -5) {$v_2$};
+
+\node at ($(ldots2)!.4!(v2)$) {\ldots};
+
+
+\node [circle, ] (v1) at (-2, -5) {$v_1$};
+
+\node [radius = 0.5, circle, ] (v) at (-6, -5) {$v$};
+
+\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
+
+\path
+ (r)
+ edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
+\path
+ (r1)
+ edge [] node {} (r2);
+\path (r2)
+ edge [] node {} (ldots);
+\path (rn)
+ edge [] node {$\mkeps$} (vn);
+\path (vn)
+ edge [] node {} (ldots2);
+\path (v2)
+ edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1);
+
+\path (v1)
+ edge [] node {$\inj \; r \; c_1 \; v_1$} (v);
+
+\path (stack)
+ edge [dashed] node {} (-4.2, -5.2);
+\path (stack)
+ edge [dashed] node {} (-4, -5.2);
+\path (stack)
+ edge [dashed] node {} (-0.1, -5.2);
+\path (stack)
+ edge [dashed] node {} (0.2, -5.26);
+\path (stack)
+ edge [dashed] node {} (3.2, -5);
+\path (stack)
+ edge [dashed] node {} (2.7, -5);
+\path (stack)
+ edge [dashed] node {} (3.7, -5);
+\end{tikzpicture}
+\caption{Stored $r_i, c_{i+1}$ Used by $\inj$}
+\end{figure}
+\noindent
+Storing all $r_i, c_{i+1}$ pairs recursively
+allows the algorithm to work in an elegant way, at the expense of
+storing quite a bit of verbose information.
+The stack seems to grow at least quadratically fast with respect
+to the input (not taking into account the size bloat of $r_i$),
+which can be inefficient and prone to stack overflow.
+\section{Bitcoded Algorithm}
+To address this,
+Sulzmann and Lu chose to define a new datatype
+called \emph{annotated regular expression},
+which condenses all the partial lexing information
+(that was originally stored in $r_i, c_{i+1}$ pairs)
+into bitcodes.
+Annotated regular expressions
+are defined as the following
+Isabelle datatype \footnote{ We use subscript notation to indicate
+ that the bitcodes are auxiliary information that do not
+interfere with the structure of the regular expressions }:
+\begin{center}
+\begin{tabular}{lcl}
+ $\textit{a}$ & $::=$ & $\ZERO$\\
+ & $\mid$ & $_{bs}\ONE$\\
+ & $\mid$ & $_{bs}{\bf c}$\\
+ & $\mid$ & $_{bs}\sum\,as$\\
+ & $\mid$ & $_{bs}a_1\cdot a_2$\\
+ & $\mid$ & $_{bs}a^*$
+\end{tabular}
+\end{center}
+\noindent
+where $bs$ stands for bit-codes, $a$ for $\mathbf{a}$nnotated regular
+expressions and $as$ for lists of annotated regular expressions.
+The alternative constructor, written, $\sum$, has been generalised to
+accept a list of annotated regular expressions rather than just two.
+Why is it generalised? This is because when we open up nested
+alternatives, there could be more than two elements at the same level
+after de-duplication, which can no longer be stored in a binary
+constructor.
+Bits and bitcodes (lists of bits) are defined as:
+\begin{center}
+ $b ::= S \mid Z \qquad
+bs ::= [] \mid b::bs
+$
+\end{center}
+\noindent
+We use $S$ and $Z$ rather than $1$ and $0$ is to avoid
+confusion with the regular expressions $\ZERO$ and $\ONE$.
+The idea is to use the attached bitcodes
+to indicate which choice was made at a certain point
+during lexing.
+For example,
+$(_{Z}a+_{S}b) \backslash a$ gives us
+$_{Z}\ONE + \ZERO$, this $Z$ bitcode will
+later be used to decode that a left branch was
+selected at the time when the part $a+b$ is being taken
+derivative of.
+\subsection{A Bird's Eye View of the Bit-coded Lexer}
+Before we give out the rest of the functions and definitions
+related to our
+$\blexer$ (\emph{b}-itcoded lexer), we first provide a high-level
+view of the algorithm, so the reader does not get lost in
+the details.
+\begin{figure}[H]
+\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
+%\draw (-6,-6) grid (6,6);
+
+ \node [circle, draw] (r0) at (-6, 2) {$r$};
+
+ \node [radius = 0.5, circle, draw] (r) at (-6, 5) {$_{bs}a$};
+ \path (r0)
+ edge [] node {$\internalise$} (r);
+%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
+\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
%
-%\path
-% (r)
-% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
-%\path
-% (r1)
-% edge [] node {} (r2);
-%\path (r2)
-% edge [] node {} (ldots);
-%\path (rn)
-% edge [] node {$\mkeps$} (vn);
-%\path (vn)
-% edge [] node {} (ldots2);
-%\path (v2)
-% edge [] node {} (v1);
+%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
+\node [minimum size = 1.0cm, circle, draw] (r1) at (-2, 5) {$_{bs_1}a_1$};
+%
+%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
+\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
+%
+%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
+\node [circle, draw, minimum size = 1.4cm] (r2) at (2, 5) {$_{bs_2}a_2$};
+%
+%
+\node [] (ldots) at (4.5, 5) {};
+%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
+
+\node [minimum size = 2.2cm, circle, draw] (rn) at (6, 5) {$_{bs_n}a_n$};
+
+\node at ($(ldots)!.1!(rn)$) {\ldots};
+
+\node [minimum size = 0.5, circle, ] (v) at (6, 2) {$v$};
+
+%\node [] (v2) at (4, -5) {};
+%
+%\node [draw, cross out] (ldots2) at (5, -5) {};
%
-%\path (v1)
-% edge [] node {} (v);
-%\path (r)
-% edge [] node {saved} (stack);
-%
-%\path (r1)
-% edge [] node {saved} (stack);
-%\end{tikzpicture}
-\noindent
-The information stored in characters and regular expressions
-make the algorithm work in an elegant way, at the expense of being
-storing quite a bit of verbose information.
+%\node at ($(ldots2)!.4!(v2)$) {\ldots};
+
+
+\node [align = center] (decode) at (6.6, 3.2) {$\bmkeps$\\$\decode$};
+
+\path (c1)
+ edge [dashed, bend left] node {} (r0);
+
+\path (c2)
+ edge [dashed, bend left] node {} (r0);
+
+\path (r1)
+ edge [dashed, bend right] node {} (r2);
+
+
+\path
+ (r)
+ edge [dashed, bend right] node[left] {} (r1);
+
+\path
+ (r)
+ edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
+
+\path
+ (r1)
+ edge [] node {} (r2);
+\path (r2)
+ edge [] node {} (ldots);
+\path (rn)
+ edge [] node {} (v);
+
+\path (r0)
+ edge [dashed, bend right] node {} (decode);
+%\path (v)
+ %edge [] node {} (ldots2);
-The lexer algorithm in Chapter \ref{Inj},
-stores information of previous lexing steps
-on a stack, in the form of regular expressions
-and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
-The red part represents what we already know during the first
-derivative phase,
-and the blue part represents the unknown part of input.
-\begin{ceqn}
-\begin{equation}%\label{graph:injLexer}
- \begin{tikzcd}[ampersand replacement=\&, execute at end picture={
- \begin{scope}[on background layer]
- \node[rectangle, fill={red!30},
- pattern=north east lines, pattern color=red,
- fit={(-3,-1) (-3, 1) (1, -1)
- (1, 1)}
- ]
- {}; ,
- \node[rectangle, fill={blue!20},
- pattern=north east lines, pattern color=blue,
- fit= {(1, -1) (1, 1) (3, -1) (3, 1)}
- ]
- {};
- \end{scope}}
- ]
-r_0 \arrow[r, "\backslash c_0"] \arrow[d] \& r_1 \arrow[r, "\backslash c_1"] \arrow[d] \& r_2 \arrow[r, dashed] \arrow[d] \& r_n \arrow[d, "mkeps" description] \\
-v_0 \& v_1 \arrow[l,"inj_{r_0} c_0"] \& v_2 \arrow[l, "inj_{r_1} c_1"] \& v_n \arrow[l, dashed] \\
-\end{tikzcd}
-\end{equation}
-\end{ceqn}
+\end{tikzpicture}
+\caption{Bird's Eye View of $\blexer$}
+\end{figure}
+\noindent
+The plain regular expressions
+are first ``lifted'' to an annotated regular expression,
+with the function called \emph{internalise}.
+Then the annotated regular expression $_{bs}a$ will
+go through successive derivatives with respect
+to the input stream of characters $c_1, c_2$ etc.
+Each time a derivative is taken, the bitcodes
+are moved around, discarded or augmented together
+with the regular expression they are attached to.
+After all input has been consumed, the
+bitcodes are collected by $\bmkeps$,
+which traverses the nullable part of the regular expression
+and collect the bitcodes along the way.
+The collected bitcodes are then $\decode$d with the guidance
+of the input regular expression $r$.
+The most notable improvements of $\blexer$
+over $\lexer$ are
+\begin{itemize}
+ \item
+
+ An absence of the second injection phase.
+ \item
+ One need not store each pair of the
+ intermediate regular expressions $_{bs_i}a_i$ and $c_{i+1}$.
+ The previous annotated regular expression $_{bs_i}a_i$'s information is passed
+ on without loss to its successor $_{bs_{i+1}}a_{i+1}$,
+ and $c_{i+1}$'s already contained in the strings in $L\;r$ \footnote{
+ which can be easily recovered if we decode in the correct order}.
+ \item
+ The simplification works much better--one can maintain correctness
+ while applying quite strong simplifications, as we shall wee.
+\end{itemize}
+\noindent
+In the next section we will
+give the operations needed in $\blexer$,
+with some remarks on the idea behind their definitions.
+\subsection{Operations in $\textit{Blexer}$}
+The first operation we define related to bit-coded regular expressions
+is how we move bits to the inside of regular expressions.
+Called $\fuse$, this operation attaches bit-codes
+to the front of an annotated regular expression:
+\begin{center}
+\begin{tabular}{lcl}
+ $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\
+ $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ &
+ $_{bs @ bs'}\ONE$\\
+ $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ &
+ $_{bs@bs'}{\bf c}$\\
+ $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ &
+ $_{bs@bs'}\sum\textit{as}$\\
+ $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ &
+ $_{bs@bs'}a_1 \cdot a_2$\\
+ $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ &
+ $_{bs @ bs'}a^*$
+\end{tabular}
+\end{center}
+
+\noindent
+With \emph{fuse} we are able to define the $\internalise$ function
+that translates a ``standard'' regular expression into an
+annotated regular expression.
+This function will be applied before we start
+with the derivative phase of the algorithm.
+
+\begin{center}
+\begin{tabular}{lcl}
+ $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
+ $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
+ $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\
+ $(r_1 + r_2)^\uparrow$ & $\dn$ &
+ $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\,
+ \textit{fuse}\,[S]\,r_2^\uparrow]$\\
+ $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
+ $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
+ $(r^*)^\uparrow$ & $\dn$ &
+ $_{[]}(r^\uparrow)^*$\\
+\end{tabular}
+\end{center}
+\noindent
+We use an up arrow with postfix notation
+to denote this operation
+for convenience.
+The opposite of $\textit{internalise}$ is
+$\erase$, where all the bit-codes are removed,
+and the alternative operator $\sum$ for annotated
+regular expressions is transformed to the binary version
+in plain regular expressions.
+\begin{center}
+ \begin{tabular}{lcl}
+ $\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\
+ $( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\
+ $( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\
+ $( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ &
+ $ (a_1) _\downarrow \cdot (a_2) _\downarrow$\\
+ $( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\
+ $( _{bs} [a] )_\downarrow$ & $\dn$ & $a_\downarrow$\\
+ $_{bs} \sum [a_1, \; a_2]$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\
+ $(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\
+ $( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$
+ \end{tabular}
+\end{center}
+\noindent
+We also abbreviate the $\erase\; a$ operation
+as $a_\downarrow$, for conciseness.
+
+Testing whether an annotated regular expression
+contains the empty string in its lauguage requires
+a dedicated function $\bnullable$.
+$\bnullable$ simply calls $\erase$ before $\nullable$.
+\begin{definition}
+ $\bnullable \; a \dn \nullable \; (a_\downarrow)$
+\end{definition}
+The function for collecting
+bitcodes from a
+(b)nullable regular expression is called $\bmkeps$.
+$\bmkeps$ is a variation of the $\textit{mkeps}$ function,
+which follows the path $\mkeps$ takes to traverse the
+$\nullable$ branches when visiting a regular expression,
+but gives back bitcodes instead of a value.
+\begin{center}
+\begin{tabular}{lcl}
+ $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\
+ $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ &
+ $\textit{if}\;\textit{bnullable}\,a$\\
+ & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
+ & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\
+ $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ &
+ $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
+ $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ &
+ $bs \,@\, [Z]$
+\end{tabular}
+\end{center}
+\noindent
+$\bmkeps$ retrieves the value $v$'s
+information in the format
+of bitcodes, by travelling along the
+path of the regular expression that corresponds to a POSIX match,
+collecting all the bitcodes, and attaching $S$ to indicate the end of star
+iterations. \\
+The bitcodes extracted by $\bmkeps$ need to be
+$\decode$d (with the guidance of a plain regular expression):
+%\begin{definition}[Bitdecoding of Values]\mbox{}
+\begin{center}
+\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
+ $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
+ $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
+ $\textit{decode}'\,(Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+ $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
+ (\Left\,v, bs_1)$\\
+ $\textit{decode}'\,(S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+ $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
+ (\Right\,v, bs_1)$\\
+ $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
+ $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
+ & & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
+ & & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
+ $\textit{decode}'\,(Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
+ $\textit{decode}'\,(S\!::\!bs)\,(r^*)$ & $\dn$ &
+ $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
+ & & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
+ & & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
+
+ $\textit{decode}\,bs\,r$ & $\dn$ &
+ $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
+ & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
+ \textit{else}\;\textit{None}$
+\end{tabular}
+\end{center}
\noindent
-The red area expands as we move towards $r_n$,
-indicating more is known about the lexing result.
-Despite knowing this partial lexing information during
-the forward derivative phase, we choose to store them
-all the way until $r_n$ is reached.
-Then we reconstruct the value character by character
-values at a later stage, using information in a Last-In-First-Out
-manner. Although the algorithm is elegant and natural,
-it can be inefficient and prone to stack overflow.\\
-It turns out we can store lexing
-information on the fly, while still using regular expression
-derivatives.
-If we remove the individual
-lexing steps, and use red and blue areas as before
-to indicate consumed (seen) input and constructed
-partial value (before recovering the rest of the stack),
-one could see that the seen part's lexical information
-is stored in the form of a regular expression.
+The function $\decode'$ returns a pair consisting of
+a partially decoded value and some leftover bit list that cannot
+be decide yet.
+The function $\decode'$ succeeds if the left-over
+bit-sequence is empty.
+$\decode$ is terminating as $\decode'$ is terminating.
+$\decode'$ is terminating
+because at least one of $\decode'$'s parameters will go down in terms
+of size.\\
+The reverse operation of $\decode$ is $\code$.
+$\textit{code}$ encodes a value into a bitcode by converting
+$\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty
+star iteration by $S$. The border where a star iteration
+terminates is marked by $Z$.
+This coding is lossy, as it throws away the information about
+characters, and does not encode the ``boundary'' between two
+sequence values. Moreover, with only the bitcode we cannot even tell
+whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$.
+\begin{center}
+\begin{tabular}{lcl}
+ $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
+ $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
+ $\textit{code}(\Left\,v)$ & $\dn$ & $Z :: code(v)$\\
+ $\textit{code}(\Right\,v)$ & $\dn$ & $S :: code(v)$\\
+ $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
+ $\textit{code}(\Stars\,[])$ & $\dn$ & $[Z]$\\
+ $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $S :: code(v) \;@\;
+ code(\Stars\,vs)$
+\end{tabular}
+\end{center}
+\noindent
+Assume we have a value $v$ and regular expression $r$
+with $\vdash v:r$,
+then we have the property that $\decode$ and $\code$ are
+reverse operations of one another:
+\begin{lemma}
+\[If \vdash v : r \; then \;\decode \; (\code \; v) \; r = \textit{Some}(v) \]
+\end{lemma}
+\begin{proof}
+By proving a more general version of the lemma, on $\decode'$:
+\[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \]
+Then setting $ds$ to be $[]$ and unfolding $\decode$ definition,
+we obtain the property.
+\end{proof}
+\noindent
+Now we give out the central part of this lexing algorithm,
+the $\bder$ function (stands for \emph{b}itcoded-derivative):
+\begin{center}
+ \begin{tabular}{@{}lcl@{}}
+ $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\
+ $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\
+ $(_{bs}{\bf d})\,\backslash c$ & $\dn$ &
+ $\textit{if}\;c=d\; \;\textit{then}\;
+ _{bs}\ONE\;\textit{else}\;\ZERO$\\
+ $(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ &
+ $_{bs}\sum\;(\textit{map} \; (\_\backslash c) \; as )$\\
+ $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
+ $\textit{if}\;\textit{bnullable}\,a_1$\\
+ & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
+ & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
+ & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\
+ $(_{bs}a^*)\,\backslash c$ & $\dn$ &
+ $_{bs}(\textit{fuse}\, [Z] \; r\,\backslash c)\cdot
+ (_{[]}r^*))$
+\end{tabular}
+\end{center}
+\noindent
+For most time we use the infix notation $(\_\backslash\_)$
+to mean $\bder$ for brevity when
+there is no danger of confusion with derivatives on plain regular expressions.
+For example, we write $( _{[]}r^* ) \backslash c$ instead of $\bder \;c \; _{[]}r^*$,
+as the bitcodes at the front of $r^*$ indicates that it is
+a bit-coded regular expression, not a plain one.\\
+$\bder$ tells us how regular expressions can be recursively traversed,
+where the bitcodes are augmented and carried around
+when a derivative is taken.
+We give the intuition behind some of the more involved cases in
+$\bder$. \\
+For example,
+in the \emph{star} case,
+a derivative on $_{bs}a^*$ means
+that one more star iteratoin needs to be taken.
+we need to unfold it into a sequence,
+and attach an additional bit $Z$ to the front of $r \backslash c$
+as a record to indicate one new star iteration is unfolded.
+
+\noindent
+\begin{center}
+ \begin{tabular}{@{}lcl@{}}
+ $(_{bs}a^*)\,\backslash c$ & $\dn$ &
+ $_{bs}(\underbrace{\textit{fuse}\, [Z] \; a\,\backslash c}_{\text{One more iteration}})\cdot
+ (_{[]}a^*))$
+\end{tabular}
+\end{center}
+
+\noindent
+This information will be recovered later by the $\decode$ function.
+The intuition is that the bit $Z$ will be decoded at the right location,
+because we accumulate bits from left to right (a rigorous proof will be given
+later).
+
+%\begin{tikzpicture}[ > = stealth, % arrow head style
+% shorten > = 1pt, % don't touch arrow head to node
+% semithick % line style
+% ]
+%
+% \tikzstyle{every state}=[
+% draw = black,
+% thin,
+% fill = cyan!29,
+% minimum size = 7mm
+% ]
+% \begin{scope}[node distance=1cm and 0cm, every node/.style=state]
+% \node (k) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
+% {$bs$
+% \nodepart{two} $a^*$ };
+% \node (l) [below =of k, rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
+% { $bs$ + [Z]
+% \nodepart{two} $(a\backslash c )\cdot a^*$ };
+% \end{scope}
+% \path[->]
+% (k) edge (l);
+%\end{tikzpicture}
+%
+%Pictorially the process looks like below.
+%Like before, the red region denotes
+%previous lexing information (stored as bitcodes in $bs$).
+
+%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+% \begin{scope}[node distance=1cm]
+% \node (a) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
+% {$bs$
+% \nodepart{two} $a^*$ };
+% \node (b) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
+% { $bs$ + [Z]
+% \nodepart{two} $(a\backslash c )\cdot a^*$ };
+%%\caption{term 1 \ref{term:1}'s matching configuration}
+% \end{scope}
+%\end{tikzpicture}
+
+\noindent
+Another place the $\bder$ function differs
+from derivatives on plain regular expressions
+is the sequence case:
+\begin{center}
+ \begin{tabular}{@{}lcl@{}}
+
+ $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
+ $\textit{if}\;\textit{bnullable}\,a_1$\\
+ & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
+ & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
+ & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$
+\end{tabular}
+\end{center}
+\noindent
+The difference is that (when $a_1$ is $\bnullable$)
+we use $\bmkeps$ to store the lexing information
+in $a_1$ before collapsing
+it (as it has been fully matched by string prior to $c$),
+and attach the collected bit-codes to the front of $a_2$
+before throwing away $a_1$. We assume that $\bmkeps$
+correctly extracts the bitcode for how $a_1$
+matches the string prior to $c$ (more on this later).
+The bitsequence $\textit{bs}$ which was initially
+attached to the first element of the sequence
+$a_1 \cdot a_2$, has now been
+elevated to the top level of the $\sum$ constructor.
+This is because this piece of information will be needed
+whichever way the sequence is matched,
+regardless of whether $c$ belongs to $a_1$ or $a_2$.
+
+In the injection-based lexer, $r_1$ is immediately thrown away in
+subsequent derivatives on the right branch (when $r_1$ is $\nullable$),
+depite $r_1$ potentially storing information of previous matches:
+\begin{center}
+ $(r_1 \cdot r_2 )\backslash c = (r_1 \backslash c) \cdot r_2 + r_2 \backslash c$
+\end{center}
+\noindent
+this loss of information makes it necessary
+to store on stack all the regular expressions'
+``snapshots'' before they were
+taken derivative of.
+So that the related information will be available
+once the child recursive
+call finishes.\\
+The rest of the clauses of $\bder$ is rather similar to
+$\der$. \\
+Generalising the derivative operation with bitcodes to strings, we have
+\begin{center}
+ \begin{tabular}{@{}lcl@{}}
+ $a\backslash_s [] $ & $\dn$ & $a$\\
+ $a\backslash (c :: s) $ & $\dn$ & $(a \backslash c) \backslash_s s$
+ \end{tabular}
+\end{center}
+\noindent
+As we did earlier, we omit the $s$ subscript at $\backslash_s$ when there is no danger
+of confusion.
+\subsection{Putting Things Together}
+Putting these operations altogether, we obtain a lexer with bit-coded regular expressions
+as its internal data structures, which we call $\blexer$:
+
+\begin{center}
+\begin{tabular}{lcl}
+ $\textit{blexer}\;r\,s$ & $\dn$ &
+ $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\
+ & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
+ & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
+ & & $\;\;\textit{else}\;\textit{None}$
+\end{tabular}
+\end{center}
+
+\noindent
+Ausaf and Urban formally proved the correctness of the $\blexer$, namely
+\begin{property}
+$\blexer \;r \; s = \lexer \; r \; s$.
+\end{property}
+This was claimed but not formalised in Sulzmann and Lu's work.
+We introduce the proof later, after we give all
+the needed auxiliary functions and definitions.
+But before this we shall first walk the reader
+through a concrete example of our $\blexer$ calculating the right
+lexical information through bit-coded regular expressions.\\
Consider the regular expression $(aa)^* \cdot bc$ matching the string $aabc$
-and assume we have just read the two characters $aa$:
+and assume we have just read the first character $a$:
\begin{center}
\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
\node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
- {Partial lexing info: $\ONE \cdot a \cdot (aa)^* \cdot bc$ etc.
- \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$};
+ {$\ONE \cdot a \cdot (aa)^* \cdot bc$
+ \nodepart{two} $[] \iff \Seq \; (\Stars \; [\Seq\; a \; ??, \;]), ??$};
\end{tikzpicture}
\end{center}
\noindent
+We use the red area for (annotated) regular expressions and the blue
+area the (partial) bitcodes and (partial) values.
In the injection-based lexing algorithm, we ``neglect" the red area
by putting all the characters we have consumed and
intermediate regular expressions on the stack when
@@ -430,401 +910,6 @@
\end{center}
\noindent
-Using $S$ and $Z$ rather than $1$ and $0$ is to avoid
-confusion with the regular expressions $\ZERO$ and $\ONE$.
-Bitcodes (or
-bit-lists) can be used to encode values (or potentially incomplete values) in a
-compact form. This can be straightforwardly seen in the following
-coding function from values to bitcodes:
-\begin{center}
-\begin{tabular}{lcl}
- $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
- $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
- $\textit{code}(\Left\,v)$ & $\dn$ & $Z :: code(v)$\\
- $\textit{code}(\Right\,v)$ & $\dn$ & $S :: code(v)$\\
- $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
- $\textit{code}(\Stars\,[])$ & $\dn$ & $[Z]$\\
- $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $S :: code(v) \;@\;
- code(\Stars\,vs)$
-\end{tabular}
-\end{center}
-\noindent
-Here $\textit{code}$ encodes a value into a bit-code by converting
-$\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty
-star iteration by $S$. The border where a local star terminates
-is marked by $Z$.
-This coding is lossy, as it throws away the information about
-characters, and also does not encode the ``boundary'' between two
-sequence values. Moreover, with only the bitcode we cannot even tell
-whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$. The
-reason for choosing this compact way of storing information is that the
-relatively small size of bits can be easily manipulated and ``moved
-around" in a regular expression.
-
-Because of the lossiness, the process of decoding a bitlist requires additionally
-a regular expression. The function $\decode$ is defined as:
-
-%\begin{definition}[Bitdecoding of Values]\mbox{}
-\begin{center}
-\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
- $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
- $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
- $\textit{decode}'\,(Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
- $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
- (\Left\,v, bs_1)$\\
- $\textit{decode}'\,(S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
- $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
- (\Right\,v, bs_1)$\\
- $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
- $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
- & & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
- & & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
- $\textit{decode}'\,(Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
- $\textit{decode}'\,(S\!::\!bs)\,(r^*)$ & $\dn$ &
- $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
- & & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
- & & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
-
- $\textit{decode}\,bs\,r$ & $\dn$ &
- $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
- & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
- \textit{else}\;\textit{None}$
-\end{tabular}
-\end{center}
-%\end{definition}
-
-\noindent
-The function $\decode'$ returns a pair consisting of
-a partially decoded value and some leftover bit list that cannot
-be decide yet.
-The function $\decode'$ succeeds if the left-over
-bit-sequence is empty.
-$\decode$ is terminating as $\decode'$ is terminating.
-$\decode'$ is terminating
-because at least one of $\decode'$'s parameters will go down in terms
-of size.
-Assuming we have a value $v$ and regular expression $r$
-with $\vdash v:r$,
-then we have the property that $\decode$ and $\code$ are
-reverse operations of one another:
-\begin{lemma}
-\[If \vdash v : r \; then \;\decode \; (\code \; v) \; r = \textit{Some}(v) \]
-\end{lemma}
-\begin{proof}
-By proving a more general version of the lemma, on $\decode'$:
-\[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \]
-Then setting $ds$ to be $[]$ and unfolding $\decode$ definition,
-we obtain the property.
-\end{proof}
-With the $\code$ and $\decode$ functions in hand, we know how to
-switch between bit-codes and values.
-The next step is to integrate this information into regular expression.
-Attaching bits to the front of regular expressions is the solution Sulzamann and Lu
-gave for storing partial values in regular expressions.
-Annotated regular expressions are therefore defined as the Isabelle
-datatype:
-
-\begin{center}
-\begin{tabular}{lcl}
- $\textit{a}$ & $::=$ & $\ZERO$\\
- & $\mid$ & $_{bs}\ONE$\\
- & $\mid$ & $_{bs}{\bf c}$\\
- & $\mid$ & $_{bs}\sum\,as$\\
- & $\mid$ & $_{bs}a_1\cdot a_2$\\
- & $\mid$ & $_{bs}a^*$
-\end{tabular}
-\end{center}
-%(in \textit{ALTS})
-
-\noindent
-where $bs$ stands for bit-codes, $a$ for $\mathbf{a}$nnotated regular
-expressions and $as$ for lists of annotated regular expressions.
-The alternative constructor, written, $\sum$, has been generalised to
-accept a list of annotated regular expressions rather than just two.
-Why is it generalised? This is because when we open up nested
-alternatives, there could be more than two elements at the same level
-after de-duplication, which can no longer be stored in a binary
-constructor.
-
-The first operation we define related to bit-coded regular expressions
-is how we move bits to the inside of regular expressions.
-Called $\fuse$, this operation is attaches bit-codes
-to the front of an annotated regular expression:
-\begin{center}
-\begin{tabular}{lcl}
- $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\
- $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ &
- $_{bs @ bs'}\ONE$\\
- $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ &
- $_{bs@bs'}{\bf c}$\\
- $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ &
- $_{bs@bs'}\sum\textit{as}$\\
- $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ &
- $_{bs@bs'}a_1 \cdot a_2$\\
- $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ &
- $_{bs @ bs'}a^*$
-\end{tabular}
-\end{center}
-
-\noindent
-With \emph{fuse} we are able to define the $\internalise$ function
-that translates a ``standard'' regular expression into an
-annotated regular expression.
-This function will be applied before we start
-with the derivative phase of the algorithm.
-
-\begin{center}
-\begin{tabular}{lcl}
- $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
- $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
- $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\
- $(r_1 + r_2)^\uparrow$ & $\dn$ &
- $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\,
- \textit{fuse}\,[S]\,r_2^\uparrow]$\\
- $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
- $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
- $(r^*)^\uparrow$ & $\dn$ &
- $_{[]}(r^\uparrow)^*$\\
-\end{tabular}
-\end{center}
-%\end{definition}
-
-\noindent
-We use an up arrow with postfix notation
-to denote this operation.
-for convenience. The $\textit{internalise} \; r$
-notation is more cumbersome.
-The opposite of $\textit{internalise}$ is
-$\erase$, where all the bit-codes are removed,
-and the alternative operator $\sum$ for annotated
-regular expressions is transformed to the binary alternatives
-for plain regular expressions.
-\begin{center}
- \begin{tabular}{lcl}
- $\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\
- $( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\
- $( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\
- $( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ &
- $ (a_1) _\downarrow \cdot (a_2) _\downarrow$\\
- $( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\
- $( _{bs} [a] )_\downarrow$ & $\dn$ & $a_\downarrow$\\
- $_{bs} \sum [a_1, \; a_2]$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\
- $(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\
- $( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$
- \end{tabular}
-\end{center}
-\noindent
-We also abbreviate the $\erase\; a$ operation
-as $a_\downarrow$, for conciseness.
-For bit-coded regular expressions, as a different datatype,
-testing whether they contain empty string in their lauguage requires
-a dedicated function $\bnullable$
-which simply calls $\erase$ first before testing whether it is $\nullable$.
-\begin{definition}
- $\bnullable \; a \dn \nullable \; (a_\downarrow)$
-\end{definition}
-The function for collecting the
-bitcodes at the end of the derivative
-phase from a (b)nullable regular expression
-is a generalised version of the $\textit{mkeps}$ function
-for annotated regular expressions, called $\textit{bmkeps}$:
-
-
-%\begin{definition}[\textit{bmkeps}]\mbox{}
-\begin{center}
-\begin{tabular}{lcl}
- $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\
- $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ &
- $\textit{if}\;\textit{bnullable}\,a$\\
- & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
- & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\
- $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ &
- $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
- $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ &
- $bs \,@\, [Z]$
-\end{tabular}
-\end{center}
-%\end{definition}
-
-\noindent
-$\bmkeps$ completes the value information by travelling along the
-path of the regular expression that corresponds to a POSIX value and
-collecting all the bitcodes, and attaching $S$ to indicate the end of star
-iterations.
-
-Now we give out the central part of this lexing algorithm,
-the $\bder$ function (stands for \emph{b}itcoded-derivative).
-For most time we use the infix notation $(\_\backslash\_)$
-to mean $\bder$ for brevity when
-there is no danger of confusion with derivatives on plain regular expressions.
-For example, we write $( _{[]}r^* ) \backslash c$ instead of $\bder \;c \; _{[]}r^*$,
-as the bitcodes at the front of $r^*$ indicates that it is
-a bit-coded regular expression, not a plain one.
-$\bder$ tells us how regular expressions can be recursively traversed,
-where the bitcodes are augmented and carried around
-when a derivative is taken.
-\begin{center}
- \begin{tabular}{@{}lcl@{}}
- $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\
- $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\
- $(_{bs}{\bf d})\,\backslash c$ & $\dn$ &
- $\textit{if}\;c=d\; \;\textit{then}\;
- _{bs}\ONE\;\textit{else}\;\ZERO$\\
- $(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ &
- $_{bs}\sum\;(\textit{map} \; (\_\backslash c) \; as )$\\
- $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
- $\textit{if}\;\textit{bnullable}\,a_1$\\
- & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
- & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
- & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\
- $(_{bs}a^*)\,\backslash c$ & $\dn$ &
- $_{bs}(\textit{fuse}\, [Z] \; r\,\backslash c)\cdot
- (_{[]}r^*))$
-\end{tabular}
-\end{center}
-\noindent
-We give the intuition behind some of the more involved cases in
-$\bder$. For example,
-in the \emph{star} case,
-a derivative on $_{bs}a^*$ means
-that one more star iteratoin needs to be taken.
-we need to unfold it into a sequence,
-and attach an additional bit $Z$ to the front of $r \backslash c$
-as a record to indicate one new star iteration is unfolded.
-
-\noindent
-\begin{center}
- \begin{tabular}{@{}lcl@{}}
- $(_{bs}a^*)\,\backslash c$ & $\dn$ &
- $_{bs}(\underbrace{\textit{fuse}\, [Z] \; a\,\backslash c}_{\text{One more iteration}})\cdot
- (_{[]}a^*))$
-\end{tabular}
-\end{center}
-
-\noindent
-This information will be recovered later by the $\decode$ function.
-The intuition is that the bit $Z$ will be decoded at the right location,
-because we accumulate bits from left to right (a rigorous proof will be given
-later).
-
-\begin{tikzpicture}[ > = stealth, % arrow head style
- shorten > = 1pt, % don't touch arrow head to node
- semithick % line style
- ]
-
- \tikzstyle{every state}=[
- draw = black,
- thin,
- fill = cyan!29,
- minimum size = 7mm
- ]
- \begin{scope}[node distance=1cm and 0cm, every node/.style=state]
- \node (k) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
- {$bs$
- \nodepart{two} $a^*$ };
- \node (l) [below =of k, rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
- { $bs$ + [Z]
- \nodepart{two} $(a\backslash c )\cdot a^*$ };
- \end{scope}
- \path[->]
- (k) edge (l);
-\end{tikzpicture}
-
-Pictorially the process looks like below.
-Like before, the red region denotes
-previous lexing information (stored as bitcodes in $bs$).
-
-\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
- \begin{scope}[node distance=1cm]
- \node (a) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
- {$bs$
- \nodepart{two} $a^*$ };
- \node (b) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},]
- { $bs$ + [Z]
- \nodepart{two} $(a\backslash c )\cdot a^*$ };
-%\caption{term 1 \ref{term:1}'s matching configuration}
- \end{scope}
-\end{tikzpicture}
-
-\noindent
-Another place in the $\bder$ function where it differs
-from normal derivatives (on un-annotated regular expressions)
-is the sequence case:
-\begin{center}
- \begin{tabular}{@{}lcl@{}}
-
- $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
- $\textit{if}\;\textit{bnullable}\,a_1$\\
- & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
- & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
- & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$
-\end{tabular}
-\end{center}
-
-The difference is that (when $a_1$ is $\bnullable$)
-we use $\bmkeps$ to store the lexing information
-in $a_1$ before collapsing it (as it has been fully matched by string prior to $c$,
-and attach the collected bit-codes to the front of $a_2$
-before throwing away $a_1$. We assume that $\bmkeps$ correctly extracts the bitcode for how $a_1$
-matches the string prior to $c$ (more on this later).
-The bitsequence $\textit{bs}$ which was initially attached to the first element of the sequence
-$a_1 \cdot a_2$, has now been elevated to the top level of teh $\sum$.
-This is because this piece of information will be needed whichever way the sequence is matched,
-regardless of whether $c$ belongs to $a_1$ or $a_2$.
-
-In the injection-based lexing, $r_1$ is immediately thrown away in
-subsequent derivatives on the right branch (when $r_1$ is $\nullable$),
-\begin{center}
- $(r_1 \cdot r_2 )\backslash c = (r_1 \backslash c) \cdot r_2 + r_2 \backslash c$
-\end{center}
-\noindent
-as it knows $r_1$ is stored on stack and available once the recursive
-call to later derivatives finish.
-Therefore, if the $\Right$ branch is taken in a $\POSIX$ match,
-we construct back the sequence value once step back by
-calling a on $\mkeps(r_1)$.
-\begin{center}
- \begin{tabular}{lcr}
- $\ldots r_1 \cdot r_2$ & $\rightarrow$ & $r_1\cdot r_2 + r_2 \backslash c \ldots $\\
- $\ldots \Seq(v_1, v_2) (\Seq(\mkeps(r1), (\inj \; r_2 \; c\; v_{2c})))$ & $\leftarrow$ & $\Right(v_{2c})\ldots$
- \end{tabular}
-\end{center}
-\noindent
-The rest of the clauses of $\bder$ is rather similar to
-$\der$, and is put together here as a wholesome definition
-for $\bder$:
-\noindent
-Generalising the derivative operation with bitcodes to strings, we have
-\begin{center}
- \begin{tabular}{@{}lcl@{}}
- $a\backslash_s [] $ & $\dn$ & $a$\\
- $a\backslash (c :: s) $ & $\dn$ & $(a \backslash c) \backslash_s s$
- \end{tabular}
-\end{center}
-As we did earlier, we omit the $s$ subscript at $\backslash_s$ when there is no danger
-of confusion.
-Putting this all together, we obtain a lexer with bit-coded regular expressions
-as its internal data structures, which we call $\blexer$:
-
-\begin{center}
-\begin{tabular}{lcl}
- $\textit{blexer}\;r\,s$ & $\dn$ &
- $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\
- & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
- & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
- & & $\;\;\textit{else}\;\textit{None}$
-\end{tabular}
-\end{center}
-
-\noindent
-Ausaf and Urban formally proved the correctness of the $\blexer$, namely
-\begin{conjecture}
-$\blexer \;r \; s = \lexer \; r \; s$.
-\end{conjecture}
-This was claimed but not formalised in Sulzmann and Lu's work.
-We introduce the proof later, after we give out all
-the needed auxiliary functions and definitions
-
%-----------------------------------
% SUBSECTION 1