# HG changeset patch # User Chengsong # Date 1660773555 -3600 # Node ID e0f0a81f907be87ed77adfc7c597a616b1227d35 # Parent 35df9cdd36ca00a49af21cd3fbf3db5304c26e9a halfway chap3 diff -r 35df9cdd36ca -r e0f0a81f907b ChengsongTanPhdThesis/Chapters/Bitcoded1.tex --- a/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Wed Aug 17 01:09:13 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Wed Aug 17 22:59:15 2022 +0100 @@ -15,7 +15,7 @@ in their algorithm such as de-duplication not working properly and redundant fixpoint construction. Their algorithm is improved and verified with the help of formal proofs. -\section{Bit-coded Algorithm} +\section{The Motivation Behind Using Bitcodes} We first do a recap of what was going on in the lexer algorithm in Chapter \ref{Inj}, \begin{center} @@ -35,7 +35,7 @@ and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc. Each descent into deeper recursive calls in $\lexer$ causes a new pair of $r_i, c_i$ to be pushed to the call stack. -\begin{figure} +\begin{figure}[H] \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] %\draw (-6,-6) grid (6,6); \node [ circle ] (r) at (-6, 5) {$r$}; @@ -64,7 +64,7 @@ -\begin{figure} +\begin{figure}[H] \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] %\draw (-6,-6) grid (6,6); \node [ circle ] (r) at (-6, 5) {$r$}; @@ -101,7 +101,7 @@ As the number of derivative steps increase, the stack would increase: -\begin{figure} +\begin{figure}[H] \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] %\draw (-6,-6) grid (6,6); \node [ circle ] (r) at (-6, 5) {$r$}; @@ -145,11 +145,9 @@ \end{tikzpicture} \caption{More Derivatives Taken} \end{figure} -\noindent -After all derivatives have been taken, the stack grows to a maximum size -and the pair of regular expressions and characters $r_i, c_{i+1}$ -are then popped out and used in the injection phase: + +\begin{figure}[H] \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] %\draw (-6,-6) grid (6,6); \node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$}; @@ -174,18 +172,8 @@ \node at ($(ldots)!.4!(rn)$) {\ldots}; -\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$}; - -\node [] (ldots2) at (3.5, -5) {}; - -\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$}; - -\node at ($(ldots2)!.4!(v2)$) {\ldots}; -\node [circle, draw] (v1) at (-2, -5) {$v_1$}; - -\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$}; \node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack}; @@ -197,15 +185,6 @@ edge [] node {} (r2); \path (r2) edge [] node {} (ldots); -\path (rn) - edge [] node {$\mkeps$} (vn); -\path (vn) - edge [] node {} (ldots2); -\path (v2) - edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1); - -\path (v1) - edge [] node {$\inj \; r \; c_1 \; v_1$} (v); \path (r) edge [dashed, bend right] node {} (stack); @@ -228,133 +207,634 @@ \path (rn) edge [dashed, bend left] node {} (stack); \end{tikzpicture} -%\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] -%%\draw (-6,-6) grid (6,6); -%\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$}; +\caption{Before Injection Phase Starts} +\end{figure} + + +\noindent +After all derivatives have been taken, the stack grows to a maximum size +and the pair of regular expressions and characters $r_i, c_{i+1}$ +are then popped out and used in the injection phase. + + + +\begin{figure}[H] +\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] +%\draw (-6,-6) grid (6,6); +\node [radius = 0.5, circle, ] (r) at (-6, 5) {$r$}; + +%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; +\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$}; % -%%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; -%\node [circle, minimum size = 0.1] (c1) at (-4, 5.4) {$c_1$}; -%% -%%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; -%\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$}; -%% -%%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$}; -%\node [circle, minimum size = 0.1] (c2) at (0, 5.4) {$c_2$}; -%% -%%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; -%\node [circle, draw] (r2) at (2, 5) {$r_2$}; -%% -%% -%\node [] (ldots) at (4.5, 5) {}; -%%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$}; +%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; +\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$}; % -%\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$}; -% -%\node at ($(ldots)!.4!(rn)$) {\ldots}; -% -%\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$}; +%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$}; +\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$}; % -%\node [] (ldots2) at (3.5, -5) {}; -% -%\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$}; -% -%\node at ($(ldots2)!.4!(v2)$) {\ldots}; +%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; +\node [circle, ] (r2) at (2, 5) {$r_2$}; % % -%\node [circle, draw] (v1) at (-2, -5) {$v_1$}; -% -%\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$}; -% -%\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack}; +\node [] (ldots) at (4.5, 5) {}; +%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$}; + +\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {$r_n$}; + +\node at ($(ldots)!.4!(rn)$) {\ldots}; + +\node [minimum size = 0.5, circle, ] (vn) at (6, -5) {$v_n$}; + +\node [] (ldots2) at (3.5, -5) {}; + +\node [minimum size = 0.5, circle, ] (v2) at (2, -5) {$v_2$}; + +\node at ($(ldots2)!.4!(v2)$) {\ldots}; + + +\node [circle, ] (v1) at (-2, -5) {$v_1$}; + +\node [radius = 0.5, circle, ] (v) at (-6, -5) {$v$}; + +\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack}; + +\path + (r) + edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); +\path + (r1) + edge [] node {} (r2); +\path (r2) + edge [] node {} (ldots); +\path (rn) + edge [] node {$\mkeps$} (vn); +\path (vn) + edge [] node {} (ldots2); +\path (v2) + edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1); + +\path (v1) + edge [] node {$\inj \; r \; c_1 \; v_1$} (v); + +\path (stack) + edge [dashed] node {} (-4.2, -5.2); +\path (stack) + edge [dashed] node {} (-4, -5.2); +\path (stack) + edge [dashed] node {} (-0.1, -5.2); +\path (stack) + edge [dashed] node {} (0.2, -5.26); +\path (stack) + edge [dashed] node {} (3.2, -5); +\path (stack) + edge [dashed] node {} (2.7, -5); +\path (stack) + edge [dashed] node {} (3.7, -5); +\end{tikzpicture} +\caption{Stored $r_i, c_{i+1}$ Used by $\inj$} +\end{figure} +\noindent +Storing all $r_i, c_{i+1}$ pairs recursively +allows the algorithm to work in an elegant way, at the expense of +storing quite a bit of verbose information. +The stack seems to grow at least quadratically fast with respect +to the input (not taking into account the size bloat of $r_i$), +which can be inefficient and prone to stack overflow. +\section{Bitcoded Algorithm} +To address this, +Sulzmann and Lu chose to define a new datatype +called \emph{annotated regular expression}, +which condenses all the partial lexing information +(that was originally stored in $r_i, c_{i+1}$ pairs) +into bitcodes. +Annotated regular expressions +are defined as the following +Isabelle datatype \footnote{ We use subscript notation to indicate + that the bitcodes are auxiliary information that do not +interfere with the structure of the regular expressions }: +\begin{center} +\begin{tabular}{lcl} + $\textit{a}$ & $::=$ & $\ZERO$\\ + & $\mid$ & $_{bs}\ONE$\\ + & $\mid$ & $_{bs}{\bf c}$\\ + & $\mid$ & $_{bs}\sum\,as$\\ + & $\mid$ & $_{bs}a_1\cdot a_2$\\ + & $\mid$ & $_{bs}a^*$ +\end{tabular} +\end{center} +\noindent +where $bs$ stands for bit-codes, $a$ for $\mathbf{a}$nnotated regular +expressions and $as$ for lists of annotated regular expressions. +The alternative constructor, written, $\sum$, has been generalised to +accept a list of annotated regular expressions rather than just two. +Why is it generalised? This is because when we open up nested +alternatives, there could be more than two elements at the same level +after de-duplication, which can no longer be stored in a binary +constructor. +Bits and bitcodes (lists of bits) are defined as: +\begin{center} + $b ::= S \mid Z \qquad +bs ::= [] \mid b::bs +$ +\end{center} +\noindent +We use $S$ and $Z$ rather than $1$ and $0$ is to avoid +confusion with the regular expressions $\ZERO$ and $\ONE$. +The idea is to use the attached bitcodes +to indicate which choice was made at a certain point +during lexing. +For example, +$(_{Z}a+_{S}b) \backslash a$ gives us +$_{Z}\ONE + \ZERO$, this $Z$ bitcode will +later be used to decode that a left branch was +selected at the time when the part $a+b$ is being taken +derivative of. +\subsection{A Bird's Eye View of the Bit-coded Lexer} +Before we give out the rest of the functions and definitions +related to our +$\blexer$ (\emph{b}-itcoded lexer), we first provide a high-level +view of the algorithm, so the reader does not get lost in +the details. +\begin{figure}[H] +\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] +%\draw (-6,-6) grid (6,6); + + \node [circle, draw] (r0) at (-6, 2) {$r$}; + + \node [radius = 0.5, circle, draw] (r) at (-6, 5) {$_{bs}a$}; + \path (r0) + edge [] node {$\internalise$} (r); +%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; +\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$}; % -%\path -% (r) -% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); -%\path -% (r1) -% edge [] node {} (r2); -%\path (r2) -% edge [] node {} (ldots); -%\path (rn) -% edge [] node {$\mkeps$} (vn); -%\path (vn) -% edge [] node {} (ldots2); -%\path (v2) -% edge [] node {} (v1); +%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; +\node [minimum size = 1.0cm, circle, draw] (r1) at (-2, 5) {$_{bs_1}a_1$}; +% +%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$}; +\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$}; +% +%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; +\node [circle, draw, minimum size = 1.4cm] (r2) at (2, 5) {$_{bs_2}a_2$}; +% +% +\node [] (ldots) at (4.5, 5) {}; +%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$}; + +\node [minimum size = 2.2cm, circle, draw] (rn) at (6, 5) {$_{bs_n}a_n$}; + +\node at ($(ldots)!.1!(rn)$) {\ldots}; + +\node [minimum size = 0.5, circle, ] (v) at (6, 2) {$v$}; + +%\node [] (v2) at (4, -5) {}; +% +%\node [draw, cross out] (ldots2) at (5, -5) {}; % -%\path (v1) -% edge [] node {} (v); -%\path (r) -% edge [] node {saved} (stack); -% -%\path (r1) -% edge [] node {saved} (stack); -%\end{tikzpicture} -\noindent -The information stored in characters and regular expressions -make the algorithm work in an elegant way, at the expense of being -storing quite a bit of verbose information. +%\node at ($(ldots2)!.4!(v2)$) {\ldots}; + + +\node [align = center] (decode) at (6.6, 3.2) {$\bmkeps$\\$\decode$}; + +\path (c1) + edge [dashed, bend left] node {} (r0); + +\path (c2) + edge [dashed, bend left] node {} (r0); + +\path (r1) + edge [dashed, bend right] node {} (r2); + + +\path + (r) + edge [dashed, bend right] node[left] {} (r1); + +\path + (r) + edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); + +\path + (r1) + edge [] node {} (r2); +\path (r2) + edge [] node {} (ldots); +\path (rn) + edge [] node {} (v); + +\path (r0) + edge [dashed, bend right] node {} (decode); +%\path (v) + %edge [] node {} (ldots2); -The lexer algorithm in Chapter \ref{Inj}, -stores information of previous lexing steps -on a stack, in the form of regular expressions -and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc. -The red part represents what we already know during the first -derivative phase, -and the blue part represents the unknown part of input. -\begin{ceqn} -\begin{equation}%\label{graph:injLexer} - \begin{tikzcd}[ampersand replacement=\&, execute at end picture={ - \begin{scope}[on background layer] - \node[rectangle, fill={red!30}, - pattern=north east lines, pattern color=red, - fit={(-3,-1) (-3, 1) (1, -1) - (1, 1)} - ] - {}; , - \node[rectangle, fill={blue!20}, - pattern=north east lines, pattern color=blue, - fit= {(1, -1) (1, 1) (3, -1) (3, 1)} - ] - {}; - \end{scope}} - ] -r_0 \arrow[r, "\backslash c_0"] \arrow[d] \& r_1 \arrow[r, "\backslash c_1"] \arrow[d] \& r_2 \arrow[r, dashed] \arrow[d] \& r_n \arrow[d, "mkeps" description] \\ -v_0 \& v_1 \arrow[l,"inj_{r_0} c_0"] \& v_2 \arrow[l, "inj_{r_1} c_1"] \& v_n \arrow[l, dashed] \\ -\end{tikzcd} -\end{equation} -\end{ceqn} +\end{tikzpicture} +\caption{Bird's Eye View of $\blexer$} +\end{figure} +\noindent +The plain regular expressions +are first ``lifted'' to an annotated regular expression, +with the function called \emph{internalise}. +Then the annotated regular expression $_{bs}a$ will +go through successive derivatives with respect +to the input stream of characters $c_1, c_2$ etc. +Each time a derivative is taken, the bitcodes +are moved around, discarded or augmented together +with the regular expression they are attached to. +After all input has been consumed, the +bitcodes are collected by $\bmkeps$, +which traverses the nullable part of the regular expression +and collect the bitcodes along the way. +The collected bitcodes are then $\decode$d with the guidance +of the input regular expression $r$. +The most notable improvements of $\blexer$ +over $\lexer$ are +\begin{itemize} + \item + + An absence of the second injection phase. + \item + One need not store each pair of the + intermediate regular expressions $_{bs_i}a_i$ and $c_{i+1}$. + The previous annotated regular expression $_{bs_i}a_i$'s information is passed + on without loss to its successor $_{bs_{i+1}}a_{i+1}$, + and $c_{i+1}$'s already contained in the strings in $L\;r$ \footnote{ + which can be easily recovered if we decode in the correct order}. + \item + The simplification works much better--one can maintain correctness + while applying quite strong simplifications, as we shall wee. +\end{itemize} +\noindent +In the next section we will +give the operations needed in $\blexer$, +with some remarks on the idea behind their definitions. +\subsection{Operations in $\textit{Blexer}$} +The first operation we define related to bit-coded regular expressions +is how we move bits to the inside of regular expressions. +Called $\fuse$, this operation attaches bit-codes +to the front of an annotated regular expression: +\begin{center} +\begin{tabular}{lcl} + $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\ + $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ & + $_{bs @ bs'}\ONE$\\ + $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ & + $_{bs@bs'}{\bf c}$\\ + $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ & + $_{bs@bs'}\sum\textit{as}$\\ + $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ & + $_{bs@bs'}a_1 \cdot a_2$\\ + $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ & + $_{bs @ bs'}a^*$ +\end{tabular} +\end{center} + +\noindent +With \emph{fuse} we are able to define the $\internalise$ function +that translates a ``standard'' regular expression into an +annotated regular expression. +This function will be applied before we start +with the derivative phase of the algorithm. + +\begin{center} +\begin{tabular}{lcl} + $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\ + $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\ + $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\ + $(r_1 + r_2)^\uparrow$ & $\dn$ & + $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\, + \textit{fuse}\,[S]\,r_2^\uparrow]$\\ + $(r_1\cdot r_2)^\uparrow$ & $\dn$ & + $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\ + $(r^*)^\uparrow$ & $\dn$ & + $_{[]}(r^\uparrow)^*$\\ +\end{tabular} +\end{center} +\noindent +We use an up arrow with postfix notation +to denote this operation +for convenience. +The opposite of $\textit{internalise}$ is +$\erase$, where all the bit-codes are removed, +and the alternative operator $\sum$ for annotated +regular expressions is transformed to the binary version +in plain regular expressions. +\begin{center} + \begin{tabular}{lcl} + $\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\ + $( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\ + $( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\ + $( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ & + $ (a_1) _\downarrow \cdot (a_2) _\downarrow$\\ + $( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\ + $( _{bs} [a] )_\downarrow$ & $\dn$ & $a_\downarrow$\\ + $_{bs} \sum [a_1, \; a_2]$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\ + $(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\ + $( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$ + \end{tabular} +\end{center} +\noindent +We also abbreviate the $\erase\; a$ operation +as $a_\downarrow$, for conciseness. + +Testing whether an annotated regular expression +contains the empty string in its lauguage requires +a dedicated function $\bnullable$. +$\bnullable$ simply calls $\erase$ before $\nullable$. +\begin{definition} + $\bnullable \; a \dn \nullable \; (a_\downarrow)$ +\end{definition} +The function for collecting +bitcodes from a +(b)nullable regular expression is called $\bmkeps$. +$\bmkeps$ is a variation of the $\textit{mkeps}$ function, +which follows the path $\mkeps$ takes to traverse the +$\nullable$ branches when visiting a regular expression, +but gives back bitcodes instead of a value. +\begin{center} +\begin{tabular}{lcl} + $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\ + $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,a$\\ + & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\ + & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\ + $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ & + $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ + $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ & + $bs \,@\, [Z]$ +\end{tabular} +\end{center} +\noindent +$\bmkeps$ retrieves the value $v$'s +information in the format +of bitcodes, by travelling along the +path of the regular expression that corresponds to a POSIX match, +collecting all the bitcodes, and attaching $S$ to indicate the end of star +iterations. \\ +The bitcodes extracted by $\bmkeps$ need to be +$\decode$d (with the guidance of a plain regular expression): +%\begin{definition}[Bitdecoding of Values]\mbox{} +\begin{center} +\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}} + $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ + $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\ + $\textit{decode}'\,(Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\; + (\Left\,v, bs_1)$\\ + $\textit{decode}'\,(S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\; + (\Right\,v, bs_1)$\\ + $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ & + $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\ + & & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\ + & & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\ + $\textit{decode}'\,(Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\ + $\textit{decode}'\,(S\!::\!bs)\,(r^*)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\ + & & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\ + & & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\ + + $\textit{decode}\,bs\,r$ & $\dn$ & + $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\ + & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\; + \textit{else}\;\textit{None}$ +\end{tabular} +\end{center} \noindent -The red area expands as we move towards $r_n$, -indicating more is known about the lexing result. -Despite knowing this partial lexing information during -the forward derivative phase, we choose to store them -all the way until $r_n$ is reached. -Then we reconstruct the value character by character -values at a later stage, using information in a Last-In-First-Out -manner. Although the algorithm is elegant and natural, -it can be inefficient and prone to stack overflow.\\ -It turns out we can store lexing -information on the fly, while still using regular expression -derivatives. -If we remove the individual -lexing steps, and use red and blue areas as before -to indicate consumed (seen) input and constructed -partial value (before recovering the rest of the stack), -one could see that the seen part's lexical information -is stored in the form of a regular expression. +The function $\decode'$ returns a pair consisting of +a partially decoded value and some leftover bit list that cannot +be decide yet. +The function $\decode'$ succeeds if the left-over +bit-sequence is empty. +$\decode$ is terminating as $\decode'$ is terminating. +$\decode'$ is terminating +because at least one of $\decode'$'s parameters will go down in terms +of size.\\ +The reverse operation of $\decode$ is $\code$. +$\textit{code}$ encodes a value into a bitcode by converting +$\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty +star iteration by $S$. The border where a star iteration +terminates is marked by $Z$. +This coding is lossy, as it throws away the information about +characters, and does not encode the ``boundary'' between two +sequence values. Moreover, with only the bitcode we cannot even tell +whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$. +\begin{center} +\begin{tabular}{lcl} + $\textit{code}(\Empty)$ & $\dn$ & $[]$\\ + $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\ + $\textit{code}(\Left\,v)$ & $\dn$ & $Z :: code(v)$\\ + $\textit{code}(\Right\,v)$ & $\dn$ & $S :: code(v)$\\ + $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\ + $\textit{code}(\Stars\,[])$ & $\dn$ & $[Z]$\\ + $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $S :: code(v) \;@\; + code(\Stars\,vs)$ +\end{tabular} +\end{center} +\noindent +Assume we have a value $v$ and regular expression $r$ +with $\vdash v:r$, +then we have the property that $\decode$ and $\code$ are +reverse operations of one another: +\begin{lemma} +\[If \vdash v : r \; then \;\decode \; (\code \; v) \; r = \textit{Some}(v) \] +\end{lemma} +\begin{proof} +By proving a more general version of the lemma, on $\decode'$: +\[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \] +Then setting $ds$ to be $[]$ and unfolding $\decode$ definition, +we obtain the property. +\end{proof} +\noindent +Now we give out the central part of this lexing algorithm, +the $\bder$ function (stands for \emph{b}itcoded-derivative): +\begin{center} + \begin{tabular}{@{}lcl@{}} + $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\ + $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\ + $(_{bs}{\bf d})\,\backslash c$ & $\dn$ & + $\textit{if}\;c=d\; \;\textit{then}\; + _{bs}\ONE\;\textit{else}\;\ZERO$\\ + $(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ & + $_{bs}\sum\;(\textit{map} \; (\_\backslash c) \; as )$\\ + $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,a_1$\\ + & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\ + & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\ + & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\ + $(_{bs}a^*)\,\backslash c$ & $\dn$ & + $_{bs}(\textit{fuse}\, [Z] \; r\,\backslash c)\cdot + (_{[]}r^*))$ +\end{tabular} +\end{center} +\noindent +For most time we use the infix notation $(\_\backslash\_)$ +to mean $\bder$ for brevity when +there is no danger of confusion with derivatives on plain regular expressions. +For example, we write $( _{[]}r^* ) \backslash c$ instead of $\bder \;c \; _{[]}r^*$, +as the bitcodes at the front of $r^*$ indicates that it is +a bit-coded regular expression, not a plain one.\\ +$\bder$ tells us how regular expressions can be recursively traversed, +where the bitcodes are augmented and carried around +when a derivative is taken. +We give the intuition behind some of the more involved cases in +$\bder$. \\ +For example, +in the \emph{star} case, +a derivative on $_{bs}a^*$ means +that one more star iteratoin needs to be taken. +we need to unfold it into a sequence, +and attach an additional bit $Z$ to the front of $r \backslash c$ +as a record to indicate one new star iteration is unfolded. + +\noindent +\begin{center} + \begin{tabular}{@{}lcl@{}} + $(_{bs}a^*)\,\backslash c$ & $\dn$ & + $_{bs}(\underbrace{\textit{fuse}\, [Z] \; a\,\backslash c}_{\text{One more iteration}})\cdot + (_{[]}a^*))$ +\end{tabular} +\end{center} + +\noindent +This information will be recovered later by the $\decode$ function. +The intuition is that the bit $Z$ will be decoded at the right location, +because we accumulate bits from left to right (a rigorous proof will be given +later). + +%\begin{tikzpicture}[ > = stealth, % arrow head style +% shorten > = 1pt, % don't touch arrow head to node +% semithick % line style +% ] +% +% \tikzstyle{every state}=[ +% draw = black, +% thin, +% fill = cyan!29, +% minimum size = 7mm +% ] +% \begin{scope}[node distance=1cm and 0cm, every node/.style=state] +% \node (k) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] +% {$bs$ +% \nodepart{two} $a^*$ }; +% \node (l) [below =of k, rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] +% { $bs$ + [Z] +% \nodepart{two} $(a\backslash c )\cdot a^*$ }; +% \end{scope} +% \path[->] +% (k) edge (l); +%\end{tikzpicture} +% +%Pictorially the process looks like below. +%Like before, the red region denotes +%previous lexing information (stored as bitcodes in $bs$). + +%\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] +% \begin{scope}[node distance=1cm] +% \node (a) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] +% {$bs$ +% \nodepart{two} $a^*$ }; +% \node (b) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] +% { $bs$ + [Z] +% \nodepart{two} $(a\backslash c )\cdot a^*$ }; +%%\caption{term 1 \ref{term:1}'s matching configuration} +% \end{scope} +%\end{tikzpicture} + +\noindent +Another place the $\bder$ function differs +from derivatives on plain regular expressions +is the sequence case: +\begin{center} + \begin{tabular}{@{}lcl@{}} + + $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,a_1$\\ + & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\ + & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\ + & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$ +\end{tabular} +\end{center} +\noindent +The difference is that (when $a_1$ is $\bnullable$) +we use $\bmkeps$ to store the lexing information +in $a_1$ before collapsing +it (as it has been fully matched by string prior to $c$), +and attach the collected bit-codes to the front of $a_2$ +before throwing away $a_1$. We assume that $\bmkeps$ +correctly extracts the bitcode for how $a_1$ +matches the string prior to $c$ (more on this later). +The bitsequence $\textit{bs}$ which was initially +attached to the first element of the sequence +$a_1 \cdot a_2$, has now been +elevated to the top level of the $\sum$ constructor. +This is because this piece of information will be needed +whichever way the sequence is matched, +regardless of whether $c$ belongs to $a_1$ or $a_2$. + +In the injection-based lexer, $r_1$ is immediately thrown away in +subsequent derivatives on the right branch (when $r_1$ is $\nullable$), +depite $r_1$ potentially storing information of previous matches: +\begin{center} + $(r_1 \cdot r_2 )\backslash c = (r_1 \backslash c) \cdot r_2 + r_2 \backslash c$ +\end{center} +\noindent +this loss of information makes it necessary +to store on stack all the regular expressions' +``snapshots'' before they were +taken derivative of. +So that the related information will be available +once the child recursive +call finishes.\\ +The rest of the clauses of $\bder$ is rather similar to +$\der$. \\ +Generalising the derivative operation with bitcodes to strings, we have +\begin{center} + \begin{tabular}{@{}lcl@{}} + $a\backslash_s [] $ & $\dn$ & $a$\\ + $a\backslash (c :: s) $ & $\dn$ & $(a \backslash c) \backslash_s s$ + \end{tabular} +\end{center} +\noindent +As we did earlier, we omit the $s$ subscript at $\backslash_s$ when there is no danger +of confusion. +\subsection{Putting Things Together} +Putting these operations altogether, we obtain a lexer with bit-coded regular expressions +as its internal data structures, which we call $\blexer$: + +\begin{center} +\begin{tabular}{lcl} + $\textit{blexer}\;r\,s$ & $\dn$ & + $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\ + & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\ + & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\ + & & $\;\;\textit{else}\;\textit{None}$ +\end{tabular} +\end{center} + +\noindent +Ausaf and Urban formally proved the correctness of the $\blexer$, namely +\begin{property} +$\blexer \;r \; s = \lexer \; r \; s$. +\end{property} +This was claimed but not formalised in Sulzmann and Lu's work. +We introduce the proof later, after we give all +the needed auxiliary functions and definitions. +But before this we shall first walk the reader +through a concrete example of our $\blexer$ calculating the right +lexical information through bit-coded regular expressions.\\ Consider the regular expression $(aa)^* \cdot bc$ matching the string $aabc$ -and assume we have just read the two characters $aa$: +and assume we have just read the first character $a$: \begin{center} \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] - {Partial lexing info: $\ONE \cdot a \cdot (aa)^* \cdot bc$ etc. - \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$}; + {$\ONE \cdot a \cdot (aa)^* \cdot bc$ + \nodepart{two} $[] \iff \Seq \; (\Stars \; [\Seq\; a \; ??, \;]), ??$}; \end{tikzpicture} \end{center} \noindent +We use the red area for (annotated) regular expressions and the blue +area the (partial) bitcodes and (partial) values. In the injection-based lexing algorithm, we ``neglect" the red area by putting all the characters we have consumed and intermediate regular expressions on the stack when @@ -430,401 +910,6 @@ \end{center} \noindent -Using $S$ and $Z$ rather than $1$ and $0$ is to avoid -confusion with the regular expressions $\ZERO$ and $\ONE$. -Bitcodes (or -bit-lists) can be used to encode values (or potentially incomplete values) in a -compact form. This can be straightforwardly seen in the following -coding function from values to bitcodes: -\begin{center} -\begin{tabular}{lcl} - $\textit{code}(\Empty)$ & $\dn$ & $[]$\\ - $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\ - $\textit{code}(\Left\,v)$ & $\dn$ & $Z :: code(v)$\\ - $\textit{code}(\Right\,v)$ & $\dn$ & $S :: code(v)$\\ - $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\ - $\textit{code}(\Stars\,[])$ & $\dn$ & $[Z]$\\ - $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $S :: code(v) \;@\; - code(\Stars\,vs)$ -\end{tabular} -\end{center} -\noindent -Here $\textit{code}$ encodes a value into a bit-code by converting -$\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty -star iteration by $S$. The border where a local star terminates -is marked by $Z$. -This coding is lossy, as it throws away the information about -characters, and also does not encode the ``boundary'' between two -sequence values. Moreover, with only the bitcode we cannot even tell -whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$. The -reason for choosing this compact way of storing information is that the -relatively small size of bits can be easily manipulated and ``moved -around" in a regular expression. - -Because of the lossiness, the process of decoding a bitlist requires additionally -a regular expression. The function $\decode$ is defined as: - -%\begin{definition}[Bitdecoding of Values]\mbox{} -\begin{center} -\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}} - $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ - $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\ - $\textit{decode}'\,(Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & - $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\; - (\Left\,v, bs_1)$\\ - $\textit{decode}'\,(S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & - $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\; - (\Right\,v, bs_1)$\\ - $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ & - $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\ - & & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\ - & & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\ - $\textit{decode}'\,(Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\ - $\textit{decode}'\,(S\!::\!bs)\,(r^*)$ & $\dn$ & - $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\ - & & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\ - & & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\ - - $\textit{decode}\,bs\,r$ & $\dn$ & - $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\ - & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\; - \textit{else}\;\textit{None}$ -\end{tabular} -\end{center} -%\end{definition} - -\noindent -The function $\decode'$ returns a pair consisting of -a partially decoded value and some leftover bit list that cannot -be decide yet. -The function $\decode'$ succeeds if the left-over -bit-sequence is empty. -$\decode$ is terminating as $\decode'$ is terminating. -$\decode'$ is terminating -because at least one of $\decode'$'s parameters will go down in terms -of size. -Assuming we have a value $v$ and regular expression $r$ -with $\vdash v:r$, -then we have the property that $\decode$ and $\code$ are -reverse operations of one another: -\begin{lemma} -\[If \vdash v : r \; then \;\decode \; (\code \; v) \; r = \textit{Some}(v) \] -\end{lemma} -\begin{proof} -By proving a more general version of the lemma, on $\decode'$: -\[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \] -Then setting $ds$ to be $[]$ and unfolding $\decode$ definition, -we obtain the property. -\end{proof} -With the $\code$ and $\decode$ functions in hand, we know how to -switch between bit-codes and values. -The next step is to integrate this information into regular expression. -Attaching bits to the front of regular expressions is the solution Sulzamann and Lu -gave for storing partial values in regular expressions. -Annotated regular expressions are therefore defined as the Isabelle -datatype: - -\begin{center} -\begin{tabular}{lcl} - $\textit{a}$ & $::=$ & $\ZERO$\\ - & $\mid$ & $_{bs}\ONE$\\ - & $\mid$ & $_{bs}{\bf c}$\\ - & $\mid$ & $_{bs}\sum\,as$\\ - & $\mid$ & $_{bs}a_1\cdot a_2$\\ - & $\mid$ & $_{bs}a^*$ -\end{tabular} -\end{center} -%(in \textit{ALTS}) - -\noindent -where $bs$ stands for bit-codes, $a$ for $\mathbf{a}$nnotated regular -expressions and $as$ for lists of annotated regular expressions. -The alternative constructor, written, $\sum$, has been generalised to -accept a list of annotated regular expressions rather than just two. -Why is it generalised? This is because when we open up nested -alternatives, there could be more than two elements at the same level -after de-duplication, which can no longer be stored in a binary -constructor. - -The first operation we define related to bit-coded regular expressions -is how we move bits to the inside of regular expressions. -Called $\fuse$, this operation is attaches bit-codes -to the front of an annotated regular expression: -\begin{center} -\begin{tabular}{lcl} - $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\ - $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ & - $_{bs @ bs'}\ONE$\\ - $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ & - $_{bs@bs'}{\bf c}$\\ - $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ & - $_{bs@bs'}\sum\textit{as}$\\ - $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ & - $_{bs@bs'}a_1 \cdot a_2$\\ - $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ & - $_{bs @ bs'}a^*$ -\end{tabular} -\end{center} - -\noindent -With \emph{fuse} we are able to define the $\internalise$ function -that translates a ``standard'' regular expression into an -annotated regular expression. -This function will be applied before we start -with the derivative phase of the algorithm. - -\begin{center} -\begin{tabular}{lcl} - $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\ - $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\ - $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\ - $(r_1 + r_2)^\uparrow$ & $\dn$ & - $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\, - \textit{fuse}\,[S]\,r_2^\uparrow]$\\ - $(r_1\cdot r_2)^\uparrow$ & $\dn$ & - $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\ - $(r^*)^\uparrow$ & $\dn$ & - $_{[]}(r^\uparrow)^*$\\ -\end{tabular} -\end{center} -%\end{definition} - -\noindent -We use an up arrow with postfix notation -to denote this operation. -for convenience. The $\textit{internalise} \; r$ -notation is more cumbersome. -The opposite of $\textit{internalise}$ is -$\erase$, where all the bit-codes are removed, -and the alternative operator $\sum$ for annotated -regular expressions is transformed to the binary alternatives -for plain regular expressions. -\begin{center} - \begin{tabular}{lcl} - $\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\ - $( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\ - $( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\ - $( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ & - $ (a_1) _\downarrow \cdot (a_2) _\downarrow$\\ - $( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\ - $( _{bs} [a] )_\downarrow$ & $\dn$ & $a_\downarrow$\\ - $_{bs} \sum [a_1, \; a_2]$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\ - $(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\ - $( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$ - \end{tabular} -\end{center} -\noindent -We also abbreviate the $\erase\; a$ operation -as $a_\downarrow$, for conciseness. -For bit-coded regular expressions, as a different datatype, -testing whether they contain empty string in their lauguage requires -a dedicated function $\bnullable$ -which simply calls $\erase$ first before testing whether it is $\nullable$. -\begin{definition} - $\bnullable \; a \dn \nullable \; (a_\downarrow)$ -\end{definition} -The function for collecting the -bitcodes at the end of the derivative -phase from a (b)nullable regular expression -is a generalised version of the $\textit{mkeps}$ function -for annotated regular expressions, called $\textit{bmkeps}$: - - -%\begin{definition}[\textit{bmkeps}]\mbox{} -\begin{center} -\begin{tabular}{lcl} - $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\ - $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ & - $\textit{if}\;\textit{bnullable}\,a$\\ - & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\ - & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\ - $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ & - $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ - $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ & - $bs \,@\, [Z]$ -\end{tabular} -\end{center} -%\end{definition} - -\noindent -$\bmkeps$ completes the value information by travelling along the -path of the regular expression that corresponds to a POSIX value and -collecting all the bitcodes, and attaching $S$ to indicate the end of star -iterations. - -Now we give out the central part of this lexing algorithm, -the $\bder$ function (stands for \emph{b}itcoded-derivative). -For most time we use the infix notation $(\_\backslash\_)$ -to mean $\bder$ for brevity when -there is no danger of confusion with derivatives on plain regular expressions. -For example, we write $( _{[]}r^* ) \backslash c$ instead of $\bder \;c \; _{[]}r^*$, -as the bitcodes at the front of $r^*$ indicates that it is -a bit-coded regular expression, not a plain one. -$\bder$ tells us how regular expressions can be recursively traversed, -where the bitcodes are augmented and carried around -when a derivative is taken. -\begin{center} - \begin{tabular}{@{}lcl@{}} - $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\ - $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\ - $(_{bs}{\bf d})\,\backslash c$ & $\dn$ & - $\textit{if}\;c=d\; \;\textit{then}\; - _{bs}\ONE\;\textit{else}\;\ZERO$\\ - $(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ & - $_{bs}\sum\;(\textit{map} \; (\_\backslash c) \; as )$\\ - $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ & - $\textit{if}\;\textit{bnullable}\,a_1$\\ - & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\ - & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\ - & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\ - $(_{bs}a^*)\,\backslash c$ & $\dn$ & - $_{bs}(\textit{fuse}\, [Z] \; r\,\backslash c)\cdot - (_{[]}r^*))$ -\end{tabular} -\end{center} -\noindent -We give the intuition behind some of the more involved cases in -$\bder$. For example, -in the \emph{star} case, -a derivative on $_{bs}a^*$ means -that one more star iteratoin needs to be taken. -we need to unfold it into a sequence, -and attach an additional bit $Z$ to the front of $r \backslash c$ -as a record to indicate one new star iteration is unfolded. - -\noindent -\begin{center} - \begin{tabular}{@{}lcl@{}} - $(_{bs}a^*)\,\backslash c$ & $\dn$ & - $_{bs}(\underbrace{\textit{fuse}\, [Z] \; a\,\backslash c}_{\text{One more iteration}})\cdot - (_{[]}a^*))$ -\end{tabular} -\end{center} - -\noindent -This information will be recovered later by the $\decode$ function. -The intuition is that the bit $Z$ will be decoded at the right location, -because we accumulate bits from left to right (a rigorous proof will be given -later). - -\begin{tikzpicture}[ > = stealth, % arrow head style - shorten > = 1pt, % don't touch arrow head to node - semithick % line style - ] - - \tikzstyle{every state}=[ - draw = black, - thin, - fill = cyan!29, - minimum size = 7mm - ] - \begin{scope}[node distance=1cm and 0cm, every node/.style=state] - \node (k) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] - {$bs$ - \nodepart{two} $a^*$ }; - \node (l) [below =of k, rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] - { $bs$ + [Z] - \nodepart{two} $(a\backslash c )\cdot a^*$ }; - \end{scope} - \path[->] - (k) edge (l); -\end{tikzpicture} - -Pictorially the process looks like below. -Like before, the red region denotes -previous lexing information (stored as bitcodes in $bs$). - -\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] - \begin{scope}[node distance=1cm] - \node (a) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] - {$bs$ - \nodepart{two} $a^*$ }; - \node (b) [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] - { $bs$ + [Z] - \nodepart{two} $(a\backslash c )\cdot a^*$ }; -%\caption{term 1 \ref{term:1}'s matching configuration} - \end{scope} -\end{tikzpicture} - -\noindent -Another place in the $\bder$ function where it differs -from normal derivatives (on un-annotated regular expressions) -is the sequence case: -\begin{center} - \begin{tabular}{@{}lcl@{}} - - $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ & - $\textit{if}\;\textit{bnullable}\,a_1$\\ - & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\ - & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\ - & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$ -\end{tabular} -\end{center} - -The difference is that (when $a_1$ is $\bnullable$) -we use $\bmkeps$ to store the lexing information -in $a_1$ before collapsing it (as it has been fully matched by string prior to $c$, -and attach the collected bit-codes to the front of $a_2$ -before throwing away $a_1$. We assume that $\bmkeps$ correctly extracts the bitcode for how $a_1$ -matches the string prior to $c$ (more on this later). -The bitsequence $\textit{bs}$ which was initially attached to the first element of the sequence -$a_1 \cdot a_2$, has now been elevated to the top level of teh $\sum$. -This is because this piece of information will be needed whichever way the sequence is matched, -regardless of whether $c$ belongs to $a_1$ or $a_2$. - -In the injection-based lexing, $r_1$ is immediately thrown away in -subsequent derivatives on the right branch (when $r_1$ is $\nullable$), -\begin{center} - $(r_1 \cdot r_2 )\backslash c = (r_1 \backslash c) \cdot r_2 + r_2 \backslash c$ -\end{center} -\noindent -as it knows $r_1$ is stored on stack and available once the recursive -call to later derivatives finish. -Therefore, if the $\Right$ branch is taken in a $\POSIX$ match, -we construct back the sequence value once step back by -calling a on $\mkeps(r_1)$. -\begin{center} - \begin{tabular}{lcr} - $\ldots r_1 \cdot r_2$ & $\rightarrow$ & $r_1\cdot r_2 + r_2 \backslash c \ldots $\\ - $\ldots \Seq(v_1, v_2) (\Seq(\mkeps(r1), (\inj \; r_2 \; c\; v_{2c})))$ & $\leftarrow$ & $\Right(v_{2c})\ldots$ - \end{tabular} -\end{center} -\noindent -The rest of the clauses of $\bder$ is rather similar to -$\der$, and is put together here as a wholesome definition -for $\bder$: -\noindent -Generalising the derivative operation with bitcodes to strings, we have -\begin{center} - \begin{tabular}{@{}lcl@{}} - $a\backslash_s [] $ & $\dn$ & $a$\\ - $a\backslash (c :: s) $ & $\dn$ & $(a \backslash c) \backslash_s s$ - \end{tabular} -\end{center} -As we did earlier, we omit the $s$ subscript at $\backslash_s$ when there is no danger -of confusion. -Putting this all together, we obtain a lexer with bit-coded regular expressions -as its internal data structures, which we call $\blexer$: - -\begin{center} -\begin{tabular}{lcl} - $\textit{blexer}\;r\,s$ & $\dn$ & - $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\ - & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\ - & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\ - & & $\;\;\textit{else}\;\textit{None}$ -\end{tabular} -\end{center} - -\noindent -Ausaf and Urban formally proved the correctness of the $\blexer$, namely -\begin{conjecture} -$\blexer \;r \; s = \lexer \; r \; s$. -\end{conjecture} -This was claimed but not formalised in Sulzmann and Lu's work. -We introduce the proof later, after we give out all -the needed auxiliary functions and definitions - %----------------------------------- % SUBSECTION 1 diff -r 35df9cdd36ca -r e0f0a81f907b ChengsongTanPhdThesis/main.tex --- a/ChengsongTanPhdThesis/main.tex Wed Aug 17 01:09:13 2022 +0100 +++ b/ChengsongTanPhdThesis/main.tex Wed Aug 17 22:59:15 2022 +0100 @@ -43,6 +43,7 @@ %\usepackage{fdsymbol} % Loads unicode-math \usepackage{verbatim} +\usepackage{float} \usepackage{mathpazo} % Use the Palatino font by default \usepackage{hyperref} \usepackage{lipsum} @@ -94,6 +95,7 @@ \usepackage{tikz} \usetikzlibrary{automata, positioning, calc} \usetikzlibrary{arrows} +\usetikzlibrary{shapes.misc} \usetikzlibrary{fit, shapes.geometric, patterns,