# HG changeset patch # User Chengsong # Date 1655226393 -3600 # Node ID a7344c9afbaff21b3bb5210650a9897755081ca2 # Parent 5bf9f94c02e17101e16027e4b78e534ed97ac4a1 chapter3 finished diff -r 5bf9f94c02e1 -r a7344c9afbaf ChengsongTanPhdThesis/Chapters/Bitcoded1.tex --- a/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Sun Jun 12 17:03:09 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Tue Jun 14 18:06:33 2022 +0100 @@ -248,17 +248,126 @@ expressions and $as$ for a list of annotated regular expressions. The alternative constructor ($\sum$) has been generalised to accept a list of annotated regular expressions rather than just 2. -%We will show that these bitcodes encode information about -%the ($\POSIX$) value that should be generated by the Sulzmann and Lu -%algorithm. + + +The first thing we define related to bit-coded regular expressions +is how we move bits, for instance pasting it at the front of an annotated regex. +The operation $\fuse$ is just to attach bit-codes +to the front of an annotated regular expression: +\begin{center} +\begin{tabular}{lcl} + $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\ + $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ & + $_{bs @ bs'}\ONE$\\ + $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ & + $_{bs@bs'}{\bf c}$\\ + $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ & + $_{bs@bs'}\sum\textit{as}$\\ + $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ & + $_{bs@bs'}a_1 \cdot a_2$\\ + $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ & + $_{bs @ bs'}a^*$ +\end{tabular} +\end{center} + +\noindent +With that we are able to define $\internalise$. + +To do lexing using annotated regular expressions, we shall first +transform the usual (un-annotated) regular expressions into annotated +regular expressions. This operation is called \emph{internalisation} and +defined as follows: + +%\begin{definition} +\begin{center} +\begin{tabular}{lcl} + $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\ + $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\ + $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\ + $(r_1 + r_2)^\uparrow$ & $\dn$ & + $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\, + \textit{fuse}\,[S]\,r_2^\uparrow]$\\ + $(r_1\cdot r_2)^\uparrow$ & $\dn$ & + $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\ + $(r^*)^\uparrow$ & $\dn$ & + $_{[]}(r^\uparrow)^*$\\ +\end{tabular} +\end{center} +%\end{definition} + +\noindent +We use an up arrow with postfix notation +to denote the operation, +for convenience. The $\textit{internalise} \; r$ +notation is more cumbersome. +The opposite of $\textit{internalise}$ is +$\erase$, where all the bit-codes are removed, +and the alternative operator $\sum$ for annotated +regular expressions is transformed to the binary alternatives +for plain regular expressions. +\begin{center} + \begin{tabular}{lcr} + $\erase \; \ZERO$ & $\dn$ & $\ZERO$\\ + $\erase \; _{bs}\ONE$ & $\dn$ & $\ONE$\\ + $\erase \; _{bs}\mathbf{c}$ & $\dn$ & $\mathbf{c}$\\ + $\erase \; _{bs} a_1 \cdot a_2$ & $\dn$ & $\erase \; r_1\cdot\erase\; r_2$\\ + $\erase \; _{bs} [] $ & $\dn$ & $\ZERO$\\ + $\erase \; _{bs} [a] $ & $\dn$ & $\erase \; a$\\ + $\erase \; _{bs} \sum [a_1, \; a_2]$ & $\dn$ & $\erase \; a_1 +\erase \; a_2$\\ + $\erase \; _{bs} \sum (a :: as)$ & $\dn$ & $\erase \; a + \erase \; _{[]} \sum as$\\ + $\erase \; _{bs} a^*$ & $\dn$ & $(\erase \; a)^*$ + \end{tabular} +\end{center} +\noindent +We also abbreviate the $\erase\; a$ operation +as $a_\downarrow$, for conciseness. +For bit-coded regular expressions, as a different datatype, +testing whether they contain empty string in their lauguage requires +a dedicated function $\bnullable$ +which simply calls $\erase$ first before testing whether it is $\nullable$. +\begin{center} + \begin{tabular}{lcr} + $\bnullable \; a $ & $\dn$ & $\nullable \; (a_\downarrow)$ + \end{tabular} +\end{center} +The function for collecting the +bitcodes of a $\bnullable$ annotated regular expression +is a generalised version of the $\textit{mkeps}$ function +for annotated regular expressions, called $\textit{bmkeps}$: + + +%\begin{definition}[\textit{bmkeps}]\mbox{} +\begin{center} +\begin{tabular}{lcl} + $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\ + $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,a$\\ + & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\ + & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\ + $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ & + $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ + $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ & + $bs \,@\, [Z]$ +\end{tabular} +\end{center} +%\end{definition} + +\noindent +This function completes the value information by travelling along the +path of the regular expression that corresponds to a POSIX value and +collecting all the bitcodes, and using $S$ to indicate the end of star +iterations. + The most central question is how these partial lexing information represented as bit-codes is augmented and carried around during a derivative is taken. - This is done by adding bitcodes to the derivatives, for example when one more star iteratoin is taken (we call the operation of derivatives on annotated regular expressions $\bder$ -because it is derivatives on regexes with bitcodes): +because it is derivatives on regexes with \emph{b}itcodes), +we need to unfold it into a sequence, +and attach an additional bit $Z$ to the front of $r \backslash c$ +to indicate one more star iteration. \begin{center} \begin{tabular}{@{}lcl@{}} $\bder \; c\; (_{bs}a^*) $ & $\dn$ & @@ -279,10 +388,10 @@ \end{tabular} \end{center} - +\noindent Using the picture we used earlier to depict this, the transformation when taking a derivative w.r.t a star is like below: -\centering + \begin{tabular}{@{}l@{\hspace{1mm}}l@{\hspace{0mm}}c@{}} \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] @@ -313,27 +422,8 @@ \end{tikzpicture} \end{tabular} \noindent -The operation $\fuse$ is just to attach bit-codes -to the front of an annotated regular expression: -\begin{center} -\begin{tabular}{lcl} - $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\ - $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ & - $_{bs @ bs'}\ONE$\\ - $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ & - $_{bs@bs'}{\bf c}$\\ - $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ & - $_{bs@bs'}\sum\textit{as}$\\ - $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ & - $_{bs@bs'}a_1 \cdot a_2$\\ - $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ & - $_{bs @ bs'}a^*$ -\end{tabular} -\end{center} - -\noindent Another place in the $\bder$ function where it differs -from normal derivatives on un-annotated regular expressions +from normal derivatives (on un-annotated regular expressions) is the sequence case: \begin{center} \begin{tabular}{@{}lcl@{}} @@ -345,9 +435,39 @@ & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$ \end{tabular} \end{center} -Here + +The difference is that (when $a_1$ is $\bnullable$) +we use $\bmkeps$ to store the lexing information +in $a_1$ before collapsing it (as it has been fully matched by string prior to $c$, +and attach the collected bit-codes to the front of $a_2$ +before throwing away $a_1$. We assume that $\bmkeps$ correctly extracts the bitcode for how $a_1$ +matches the string prior to $c$ (more on this later). +The bitsequence $\textit{bs}$ which was initially attached to the first element of the sequence +$a_1 \cdot a_2$, has now been elevated to the top level of teh $\sum$. +This is because this piece of information will be needed whichever way the sequence is matched, +regardless of whether $c$ belongs to $a_1$ or $a_2$. - +In the injection-based lexing, $r_1$ is immediately thrown away in +subsequent derivatives on the right branch (when $r_1$ is $\nullable$), +\begin{center} + $(r_1 \cdot r_2 )\backslash c = (r_1 \backslash c) \cdot r_2 + r_2 \backslash c$ +\end{center} +\noindent +as it knows $r_1$ is stored on stack and available once the recursive +call to later derivatives finish. +Therefore, if the $\Right$ branch is taken in a $\POSIX$ match, +we construct back the sequence value once step back by +calling a on $\mkeps(r_1)$. +\begin{center} + \begin{tabular}{lcr} + $\ldots r_1 \cdot r_2$ & $\rightarrow$ & $r_1\cdot r_2 + r_2 \backslash c \ldots $\\ + $\ldots \Seq(v_1, v_2) (\Seq(\mkeps(r1), (\inj \; r_2 \; c\; v_{2c})))$ & $\leftarrow$ & $\Right(v_{2c})\ldots$ + \end{tabular} +\end{center} +\noindent +The rest of the clauses of $\bder$ is rather similar to +$\der$, and is put together here as a wholesome definition +for $\bder$: \begin{center} \begin{tabular}{@{}lcl@{}} $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\ @@ -356,7 +476,7 @@ $\textit{if}\;c=d\; \;\textit{then}\; _{bs}\ONE\;\textit{else}\;\ZERO$\\ $(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ & - $_{bs}\sum\;(\textit{map} (\_\backslash c) as )$\\ + $_{bs}\sum\;(\textit{map} \; (\_\backslash c) \; as )$\\ $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ & $\textit{if}\;\textit{bnullable}\,a_1$\\ & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\ @@ -367,98 +487,18 @@ (_{[]}r^*))$ \end{tabular} \end{center} - - -To do lexing using annotated regular expressions, we shall first -transform the usual (un-annotated) regular expressions into annotated -regular expressions. This operation is called \emph{internalisation} and -defined as follows: - -%\begin{definition} +\noindent +Generalising the derivative operation with bitcodes to strings, we have \begin{center} -\begin{tabular}{lcl} - $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\ - $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\ - $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\ - $(r_1 + r_2)^\uparrow$ & $\dn$ & - $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\, - \textit{fuse}\,[S]\,r_2^\uparrow]$\\ - $(r_1\cdot r_2)^\uparrow$ & $\dn$ & - $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\ - $(r^*)^\uparrow$ & $\dn$ & - $_{[]}(r^\uparrow)^*$\\ -\end{tabular} -\end{center} -%\end{definition} - -\noindent -We use up arrows here to indicate that the basic un-annotated regular -expressions are ``lifted up'' into something slightly more complex. In the -fourth clause, $\textit{fuse}$ is an auxiliary function that helps to -attach bits to the front of an annotated regular expression. Its -definition is as follows: - - - -\noindent -After internalising the regular expression, we perform successive -derivative operations on the annotated regular expressions. This -derivative operation is the same as what we had previously for the -basic regular expressions, except that we beed to take care of -the bitcodes: - - - - - -%\end{definition} -\noindent -For instance, when we do derivative of $_{bs}a^*$ with respect to c, -we need to unfold it into a sequence, -and attach an additional bit $Z$ to the front of $r \backslash c$ -to indicate one more star iteration. Also the sequence clause -is more subtle---when $a_1$ is $\textit{bnullable}$ (here -\textit{bnullable} is exactly the same as $\textit{nullable}$, except -that it is for annotated regular expressions, therefore we omit the -definition). Assume that $\textit{bmkeps}$ correctly extracts the bitcode for how -$a_1$ matches the string prior to character $c$ (more on this later), -then the right branch of alternative, which is $\textit{fuse} \; \bmkeps \; a_1 (a_2 -\backslash c)$ will collapse the regular expression $a_1$(as it has -already been fully matched) and store the parsing information at the -head of the regular expression $a_2 \backslash c$ by fusing to it. The -bitsequence $\textit{bs}$, which was initially attached to the -first element of the sequence $a_1 \cdot a_2$, has -now been elevated to the top-level of $\sum$, as this information will be -needed whichever way the sequence is matched---no matter whether $c$ belongs -to $a_1$ or $ a_2$. After building these derivatives and maintaining all -the lexing information, we complete the lexing by collecting the -bitcodes using a generalised version of the $\textit{mkeps}$ function -for annotated regular expressions, called $\textit{bmkeps}$: - - -%\begin{definition}[\textit{bmkeps}]\mbox{} -\begin{center} -\begin{tabular}{lcl} - $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\ - $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ & - $\textit{if}\;\textit{bnullable}\,a$\\ - & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\ - & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{bs}\sum \textit{as})$\\ - $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ & - $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ - $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ & - $bs \,@\, [Z]$ -\end{tabular} -\end{center} -%\end{definition} - -\noindent -This function completes the value information by travelling along the -path of the regular expression that corresponds to a POSIX value and -collecting all the bitcodes, and using $S$ to indicate the end of star -iterations. If we take the bitcodes produced by $\textit{bmkeps}$ and -decode them, we get the value we expect. The corresponding lexing -algorithm looks as follows: + \begin{tabular}{@{}lcl@{}} + $a\backslash_s [] $ & $\dn$ & $a$\\ + $a\backslash (c :: s) $ & $\dn$ & $(a \backslash c) \backslash_s s$ + \end{tabular} +\end{center} +As we did earlier, we omit the $s$ subscript at $\backslash_s$ when there is no danger +of confusion. +Putting this all together, we obtain a lexer with bit-coded regular expressions +as its internal data structures, which we call $\blexer$: \begin{center} \begin{tabular}{lcl} @@ -471,32 +511,152 @@ \end{center} \noindent -In this definition $\_\backslash s$ is the generalisation of the derivative -operation from characters to strings (just like the derivatives for un-annotated -regular expressions). - +Ausaf and Urban formally proved the correctness of the $\blexer$, namely +\begin{conjecture} +$\blexer \;r \; s = \lexer \; r \; s$. +\end{conjecture} +This was claimed but not formalised in Sulzmann and Lu's work. +We introduce the proof later, after we give out all +the needed auxiliary functions and definitions %----------------------------------- % SUBSECTION 1 %----------------------------------- \section{Specifications of Some Helper Functions} -Here we give some functions' definitions, -which we will use later. +The functions we introduce will give a more detailed glimpse into +the lexing process, which might not be possible +using $\lexer$ or $\blexer$ themselves. +The first function we shall look at is $\retrieve$. +\subsection{$\retrieve$} +Our bit-coded lexer "retrieve"s the bitcodes using $\bmkeps$ +after we finished doing all the derivatives: \begin{center} -\begin{tabular}{ccc} -$\retrieve \; \ACHAR \, \textit{bs} \, c \; \Char(c) = \textit{bs}$ +\begin{tabular}{lcl} + & & $\ldots$\\ + & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\ + & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\ + & & $\ldots$ \end{tabular} \end{center} +\noindent +Recall that $\bmkeps$ looks for the leftmost branch of an alternative +and completes a star's iterations by attaching a $Z$ at the end of the bitcodes +extracted. It "retrieves" a sequence by visiting both children and then stitch together +two bitcodes using concatenation. After the entire tree structure of the regular +expression has been traversed using the above manner, we get a bitcode encoding the +lexing result. +We know that this "retrieved" bitcode leads to the correct value after decoding, +which is $v_0$ in the bird's eye view of the injection-based lexing diagram. +Now assume we keep every other data structure in the diagram \ref{InjFigure}, +and only replace all the plain regular expression by their annotated counterparts, +computed during a $\blexer$ run. +Then we obtain a diagram for the annotated regular expression derivatives and +their corresponding values, though the values are never calculated in $\blexer$. +We have that $a_n$ contains all the lexing result information. +\vspace{20mm} +\begin{center}%\label{graph:injLexer} +\begin{tikzcd}[ + every matrix/.append style = {name=p}, + remember picture, overlay, + ] + a_0 \arrow[r, "\backslash c_0"] \arrow[d] & a_1 \arrow[r, "\backslash c_1"] \arrow[d] & a_2 \arrow[r, dashed] \arrow[d] & a_n \arrow[d] \\ +v_0 & v_1 \arrow[l,"inj_{r_0} c_0"] & v_2 \arrow[l, "inj_{r_1} c_1"] & v_n \arrow[l, dashed] +\end{tikzcd} +\begin{tikzpicture}[ + remember picture, overlay, +E/.style = {ellipse, draw=blue, dashed, + inner xsep=4mm,inner ysep=-4mm, rotate=90, fit=#1} + ] +\node[E = (p-1-1) (p-2-1)] {}; +\node[E = (p-1-4) (p-2-4)] {}; +\end{tikzpicture} + +\end{center} +\vspace{20mm} +\noindent +On the other hand, $v_0$ also encodes the correct lexing result, as we have proven for $\lexer$. +Encircled in the diagram are the two pairs $v_0, a_0$ and $v_n, a_n$, which both +encode the correct lexical result. Though for the leftmost pair, we have +the information condensed in $v_0$ the value part, whereas for the rightmost pair, +the information is concentrated on $a_n$. +We know that in the intermediate steps the pairs $v_i, a_i$, must in some way encode the complete +lexing information as well. Therefore, we need a unified approach to extract such lexing result +from a value $v_i$ and its annotated regular expression $a_i$. +And the function $f$ must satisfy these requirements: +\begin{itemize} + \item + $f \; a_i\;v_i = f \; a_n \; v_n = \decode \; (\bmkeps \; a_n) \; (\erase \; a_0)$ + \item + $f \; a_i\;v_i = f \; a_0 \; v_0 = v_0 = \decode \;(\code \; v_0) \; (\erase \; a_0)$ +\end{itemize} +\noindent +If we factor out the common part $\decode \; \_ \; (\erase \; a_0)$, +The core of the function $f$ is something that produces the bitcodes +$\code \; v_0$. +It is unclear how, but Sulzmann and Lu came up with a function satisfying all the above +requirements, named \emph{retrieve}: -%---------------------------------------------------------------------------------------- -% SECTION correctness proof -%---------------------------------------------------------------------------------------- -\section{Correctness of Bit-coded Algorithm (Without Simplification)} -We now give the proof the correctness of the algorithm with bit-codes. -Ausaf and Urban cleverly defined an auxiliary function called $\flex$, +\begin{center} +\begin{tabular}{lcr} + $\retrieve \; \, (_{bs} \ONE) \; \, (\Empty)$ & $\dn$ & $\textit{bs}$\\ + $\retrieve \; \, (_{bs} \mathbf{c} ) \; \Char(c)$ & $\dn$ & $ \textit{bs}$\\ + $\retrieve \; \, (_{bs} a_1 \cdot a_2) \; \Seq(v_1, v_2)$ & $\dn$ & $\textit{bs} @ (\retrieve \; a_1\; v_1) @ (\retrieve \; a_2 \; v_2)$\\ + $\retrieve \; \, (_{bs} \Sigma (a :: \textit{as}) \; \,\Left(v)$ & $\dn$ & $\textit{bs} @ (\retrieve \; a \; v)$\\ + $\retrieve \; \, (_{bs} \Sigma (a :: \textit{as} \; \, \Right(v)$ & $\dn$ & $\textit{bs} @ (\retrieve \; (_{[]}\Sigma \textit{as}) \; v)$\\ + $\retrieve \; \, (_{bs} a^*) \; \, (\Stars(v :: vs)) $ & $\dn$ & $\textit{bs} @ (\retrieve \; a \; v) @ (\retrieve \; (_{[]} a^*) \; (\Stars(vs)))$\\ + $\retrieve \; \, (_{bs} a^*) \; \, (\Stars([]) $ & $\dn$ & $\textit{bs} @ [Z]$ +\end{tabular} +\end{center} +\noindent +As promised, $\retrieve$ collects the right bit-codes from the +final derivative $a_n$: +\begin{lemma} + $\bnullable \; a \implies \bmkeps \; a = \retrieve \; a \; (\mkeps \; (\erase \; a))$ +\end{lemma} +\begin{proof} + By a routine induction on $a$. +\end{proof} +The design of $\retrieve$ enables extraction of bit-codes +from not only $\bnullable$ (annotated) regular expressions, +but also those that are not $\bnullable$. +For example, if we have the regular expression just internalised +and the lexing result value, we could $\retrieve$ the bitcdoes +that look as if we have en$\code$-ed the value: +\begin{lemma} + $\vdash v : r \implies \retrieve \; (r)^\uparrow \; v = \code \; v$ +\end{lemma} +\begin{proof} + By induction on $r$. +\end{proof} +\noindent +The following property is more interesting, as +it provides a "bridge" between $a_0, v_0$ and $a_n, v_n$ in the +lexing diagram. +If you take derivative of an annotated regular expression, +you can $\retrieve$ the same bit-codes as before the derivative took place, +provided that you use the corresponding value: + +\begin{lemma}\label{retrieveStepwise} + $\vdash v : (r\backslash c) \implies \retrieve \; (r \backslash c) \; v= \retrieve \; r \; (\inj \; r\; c\; v)$ +\end{lemma} +\begin{proof} + By induction on $r$, where $v$ is allowed to be arbitrary. + The induction principle is function $\erase$'s cases. +\end{proof} +\noindent +$\retrieve$ is connected to the $\blexer$ in the following way: +\begin{lemma}\label{blexer_retrieve} +$\blexer \; r \; s = \decode \; (\retrieve \; (\internalise \; r) \; (\mkeps \; (r \backslash s) )) \; r$ +\end{lemma} +\noindent +$\retrieve$ allows free navigation on the diagram \ref{InjFigure} for annotated regexes of $\blexer$. +For plain regular expressions something similar is required as well. + +\subsection{$\flex$} +Ausaf and Urban cleverly defined an auxiliary function called $\flex$ for $\lexer$, defined as \begin{center} \begin{tabular}{lcr} @@ -510,7 +670,10 @@ \begin{lemma} $\flex \; r \; \textit{id}\; s \; \mkeps (r\backslash s) = \lexer \; r \; s$ \end{lemma} -$\flex$ provides us a bridge between $\lexer$ and $\blexer$. +\begin{proof} + By reverse induction on $s$. +\end{proof} +$\flex$ provides us a bridge between $\lexer$'s intermediate steps. What is even better about $\flex$ is that it allows us to directly operate on the value $\mkeps (r\backslash v)$, which is pivotal in the definition of $\lexer $ and $\blexer$, but not visible as an argument. @@ -519,24 +682,13 @@ \begin{lemma}\label{flexStepwise} $\textit{flex} \; r \; f \; s@[c] \; v= \flex \; r \; f\; s \; (\inj \; (r\backslash s) \; c \; v) $ \end{lemma} +\begin{proof} + By induction on the shape of $r\backslash s$ +\end{proof} +\noindent +With $\flex$ and $\retrieve$ ready, we are ready to connect $\lexer$ and $\blexer$ . -And for $\blexer$ we have a function with stepwise properties like $\flex$ as well, -called $\retrieve$\ref{retrieveDef}. -$\retrieve$ takes bit-codes from annotated regular expressions -guided by a value. -$\retrieve$ is connected to the $\blexer$ in the following way: -\begin{lemma}\label{blexer_retrieve} -$\blexer \; r \; s = \decode \; (\retrieve \; (\internalise \; r) \; (\mkeps \; (r \backslash s) )) \; r$ -\end{lemma} -If you take derivative of an annotated regular expression, -you can $\retrieve$ the same bit-codes as before the derivative took place, -provided that you use the corresponding value: - -\begin{lemma}\label{retrieveStepwise} -$\retrieve \; (r \backslash c) \; v= \retrieve \; r \; (\inj \; r\; c\; v)$ -\end{lemma} -The other good thing about $\retrieve$ is that it can be connected to $\flex$: -%centralLemma1 +\subsection{Correctness Proof of Bit-coded Algorithm} \begin{lemma}\label{flex_retrieve} $\flex \; r \; \textit{id}\; s\; v = \decode \; (\retrieve \; (r\backslash s )\; v) \; r$ \end{lemma} @@ -567,13 +719,29 @@ With the above lemma we can now link $\flex$ and $\blexer$. +%---------------------------------------------------------------------------------------- +% SECTION correctness proof +%---------------------------------------------------------------------------------------- +\section{Correctness of Bit-coded Algorithm (Without Simplification)} +We now give the proof the correctness of the algorithm with bit-codes. + \begin{lemma}\label{flex_blexer} $\textit{flex} \; r \; \textit{id} \; s \; \mkeps(r \backslash s) = \blexer \; r \; s$ \end{lemma} \begin{proof} Using two of the above lemmas: \ref{flex_retrieve} and \ref{blexer_retrieve}. \end{proof} -Finally +Finally the correctness of $\blexer$ is given as it outputs the same result as $\lexer$: +\begin{theorem} + $\blexer\; r \; s = \lexer \; r \; s$ +\end{theorem} +\begin{proof} + Straightforward corollary of \ref{flex_blexer}. +\end{proof} +\noindent +Our main reason for wanting a bit-coded algorithm over +the injection-based one is for its capabilities of allowing +more aggressive simplifications. +We will elaborate on this in the next chapter. - diff -r 5bf9f94c02e1 -r a7344c9afbaf ChengsongTanPhdThesis/Chapters/Introduction.tex --- a/ChengsongTanPhdThesis/Chapters/Introduction.tex Sun Jun 12 17:03:09 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Introduction.tex Tue Jun 14 18:06:33 2022 +0100 @@ -37,6 +37,7 @@ \newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}} +\def\bnullable{\textit{bnullable}} \def\Some{\textit{Some}} \def\None{\textit{None}} \def\code{\textit{code}} @@ -52,6 +53,7 @@ \def\fuse{\textit{fuse}} \def\bder{\textit{bder}} +\def\der{\textit{der}} \def\POSIX{\textit{POSIX}} \def\ALTS{\textit{ALTS}} \def\ASTAR{\textit{ASTAR}} diff -r 5bf9f94c02e1 -r a7344c9afbaf ChengsongTanPhdThesis/main.tex --- a/ChengsongTanPhdThesis/main.tex Sun Jun 12 17:03:09 2022 +0100 +++ b/ChengsongTanPhdThesis/main.tex Tue Jun 14 18:06:33 2022 +0100 @@ -44,6 +44,7 @@ \usepackage{mathpazo} % Use the Palatino font by default \usepackage{hyperref} +\usepackage{lipsum} \usepackage[backend=bibtex,style=authoryear,natbib=true]{biblatex} % Use the bibtex backend with the authoryear citation style (which resembles APA) \usepackage{stmaryrd} \usepackage{caption} @@ -68,6 +69,8 @@ \usepackage{tikz-cd} \usepackage{tikz} \usetikzlibrary{automata, positioning, calc} +\usetikzlibrary{fit, + shapes.geometric} \usepackage{mathpartir} @@ -270,6 +273,7 @@ \newtheorem{theorem}{Theorem} \newtheorem{lemma}{Lemma} \newtheorem{definition}{Definition} +\newtheorem{conjecture}{Conjecture} %proof