diff -r e0db3242d8b5 -r b1e365afa29c etnms/etnms.tex --- a/etnms/etnms.tex Wed Jan 15 13:01:10 2020 +0000 +++ b/etnms/etnms.tex Thu Jan 16 22:34:23 2020 +0000 @@ -37,6 +37,9 @@ \def\bders{\textit{bders}} \def\lexer{\mathit{lexer}} \def\blexer{\textit{blexer}} +\def\fuse{\textit{fuse}} +\def\flatten{\textit{flatten}} +\def\map{\textit{map}} \def\blexers{\mathit{blexer\_simp}} \def\simp{\mathit{simp}} \def\mkeps{\mathit{mkeps}} @@ -98,6 +101,581 @@ negation and back-references. \end{abstract} +\section{Recapitulation of Concepts From the Last Report} +\subsection{The Algorithm by Brzozowski based on Derivatives of Regular +Expressions} + +Suppose (basic) regular expressions are given by the following grammar: +\[ r ::= \ZERO \mid \ONE + \mid c + \mid r_1 \cdot r_2 + \mid r_1 + r_2 + \mid r^* +\] + +\noindent + +The ingenious contribution by Brzozowski is the notion of +\emph{derivatives} of regular expressions. +\begin{center} + \begin{tabular}{lcl} + $\nullable(\ZERO)$ & $\dn$ & $\mathit{false}$ \\ + $\nullable(\ONE)$ & $\dn$ & $\mathit{true}$ \\ + $\nullable(c)$ & $\dn$ & $\mathit{false}$ \\ + $\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\ + $\nullable(r_1\cdot r_2)$ & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\ + $\nullable(r^*)$ & $\dn$ & $\mathit{true}$ \\ + \end{tabular} + \end{center} + +\begin{center} +\begin{tabular}{lcl} + $\ZERO \backslash c$ & $\dn$ & $\ZERO$\\ + $\ONE \backslash c$ & $\dn$ & $\ZERO$\\ + $d \backslash c$ & $\dn$ & + $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\ +$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\ +$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, nullable(r_1)$\\ + & & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\ + & & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\ + $(r^*)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^*$\\ +\end{tabular} +\end{center} + +%Assuming the classic notion of a +%\emph{language} of a regular expression, written $L(\_)$, t + +\noindent +The main property of the derivative operation is that + +\begin{center} +$c\!::\!s \in L(r)$ holds +if and only if $s \in L(r\backslash c)$. +\end{center} + +\noindent + + +Now we can generalise the derivative operation to strings like this: + +\begin{center} +\begin{tabular}{lcl} +$r \backslash (c\!::\!s) $ & $\dn$ & $(r \backslash c) \backslash s$ \\ +$r \backslash [\,] $ & $\dn$ & $r$ +\end{tabular} +\end{center} + +\noindent +and then define as regular-expression matching algorithm: +\[ +match\;s\;r \;\dn\; nullable(r\backslash s) +\] + +\noindent +This algorithm looks graphically as follows: +\begin{equation}\label{graph:*} +\begin{tikzcd} +r_0 \arrow[r, "\backslash c_0"] & r_1 \arrow[r, "\backslash c_1"] & r_2 \arrow[r, dashed] & r_n \arrow[r,"\textit{nullable}?"] & \;\textrm{YES}/\textrm{NO} +\end{tikzcd} +\end{equation} + +\noindent +where we start with a regular expression $r_0$, build successive +derivatives until we exhaust the string and then use \textit{nullable} +to test whether the result can match the empty string. It can be +relatively easily shown that this matcher is correct (that is given +an $s = c_0...c_{n-1}$ and an $r_0$, it generates YES if and only if $s \in L(r_0)$). + + +\subsection{Values and the Algorithm by Sulzmann and Lu} + +One limitation of Brzozowski's algorithm is that it only produces a +YES/NO answer for whether a string is being matched by a regular +expression. Sulzmann and Lu~\cite{Sulzmann2014} extended this algorithm +to allow generation of an actual matching, called a \emph{value} or +sometimes also \emph{lexical value}. These values and regular +expressions correspond to each other as illustrated in the following +table: + + +\begin{center} + \begin{tabular}{c@{\hspace{20mm}}c} + \begin{tabular}{@{}rrl@{}} + \multicolumn{3}{@{}l}{\textbf{Regular Expressions}}\medskip\\ + $r$ & $::=$ & $\ZERO$\\ + & $\mid$ & $\ONE$ \\ + & $\mid$ & $c$ \\ + & $\mid$ & $r_1 \cdot r_2$\\ + & $\mid$ & $r_1 + r_2$ \\ + \\ + & $\mid$ & $r^*$ \\ + \end{tabular} + & + \begin{tabular}{@{\hspace{0mm}}rrl@{}} + \multicolumn{3}{@{}l}{\textbf{Values}}\medskip\\ + $v$ & $::=$ & \\ + & & $\Empty$ \\ + & $\mid$ & $\Char(c)$ \\ + & $\mid$ & $\Seq\,v_1\, v_2$\\ + & $\mid$ & $\Left(v)$ \\ + & $\mid$ & $\Right(v)$ \\ + & $\mid$ & $\Stars\,[v_1,\ldots\,v_n]$ \\ + \end{tabular} + \end{tabular} +\end{center} + +\noindent + +The contribution of Sulzmann and Lu is an extension of Brzozowski's +algorithm by a second phase (the first phase being building successive +derivatives---see \eqref{graph:*}). In this second phase, a POSIX value +is generated in case the regular expression matches the string. +Pictorially, the Sulzmann and Lu algorithm is as follows: + +\begin{ceqn} +\begin{equation}\label{graph:2} +\begin{tikzcd} +r_0 \arrow[r, "\backslash c_0"] \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\ +v_0 & v_1 \arrow[l,"inj_{r_0} c_0"] & v_2 \arrow[l, "inj_{r_1} c_1"] & v_n \arrow[l, dashed] +\end{tikzcd} +\end{equation} +\end{ceqn} + +\noindent +For convenience, we shall employ the following notations: the regular +expression we start with is $r_0$, and the given string $s$ is composed +of characters $c_0 c_1 \ldots c_{n-1}$. In the first phase from the +left to right, we build the derivatives $r_1$, $r_2$, \ldots according +to the characters $c_0$, $c_1$ until we exhaust the string and obtain +the derivative $r_n$. We test whether this derivative is +$\textit{nullable}$ or not. If not, we know the string does not match +$r$ and no value needs to be generated. If yes, we start building the +values incrementally by \emph{injecting} back the characters into the +earlier values $v_n, \ldots, v_0$. This is the second phase of the +algorithm from the right to left. For the first value $v_n$, we call the +function $\textit{mkeps}$, which builds the lexical value +for how the empty string has been matched by the (nullable) regular +expression $r_n$. This function is defined as + + \begin{center} + \begin{tabular}{lcl} + $\mkeps(\ONE)$ & $\dn$ & $\Empty$ \\ + $\mkeps(r_{1}+r_{2})$ & $\dn$ + & \textit{if} $\nullable(r_{1})$\\ + & & \textit{then} $\Left(\mkeps(r_{1}))$\\ + & & \textit{else} $\Right(\mkeps(r_{2}))$\\ + $\mkeps(r_1\cdot r_2)$ & $\dn$ & $\Seq\,(\mkeps\,r_1)\,(\mkeps\,r_2)$\\ + $mkeps(r^*)$ & $\dn$ & $\Stars\,[]$ + \end{tabular} + \end{center} + + +\noindent +After the $\mkeps$-call, we inject back the characters one by one in order to build +the lexical value $v_i$ for how the regex $r_i$ matches the string $s_i$ +($s_i = c_i \ldots c_{n-1}$ ) from the previous lexical value $v_{i+1}$. +After injecting back $n$ characters, we get the lexical value for how $r_0$ +matches $s$. For this Sulzmann and Lu defined a function that reverses +the ``chopping off'' of characters during the derivative phase. The +corresponding function is called \emph{injection}, written +$\textit{inj}$; it takes three arguments: the first one is a regular +expression ${r_{i-1}}$, before the character is chopped off, the second +is a character ${c_{i-1}}$, the character we want to inject and the +third argument is the value ${v_i}$, into which one wants to inject the +character (it corresponds to the regular expression after the character +has been chopped off). The result of this function is a new value. The +definition of $\textit{inj}$ is as follows: + +\begin{center} +\begin{tabular}{l@{\hspace{1mm}}c@{\hspace{1mm}}l} + $\textit{inj}\,(c)\,c\,Empty$ & $\dn$ & $Char\,c$\\ + $\textit{inj}\,(r_1 + r_2)\,c\,\Left(v)$ & $\dn$ & $\Left(\textit{inj}\,r_1\,c\,v)$\\ + $\textit{inj}\,(r_1 + r_2)\,c\,Right(v)$ & $\dn$ & $Right(\textit{inj}\,r_2\,c\,v)$\\ + $\textit{inj}\,(r_1 \cdot r_2)\,c\,Seq(v_1,v_2)$ & $\dn$ & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\ + $\textit{inj}\,(r_1 \cdot r_2)\,c\,\Left(Seq(v_1,v_2))$ & $\dn$ & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\ + $\textit{inj}\,(r_1 \cdot r_2)\,c\,Right(v)$ & $\dn$ & $Seq(\textit{mkeps}(r_1),\textit{inj}\,r_2\,c\,v)$\\ + $\textit{inj}\,(r^*)\,c\,Seq(v,Stars\,vs)$ & $\dn$ & $Stars((\textit{inj}\,r\,c\,v)\,::\,vs)$\\ +\end{tabular} +\end{center} + +\noindent This definition is by recursion on the ``shape'' of regular +expressions and values. + + +\subsection{Simplification of Regular Expressions} + +The main drawback of building successive derivatives according +to Brzozowski's definition is that they can grow very quickly in size. +This is mainly due to the fact that the derivative operation generates +often ``useless'' $\ZERO$s and $\ONE$s in derivatives. As a result, if +implemented naively both algorithms by Brzozowski and by Sulzmann and Lu +are excruciatingly slow. For example when starting with the regular +expression $(a + aa)^*$ and building 12 successive derivatives +w.r.t.~the character $a$, one obtains a derivative regular expression +with more than 8000 nodes (when viewed as a tree). Operations like +$\textit{der}$ and $\nullable$ need to traverse such trees and +consequently the bigger the size of the derivative the slower the +algorithm. + +Fortunately, one can simplify regular expressions after each derivative +step. Various simplifications of regular expressions are possible, such +as the simplification of $\ZERO + r$, $r + \ZERO$, $\ONE\cdot r$, $r +\cdot \ONE$, and $r + r$ to just $r$. These simplifications do not +affect the answer for whether a regular expression matches a string or +not, but fortunately also do not affect the POSIX strategy of how +regular expressions match strings---although the latter is much harder +to establish. Some initial results in this regard have been +obtained in \cite{AusafDyckhoffUrban2016}. + +If we want the size of derivatives in Sulzmann and Lu's algorithm to +stay below this bound, we would need more aggressive simplifications. +Essentially we need to delete useless $\ZERO$s and $\ONE$s, as well as +deleting duplicates whenever possible. For example, the parentheses in +$(a+b) \cdot c + bc$ can be opened up to get $a\cdot c + b \cdot c + b +\cdot c$, and then simplified to just $a \cdot c + b \cdot c$. Another +example is simplifying $(a^*+a) + (a^*+ \ONE) + (a +\ONE)$ to just +$a^*+a+\ONE$. Adding these more aggressive simplification rules helps us +to achieve the same size bound as that of the partial derivatives. + +In order to implement the idea of ``spilling out alternatives'' and to +make them compatible with the $\text{inj}$-mechanism, we use +\emph{bitcodes}. Bits and bitcodes (lists of bits) are just: + + +\begin{center} + $b ::= S \mid Z \qquad +bs ::= [] \mid b:bs +$ +\end{center} + +\noindent +The $S$ and $Z$ are arbitrary names for the bits in order to avoid +confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or +bit-lists) can be used to encode values (or incomplete values) in a +compact form. This can be straightforwardly seen in the following +coding function from values to bitcodes: + +\begin{center} +\begin{tabular}{lcl} + $\textit{code}(\Empty)$ & $\dn$ & $[]$\\ + $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\ + $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\ + $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\ + $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\ + $\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\ + $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\; + code(\Stars\,vs)$ +\end{tabular} +\end{center} + +\noindent +Here $\textit{code}$ encodes a value into a bitcodes by converting +$\Left$ into $\Z$, $\Right$ into $\S$, the start point of a non-empty +star iteration into $\S$, and the border where a local star terminates +into $\Z$. This coding is lossy, as it throws away the information about +characters, and also does not encode the ``boundary'' between two +sequence values. Moreover, with only the bitcode we cannot even tell +whether the $\S$s and $\Z$s are for $\Left/\Right$ or $\Stars$. The +reason for choosing this compact way of storing information is that the +relatively small size of bits can be easily manipulated and ``moved +around'' in a regular expression. In order to recover values, we will +need the corresponding regular expression as an extra information. This +means the decoding function is defined as: + + +%\begin{definition}[Bitdecoding of Values]\mbox{} +\begin{center} +\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}} + $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ + $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\ + $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\; + (\Left\,v, bs_1)$\\ + $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\; + (\Right\,v, bs_1)$\\ + $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ & + $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\ + & & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\ + & & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\ + $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\ + $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\ + & & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\ + & & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\ + + $\textit{decode}\,bs\,r$ & $\dn$ & + $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\ + & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\; + \textit{else}\;\textit{None}$ +\end{tabular} +\end{center} +%\end{definition} + +Sulzmann and Lu's integrated the bitcodes into regular expressions to +create annotated regular expressions \cite{Sulzmann2014}. +\emph{Annotated regular expressions} are defined by the following +grammar:%\comment{ALTS should have an $as$ in the definitions, not just $a_1$ and $a_2$} + +\begin{center} +\begin{tabular}{lcl} + $\textit{a}$ & $::=$ & $\textit{ZERO}$\\ + & $\mid$ & $\textit{ONE}\;\;bs$\\ + & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\ + & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\ + & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\ + & $\mid$ & $\textit{STAR}\;\;bs\,a$ +\end{tabular} +\end{center} +%(in \textit{ALTS}) + +\noindent +where $bs$ stands for bitcodes, $a$ for $\bold{a}$nnotated regular +expressions and $as$ for a list of annotated regular expressions. +The alternative constructor($\textit{ALTS}$) has been generalized to +accept a list of annotated regular expressions rather than just 2. +We will show that these bitcodes encode information about +the (POSIX) value that should be generated by the Sulzmann and Lu +algorithm. + + +To do lexing using annotated regular expressions, we shall first +transform the usual (un-annotated) regular expressions into annotated +regular expressions. This operation is called \emph{internalisation} and +defined as follows: + +%\begin{definition} +\begin{center} +\begin{tabular}{lcl} + $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\ + $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\ + $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\ + $(r_1 + r_2)^\uparrow$ & $\dn$ & + $\textit{ALTS}\;[]\,List((\textit{fuse}\,[\Z]\,r_1^\uparrow),\, + (\textit{fuse}\,[\S]\,r_2^\uparrow))$\\ + $(r_1\cdot r_2)^\uparrow$ & $\dn$ & + $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\ + $(r^*)^\uparrow$ & $\dn$ & + $\textit{STAR}\;[]\,r^\uparrow$\\ +\end{tabular} +\end{center} +%\end{definition} + +\noindent +We use up arrows here to indicate that the basic un-annotated regular +expressions are ``lifted up'' into something slightly more complex. In the +fourth clause, $\textit{fuse}$ is an auxiliary function that helps to +attach bits to the front of an annotated regular expression. Its +definition is as follows: + +\begin{center} +\begin{tabular}{lcl} + $\textit{fuse}\;bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\ + $\textit{fuse}\;bs\,(\textit{ONE}\,bs')$ & $\dn$ & + $\textit{ONE}\,(bs\,@\,bs')$\\ + $\textit{fuse}\;bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ & + $\textit{CHAR}\,(bs\,@\,bs')\,c$\\ + $\textit{fuse}\;bs\,(\textit{ALTS}\,bs'\,as)$ & $\dn$ & + $\textit{ALTS}\,(bs\,@\,bs')\,as$\\ + $\textit{fuse}\;bs\,(\textit{SEQ}\,bs'\,a_1\,a_2)$ & $\dn$ & + $\textit{SEQ}\,(bs\,@\,bs')\,a_1\,a_2$\\ + $\textit{fuse}\;bs\,(\textit{STAR}\,bs'\,a)$ & $\dn$ & + $\textit{STAR}\,(bs\,@\,bs')\,a$ +\end{tabular} +\end{center} + +\noindent +After internalising the regular expression, we perform successive +derivative operations on the annotated regular expressions. This +derivative operation is the same as what we had previously for the +basic regular expressions, except that we beed to take care of +the bitcodes: + + %\begin{definition}{bder} +\begin{center} + \begin{tabular}{@{}lcl@{}} + $(\textit{ZERO})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\ + $(\textit{ONE}\;bs)\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\ + $(\textit{CHAR}\;bs\,d)\,\backslash c$ & $\dn$ & + $\textit{if}\;c=d\; \;\textit{then}\; + \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\ + $(\textit{ALTS}\;bs\,as)\,\backslash c$ & $\dn$ & + $\textit{ALTS}\;bs\,(as.map(\backslash c))$\\ + $(\textit{SEQ}\;bs\,a_1\,a_2)\,\backslash c$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,a_1$\\ + & &$\textit{then}\;\textit{ALTS}\,bs\,List((\textit{SEQ}\,[]\,(a_1\,\backslash c)\,a_2),$\\ + & &$\phantom{\textit{then}\;\textit{ALTS}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c)))$\\ + & &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\,\backslash c)\,a_2$\\ + $(\textit{STAR}\,bs\,a)\,\backslash c$ & $\dn$ & + $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\,\backslash c))\, + (\textit{STAR}\,[]\,r)$ +\end{tabular} +\end{center} +%\end{definition} + +\noindent +For instance, when we unfold $\textit{STAR} \; bs \; a$ into a sequence, +we need to attach an additional bit $Z$ to the front of $r \backslash c$ +to indicate that there is one more star iteration. Also the $SEQ$ clause +is more subtle---when $a_1$ is $\textit{bnullable}$ (here +\textit{bnullable} is exactly the same as $\textit{nullable}$, except +that it is for annotated regular expressions, therefore we omit the +definition). Assume that $bmkeps$ correctly extracts the bitcode for how +$a_1$ matches the string prior to character $c$ (more on this later), +then the right branch of $ALTS$, which is $fuse \; bmkeps \; a_1 (a_2 +\backslash c)$ will collapse the regular expression $a_1$(as it has +already been fully matched) and store the parsing information at the +head of the regular expression $a_2 \backslash c$ by fusing to it. The +bitsequence $bs$, which was initially attached to the head of $SEQ$, has +now been elevated to the top-level of $ALTS$, as this information will be +needed whichever way the $SEQ$ is matched---no matter whether $c$ belongs +to $a_1$ or $ a_2$. After building these derivatives and maintaining all +the lexing information, we complete the lexing by collecting the +bitcodes using a generalised version of the $\textit{mkeps}$ function +for annotated regular expressions, called $\textit{bmkeps}$: + + +%\begin{definition}[\textit{bmkeps}]\mbox{} +\begin{center} +\begin{tabular}{lcl} + $\textit{bmkeps}\,(\textit{ONE}\;bs)$ & $\dn$ & $bs$\\ + $\textit{bmkeps}\,(\textit{ALTS}\;bs\,a::as)$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,a$\\ + & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\ + & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(\textit{ALTS}\;bs\,as)$\\ + $\textit{bmkeps}\,(\textit{SEQ}\;bs\,a_1\,a_2)$ & $\dn$ & + $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ + $\textit{bmkeps}\,(\textit{STAR}\;bs\,a)$ & $\dn$ & + $bs \,@\, [\S]$ +\end{tabular} +\end{center} +%\end{definition} + +\noindent +This function completes the value information by travelling along the +path of the regular expression that corresponds to a POSIX value and +collecting all the bitcodes, and using $S$ to indicate the end of star +iterations. If we take the bitcodes produced by $\textit{bmkeps}$ and +decode them, we get the value we expect. The corresponding lexing +algorithm looks as follows: + +\begin{center} +\begin{tabular}{lcl} + $\textit{blexer}\;r\,s$ & $\dn$ & + $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\ + & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\ + & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\ + & & $\;\;\textit{else}\;\textit{None}$ +\end{tabular} +\end{center} + +\noindent +In this definition $\_\backslash s$ is the generalisation of the derivative +operation from characters to strings (just like the derivatives for un-annotated +regular expressions). + +The main point of the bitcodes and annotated regular expressions is that +we can apply rather aggressive (in terms of size) simplification rules +in order to keep derivatives small. We have developed such +``aggressive'' simplification rules and generated test data that show +that the expected bound can be achieved. Obviously we could only +partially cover the search space as there are infinitely many regular +expressions and strings. + +One modification we introduced is to allow a list of annotated regular +expressions in the \textit{ALTS} constructor. This allows us to not just +delete unnecessary $\ZERO$s and $\ONE$s from regular expressions, but +also unnecessary ``copies'' of regular expressions (very similar to +simplifying $r + r$ to just $r$, but in a more general setting). Another +modification is that we use simplification rules inspired by Antimirov's +work on partial derivatives. They maintain the idea that only the first +``copy'' of a regular expression in an alternative contributes to the +calculation of a POSIX value. All subsequent copies can be pruned away from +the regular expression. A recursive definition of our simplification function +that looks somewhat similar to our Scala code is given below: +%\comment{Use $\ZERO$, $\ONE$ and so on. +%Is it $ALTS$ or $ALTS$?}\\ + +\begin{center} + \begin{tabular}{@{}lcl@{}} + + $\textit{simp} \; (\textit{SEQ}\;bs\,a_1\,a_2)$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp} \; a_2) \; \textit{match} $ \\ + &&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow \ZERO$ \\ + &&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow \ZERO$ \\ + &&$\quad\textit{case} \; (\ONE, a_2') \Rightarrow \textit{fuse} \; bs \; a_2'$ \\ + &&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow \textit{fuse} \; bs \; a_1'$ \\ + &&$\quad\textit{case} \; (a_1', a_2') \Rightarrow \textit{SEQ} \; bs \; a_1' \; a_2'$ \\ + + $\textit{simp} \; (\textit{ALTS}\;bs\,as)$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $ \\ + &&$\quad\textit{case} \; [] \Rightarrow \ZERO$ \\ + &&$\quad\textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$ \\ + &&$\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$\\ + + $\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$ +\end{tabular} +\end{center} + +\noindent +The simplification does a pattern matching on the regular expression. +When it detected that the regular expression is an alternative or +sequence, it will try to simplify its children regular expressions +recursively and then see if one of the children turn into $\ZERO$ or +$\ONE$, which might trigger further simplification at the current level. +The most involved part is the $\textit{ALTS}$ clause, where we use two +auxiliary functions $\textit{flatten}$ and $\textit{distinct}$ to open up nested +$\textit{ALTS}$ and reduce as many duplicates as possible. Function +$\textit{distinct}$ keeps the first occurring copy only and remove all later ones +when detected duplicates. Function $\textit{flatten}$ opens up nested \textit{ALTS}. +Its recursive definition is given below: + + \begin{center} + \begin{tabular}{@{}lcl@{}} + $\textit{flatten} \; (\textit{ALTS}\;bs\,as) :: as'$ & $\dn$ & $(\textit{map} \; + (\textit{fuse}\;bs)\; \textit{as}) \; @ \; \textit{flatten} \; as' $ \\ + $\textit{flatten} \; \textit{ZERO} :: as'$ & $\dn$ & $ \textit{flatten} \; as' $ \\ + $\textit{flatten} \; a :: as'$ & $\dn$ & $a :: \textit{flatten} \; as'$ \quad(otherwise) +\end{tabular} +\end{center} + +\noindent +Here $\textit{flatten}$ behaves like the traditional functional programming flatten +function, except that it also removes $\ZERO$s. Or in terms of regular expressions, it +removes parentheses, for example changing $a+(b+c)$ into $a+b+c$. + +Suppose we apply simplification after each derivative step, and view +these two operations as an atomic one: $a \backslash_{simp}\,c \dn +\textit{simp}(a \backslash c)$. Then we can use the previous natural +extension from derivative w.r.t.~character to derivative +w.r.t.~string:%\comment{simp in the [] case?} + +\begin{center} +\begin{tabular}{lcl} +$r \backslash_{simp} (c\!::\!s) $ & $\dn$ & $(r \backslash_{simp}\, c) \backslash_{simp}\, s$ \\ +$r \backslash_{simp} [\,] $ & $\dn$ & $r$ +\end{tabular} +\end{center} + +\noindent +we obtain an optimised version of the algorithm: + + \begin{center} +\begin{tabular}{lcl} + $\textit{blexer\_simp}\;r\,s$ & $\dn$ & + $\textit{let}\;a = (r^\uparrow)\backslash_{simp}\, s\;\textit{in}$\\ + & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\ + & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\ + & & $\;\;\textit{else}\;\textit{None}$ +\end{tabular} +\end{center} + +\noindent +This algorithm keeps the regular expression size small, for example, +with this simplification our previous $(a + aa)^*$ example's 8000 nodes +will be reduced to just 6 and stays constant, no matter how long the +input string is. + + + \section{Introduction} While we believe derivatives of regular expressions, written @@ -287,6 +865,67 @@ $ \flex \; r\; id\; (s@[c])\; v = \textit{decode} \;( \textit{retrieve}\; (\rup \backslash s) \; (\inj \; (r\backslash s) \;c\;v)\;) r$ \end{center} + +\subsubsection{Retrieve Function} +The crucial point is to find the +$\textit{POSIX}$ information of a regular expression and how it is modified, +augmented and propagated +during simplification in parallel with the regular expression that +has not been simplified in the subsequent derivative operations. To aid this, +we use the helper function retrieve described by Sulzmann and Lu: +\begin{center} +\begin{tabular}{@{}l@{\hspace{2mm}}c@{\hspace{2mm}}l@{}} + $\textit{retrieve}\,(\textit{ONE}\,bs)\,\Empty$ & $\dn$ & $bs$\\ + $\textit{retrieve}\,(\textit{CHAR}\,bs\,c)\,(\Char\,d)$ & $\dn$ & $bs$\\ + $\textit{retrieve}\,(\textit{ALTS}\,bs\,a::as)\,(\Left\,v)$ & $\dn$ & + $bs \,@\, \textit{retrieve}\,a\,v$\\ + $\textit{retrieve}\,(\textit{ALTS}\,bs\,a::as)\,(\Right\,v)$ & $\dn$ & + $bs \,@\, \textit{retrieve}\,(\textit{ALTS}\,bs\,as)\,v$\\ + $\textit{retrieve}\,(\textit{SEQ}\,bs\,a_1\,a_2)\,(\Seq\,v_1\,v_2)$ & $\dn$ & + $bs \,@\,\textit{retrieve}\,a_1\,v_1\,@\, \textit{retrieve}\,a_2\,v_2$\\ + $\textit{retrieve}\,(\textit{STAR}\,bs\,a)\,(\Stars\,[])$ & $\dn$ & + $bs \,@\, [\S]$\\ + $\textit{retrieve}\,(\textit{STAR}\,bs\,a)\,(\Stars\,(v\!::\!vs))$ & $\dn$ &\\ + \multicolumn{3}{l}{ + \hspace{3cm}$bs \,@\, [\Z] \,@\, \textit{retrieve}\,a\,v\,@\, + \textit{retrieve}\,(\textit{STAR}\,[]\,a)\,(\Stars\,vs)$}\\ +\end{tabular} +\end{center} +%\comment{Did not read further}\\ +This function assembles the bitcode +%that corresponds to a lexical value for how +%the current derivative matches the suffix of the string(the characters that +%have not yet appeared, but will appear as the successive derivatives go on. +%How do we get this "future" information? By the value $v$, which is +%computed by a pass of the algorithm that uses +%$inj$ as described in the previous section). +using information from both the derivative regular expression and the +value. Sulzmann and Lu poroposed this function, but did not prove +anything about it. Ausaf and Urban used it to connect the bitcoded +algorithm to the older algorithm by the following equation: + + \begin{center} $inj \;a\; c \; v = \textit{decode} \; (\textit{retrieve}\; + (r^\uparrow)\backslash_{simp} \,c)\,v)$ + \end{center} + +\noindent +whereby $r^\uparrow$ stands for the internalised version of $r$. Ausaf +and Urban also used this fact to prove the correctness of bitcoded +algorithm without simplification. Our purpose of using this, however, +is to establish + +\begin{center} +$ \textit{retrieve} \; +a \; v \;=\; \textit{retrieve} \; (\textit{simp}\,a) \; v'.$ +\end{center} +The idea is that using $v'$, a simplified version of $v$ that had gone +through the same simplification step as $\textit{simp}(a)$, we are able +to extract the bitcode that gives the same parsing information as the +unsimplified one. However, we noticed that constructing such a $v'$ +from $v$ is not so straightforward. The point of this is that we might +be able to finally bridge the gap by proving + + \noindent\rule[1.5ex]{\linewidth}{4pt} There is no mention of retrieve yet .... this is the second trick in the existing proof. I am not sure whether you need to explain annotated regular @@ -356,6 +995,7 @@ $f$ rectifies $r\backslash s$ so the value $\mkeps(f(r\backslash s))$ becomes something simpler to make the retrieve function defined.\\ +\subsubsection{Ways to Rectify Value} One way to do this is to prove the following: \begin{center} $\retrieve \; \rup\backslash_{simp} s \; \mkeps(\simp(r\backslash s))=\textit{retrieve} \; \rup\backslash s \; \mkeps(r\backslash s)$ @@ -374,6 +1014,7 @@ regular expression(just like "erasing" the bits). Its definition is omitted. $\rup\backslash_{simp} \, s$ and $\simp(\rup\backslash s)$ are very closely related, but not identical. +\subsubsection{Example for $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$} For example, let $r$ be the regular expression $(a+b)(a+a*)$ and $s$ be the string $aa$, then both $\erase (\rup\backslash_{simp} \, s)$ and $\erase (\simp (\rup\backslash s))$ @@ -387,43 +1028,46 @@ $\simp(\rup\backslash s)$ is equal to $(_{00}\ONE +_{011}a^*)$ \end{center} \noindent -For the sake of visual simplicity, we use numbers to denote the bits +(For the sake of visual simplicity, we use numbers to denote the bits in bitcodes as we have previously defined for annotated regular expressions. $\S$ is replaced by -subscript $_1$ and $\Z$ by $_0$. +subscript $_1$ and $\Z$ by $_0$.) -Two "rules" might be inferred from the above example. +What makes the difference? + +%Two "rules" might be inferred from the above example. -First, after erasing the bits the two regular expressions -are exactly the same: both become $1+a^*$. Here the -function $\simp$ exhibits the "one in the end equals many times -at the front" -property: one simplification in the end causes the -same regular expression structure as -successive simplifications done alongside derivatives. -$\rup\backslash_{simp} \, s$ unfolds to -$\simp((\simp(r\backslash a))\backslash a)$ -and $\simp(\rup\backslash s)$ unfolds to -$\simp((r\backslash a)\backslash a)$. The one simplification -in the latter causes the resulting regular expression to -become $1+a^*$, exactly the same as the former with -two simplifications. +%First, after erasing the bits the two regular expressions +%are exactly the same: both become $1+a^*$. Here the +%function $\simp$ exhibits the "one in the end equals many times +%at the front" +%property: one simplification in the end causes the +%same regular expression structure as +%successive simplifications done alongside derivatives. +%$\rup\backslash_{simp} \, s$ unfolds to +%$\simp((\simp(r\backslash a))\backslash a)$ +%and $\simp(\rup\backslash s)$ unfolds to +%$\simp((r\backslash a)\backslash a)$. The one simplification +%in the latter causes the resulting regular expression to +%become $1+a^*$, exactly the same as the former with +%two simplifications. -Second, the bit-codes are different, but they are essentially -the same: if we push the outmost bits ${\bf_0}(_0\ONE +_{11}a^*)$ of $\rup\backslash_{simp} \, s$ -inside then we get $(_{00}\ONE +_{011}a^*)$, exactly the -same as that of $\rup\backslash \, s$. And this difference -does not matter when we try to apply $\bmkeps$ or $\retrieve$ -to it. This seems a good news if we want to use $\retrieve$ -to prove things. +%Second, the bit-codes are different, but they are essentially +%the same: if we push the outmost bits ${\bf_0}(_0\ONE +_{11}a^*)$ of $\rup\backslash_{simp} \, s$ +%inside then we get $(_{00}\ONE +_{011}a^*)$, exactly the +%same as that of $\rup\backslash \, s$. And this difference +%does not matter when we try to apply $\bmkeps$ or $\retrieve$ +%to it. This seems a good news if we want to use $\retrieve$ +%to prove things. -If we look into the difference above, we could see that the -difference is not fundamental: the bits are just being moved -around in a way that does not hurt the correctness. +%If we look into the difference above, we could see that the +%difference is not fundamental: the bits are just being moved +%around in a way that does not hurt the correctness. During the first derivative operation, $\rup\backslash a=(_0\ONE + \ZERO)(_0a + _1a^*)$ is -in the form of a sequence regular expression with the first -part being nullable. +in the form of a sequence regular expression with +two components, the first +one $\ONE + \ZERO$ being nullable. Recall the simplification function definition: \begin{center} \begin{tabular}{@{}lcl@{}} @@ -445,11 +1089,24 @@ \end{center} \noindent -If we call $\simp$ on $\rup$, just as $\backslash_{simp}$ + +and the difinition of $\flatten$: + \begin{center} + \begin{tabular}{c c c} + $\flatten \; []$ & $\dn$ & $[]$\\ + $\flatten \; \ZERO::rs$ & $\dn$ & $rs$\\ + $\flatten \;(_{\textit{bs}_1}\oplus \textit{rs}_1 ::rs)$ & $\dn$ & $(\map \, (\fuse \, \textit{bs}_1) \,\textit{rs}_1) ::: \flatten(rs)$\\ + $\flatten \; r :: rs$ & $\dn$ & $r::\flatten(rs)$ + \end{tabular} + \end{center} + + \noindent +If we call $\simp$ on $\rup\backslash a$, just as $\backslash_{simp}$ requires, then we would go throught the third clause of the sequence case:$\quad\textit{case} \; (\ONE, a_2') \Rightarrow \textit{fuse} \; bs \; a_2'$. -The $\ZERO$ of $(_0\ONE + \ZERO)$ is simplified away and -$_0\ONE$ merged into $_0a + _1a^*$ by simply +The $\ZERO$ of $(_0\ONE + \ZERO)$ is thrown away +by $\flatten$ and +$_0\ONE$ merged into $(_0a + _1a^*)$ by simply putting its bits($_0$) to the front of the second component: ${\bf_0}(_0a + _1a^*)$. After a second derivative operation, @@ -459,25 +1116,56 @@ $, and this simplifies to $_0(_0 \ONE + _{11} a^*)$ by the third clause of the alternative case: $\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$. -The outmost bit $_0$ remains unchanged and stays with -the outmost regular expression. However, things are a bit -different when it comes to $\simp(\rup\backslash \, s)$, because -without simplification, first term of the sequence -$\rup\backslash a=(_0\ONE + \ZERO)(_0a + _1a^*)$ -is not merged into the second component -and is nullable. +The outmost bit $_0$ stays with +the outmost regular expression, rather than being fused to +its child regular expressions, as what we will later see happens +to $\simp(\rup\backslash \, s)$. +If we choose to not simplify in the midst of derivative operations, +but only do it at the end after the string has been exhausted, +namely, $\simp(\rup\backslash \, s)=\simp((\rup\backslash a)\backslash a)$, +then at the {\bf second} derivative of +$(\rup\backslash a)\bf{\backslash a}$ +we will go throught the clause of $\backslash$: +\begin{center} +\begin{tabular}{lcl} +$(\textit{SEQ}\;bs\,a_1\,a_2)\,\backslash c$ & $\dn$ & + $(when \; \textit{bnullable}\,a_1)$\\ + & &$\textit{ALTS}\,bs\,List(\;\;(\textit{SEQ}\,[]\,(a_1\,\backslash c)\,a_2),$\\ + & &$(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))\;\;)$\\ +\end{tabular} +\end{center} + +because +$\rup\backslash a = (_0\ONE + \ZERO)(_0a + _1a^*)$ +is a sequence +with the first component being nullable +(unsimplified, unlike the first round of running$\backslash_{simp}$). Therefore $((_0\ONE + \ZERO)(_0a + _1a^*))\backslash a$ splits into $([(\ZERO + \ZERO)\cdot(_0a + _1a^*)] + _0( _0\ONE + _1[_1\ONE \cdot a^*]))$. After these two successive derivatives without simplification, we apply $\simp$ to this regular expression, which goes through -the alternative clause, and each component of $([(\ZERO + \ZERO)\cdot(_0a + _1a^*)] + _0( _0\ONE + _1[_1\ONE \cdot a^*]))$ will be simplified, giving us the list:$[\ZERO, _0(_0\ONE + _{11}a^*]$,this -list is then flattened--for -$([(\ZERO + \ZERO)\cdot(_0a + _1a^*)]$ it will be simplified into $\ZERO$ -and then thrown away by $\textit{flatten}$; $ _0( _0\ONE + _1[_1\ONE \cdot a^*]))$ - becomes $ _{00}\ONE + _{011}a^*]))$ because flatten opens up the alternative - $\ONE + a^*$ and fuses the front bit(s) $_0$ to the front of $_0\ONE $ and $_{11}a^*$ - and get $_{00}$ and $_{011}$. - %CONSTRUCTION SITE +the alternative clause, and each component of +$([(\ZERO + \ZERO)\cdot(_0a + _1a^*)] + _0( _0\ONE + _1[_1\ONE \cdot a^*]))$ +will be simplified, giving us the list:$[\ZERO, _0(_0\ONE + _{11}a^*)]$ +This list is then "flattened"--$\ZERO$ will be +thrown away by $\textit{flatten}$; $ _0(_0\ONE + _{11}a^*)$ +is opened up to make the list consisting of two separate elements +$_{00}\ONE$ and $_{011}a^*$, note that $flatten$ +$\fuse$s the bit(s) $_0$ to the front of $_0\ONE $ and $_{11}a^*$. +In a nutshell, the order of simplification causes +the bits to be moved differently. + + \subsubsection{A Failed Attempt To Remedy the Problem Above} +A simple class of regular expressions and strings +pairs can be deduced +from the above example which +trigger the difference between +$\rup\backslash_{simp} \, s$ +and $\simp(\rup\backslash s)$: +\[ +D = (c_1c_2, \{r_1\cdot r_2 \mid \text{$ c_1\in L(r_1), r_1 \; not \; \nullable, c_2 \in L(r_2), +r_2 \backslash c_2$ is of shape alternative and $c_1c_2 \notin L(r_1)$}\}) +\] and we can use this to construct a set of examples based on this type of behaviour of two operations: $r_1$ @@ -487,6 +1175,16 @@ same sequence of derivative operations regardless of whether we did simplification along the way. + One way would be to give a function that calls + %CONSTRUCTION SITE +fuse is the culprit: it causes the order in which alternatives +are opened up to be remembered and finally the difference +appear in $\simp(\rup \backslash s)$ and $\rup \backslash{simp} \,s$. +but we have to use them as they are essential in the simplification: +flatten needs them. + + + However, without erase the above equality does not hold: for the regular expression $(a+b)(a+a*)$,