diff -r fd068f39ac23 -r ce4e5151a836 ChengsongTanPhdThesis/Chapters/Inj.tex --- a/ChengsongTanPhdThesis/Chapters/Inj.tex Mon Sep 12 23:32:18 2022 +0200 +++ b/ChengsongTanPhdThesis/Chapters/Inj.tex Thu Sep 22 00:31:09 2022 +0100 @@ -516,7 +516,7 @@ Assuming the string is given as a sequence of characters, say $c_0c_1..c_n$, this algorithm presented graphically is as follows: -\begin{equation}\label{graph:successive_ders} +\begin{equation}\label{matcher} \begin{tikzcd} r_0 \arrow[r, "\backslash c_0"] & r_1 \arrow[r, "\backslash c_1"] & r_2 \arrow[r, dashed] & r_n \arrow[r,"\textit{nullable}?"] & @@ -535,13 +535,13 @@ lemma \ref{derDer}. \end{proof} \noindent +\begin{figure} \begin{center} -\begin{figure} \begin{tikzpicture} \begin{axis}[ xlabel={$n$}, ylabel={time in secs}, - ymode = log, + %ymode = log, legend entries={Naive Matcher}, legend pos=north west, legend cell align=left] @@ -551,8 +551,8 @@ \caption{Matching the regular expression $(a^*)^*b$ against strings of the form $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s} $ using Brzozowski's original algorithm}\label{NaiveMatcher} +\end{center} \end{figure} -\end{center} \noindent If we implement the above algorithm naively, however, the algorithm can be excruciatingly slow, as shown in @@ -600,8 +600,8 @@ \begin{axis}[ xlabel={$n$}, ylabel={time in secs}, - ymode = log, - xmode = log, + %ymode = log, + %xmode = log, grid = both, legend entries={Matcher With Simp}, legend pos=north west, @@ -702,6 +702,8 @@ is to make sure that for a given pair of regular expression $r$ and string $s$, the number of values satisfying $|v| = s$ and $\vdash v:r$ is finite. +This additional condition was +imposed by Ausaf and Urban to make their proofs easier. Given the same string and regular expression, there can be multiple values for it. For example, both $\vdash \Seq(\Left \; ab)(\Right \; c):(ab+a)(bc+c)$ and @@ -717,9 +719,8 @@ The $\POSIX$ value $v$ for a regular expression $r$ and string $s$, denoted as $(s, r) \rightarrow v$, can be specified in the following set of rules\footnote{The names of the rules are used -as they were originally given in \cite{AusafDyckhoffUrban2016}}: -\noindent -\begin{figure} +as they were originally given in \cite{AusafDyckhoffUrban2016} }: +\begin{figure}[H] \begin{mathpar} \inferrule[P1]{\mbox{}}{([], \ONE) \rightarrow \Empty} @@ -741,7 +742,13 @@ s_1@s_3 \in L \; r \land s_4 \in L \; r^*}{(s_1@s_2, r^*)\rightarrow \Stars \; (v::vs)} \end{mathpar} -\caption{POSIX Lexing Rules} +\caption{The inductive POSIX Lexing Rules defined by Ausaf, Dyckhoff and Urban \cite{AusafDyckhoffUrban2016}. +The ternary relation, written $(s, r) \rightarrow v$, formalises the POSIX constraints on the +value $v$ given a string $s$ and +regular expression $r$. +For example, this specification says that all matches for an alternative +must always prefer a left value to a right one. +} \end{figure} \noindent @@ -832,7 +839,7 @@ When we have \[ (s_1, r_1) \rightarrow v_1 \;\, and \;\, - (s_2, r_2) \rightarrow v_2 \;\, and \;\,\\ + (s_2, r_2) \rightarrow v_2 \;\, and \;\, \nexists s_3 \; s_4. s_3 \neq [] \land s_3 @ s_4 = s_2 \land s_1@ s_3 \in L \; r_1 \land s_4 \in L \; r_2 \] @@ -942,14 +949,25 @@ of $s$ (i.e. $s_i = c_i \ldots c_{n-1}$ ) from the previous lexical value $v_{i+1}$. After injecting back $n$ characters, we get the lexical value for how $r_0$ matches $s$. +\begin{figure}[H] +\begin{center} \begin{ceqn} -\begin{equation}\label{graph:inj} \begin{tikzcd} r_0 \arrow[r, dashed] \arrow[d]& r_i \arrow[r, "\backslash c_i"] \arrow[d] & r_{i+1} \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\ v_0 \arrow[u] & v_i \arrow[l, dashed] & v_{i+1} \arrow[l,"inj_{r_i} c_i"] & v_n \arrow[l, dashed] \end{tikzcd} -\end{equation} \end{ceqn} +\end{center} +\caption{The two-phase lexing algorithm by Sulzmann and Lu \cite{AusafDyckhoffUrban2016}, + matching the regular expression $r_0$ and string of the form $[c_0, c_1, \ldots, c_{n-1}]$. + The first phase involves taking successive derivatives w.r.t the characters $c_0$, + $c_1$, and so on. These are the same operations as they have appeared in the matcher + \ref{matcher}. When the final derivative regular expression is nullable (contains the empty string), + then the second phase starts. First $\mkeps$ generates a POSIX value which tells us how $r_n$ matches + the empty string , by always selecting the leftmost + nullable regular expression. After that $\inj$ ``injects'' back the character in reverse order as they + appeared in the string, always preserving POSIXness.}\label{graph:inj} +\end{figure} \noindent $\textit{inj}$ takes three arguments: a regular expression ${r_{i}}$, before the character is chopped off, @@ -1188,7 +1206,7 @@ even with simplification, which is not much better compared with the naive version without any simplifications: \begin{figure}[H] - \centering +\begin{center} \begin{tikzpicture} \begin{axis}[ xlabel={$n$}, @@ -1200,6 +1218,7 @@ \addplot[blue,mark=*, mark options={fill=white}] table {BetterWaterloo1.data}; \end{axis} \end{tikzpicture} +\end{center} \caption{Size of $(a^*\cdot a^*)^*$ against $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}$} \end{figure}\label{fig:BetterWaterloo}