--- a/ChengsongTanPhdThesis/Chapters/Inj.tex Mon Sep 12 23:32:18 2022 +0200
+++ b/ChengsongTanPhdThesis/Chapters/Inj.tex Thu Sep 22 00:31:09 2022 +0100
@@ -516,7 +516,7 @@
Assuming the string is given as a sequence of characters, say $c_0c_1..c_n$,
this algorithm presented graphically is as follows:
-\begin{equation}\label{graph:successive_ders}
+\begin{equation}\label{matcher}
\begin{tikzcd}
r_0 \arrow[r, "\backslash c_0"] & r_1 \arrow[r, "\backslash c_1"] &
r_2 \arrow[r, dashed] & r_n \arrow[r,"\textit{nullable}?"] &
@@ -535,13 +535,13 @@
lemma \ref{derDer}.
\end{proof}
\noindent
+\begin{figure}
\begin{center}
-\begin{figure}
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
ylabel={time in secs},
- ymode = log,
+ %ymode = log,
legend entries={Naive Matcher},
legend pos=north west,
legend cell align=left]
@@ -551,8 +551,8 @@
\caption{Matching the regular expression $(a^*)^*b$ against strings of the form
$\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}
$ using Brzozowski's original algorithm}\label{NaiveMatcher}
+\end{center}
\end{figure}
-\end{center}
\noindent
If we implement the above algorithm naively, however,
the algorithm can be excruciatingly slow, as shown in
@@ -600,8 +600,8 @@
\begin{axis}[
xlabel={$n$},
ylabel={time in secs},
- ymode = log,
- xmode = log,
+ %ymode = log,
+ %xmode = log,
grid = both,
legend entries={Matcher With Simp},
legend pos=north west,
@@ -702,6 +702,8 @@
is to make sure that for a given pair of regular
expression $r$ and string $s$, the number of values
satisfying $|v| = s$ and $\vdash v:r$ is finite.
+This additional condition was
+imposed by Ausaf and Urban to make their proofs easier.
Given the same string and regular expression, there can be
multiple values for it. For example, both
$\vdash \Seq(\Left \; ab)(\Right \; c):(ab+a)(bc+c)$ and
@@ -717,9 +719,8 @@
The $\POSIX$ value $v$ for a regular expression
$r$ and string $s$, denoted as $(s, r) \rightarrow v$, can be specified
in the following set of rules\footnote{The names of the rules are used
-as they were originally given in \cite{AusafDyckhoffUrban2016}}:
-\noindent
-\begin{figure}
+as they were originally given in \cite{AusafDyckhoffUrban2016} }:
+\begin{figure}[H]
\begin{mathpar}
\inferrule[P1]{\mbox{}}{([], \ONE) \rightarrow \Empty}
@@ -741,7 +742,13 @@
s_1@s_3 \in L \; r \land s_4 \in L \; r^*}{(s_1@s_2, r^*)\rightarrow \Stars \;
(v::vs)}
\end{mathpar}
-\caption{POSIX Lexing Rules}
+\caption{The inductive POSIX Lexing Rules defined by Ausaf, Dyckhoff and Urban \cite{AusafDyckhoffUrban2016}.
+The ternary relation, written $(s, r) \rightarrow v$, formalises the POSIX constraints on the
+value $v$ given a string $s$ and
+regular expression $r$.
+For example, this specification says that all matches for an alternative
+must always prefer a left value to a right one.
+}
\end{figure}
\noindent
@@ -832,7 +839,7 @@
When we have
\[
(s_1, r_1) \rightarrow v_1 \;\, and \;\,
- (s_2, r_2) \rightarrow v_2 \;\, and \;\,\\
+ (s_2, r_2) \rightarrow v_2 \;\, and \;\,
\nexists s_3 \; s_4. s_3 \neq [] \land s_3 @ s_4 = s_2 \land
s_1@ s_3 \in L \; r_1 \land s_4 \in L \; r_2
\]
@@ -942,14 +949,25 @@
of $s$ (i.e. $s_i = c_i \ldots c_{n-1}$ ) from the previous lexical value $v_{i+1}$.
After injecting back $n$ characters, we get the lexical value for how $r_0$
matches $s$.
+\begin{figure}[H]
+\begin{center}
\begin{ceqn}
-\begin{equation}\label{graph:inj}
\begin{tikzcd}
r_0 \arrow[r, dashed] \arrow[d]& r_i \arrow[r, "\backslash c_i"] \arrow[d] & r_{i+1} \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\
v_0 \arrow[u] & v_i \arrow[l, dashed] & v_{i+1} \arrow[l,"inj_{r_i} c_i"] & v_n \arrow[l, dashed]
\end{tikzcd}
-\end{equation}
\end{ceqn}
+\end{center}
+\caption{The two-phase lexing algorithm by Sulzmann and Lu \cite{AusafDyckhoffUrban2016},
+ matching the regular expression $r_0$ and string of the form $[c_0, c_1, \ldots, c_{n-1}]$.
+ The first phase involves taking successive derivatives w.r.t the characters $c_0$,
+ $c_1$, and so on. These are the same operations as they have appeared in the matcher
+ \ref{matcher}. When the final derivative regular expression is nullable (contains the empty string),
+ then the second phase starts. First $\mkeps$ generates a POSIX value which tells us how $r_n$ matches
+ the empty string , by always selecting the leftmost
+ nullable regular expression. After that $\inj$ ``injects'' back the character in reverse order as they
+ appeared in the string, always preserving POSIXness.}\label{graph:inj}
+\end{figure}
\noindent
$\textit{inj}$ takes three arguments: a regular
expression ${r_{i}}$, before the character is chopped off,
@@ -1188,7 +1206,7 @@
even with simplification, which is not much better compared
with the naive version without any simplifications:
\begin{figure}[H]
- \centering
+\begin{center}
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
@@ -1200,6 +1218,7 @@
\addplot[blue,mark=*, mark options={fill=white}] table {BetterWaterloo1.data};
\end{axis}
\end{tikzpicture}
+\end{center}
\caption{Size of $(a^*\cdot a^*)^*$ against $\protect\underbrace{aa\ldots a}_\text{n \textit{a}s}$}
\end{figure}\label{fig:BetterWaterloo}