--- a/ChengsongTanPhdThesis/Chapters/Inj.tex Mon Aug 15 17:26:08 2022 +0200
+++ b/ChengsongTanPhdThesis/Chapters/Inj.tex Wed Aug 17 01:09:13 2022 +0100
@@ -278,7 +278,7 @@
\end{center}
\noindent
Brzozowski noticed that this operation
-can be ``mirrored" on regular expressions which
+can be ``mirrored'' on regular expressions which
he calls the derivative of a regular expression $r$
with respect to a character $c$, written
$r \backslash c$. This infix operator
@@ -288,9 +288,17 @@
The derivative operation on regular expression
is defined such that the language of the derivative result
coincides with the language of the original
-regular expression's language being taken the language
+regular expression being taken
derivative with respect to the same character:
-\begin{center}
+\begin{property}
+
+\[
+ L \; (r \backslash c) = \Der \; c \; (L \; r)
+\]
+\end{property}
+\noindent
+Pictorially, this looks as follows:
+
\parskip \baselineskip
\def\myupbracefill#1{\rotatebox{90}{\stretchto{\{}{#1}}}
\def\rlwd{.5pt}
@@ -308,25 +316,98 @@
\;\ldots\}$}
}
\Longstack{
- $\stackrel{\backslash c}{\longrightarrow}$
+ $\stackrel{\backslash c}{\xrightarrow{\hspace*{8cm}}}$
}
\Longstack{
- \notate{$r\backslash c$}{2}{$L \; (r\backslash c)=
+ \notate{$r\backslash c$}{1}{$L \; (r\backslash c)=
\{\ldots,\;s_1,\;\ldots\}$}
}
-\end{center}
-\begin{center}
+\\
+The derivatives on regular expression can again be
+generalised to a string.
+One could compute $r\backslash s$ and test membership of $s$
+in $L \; r$ by checking
+whether the empty string is in the language of
+$r\backslash s$.
+
-\[
-L(r \backslash c) = \Der \; c \; L(r)
-\]
+\Longstack{
+ \notate{$r_{start}$}{4}{
+ \Longstack{$L \; r_{start} = \{\ldots, \;$
+ \notate{$s$}{1}{$c_1::s_1$}
+ $, \ldots\} $
+ }
+ }
+}
+\Longstack{
+ $\stackrel{\backslash c_1}{ \xrightarrow{\hspace*{1.8cm}} }$
+}
+\Longstack{
+ \notate{$r_1$}{3}{
+ $r_1 = r_{start}\backslash c_1$,
+ $L \; r_1 = $
+ \Longstack{
+ $\{ \ldots,\;$ \notate{$s_1$}{1}{$c_2::s_2$}
+ $,\; \ldots \}$
+ }
+}
+}
+\Longstack{
+ $\stackrel{\backslash c_2}{\xrightarrow{\hspace*{1.8cm}}}$
+}
+\Longstack{
+ $r_2$
+}
+\Longstack{
+ $ \xdashrightarrow{\hspace*{0.3cm} \backslash c_3 \ldots \ldots \ldots \hspace*{0.3cm}} $
+}
+\Longstack{
+ \notate{$r_{end}$}{1}{
+ $L \; r_{end} = \{\ldots, \; [], \ldots\}$}
+}
+
+
+
+\begin{property}
+ $s \in L \; r_{start} \iff [] \in L \; r_{end}$
+\end{property}
+\noindent
+Now we give the recursive definition of derivative on
+regular expressions, so that it satisfies the properties above.
+The derivative function, written $r\backslash c$,
+defines how a regular expression evolves into
+a new one after all the string it contains is acted on:
+if it starts with $c$, then the character is chopped of,
+if not, that string is removed.
+\begin{center}
+\begin{tabular}{lcl}
+ $\ZERO \backslash c$ & $\dn$ & $\ZERO$\\
+ $\ONE \backslash c$ & $\dn$ & $\ZERO$\\
+ $d \backslash c$ & $\dn$ &
+ $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
+$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
+$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\
+ & & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
+ & & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
+ $(r^*)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^*$\\
+\end{tabular}
\end{center}
\noindent
-where we do derivatives on the regular expression
-$r$ and test membership of $s$ by checking
-whether the empty string is in the language of
-$r\backslash s$.
-For example in the sequence case we have
+The most involved cases are the sequence case
+and the star case.
+The sequence case says that if the first regular expression
+contains an empty string, then the second component of the sequence
+needs to be considered, as its derivative will contribute to the
+result of this derivative:
+\begin{center}
+ \begin{tabular}{lcl}
+ $(r_1 \cdot r_2 ) \backslash c$ & $\dn$ & $\textit{if}\;\,([] \in L(r_1))\; r_1 \backslash c \cdot r_2 + r_2 \backslash c$ \\
+ & & $\textit{else} \; (r_1 \backslash c) \cdot r_2$
+ \end{tabular}
+\end{center}
+\noindent
+Notice how this closely resembles
+the language derivative operation $\Der$:
\begin{center}
\begin{tabular}{lcl}
$\Der \; c \; (A @ B)$ & $\dn$ &
@@ -337,26 +418,19 @@
\end{tabular}
\end{center}
\noindent
-This can be translated to
-regular expressions in the following
-manner:
-\begin{center}
- \begin{tabular}{lcl}
- $(r_1 \cdot r_2 ) \backslash c$ & $\dn$ & $\textit{if}\;\,([] \in L(r_1)) r_1 \backslash c \cdot r_2 + r_2 \backslash c$ \\
- & & $\textit{else} \; (r_1 \backslash c) \cdot r_2$
- \end{tabular}
-\end{center}
-
-\noindent
-And similarly, the Kleene star's semantic derivative
-can be expressed as
+The star regular expression $r^*$'s derivative
+unwraps one iteration of $r$, turns it into $r\backslash c$,
+and attaches the original $r^*$
+after $r\backslash c$, so that
+we can further unfold it as many times as needed:
+\[
+ (r^*) \backslash c \dn (r \backslash c)\cdot r^*.
+\]
+Again,
+the structure is the same as the semantic derivative of Kleene star:
\[
\textit{Der} \;c \;(A*) \dn (\textit{Der}\; c A) @ (A*)
\]
-which translates to
-\[
- (r^*) \backslash c \dn (r \backslash c)\cdot r^*.
-\]
In the above definition of $(r_1\cdot r_2) \backslash c$,
the $\textit{if}$ clause's
boolean condition
@@ -390,54 +464,28 @@
would require both children to have the empty string
to compose an empty string, and the Kleene star
is always nullable because it naturally
-contains the empty string.
-
-The derivative function, written $r\backslash c$,
-defines how a regular expression evolves into
-a new one after all the string it contains is acted on:
-if it starts with $c$, then the character is chopped of,
-if not, that string is removed.
-\begin{center}
-\begin{tabular}{lcl}
- $\ZERO \backslash c$ & $\dn$ & $\ZERO$\\
- $\ONE \backslash c$ & $\dn$ & $\ZERO$\\
- $d \backslash c$ & $\dn$ &
- $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
-$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
-$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\
- & & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
- & & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
- $(r^*)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^*$\\
-\end{tabular}
-\end{center}
+contains the empty string.
\noindent
-The most involved cases are the sequence case
-and the star case.
-The sequence case says that if the first regular expression
-contains an empty string, then the second component of the sequence
-needs to be considered, as its derivative will contribute to the
-result of this derivative.
-The star regular expression $r^*$'s derivative
-unwraps one iteration of $r$, turns it into $r\backslash c$,
-and attaches the original $r^*$
-after $r\backslash c$, so that
-we can further unfold it as many times as needed.
We have the following correspondence between
derivatives on regular expressions and
derivatives on a set of strings:
\begin{lemma}\label{derDer}
+ \begin{itemize}
+ \item
$\textit{Der} \; c \; L(r) = L (r\backslash c)$
+\item
+ $c\!::\!s \in L(r)$ \textit{iff} $s \in L(r\backslash c)$.
+ \end{itemize}
\end{lemma}
-
+\begin{proof}
+ By induction on $r$.
+\end{proof}
\noindent
The main property of the derivative operation
(that enables us to reason about the correctness of
derivative-based matching)
is
-\begin{lemma}\label{derStepwise}
- $c\!::\!s \in L(r)$ \textit{iff} $s \in L(r\backslash c)$.
-\end{lemma}
\noindent
We can generalise the derivative operation shown above for single characters
@@ -478,8 +526,8 @@
$\textit{match} \; s\; r = \textit{true} \; \textit{iff} \; s \in L(r)$
\end{lemma}
\begin{proof}
- By the stepwise property of derivatives (lemma \ref{derStepwise})
- and lemma \ref{derDer}.
+ By the stepwise property of derivatives
+ (lemma \ref{derDer}).
\end{proof}
\noindent
\begin{center}
@@ -689,6 +737,34 @@
\caption{POSIX Lexing Rules}
\end{figure}
\noindent
+
+\begin{figure}
+\begin{tikzpicture}[]
+ \node [minimum width = 6cm, rectangle split, rectangle split horizontal,
+ rectangle split parts=2, rectangle split part fill={red!30,blue!20}, style={draw, rounded corners, inner sep=10pt}]
+ (node1)
+ {$r_{token1}$
+ \nodepart{two} $\;\;\; \quad r_{token2}\;\;\;\quad$ };
+ %\node [left = 6.0cm of node1] (start1) {hi};
+ \node [left = 0.2cm of node1] (middle) {$v.s.$};
+ \node [minimum width = 6cm, left = 0.2cm of middle, rectangle split, rectangle split horizontal,
+ rectangle split parts=2, rectangle split part fill={red!30,blue!20}, style={draw, rounded corners, inner sep=10pt}]
+ (node2)
+ {$\quad\;\;\;r_{token1}\quad\;\;\;$
+ \nodepart{two} $r_{token2}$ };
+ \node [below = 0.1cm of node2] (text1) {\checkmark preferred by POSIX};
+ \node [above = 1.5cm of middle, minimum width = 6cm,
+ rectangle, style={draw, rounded corners, inner sep=10pt}]
+ (topNode) {$s$};
+ \path[->,draw]
+ (topNode) edge node {split $A$} (node2)
+ (topNode) edge node {split $B$} (node1)
+ ;
+
+
+\end{tikzpicture}
+\caption{Maximum munch example: $s$ matches $r_{token1} \cdot r_{token2}$}\label{munch}
+\end{figure}
The above $\POSIX$ rules follows the intuition described below:
\begin{itemize}
\item (Left Priority)\\
@@ -697,6 +773,13 @@
\item (Maximum munch)\\
Always match a subpart as much as possible before proceeding
to the next token.
+ For example, when the string $s$ matches
+ $r_{token1}\cdot r_{token2}$, and we have two ways $s$ can be split:
+ Then the split that matches a longer string for the first token
+ $r_{token1}$ is preferred by this maximum munch rule (See
+ \ref{munch} for an illustration).
+\noindent
+
\end{itemize}
\noindent
These disambiguation strategies can be
@@ -1205,10 +1288,10 @@
and one can remove ones that are absolutely not possible to
be POSIX.
In the above example,
-\[
+\begin{equation}\label{eqn:growth2}
((a^*a^* + \underbrace{a^*}_\text{A})+\underbrace{a^*}_\text{duplicate of A})\cdot(a^*a^*)^* +
\underbrace{(a^*a^* + a^*)\cdot(a^*a^*)^*}_\text{further simp removes this}.
-\]
+\end{equation}
can be further simplified by
removing the underlined term first,
which would open up possibilities