--- a/ChengsongTanPhdThesis/Chapters/Finite.tex Fri Sep 02 00:14:19 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Finite.tex Fri Sep 02 11:06:26 2022 +0100
@@ -13,20 +13,20 @@
size is finitely bounded
by a constant $N_a$ that only depends on $a$:
\begin{center}
-$\llbracket \bderssimp{a}{s} \rrbracket \leq N_a$
+ $\llbracket \bderssimp{a}{s} \rrbracket \leq N_a$
\end{center}
\noindent
where the size of an annotated regular expression is defined
in terms of the number of nodes in its tree structure:
\begin{center}
-\begin{tabular}{ccc}
- $\llbracket _{bs}\ONE \rrbracket$ & $\dn$ & $1$\\
- $\llbracket \ZERO \rrbracket$ & $\dn$ & $1$ \\
- $\llbracket _{bs} r_1 \cdot r_2 \rrbracket$ & $\dn$ & $\llbracket r_1 \rrbracket + \llbracket r_2 \rrbracket + 1$\\
-$\llbracket _{bs}\mathbf{c} \rrbracket $ & $\dn$ & $1$\\
-$\llbracket _{bs}\sum as \rrbracket $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket)\; as + 1$\\
-$\llbracket _{bs} a^* \rrbracket $ & $\dn$ & $\llbracket a \rrbracket + 1$.
-\end{tabular}
+ \begin{tabular}{ccc}
+ $\llbracket _{bs}\ONE \rrbracket$ & $\dn$ & $1$\\
+ $\llbracket \ZERO \rrbracket$ & $\dn$ & $1$ \\
+ $\llbracket _{bs} r_1 \cdot r_2 \rrbracket$ & $\dn$ & $\llbracket r_1 \rrbracket + \llbracket r_2 \rrbracket + 1$\\
+ $\llbracket _{bs}\mathbf{c} \rrbracket $ & $\dn$ & $1$\\
+ $\llbracket _{bs}\sum as \rrbracket $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket)\; as + 1$\\
+ $\llbracket _{bs} a^* \rrbracket $ & $\dn$ & $\llbracket a \rrbracket + 1$.
+ \end{tabular}
\end{center}
We believe this size formalisation
of the algorithm is important in our context, because
@@ -48,30 +48,30 @@
The regular expression is repeatedly being taken derivative of
and then simplified.
\begin{figure}[H]
-\begin{tikzpicture}[scale=2,
- every node/.style={minimum size=11mm},
- ->,>=stealth',shorten >=1pt,auto,thick
- ]
-\node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
-\node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
-\draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
+ \begin{tikzpicture}[scale=2,
+ every node/.style={minimum size=11mm},
+ ->,>=stealth',shorten >=1pt,auto,thick
+ ]
+ \node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
+ \node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
+ \draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
-\node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=6mm]{$a_{1s}$};
-\draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp$};
+ \node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=6mm]{$a_{1s}$};
+ \draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp$};
-\node (r2) [rectangle, draw=black, thick, right=of r1s, minimum size = 12mm]{$a_2$};
-\draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
+ \node (r2) [rectangle, draw=black, thick, right=of r1s, minimum size = 12mm]{$a_2$};
+ \draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
-\node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=6mm]{$a_{2s}$};
-\draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp$};
+ \node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=6mm]{$a_{2s}$};
+ \draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp$};
-\node (rns) [rectangle, draw = blue, thick, right=of r2s,minimum size=6mm]{$a_{ns}$};
-\draw[->,line width=0.2mm, dashed](r2s)--(rns) node[above,midway] {$\backslash \ldots$};
+ \node (rns) [rectangle, draw = blue, thick, right=of r2s,minimum size=6mm]{$a_{ns}$};
+ \draw[->,line width=0.2mm, dashed](r2s)--(rns) node[above,midway] {$\backslash \ldots$};
-\node (v) [circle, thick, draw, right=of rns, minimum size=6mm, right=1.7cm]{$v$};
-\draw[->, line width=0.2mm](rns)--(v) node[above, midway] {\bmkeps} node [below, midway] {\decode};
-\end{tikzpicture}
-\caption{Regular expression size change during our $\blexersimp$ algorithm}\label{simpShrinks}
+ \node (v) [circle, thick, draw, right=of rns, minimum size=6mm, right=1.7cm]{$v$};
+ \draw[->, line width=0.2mm](rns)--(v) node[above, midway] {\bmkeps} node [below, midway] {\decode};
+ \end{tikzpicture}
+ \caption{Regular expression size change during our $\blexersimp$ algorithm}\label{simpShrinks}
\end{figure}
\noindent
Each time
@@ -88,34 +88,34 @@
Sulzmann and Lu's assumed something similar about their algorithm,
though in fact their algorithm's size might be better depicted by the following graph:
\begin{figure}[H]
-\begin{tikzpicture}[scale=2,
- every node/.style={minimum size=11mm},
- ->,>=stealth',shorten >=1pt,auto,thick
- ]
-\node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
-\node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
-\draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
+ \begin{tikzpicture}[scale=2,
+ every node/.style={minimum size=11mm},
+ ->,>=stealth',shorten >=1pt,auto,thick
+ ]
+ \node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
+ \node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
+ \draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
-\node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=7mm]{$a_{1s}$};
-\draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp'$};
+ \node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=7mm]{$a_{1s}$};
+ \draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp'$};
-\node (r2) [rectangle, draw=black, thick, right=of r1s, minimum size = 17mm]{$a_2$};
-\draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
+ \node (r2) [rectangle, draw=black, thick, right=of r1s, minimum size = 17mm]{$a_2$};
+ \draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
-\node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=14mm]{$a_{2s}$};
-\draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp'$};
+ \node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=14mm]{$a_{2s}$};
+ \draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp'$};
-\node (r3) [rectangle, draw = black, thick, right= of r2s, minimum size = 22mm]{$a_3$};
-\draw[->,line width=0.2mm](r2s)--(r3) node[above,midway] {$\backslash c_3$};
+ \node (r3) [rectangle, draw = black, thick, right= of r2s, minimum size = 22mm]{$a_3$};
+ \draw[->,line width=0.2mm](r2s)--(r3) node[above,midway] {$\backslash c_3$};
-\node (rns) [right = of r3, draw=blue, minimum size = 20mm]{$a_{3s}$};
-\draw[->,line width=0.2mm] (r3)--(rns) node [above, midway] {$\simp'$};
+ \node (rns) [right = of r3, draw=blue, minimum size = 20mm]{$a_{3s}$};
+ \draw[->,line width=0.2mm] (r3)--(rns) node [above, midway] {$\simp'$};
-\node (rnn) [right = of rns, minimum size = 1mm]{};
-\draw[->, dashed] (rns)--(rnn) node [above, midway] {$\ldots$};
+ \node (rnn) [right = of rns, minimum size = 1mm]{};
+ \draw[->, dashed] (rns)--(rnn) node [above, midway] {$\ldots$};
-\end{tikzpicture}
-\caption{Regular expression size change during our $\blexersimp$ algorithm}\label{sulzShrinks}
+ \end{tikzpicture}
+ \caption{Regular expression size change during our $\blexersimp$ algorithm}\label{sulzShrinks}
\end{figure}
\noindent
That is, on certain cases their lexer
@@ -132,34 +132,35 @@
\subsection{Overview of the Proof}
Here is a bird's eye view of how the proof of finiteness works,
which involves three steps:
-\begin{center}
- \begin{tikzpicture}[scale=1,font=\bf,
- node/.style={
- rectangle,rounded corners=3mm,
- ultra thick,draw=black!50,minimum height=18mm,
- minimum width=20mm,
- top color=white,bottom color=black!20}]
+\begin{figure}[H]
+ \begin{tikzpicture}[scale=1,font=\bf,
+ node/.style={
+ rectangle,rounded corners=3mm,
+ ultra thick,draw=black!50,minimum height=18mm,
+ minimum width=20mm,
+ top color=white,bottom color=black!20}]
- \node (0) at (-5,0)
- [node, text width=1.8cm, text centered]
- {$\llbracket \bderssimp{a}{s} \rrbracket$};
- \node (A) at (0,0)
- [node,text width=1.6cm, text centered]
- {$\llbracket \rderssimp{r}{s} \rrbracket_r$};
- \node (B) at (3,0)
- [node,text width=3.0cm, anchor=west, minimum width = 40mm]
- {$\llbracket \textit{ClosedForm}(r, s)\rrbracket_r$};
- \node (C) at (9.5,0) [node, minimum width=10mm] {$N_r$};
-
- \draw [->,line width=0.5mm] (0) --
- node [above,pos=0.45] {=} (A) node [below, pos = 0.45] {$(r = a \downarrow_r)$} (A);
- \draw [->,line width=0.5mm] (A) --
- node [above,pos=0.35] {$\quad =\ldots=$} (B);
- \draw [->,line width=0.5mm] (B) --
- node [above,pos=0.35] {$\quad \leq \ldots \leq$} (C);
- \end{tikzpicture}
-\end{center}
+ \node (0) at (-5,0)
+ [node, text width=1.8cm, text centered]
+ {$\llbracket \bderssimp{a}{s} \rrbracket$};
+ \node (A) at (0,0)
+ [node,text width=1.6cm, text centered]
+ {$\llbracket \rderssimp{r}{s} \rrbracket_r$};
+ \node (B) at (3,0)
+ [node,text width=3.0cm, anchor=west, minimum width = 40mm]
+ {$\llbracket \textit{ClosedForm}(r, s)\rrbracket_r$};
+ \node (C) at (9.5,0) [node, minimum width=10mm] {$N_r$};
+
+ \draw [->,line width=0.5mm] (0) --
+ node [above,pos=0.45] {=} (A) node [below, pos = 0.45] {$(r = a \downarrow_r)$} (A);
+ \draw [->,line width=0.5mm] (A) --
+ node [above,pos=0.35] {$\quad =\ldots=$} (B);
+ \draw [->,line width=0.5mm] (B) --
+ node [above,pos=0.35] {$\quad \leq \ldots \leq$} (C);
+ \end{tikzpicture}
+ %\caption{
+\end{figure}
\noindent
We explain the steps one by one:
\begin{itemize}
@@ -168,22 +169,21 @@
derivatives, simplification, size calculation, etc.
associated with $\rrexp$s, which we have given
a very brief introduction to in chapter \ref{Bitcoded2}.
+ The operations on $\rrexp$s are identical to those on
+ annotated regular expressions except that they are unaware
+ of bitcodes. This means that all proofs about size of $\rrexp$s will apply to
+ annotated regular expressions.
\item
- We have a set of equalities for this new datatype that enables one to
- rewrite $\rderssimp{r_1 \cdot r_2}{s}$
- and $\rderssimp{r^*}{s}$ (the most involved
- inductive cases)
- by a combinatioin of derivatives of their
- children regular expressions ($r_1$, $r_2$, and $r$, respectively),
- which we call the \emph{Closed Forms}
- of the derivatives.
+ We prove that $\rderssimp{r}{s} = \textit{ClosedForm}(r, s)$,
+ where $\textit{ClosedForm}(r, s)$ is entirely
+ written in the derivatives of their children regular
+ expressions.
+ We call the right-hand-side the \emph{Closed Form}
+ of the derivative $\rderssimp{r}{s}$.
\item
- The Closed Forms of the regular expressions
- are controlled by terms that
- are easier to deal with.
- Using inductive hypothesis, these terms
- are in turn bounded loosely
- by a large yet constant number.
+ We estimate $\llbracket \textit{ClosedForm}(r, s) \rrbracket_r$.
+ The key observation is that $\distinctBy$'s output is
+ a list with a constant length bound.
\end{itemize}
We give details of these steps in the next sections.
The first step is to use
@@ -191,65 +191,67 @@
something simpler than
annotated regular expressions.
-\section{the $\textit{rrexp}$ Datatype and Its Size Functions}
-
-We first recap a bit about the new datatype
-we defined in \ref{rrexpDef},
-called $\textit{rrexp}$s.
-We chose $\rrexp$ over
-the basic regular expressions
-because it is more straightforward to tweak
-into the shape we want
-compared with an annotated regular expression.
-We want to prove the size property about annotated regular expressions.
-The size is
-written $\llbracket r\rrbracket$, whose intuitive definition is as below
-\noindent
-We first note that $\llbracket \_ \rrbracket$
-is unaware of bitcodes, since
-it only counts the number of nodes
-if we regard $r$ as a tree.
-This suggests that if we define a new datatype that is similar to plain $\rexp$s,
+\section{The $\textit{Rrexp}$ Datatype and Its Lexing-Related Functions}
+We first recap the definition of the new datatype $\rrexp$, called
+\emph{r-regular expressions},
+which we first defined in \ref{rrexpDef}.
+R-regular expressions are similar to basic regular expressions.
+We call them \emph{r}-regular expressions
+to make a distinction
+with plain regular expressions.
\[ \rrexp ::= \RZERO \mid \RONE
- \mid \RCHAR{c}
- \mid \RSEQ{r_1}{r_2}
- \mid \RALTS{rs}
- \mid \RSTAR{r}
+ \mid \RCHAR{c}
+ \mid \RSEQ{r_1}{r_2}
+ \mid \RALTS{rs}
+ \mid \RSTAR{r}
\]
-and define
+The size of an r-regular expression is
+written $\llbracket r\rrbracket_r$,
+whose definition mirrors that of an annotated regular expression.
\begin{center}
-\begin{tabular}{lcl}
-$\rerase{\ZERO}$ & $\dn$ & $\RZERO$\\
-$\rerase{_{bs}\ONE}$ & $\dn$ & $\RONE$\\
- $\rerase{_{bs}\mathbf{c}}$ & $\dn$ & $\RCHAR{c}$\\
-$\rerase{_{bs}r_1\cdot r_2}$ & $\dn$ & $\RSEQ{\rerase{r_1}}{\rerase{r_2}}$\\
-$\rerase{_{bs}\sum as}$ & $\dn$ & $\RALTS{\map \; \rerase{\_} \; as}$\\
-$\rerase{_{bs} a ^*}$ & $\dn$ & $\rerase{a}^*$
-\end{tabular}
+ \begin{tabular}{ccc}
+ $\llbracket _{bs}\ONE \rrbracket_r$ & $\dn$ & $1$\\
+ $\llbracket \ZERO \rrbracket_r$ & $\dn$ & $1$ \\
+ $\llbracket _{bs} r_1 \cdot r_2 \rrbracket_r$ & $\dn$ & $\llbracket r_1 \rrbracket_r + \llbracket r_2 \rrbracket_r + 1$\\
+ $\llbracket _{bs}\mathbf{c} \rrbracket_r $ & $\dn$ & $1$\\
+ $\llbracket _{bs}\sum as \rrbracket_r $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket_r)\; as + 1$\\
+ $\llbracket _{bs} a^* \rrbracket_r $ & $\dn$ & $\llbracket a \rrbracket_r + 1$.
+ \end{tabular}
\end{center}
\noindent
-we could calculate the size of annotated regular expressions in terms of
-its un-annotated counterpart:
-\begin{lemma}
-$\rsize{\rerase a} = \asize a$
-\end{lemma}
-\begin{proof}
- By routine induction on the structure of $a$.
-\end{proof}
+The $r$ in the subscript of $\llbracket \rrbracket_r$ is to
+differentiate with the same operation for annotated regular expressions.
+Adding $r$ as subscript will be used in
+other operations as well.
+
+The transformation from an annotated regular expression
+to an r-regular expression is straightforward.
+\begin{center}
+ \begin{tabular}{lcl}
+ $\rerase{\ZERO}$ & $\dn$ & $\RZERO$\\
+ $\rerase{_{bs}\ONE}$ & $\dn$ & $\RONE$\\
+ $\rerase{_{bs}\mathbf{c}}$ & $\dn$ & $\RCHAR{c}$\\
+ $\rerase{_{bs}r_1\cdot r_2}$ & $\dn$ & $\RSEQ{\rerase{r_1}}{\rerase{r_2}}$\\
+ $\rerase{_{bs}\sum as}$ & $\dn$ & $\RALTS{\map \; \rerase{\_} \; as}$\\
+ $\rerase{_{bs} a ^*}$ & $\dn$ & $\rerase{a} ^*$
+ \end{tabular}
+\end{center}
\noindent
-We call them \emph{r}-regular expressions:
-we use
-$\rrexp$ as its type name, so as to make a distinction
-with plain regular expressions.
-In $\rrexp$ we throw away the bitcodes on the annotated regular expressions,
-but keep everything else intact.
-To transform an annotated regular expression into an r-regular expression,
-we use the function \emph{rerase}, written $\downarrow_r$.
-The $r$ in the subscript of $\downarrow$ is to
-differentiate with the $\downarrow$ for the $\erase$ operation.
+$\textit{Rerase}$ throws away the bitcodes on the annotated regular expressions,
+but keeps everything else intact.
+
Before we introduce more functions related to r-regular expressions,
we first give out the reason why we take all the trouble
defining a new datatype in the first place.
+We could calculate the size of annotated regular expressions in terms of
+their un-annotated $\rrexp$ counterpart:
+\begin{lemma}
+ $\rsize{\rerase a} = \asize a$
+\end{lemma}
+\begin{proof}
+ By routine structural induction on $a$.
+\end{proof}
+\noindent
\subsection{Motivation Behind a New Datatype}
The main difference between a plain regular expression
and an r-regular expression is that it can take
@@ -275,23 +277,23 @@
This does not hold for plain $\rexp$s.
- These operations are
- almost identical to those of the annotated regular expressions,
- except that no bitcodes are attached.
+These operations are
+almost identical to those of the annotated regular expressions,
+except that no bitcodes are attached.
Of course, the bits which encode the lexing information would grow linearly with respect
to the input, which should be taken into account when we wish to tackle the runtime comlexity.
But at the current stage
we can safely ignore them.
Similarly there is a size function for plain regular expressions:
\begin{center}
-\begin{tabular}{ccc}
- $\llbracket \ONE \rrbracket_p$ & $\dn$ & $1$\\
- $\llbracket \ZERO \rrbracket_p$ & $\dn$ & $1$ \\
- $\llbracket r_1 \cdot r_2 \rrbracket_p$ & $\dn$ & $\llbracket r_1 \rrbracket_p + \llbracket r_2 \rrbracket_p + 1$\\
-$\llbracket \mathbf{c} \rrbracket_p $ & $\dn$ & $1$\\
-$\llbracket r_1 \cdot r_2 \rrbracket_p $ & $\dn$ & $\llbracket r_1 \rrbracket_p \; + \llbracket r_2 \rrbracket_p + 1$\\
-$\llbracket a^* \rrbracket_p $ & $\dn$ & $\llbracket a \rrbracket_p + 1$
-\end{tabular}
+ \begin{tabular}{ccc}
+ $\llbracket \ONE \rrbracket_p$ & $\dn$ & $1$\\
+ $\llbracket \ZERO \rrbracket_p$ & $\dn$ & $1$ \\
+ $\llbracket r_1 \cdot r_2 \rrbracket_p$ & $\dn$ & $\llbracket r_1 \rrbracket_p + \llbracket r_2 \rrbracket_p + 1$\\
+ $\llbracket \mathbf{c} \rrbracket_p $ & $\dn$ & $1$\\
+ $\llbracket r_1 \cdot r_2 \rrbracket_p $ & $\dn$ & $\llbracket r_1 \rrbracket_p \; + \llbracket r_2 \rrbracket_p + 1$\\
+ $\llbracket a^* \rrbracket_p $ & $\dn$ & $\llbracket a \rrbracket_p + 1$
+ \end{tabular}
\end{center}
\noindent
@@ -319,15 +321,15 @@
due to the discrepancy between annotated regular expression's $\sum$ constructor
and plain regular expression's $+$ constructor having different arity.
\begin{center}
-\begin{tabular}{ccc}
-$\erase \; _{bs}\sum [] $ & $\dn$ & $\ZERO$\\
-$\erase \; _{bs}\sum [a]$ & $\dn$ & $a$\\
-$\erase \; _{bs}\sum a :: as$ & $\dn$ & $a + (\erase \; _{[]} \sum as)\quad \text{if $as$ length over 1}$
-\end{tabular}
+ \begin{tabular}{ccc}
+ $\erase \; _{bs}\sum [] $ & $\dn$ & $\ZERO$\\
+ $\erase \; _{bs}\sum [a]$ & $\dn$ & $a$\\
+ $\erase \; _{bs}\sum a :: as$ & $\dn$ & $a + (\erase \; _{[]} \sum as)\quad \text{if $as$ length over 1}$
+ \end{tabular}
\end{center}
\noindent
An alternative regular expression with an empty list of children
- is turned into a $\ZERO$ during the
+is turned into a $\ZERO$ during the
$\erase$ function, thereby changing the size and structure of the regex.
Therefore the equality in question does not hold.
@@ -339,23 +341,23 @@
except that now they can operate on alternatives taking multiple arguments.
\begin{center}
-\begin{tabular}{lcr}
- $(\RALTS{rs})\; \backslash c$ & $\dn$ & $\RALTS{\map\; (\_ \backslash c) \;rs}$\\
-(other clauses omitted)
-With the new $\rrexp$ datatype in place, one can define its size function,
-which precisely mirrors that of the annotated regular expressions:
-\end{tabular}
+ \begin{tabular}{lcr}
+ $(\RALTS{rs})\; \backslash c$ & $\dn$ & $\RALTS{\map\; (\_ \backslash c) \;rs}$\\
+ (other clauses omitted)
+ With the new $\rrexp$ datatype in place, one can define its size function,
+ which precisely mirrors that of the annotated regular expressions:
+ \end{tabular}
\end{center}
\noindent
\begin{center}
-\begin{tabular}{ccc}
- $\llbracket _{bs}\ONE \rrbracket_r$ & $\dn$ & $1$\\
- $\llbracket \ZERO \rrbracket_r$ & $\dn$ & $1$ \\
- $\llbracket _{bs} r_1 \cdot r_2 \rrbracket_r$ & $\dn$ & $\llbracket r_1 \rrbracket_r + \llbracket r_2 \rrbracket_r + 1$\\
-$\llbracket _{bs}\mathbf{c} \rrbracket_r $ & $\dn$ & $1$\\
-$\llbracket _{bs}\sum as \rrbracket_r $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket_r)\; as + 1$\\
-$\llbracket _{bs} a^* \rrbracket_r $ & $\dn$ & $\llbracket a \rrbracket_r + 1$
-\end{tabular}
+ \begin{tabular}{ccc}
+ $\llbracket _{bs}\ONE \rrbracket_r$ & $\dn$ & $1$\\
+ $\llbracket \ZERO \rrbracket_r$ & $\dn$ & $1$ \\
+ $\llbracket _{bs} r_1 \cdot r_2 \rrbracket_r$ & $\dn$ & $\llbracket r_1 \rrbracket_r + \llbracket r_2 \rrbracket_r + 1$\\
+ $\llbracket _{bs}\mathbf{c} \rrbracket_r $ & $\dn$ & $1$\\
+ $\llbracket _{bs}\sum as \rrbracket_r $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket_r)\; as + 1$\\
+ $\llbracket _{bs} a^* \rrbracket_r $ & $\dn$ & $\llbracket a \rrbracket_r + 1$
+ \end{tabular}
\end{center}
\noindent
@@ -364,32 +366,32 @@
except that they do not involve rectifying and augmenting bit-encoded tokenization information.
As expected, most functions are simpler, such as the derivative:
\begin{center}
- \begin{tabular}{@{}lcl@{}}
- $(\ZERO)\,\backslash_r c$ & $\dn$ & $\ZERO$\\
- $(\ONE)\,\backslash_r c$ & $\dn$ &
- $\textit{if}\;c=d\; \;\textit{then}\;
- \ONE\;\textit{else}\;\ZERO$\\
- $(\sum \;\textit{rs})\,\backslash_r c$ & $\dn$ &
- $\sum\;(\textit{map} \; (\_\backslash_r c) \; rs )$\\
- $(r_1\cdot r_2)\,\backslash_r c$ & $\dn$ &
- $\textit{if}\;\textit{rnullable}\,r_1$\\
- & &$\textit{then}\;\sum\,[(r_1\,\backslash_r c)\cdot\,r_2,$\\
- & &$\phantom{\textit{then},\;\sum\,}((r_2\,\backslash_r c))]$\\
- & &$\textit{else}\;\,(r_1\,\backslash_r c)\cdot r_2$\\
- $(r^*)\,\backslash_r c$ & $\dn$ &
- $( r\,\backslash_r c)\cdot
- (_{[]}r^*))$
-\end{tabular}
+ \begin{tabular}{@{}lcl@{}}
+ $(\ZERO)\,\backslash_r c$ & $\dn$ & $\ZERO$\\
+ $(\ONE)\,\backslash_r c$ & $\dn$ &
+ $\textit{if}\;c=d\; \;\textit{then}\;
+ \ONE\;\textit{else}\;\ZERO$\\
+ $(\sum \;\textit{rs})\,\backslash_r c$ & $\dn$ &
+ $\sum\;(\textit{map} \; (\_\backslash_r c) \; rs )$\\
+ $(r_1\cdot r_2)\,\backslash_r c$ & $\dn$ &
+ $\textit{if}\;\textit{rnullable}\,r_1$\\
+ & &$\textit{then}\;\sum\,[(r_1\,\backslash_r c)\cdot\,r_2,$\\
+ & &$\phantom{\textit{then},\;\sum\,}((r_2\,\backslash_r c))]$\\
+ & &$\textit{else}\;\,(r_1\,\backslash_r c)\cdot r_2$\\
+ $(r^*)\,\backslash_r c$ & $\dn$ &
+ $( r\,\backslash_r c)\cdot
+ (_{[]}r^*))$
+ \end{tabular}
\end{center}
\noindent
The simplification function is simplified without annotation causing superficial differences.
Duplicate removal without an equivalence relation:
\begin{center}
-\begin{tabular}{lcl}
- $\rdistinct{[]}{rset} $ & $\dn$ & $[]$\\
-$\rdistinct{r :: rs}{rset}$ & $\dn$ & $\textit{if}(r \in \textit{rset}) \; \textit{then} \; \rdistinct{rs}{rset}$\\
- & & $\textit{else}\; r::\rdistinct{rs}{(rset \cup \{r\})}$
-\end{tabular}
+ \begin{tabular}{lcl}
+ $\rdistinct{[]}{rset} $ & $\dn$ & $[]$\\
+ $\rdistinct{r :: rs}{rset}$ & $\dn$ & $\textit{if}(r \in \textit{rset}) \; \textit{then} \; \rdistinct{rs}{rset}$\\
+ & & $\textit{else}\; r::\rdistinct{rs}{(rset \cup \{r\})}$
+ \end{tabular}
\end{center}
%TODO: definition of rsimp (maybe only the alternative clause)
\noindent
@@ -418,31 +420,31 @@
We will piece together this bound and show the same bound for annotated regular
expressions in the end.
Unless stated otherwise in this chapter all $\textit{rexp}$s without
- bitcodes are seen as $\rrexp$s.
- We also use $r_1 + r_2$ and $\RALTS{[r_1, r_2]}$ interchageably
- as the former suits people's intuitive way of stating a binary alternative
- regular expression.
+bitcodes are seen as $\rrexp$s.
+We also use $r_1 + r_2$ and $\RALTS{[r_1, r_2]}$ interchageably
+as the former suits people's intuitive way of stating a binary alternative
+regular expression.
%-----------------------------------
% SECTION ?
%-----------------------------------
- \subsection{Finiteness Proof Using $\rrexp$s}
- Now that we have defined the $\rrexp$ datatype, and proven that its size changes
- w.r.t derivatives and simplifications mirrors precisely those of annotated regular expressions,
- we aim to bound the size of $r \backslash s$ for any $\rrexp$ $r$.
- Once we have a bound like:
- \[
- \llbracket r \backslash_{rsimp} s \rrbracket_r \leq N_r
- \]
- \noindent
- we could easily extend that to
- \[
- \llbracket a \backslash_{bsimps} s \rrbracket \leq N_r.
- \]
+\subsection{Finiteness Proof Using $\rrexp$s}
+Now that we have defined the $\rrexp$ datatype, and proven that its size changes
+w.r.t derivatives and simplifications mirrors precisely those of annotated regular expressions,
+we aim to bound the size of $r \backslash s$ for any $\rrexp$ $r$.
+Once we have a bound like:
+\[
+ \llbracket r \backslash_{rsimp} s \rrbracket_r \leq N_r
+\]
+\noindent
+we could easily extend that to
+\[
+ \llbracket a \backslash_{bsimps} s \rrbracket \leq N_r.
+\]
- \subsection{Roadmap to a Bound for $\textit{Rrexp}$}
+\subsection{Roadmap to a Bound for $\textit{Rrexp}$}
The way we obtain the bound for $\rrexp$s is by two steps:
\begin{itemize}
@@ -459,18 +461,18 @@
\end{itemize}
\section{Step One: Closed Forms}
- We transform the function application $\rderssimp{r}{s}$
- into an equivalent
- form $f\; (g \; (\sum rs))$.
- The functions $f$ and $g$ can be anything from $\flts$, $\distinctBy$ and other helper functions from $\bsimp{\_}$.
- This way we get a different but equivalent way of expressing : $r\backslash s = f \; (g\; (\sum rs))$, we call the
- right hand side the "closed form" of $r\backslash s$.
+We transform the function application $\rderssimp{r}{s}$
+into an equivalent
+form $f\; (g \; (\sum rs))$.
+The functions $f$ and $g$ can be anything from $\flts$, $\distinctBy$ and other helper functions from $\bsimp{\_}$.
+This way we get a different but equivalent way of expressing : $r\backslash s = f \; (g\; (\sum rs))$, we call the
+right hand side the "closed form" of $r\backslash s$.
\begin{quote}\it
Claim: For regular expressions $r_1 \cdot r_2$, we claim that
\begin{center}
$ \rderssimp{r_1 \cdot r_2}{s} =
- \rsimp{(\sum (r_1 \backslash s \cdot r_2 ) \; :: \;(\map \; \rderssimp{r2}{\_} \;(\vsuf{s}{r_1})))}$
+ \rsimp{(\sum (r_1 \backslash s \cdot r_2 ) \; :: \;(\map \; \rderssimp{r2}{\_} \;(\vsuf{s}{r_1})))}$
\end{center}
\end{quote}
\noindent
@@ -504,7 +506,7 @@
The first part is by an induction on $rs$.
The second and third part can be proven by using the
induction rules of $\rdistinct{\_}{\_}$.
-
+
\end{proof}
\noindent
@@ -537,7 +539,7 @@
and the output will not change.
\begin{itemize}
\item
- If $r \notin rs$, then $\rdistinct{rs}{acc} = \rdistinct{rs}{\{r\} \cup acc}$.
+ If $r \notin rs$, then $\rdistinct{rs}{acc} = \rdistinct{rs}{\{r\} \cup acc}$.
\item
Particularly, when $acc = \varnothing$ and $rs$ de-duplicated, we have\\
\[ \rdistinct{rs}{\varnothing} = rs \]
@@ -560,7 +562,7 @@
\noindent
In other words, it can be taken out and left untouched in the output.
\begin{proof}
-By an induction on $rs_1$, where $rsa$ and $acc$ are allowed to be arbitrary.
+ By an induction on $rs_1$, where $rsa$ and $acc$ are allowed to be arbitrary.
\end{proof}
\noindent
$\rdistinct{}{}$ removes any element in anywhere of a list, if it
@@ -576,15 +578,15 @@
$\rdistinct{ (rs @ rs') }{rset} = \rdistinct{rs @ [r] @ rs'}{rset}$\\
and\\
$\rdistinct{(ab :: rs @ [ab] @ rs'')}{rset'} =
- \rdistinct{(ab :: rs @ rs'')}{rset'}$
+ \rdistinct{(ab :: rs @ rs'')}{rset'}$
\end{itemize}
\end{lemma}
\noindent
\begin{proof}
-By induction on $rs$. All other variables are allowed to be arbitrary.
-The second half of the lemma requires the first half.
-Note that for each half's two sub-propositions need to be proven concurrently,
-so that the induction goes through.
+ By induction on $rs$. All other variables are allowed to be arbitrary.
+ The second half of the lemma requires the first half.
+ Note that for each half's two sub-propositions need to be proven concurrently,
+ so that the induction goes through.
\end{proof}
\noindent
@@ -658,14 +660,14 @@
Much like the definition of $L$ on plain regular expressions, one could also
define the language interpretation of $\rrexp$s.
\begin{center}
-\begin{tabular}{lcl}
-$RL \; (\ZERO)$ & $\dn$ & $\phi$\\
-$RL \; (\ONE)$ & $\dn$ & $\{[]\}$\\
-$RL \; (c)$ & $\dn$ & $\{[c]\}$\\
-$RL \; \sum rs$ & $\dn$ & $ \bigcup_{r \in rs} (RL \; r)$\\
-$RL \; (r_1 \cdot r_2)$ & $\dn$ & $ RL \; (r_1) @ RL \; (r_2)$\\
-$RL \; (r^*)$ & $\dn$ & $ (RL(r))^*$
-\end{tabular}
+ \begin{tabular}{lcl}
+ $RL \; (\ZERO)$ & $\dn$ & $\phi$\\
+ $RL \; (\ONE)$ & $\dn$ & $\{[]\}$\\
+ $RL \; (c)$ & $\dn$ & $\{[c]\}$\\
+ $RL \; \sum rs$ & $\dn$ & $ \bigcup_{r \in rs} (RL \; r)$\\
+ $RL \; (r_1 \cdot r_2)$ & $\dn$ & $ RL \; (r_1) @ RL \; (r_2)$\\
+ $RL \; (r^*)$ & $\dn$ & $ (RL(r))^*$
+ \end{tabular}
\end{center}
\noindent
The main use of $RL$ is to establish some connections between $\rsimp{}$
@@ -684,7 +686,7 @@
The second part is true because property
\[ RL \; r = RL \; (\rsimp{r})\] holds.
\end{proof}
-
+
\subsubsection{$\rsimp{}$ is Non-Increasing}
In this subsection, we prove that the function $\rsimp{}$ does not
make the $\llbracket \rrbracket_r$ size increase.
@@ -709,7 +711,7 @@
$\good\; \RALTS{[r]}$ & $\dn$ & $\bfalse$\\
$\good\; \RALTS{r_1 :: r_2 :: rs}$ & $\dn$ &
$\textit{isDistinct} \; (r_1 :: r_2 :: rs) \;$\\
- & & $\textit{and}\; (\forall r' \in (r_1 :: r_2 :: rs).\; \good \; r'\; \, \textit{and}\; \, \textit{nonAlt}\; r')$\\
+ & & $\textit{and}\; (\forall r' \in (r_1 :: r_2 :: rs).\; \good \; r'\; \, \textit{and}\; \, \textit{nonAlt}\; r')$\\
$\good \; \RSEQ{\RZERO}{r}$ & $\dn$ & $\bfalse$\\
$\good \; \RSEQ{\RONE}{r}$ & $\dn$ & $\bfalse$\\
$\good \; \RSEQ{r}{\RZERO}$ & $\dn$ & $\bfalse$\\
@@ -850,7 +852,7 @@
\rflts \; ( (\rdistinct{(\rflts \; (\map \; \rsimp{}\; rs))}{\varnothing}) @ (
\map \; \rsimp{rs'}))$
-
+
\end{lemma}
\begin{proof}
By \ref{good1}.
@@ -868,7 +870,7 @@
forms so that key steps allowing further rewriting to closed forms
are possible.
\begin{lemma}\label{rsimpIdem}
-$\rsimp{r} = \rsimp{\rsimp{r}}$
+ $\rsimp{r} = \rsimp{\rsimp{r}}$
\end{lemma}
\begin{proof}
@@ -918,8 +920,8 @@
Now presented are a few equivalent terms under $\rsimp{}$.
We use $r_1 \sequal r_2 $ here to denote $\rsimp{r_1} = \rsimp{r_2}$.
\begin{lemma}
-\begin{itemize}
- The following equivalence hold:
+ \begin{itemize}
+ The following equivalence hold:
\item
$\rsimpalts \; (\RZERO :: rs) \sequal \rsimpalts\; rs$
\item
@@ -966,7 +968,7 @@
$\stackrel{by \ref{rdistinctConcatGeneral}}{=}
\rsimpalts \; (\rdistinct{\rflts \; rs'}{\varnothing} @ \rdistinct{(
\rflts\; rs @ \rflts \; rs'')}{\rflts \; rs'})$\\
-
+
\end{center}
\end{comment}
%Rewriting steps not put in--too long and complicated-------------------------------
@@ -998,73 +1000,73 @@
- List of one-step rewrite rules for $\rrexp$ ($\hrewrite$):
+List of one-step rewrite rules for $\rrexp$ ($\hrewrite$):
\begin{center}
-\begin{mathpar}
- \inferrule[RSEQ0L]{}{\RZERO \cdot r_2 \hrewrite \RZERO\\}
+ \begin{mathpar}
+ \inferrule[RSEQ0L]{}{\RZERO \cdot r_2 \hrewrite \RZERO\\}
- \inferrule[RSEQ0R]{}{r_1 \cdot \RZERO \hrewrite \RZERO\\}
+ \inferrule[RSEQ0R]{}{r_1 \cdot \RZERO \hrewrite \RZERO\\}
- \inferrule[RSEQ1]{}{(\RONE \cdot r) \hrewrite r\\}\\
-
- \inferrule[RSEQL]{ r_1 \hrewrite r_2}{r_1 \cdot r_3 \hrewrite r_2 \cdot r_3\\}
+ \inferrule[RSEQ1]{}{(\RONE \cdot r) \hrewrite r\\}\\
- \inferrule[RSEQR]{ r_3 \hrewrite r_4}{r_1 \cdot r_3 \hrewrite r_1 \cdot r_4\\}\\
+ \inferrule[RSEQL]{ r_1 \hrewrite r_2}{r_1 \cdot r_3 \hrewrite r_2 \cdot r_3\\}
+
+ \inferrule[RSEQR]{ r_3 \hrewrite r_4}{r_1 \cdot r_3 \hrewrite r_1 \cdot r_4\\}\\
- \inferrule[RALTSChild]{r \hrewrite r'}{\sum (rs_1 @ [r] @ rs_2) \hrewrite \sum (rs_1 @ [r'] @ rs_2)\\}
+ \inferrule[RALTSChild]{r \hrewrite r'}{\sum (rs_1 @ [r] @ rs_2) \hrewrite \sum (rs_1 @ [r'] @ rs_2)\\}
- \inferrule[RALTS0]{}{\sum (rs_a @ [\RZERO] @ rs_b) \hrewrite \sum (rs_a @ rs_b)}
+ \inferrule[RALTS0]{}{\sum (rs_a @ [\RZERO] @ rs_b) \hrewrite \sum (rs_a @ rs_b)}
- \inferrule[RALTSNested]{}{\sum (rs_a @ [\sum rs_1] @ rs_b) \hrewrite \sum (rs_a @ rs_1 @ rs_b)}
+ \inferrule[RALTSNested]{}{\sum (rs_a @ [\sum rs_1] @ rs_b) \hrewrite \sum (rs_a @ rs_1 @ rs_b)}
- \inferrule[RALTSNil]{}{ \sum [] \hrewrite \RZERO\\}
+ \inferrule[RALTSNil]{}{ \sum [] \hrewrite \RZERO\\}
- \inferrule[RALTSSingle]{}{ \sum [r] \hrewrite r\\}
+ \inferrule[RALTSSingle]{}{ \sum [r] \hrewrite r\\}
- \inferrule[RALTSDelete]{\\ r_1 = r_2}{\sum rs_a @ [r_1] @ rs_b @ [r_2] @ rsc \hrewrite \sum rs_a @ [r_1] @ rs_b @ rs_c}
+ \inferrule[RALTSDelete]{\\ r_1 = r_2}{\sum rs_a @ [r_1] @ rs_b @ [r_2] @ rsc \hrewrite \sum rs_a @ [r_1] @ rs_b @ rs_c}
-\end{mathpar}
+ \end{mathpar}
\end{center}
- List of rewrite rules for a list of regular expressions,
- where each element can rewrite in many steps to the other (scf stands for
- li\emph{s}t \emph{c}losed \emph{f}orm). This relation is similar to the
- $\stackrel{s*}{\rightsquigarrow}$ for annotated regular expressions.
+List of rewrite rules for a list of regular expressions,
+where each element can rewrite in many steps to the other (scf stands for
+li\emph{s}t \emph{c}losed \emph{f}orm). This relation is similar to the
+$\stackrel{s*}{\rightsquigarrow}$ for annotated regular expressions.
\begin{center}
-\begin{mathpar}
- \inferrule{}{[] \scfrewrites [] }
- \inferrule{r \hrewrites r' \\ rs \scfrewrites rs'}{r :: rs \scfrewrites r' :: rs'}
-\end{mathpar}
+ \begin{mathpar}
+ \inferrule{}{[] \scfrewrites [] }
+ \inferrule{r \hrewrites r' \\ rs \scfrewrites rs'}{r :: rs \scfrewrites r' :: rs'}
+ \end{mathpar}
\end{center}
%frewrite
- List of one-step rewrite rules for flattening
- a list of regular expressions($\frewrite$):
+List of one-step rewrite rules for flattening
+a list of regular expressions($\frewrite$):
\begin{center}
-\begin{mathpar}
- \inferrule{}{\RZERO :: rs \frewrite rs \\}
+ \begin{mathpar}
+ \inferrule{}{\RZERO :: rs \frewrite rs \\}
- \inferrule{}{(\sum rs) :: rs_a \frewrite rs @ rs_a \\}
+ \inferrule{}{(\sum rs) :: rs_a \frewrite rs @ rs_a \\}
- \inferrule{rs_1 \frewrite rs_2}{r :: rs_1 \frewrite r :: rs_2}
-\end{mathpar}
+ \inferrule{rs_1 \frewrite rs_2}{r :: rs_1 \frewrite r :: rs_2}
+ \end{mathpar}
\end{center}
- Lists of one-step rewrite rules for flattening and de-duplicating
- a list of regular expressions ($\grewrite$):
+Lists of one-step rewrite rules for flattening and de-duplicating
+a list of regular expressions ($\grewrite$):
\begin{center}
-\begin{mathpar}
- \inferrule{}{\RZERO :: rs \grewrite rs \\}
+ \begin{mathpar}
+ \inferrule{}{\RZERO :: rs \grewrite rs \\}
- \inferrule{}{(\sum rs) :: rs_a \grewrite rs @ rs_a \\}
+ \inferrule{}{(\sum rs) :: rs_a \grewrite rs @ rs_a \\}
- \inferrule{rs_1 \grewrite rs_2}{r :: rs_1 \grewrite r :: rs_2}
+ \inferrule{rs_1 \grewrite rs_2}{r :: rs_1 \grewrite r :: rs_2}
- \inferrule[dB]{}{rs_a @ [a] @ rs_b @[a] @ rs_c \grewrite rs_a @ [a] @ rsb @ rsc}
-\end{mathpar}
+ \inferrule[dB]{}{rs_a @ [a] @ rs_b @[a] @ rs_c \grewrite rs_a @ [a] @ rsb @ rsc}
+ \end{mathpar}
\end{center}
\noindent
@@ -1078,8 +1080,8 @@
one of the rewriting steps would be:
\begin{lemma}
$\sum (\rDistinct \;\; (\map \; (\_ \backslash x) \; (\rflts \; rs)) \;\; \varnothing) \sequal
- \sum (\rDistinct \;\; (\rflts \; (\map \; (\_ \backslash x) \; rs)) \;\; \varnothing)
- $
+ \sum (\rDistinct \;\; (\rflts \; (\map \; (\_ \backslash x) \; rs)) \;\; \varnothing)
+ $
\end{lemma}
\noindent
Proving this is by first showing
@@ -1099,16 +1101,16 @@
For example, a rewriting step in proving
closed forms is:
\begin{center}
-$\rsimp{(\rsimpalts \; (\map \; (\_ \backslash x) \; (\rdistinct{(\rflts \; (\map \; (\rsimp{} \; \circ \; (\lambda r. \rderssimp{r}{xs}))))}{\varnothing})))}$\\
-$=$ \\
-$\rsimp{(\rsimpalts \; (\rdistinct{(\map \; (\_ \backslash x) \; (\rflts \; (\map \; (\rsimp{} \; \circ \; (\lambda r. \rderssimp{r}{xs})))) ) }{\varnothing}))} $
-\noindent
+ $\rsimp{(\rsimpalts \; (\map \; (\_ \backslash x) \; (\rdistinct{(\rflts \; (\map \; (\rsimp{} \; \circ \; (\lambda r. \rderssimp{r}{xs}))))}{\varnothing})))}$\\
+ $=$ \\
+ $\rsimp{(\rsimpalts \; (\rdistinct{(\map \; (\_ \backslash x) \; (\rflts \; (\map \; (\rsimp{} \; \circ \; (\lambda r. \rderssimp{r}{xs})))) ) }{\varnothing}))} $
+ \noindent
\end{center}
For this one would hope to have a rewriting relation between the two lists involved,
similar to \ref{earlyLaterDerFrewrites}. However, it turns out that
\begin{center}
-$\map \; (\_ \backslash x) \; (\rDistinct \; rs \; rset) \grewrites \rDistinct \; (\map \;
-(\_ \backslash x) \; rs) \; ( rset \backslash x)$
+ $\map \; (\_ \backslash x) \; (\rDistinct \; rs \; rset) \grewrites \rDistinct \; (\map \;
+ (\_ \backslash x) \; rs) \; ( rset \backslash x)$
\end{center}
\noindent
does $\mathbf{not}$ hold in general.
@@ -1200,11 +1202,11 @@
If $s \neq []$ then $\rderssimp{r}{s} = \rsimp{(\rders \; r \; s)}$.
\item
$\rsimpalts \; (\map \; (\_ \backslash_r x) \;
- (\rdistinct{rs}{\varnothing})) \sequal
- \rsimpalts \; (\rDistinct \;
- (\map \; (\_ \backslash_r x) rs) \;\varnothing )$
- \end{itemize}
- \end{lemma}
+ (\rdistinct{rs}{\varnothing})) \sequal
+ \rsimpalts \; (\rDistinct \;
+ (\map \; (\_ \backslash_r x) rs) \;\varnothing )$
+ \end{itemize}
+\end{lemma}
\noindent
Finally,
@@ -1215,12 +1217,12 @@
\noindent
this leads to the first closed form--
\begin{lemma}\label{altsClosedForm}
-\begin{center}
- $\rderssimp{(\sum rs)}{s} \sequal
- \sum \; (\map \; (\rderssimp{\_}{s}) \; rs)$
-\end{center}
+ \begin{center}
+ $\rderssimp{(\sum rs)}{s} \sequal
+ \sum \; (\map \; (\rderssimp{\_}{s}) \; rs)$
+ \end{center}
\end{lemma}
-
+
\noindent
\begin{proof}
By a reverse induction on the string $s$.
@@ -1232,7 +1234,7 @@
(\lambda r. \rderssimp{r}{xs}))))}{\varnothing}))
\sequal
\rsimpalts \; (\rdistinct{(\map \; (\_ \backslash x) \;
- (\rflts \; (\map \; (\rsimp{} \; \circ \;
+ (\rflts \; (\map \; (\rsimp{} \; \circ \;
(\lambda r. \rderssimp{r}{xs})))) ) }{\varnothing}) $.
\end{center}
This can be proven by a combination of
@@ -1261,9 +1263,9 @@
component of the sequence is always nullable):
\begin{center}
-$r_1 \cdot r_2 \quad \longrightarrow_{\backslash c} \quad r_1 \backslash c \cdot r_2 + r_2 \backslash c \quad \longrightarrow_{\backslash c'} \quad (r_1 \backslash cc' \cdot r_2 + r_2 \backslash c') + r_2 \backslash cc' \longrightarrow_{\backslash c''} \quad$\\
-$((r_1 \backslash cc'c'' \cdot r_2 + r_2 \backslash c'') + r_2 \backslash c'c'') + r_2 \backslash cc'c'' \longrightarrow_{\backslash c''} \quad
- \ldots$
+ $r_1 \cdot r_2 \quad \longrightarrow_{\backslash c} \quad r_1 \backslash c \cdot r_2 + r_2 \backslash c \quad \longrightarrow_{\backslash c'} \quad (r_1 \backslash cc' \cdot r_2 + r_2 \backslash c') + r_2 \backslash cc' \longrightarrow_{\backslash c''} \quad$\\
+ $((r_1 \backslash cc'c'' \cdot r_2 + r_2 \backslash c'') + r_2 \backslash c'c'') + r_2 \backslash cc'c'' \longrightarrow_{\backslash c''} \quad
+ \ldots$
\end{center}
Roughly speaking $r_1 \cdot r_2 \backslash s$ can be expresssed as
@@ -1276,12 +1278,12 @@
a pencil-and-paper derivation of $(r_1 \cdot r_2)\backslash s$:
\begin{center}
\begin{tabular}{c}
- $(r_1 \cdot r_2) \backslash_r (c_1 :: c_2 :: \ldots c_n) \myequiv$\\
- \rule{0pt}{3ex} $((r_1 \backslash_r c_1) \cdot r_2 + (\delta\; (\rnullable \; r_1) \; r_2 \backslash_r c_1)) \backslash_r (c_2 :: \ldots c_n)
- \myequiv$\\
- \rule{0pt}{3ex} $((r_1 \backslash_r c_1c_2 \cdot r_2 + (\delta \; (\rnullable \; r_1) \; r_2 \backslash_r c_1c_2))
- + (\delta \ (\rnullable \; r_1 \backslash_r c)\; r_2 \backslash_r c_2)) \backslash_r (c_3 \ldots c_n)
- $
+ $(r_1 \cdot r_2) \backslash_r (c_1 :: c_2 :: \ldots c_n) \myequiv$\\
+ \rule{0pt}{3ex} $((r_1 \backslash_r c_1) \cdot r_2 + (\delta\; (\rnullable \; r_1) \; r_2 \backslash_r c_1)) \backslash_r (c_2 :: \ldots c_n)
+ \myequiv$\\
+ \rule{0pt}{3ex} $((r_1 \backslash_r c_1c_2 \cdot r_2 + (\delta \; (\rnullable \; r_1) \; r_2 \backslash_r c_1c_2))
+ + (\delta \ (\rnullable \; r_1 \backslash_r c)\; r_2 \backslash_r c_2)) \backslash_r (c_3 \ldots c_n)
+ $
\end{tabular}
\end{center}
\noindent
@@ -1319,11 +1321,11 @@
The function $\vsuf{\_}{\_}$ is defined recursively on the structure of the string:
\begin{center}
-\begin{tabular}{lcl}
-$\vsuf{[]}{\_} $ & $=$ & $[]$\\
-$\vsuf{c::cs}{r_1}$ & $ =$ & $ \textit{if} (\rnullable{r_1}) \textit{then} \; (\vsuf{cs}{(\rder{c}{r_1})}) @ [c :: cs]$\\
- && $\textit{else} \; (\vsuf{cs}{(\rder{c}{r_1}) }) $
-\end{tabular}
+ \begin{tabular}{lcl}
+ $\vsuf{[]}{\_} $ & $=$ & $[]$\\
+ $\vsuf{c::cs}{r_1}$ & $ =$ & $ \textit{if} (\rnullable{r_1}) \textit{then} \; (\vsuf{cs}{(\rder{c}{r_1})}) @ [c :: cs]$\\
+ && $\textit{else} \; (\vsuf{cs}{(\rder{c}{r_1}) }) $
+ \end{tabular}
\end{center}
\noindent
The list is sorted in the order $r_2\backslash s''$
@@ -1340,25 +1342,25 @@
a left-associative nested sequence of alternatives into
a flattened list:
\[
- \sum [r_1, r_2, r_3, \ldots] \stackrel{\sflat{\_}}{\rightarrow}
-(\ldots ((r_1 + r_2) + r_3) + \ldots)
+ \sum [r_1, r_2, r_3, \ldots] \stackrel{\sflat{\_}}{\rightarrow}
+ (\ldots ((r_1 + r_2) + r_3) + \ldots)
\]
\noindent
The definitions $\sflat{\_}$, $\sflataux{\_}$ are given below.
- \begin{center}
- \begin{tabular}{ccc}
- $\sflataux{\AALTS{ }{r :: rs}}$ & $=$ & $\sflataux{r} @ rs$\\
-$\sflataux{\AALTS{ }{[]}}$ & $ = $ & $ []$\\
-$\sflataux r$ & $=$ & $ [r]$
-\end{tabular}
+\begin{center}
+ \begin{tabular}{ccc}
+ $\sflataux{\AALTS{ }{r :: rs}}$ & $=$ & $\sflataux{r} @ rs$\\
+ $\sflataux{\AALTS{ }{[]}}$ & $ = $ & $ []$\\
+ $\sflataux r$ & $=$ & $ [r]$
+ \end{tabular}
\end{center}
- \begin{center}
- \begin{tabular}{ccc}
- $\sflat{(\sum r :: rs)}$ & $=$ & $\sum (\sflataux{r} @ rs)$\\
- $\sflat{\sum []}$ & $ = $ & $ \sum []$\\
-$\sflat r$ & $=$ & $ r$
-\end{tabular}
+\begin{center}
+ \begin{tabular}{ccc}
+ $\sflat{(\sum r :: rs)}$ & $=$ & $\sum (\sflataux{r} @ rs)$\\
+ $\sflat{\sum []}$ & $ = $ & $ \sum []$\\
+ $\sflat r$ & $=$ & $ r$
+ \end{tabular}
\end{center}
\noindent
$\sflataux{\_}$ breaks up nested alternative regular expressions
@@ -1386,7 +1388,7 @@
\map \; (r_2 \backslash_r \_) \; (\vsuf{[x]}{(r_1 \backslash_r xs)}) \;\; @ \;\;
\map \; (\_ \backslash_r x) \; (\map \; (r_2 \backslash \_) \; (\vsuf{xs}{r_1}))
\]
- =
+ =
\[
\map \; (r_2 \backslash_r \_) \; (\vsuf{xs @ [x]}{r_1})
\]
@@ -1403,7 +1405,7 @@
nested do not matter under $\rsimp{}$:
\begin{center}
$\rsimp{r} \stackrel{?}{\sequal} \rsimp{r'}$ if $r = \sum [r_1, r_2, r_3, \ldots]$
-and $r' =(\ldots ((r_1 + r_2) + r_3) + \ldots)$
+ and $r' =(\ldots ((r_1 + r_2) + r_3) + \ldots)$
\end{center}
Simply wrap with $\sum$ constructor and add
simplifications to both sides of \ref{seqSfau0}
@@ -1411,7 +1413,7 @@
\begin{corollary}\label{seqClosedFormGeneral}
$\rsimp{\sflat{(r_1 \cdot r_2) \backslash s} }
=\rsimp{(\sum ( (r_1 \backslash s) \cdot r_2 ::
- \map\; (r_2 \backslash \_) \; (\vsuf{s}{r_1})))}$
+ \map\; (r_2 \backslash \_) \; (\vsuf{s}{r_1})))}$
\end{corollary}
Together with the idempotency property of $\rsimp{}$ (lemma \ref{rsimpIdem}),
it is possible to convert the above lemma to obtain a "closed form"
@@ -1432,9 +1434,9 @@
\begin{corollary}\label{seqEstimate1}
\begin{center}
- $\llbracket \rderssimp{(r_1 \cdot r_2)}{s} \rrbracket_r = \llbracket \rsimp{(\sum ((r_1 \backslash s) \cdot r_2 )
- :: (\map \; (r_2 \backslash \_) (\vsuf{s}{r_1})))} \rrbracket_r$
-
+ $\llbracket \rderssimp{(r_1 \cdot r_2)}{s} \rrbracket_r = \llbracket \rsimp{(\sum ((r_1 \backslash s) \cdot r_2 )
+ :: (\map \; (r_2 \backslash \_) (\vsuf{s}{r_1})))} \rrbracket_r$
+
\end{center}
\end{corollary}
\noindent
@@ -1446,10 +1448,10 @@
Again, we first look at how a star's derivatives evolve, if they grow maximally:
\begin{center}
-$r^* \quad \longrightarrow_{\backslash c} \quad (r\backslash c) \cdot r^* \quad \longrightarrow_{\backslash c'} \quad
-r \backslash cc' \cdot r^* + r \backslash c' \cdot r^* \quad \longrightarrow_{\backslash c''} \quad
-(r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*) \quad \longrightarrow_{\backslash c'''}
-\quad \ldots$
+ $r^* \quad \longrightarrow_{\backslash c} \quad (r\backslash c) \cdot r^* \quad \longrightarrow_{\backslash c'} \quad
+ r \backslash cc' \cdot r^* + r \backslash c' \cdot r^* \quad \longrightarrow_{\backslash c''} \quad
+ (r_1 \backslash cc'c'' \cdot r^* + r \backslash c'') + (r \backslash c'c'' \cdot r^* + r \backslash c'' \cdot r^*) \quad \longrightarrow_{\backslash c'''}
+ \quad \ldots$
\end{center}
When we have a string $s = c :: c' :: c'' \ldots$ such that $r \backslash c$, $r \backslash cc'$, $r \backslash c'$,
@@ -1532,60 +1534,60 @@
elements.
We give a predicate for such "star-created" regular expressions:
\begin{center}
-\begin{tabular}{lcr}
- & & $\createdByStar{(\RSEQ{ra}{\RSTAR{rb}}) }$\\
- $\createdByStar{r_1} \land \createdByStar{r_2} $ & $ \Longrightarrow$ & $\createdByStar{(r_1 + r_2)}$
- \end{tabular}
- \end{center}
-
- These definitions allows us the flexibility to talk about
- regular expressions in their most convenient format,
- for example, flattened out $\RALTS{[r_1, r_2, \ldots, r_n]} $
- instead of binary-nested: $((r_1 + r_2) + (r_3 + r_4)) + \ldots$.
- These definitions help express that certain classes of syntatically
- distinct regular expressions are actually the same under simplification.
- This is not entirely true for annotated regular expressions:
- %TODO: bsimp bders \neq bderssimp
- \begin{center}
- $(1+ (c\cdot \ASEQ{bs}{c^*}{c} ))$
- \end{center}
- For bit-codes, the order in which simplification is applied
- might cause a difference in the location they are placed.
- If we want something like
- \begin{center}
- $\bderssimp{r}{s} \myequiv \bsimp{\bders{r}{s}}$
- \end{center}
- Some "canonicalization" procedure is required,
- which either pushes all the common bitcodes to nodes
- as senior as possible:
- \begin{center}
- $_{bs}(_{bs_1 @ bs'}r_1 + _{bs_1 @ bs''}r_2) \rightarrow _{bs @ bs_1}(_{bs'}r_1 + _{bs''}r_2) $
- \end{center}
- or does the reverse. However bitcodes are not of interest if we are talking about
- the $\llbracket r \rrbracket$ size of a regex.
- Therefore for the ease and simplicity of producing a
- proof for a size bound, we are happy to restrict ourselves to
- unannotated regular expressions, and obtain such equalities as
- \begin{lemma}
- $\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$
- \end{lemma}
-
- \begin{proof}
- By using the rewriting relation $\rightsquigarrow$
- \end{proof}
- %TODO: rsimp sflat
+ \begin{tabular}{lcr}
+ & & $\createdByStar{(\RSEQ{ra}{\RSTAR{rb}}) }$\\
+ $\createdByStar{r_1} \land \createdByStar{r_2} $ & $ \Longrightarrow$ & $\createdByStar{(r_1 + r_2)}$
+ \end{tabular}
+\end{center}
+
+These definitions allows us the flexibility to talk about
+regular expressions in their most convenient format,
+for example, flattened out $\RALTS{[r_1, r_2, \ldots, r_n]} $
+instead of binary-nested: $((r_1 + r_2) + (r_3 + r_4)) + \ldots$.
+These definitions help express that certain classes of syntatically
+distinct regular expressions are actually the same under simplification.
+This is not entirely true for annotated regular expressions:
+%TODO: bsimp bders \neq bderssimp
+\begin{center}
+ $(1+ (c\cdot \ASEQ{bs}{c^*}{c} ))$
+\end{center}
+For bit-codes, the order in which simplification is applied
+might cause a difference in the location they are placed.
+If we want something like
+\begin{center}
+ $\bderssimp{r}{s} \myequiv \bsimp{\bders{r}{s}}$
+\end{center}
+Some "canonicalization" procedure is required,
+which either pushes all the common bitcodes to nodes
+as senior as possible:
+\begin{center}
+ $_{bs}(_{bs_1 @ bs'}r_1 + _{bs_1 @ bs''}r_2) \rightarrow _{bs @ bs_1}(_{bs'}r_1 + _{bs''}r_2) $
+\end{center}
+or does the reverse. However bitcodes are not of interest if we are talking about
+the $\llbracket r \rrbracket$ size of a regex.
+Therefore for the ease and simplicity of producing a
+proof for a size bound, we are happy to restrict ourselves to
+unannotated regular expressions, and obtain such equalities as
+\begin{lemma}
+ $\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$
+\end{lemma}
+
+\begin{proof}
+ By using the rewriting relation $\rightsquigarrow$
+\end{proof}
+%TODO: rsimp sflat
And from this we obtain a proof that a star's derivative will be the same
as if it had all its nested alternatives created during deriving being flattened out:
- For example,
- \begin{lemma}
- $\createdByStar{r} \implies \rsimp{r} = \rsimp{\RALTS{\hflataux{r}}}$
- \end{lemma}
- \begin{proof}
- By structural induction on $r$, where the induction rules are these of $\createdByStar{_}$.
- \end{proof}
+For example,
+\begin{lemma}
+ $\createdByStar{r} \implies \rsimp{r} = \rsimp{\RALTS{\hflataux{r}}}$
+\end{lemma}
+\begin{proof}
+ By structural induction on $r$, where the induction rules are these of $\createdByStar{_}$.
+\end{proof}
% The simplification of a flattened out regular expression, provided it comes
%from the derivative of a star, is the same as the one nested.
-
+
We first introduce an inductive property
@@ -1629,14 +1631,14 @@
\noindent
This yields
\begin{lemma}\label{hfauRsimpeq2}
-$\rsimp{r} = \rsimp{(\sum \hflataux{r})}$
+ $\rsimp{r} = \rsimp{(\sum \hflataux{r})}$
\end{lemma}
\noindent
Together with the rewriting relation
\begin{lemma}\label{starClosedForm6Hrewrites}
$\map \; (\lambda s. (\rsimp{r \backslash_r s}) \cdot (r^*)) \; Ss
\scfrewrites
- \map \; (\lambda s. (\rsimp{r \backslash_r s}) \cdot (r^*)) \; Ss$
+ \map \; (\lambda s. (\rsimp{r \backslash_r s}) \cdot (r^*)) \; Ss$
\end{lemma}
\noindent
We obtain the closed form for star regular expression:
@@ -1644,10 +1646,10 @@
$\rderssimp{r^*}{c::s} =
\rsimp{
(\sum (\map \; (\lambda s. (\rderssimp{r}{s})\cdot r^*) \;
- (\starupdates \; s\; r \; [[c]])
- )
+ (\starupdates \; s\; r \; [[c]])
)
- }
+ )
+ }
$
\end{lemma}
\begin{proof}
@@ -1659,21 +1661,21 @@
We now summarize the closed forms below:
\begin{itemize}
\item
- $\rderssimp{(\sum rs)}{s} \sequal
- \sum \; (\map \; (\rderssimp{\_}{s}) \; rs)$
+ $\rderssimp{(\sum rs)}{s} \sequal
+ \sum \; (\map \; (\rderssimp{\_}{s}) \; rs)$
\item
- $\rderssimp{(r_1 \cdot r_2)}{s} \sequal \sum ((r_1 \backslash s) \cdot r_2 )
- :: (\map \; (r_2 \backslash \_) (\vsuf{s}{r_1}))$
+ $\rderssimp{(r_1 \cdot r_2)}{s} \sequal \sum ((r_1 \backslash s) \cdot r_2 )
+ :: (\map \; (r_2 \backslash \_) (\vsuf{s}{r_1}))$
\item
- $\rderssimp{r^*}{c::s} =
- \rsimp{
- (\sum (\map \; (\lambda s. (\rderssimp{r}{s})\cdot r^*) \;
+ $\rderssimp{r^*}{c::s} =
+ \rsimp{
+ (\sum (\map \; (\lambda s. (\rderssimp{r}{s})\cdot r^*) \;
(\starupdates \; s\; r \; [[c]])
- )
- )
- }
- $
+ )
+ )
+ }
+ $
\end{itemize}
\noindent
The closed forms on the left-hand-side
@@ -1710,7 +1712,7 @@
using union of sets like
$\{r_1 \cdot r_2 \mid r_1 \in A
\text{and}
- r_2 \in A\}
+ r_2 \in A\}
$ where $A = \sizeNregex \; N$.
\end{proof}
\noindent
@@ -1758,27 +1760,27 @@
Now we are ready to control the sizes of
$r_1 \cdot r_2 \backslash s$, $r^* \backslash s$.
\begin{theorem}
-For any regex $r$, $\exists N_r. \forall s. \; \rsize{\rderssimp{r}{s}} \leq N_r$
+ For any regex $r$, $\exists N_r. \forall s. \; \rsize{\rderssimp{r}{s}} \leq N_r$
\end{theorem}
\noindent
\begin{proof}
-We prove this by induction on $r$. The base cases for $\RZERO$,
-$\RONE $ and $\RCHAR{c}$ are straightforward.
-In the sequence $r_1 \cdot r_2$ case,
-the inductive hypotheses state
-$\exists N_1. \forall s. \; \llbracket \rderssimp{r}{s} \rrbracket \leq N_1$ and
-$\exists N_2. \forall s. \; \llbracket \rderssimp{r_2}{s} \rrbracket \leq N_2$.
+ We prove this by induction on $r$. The base cases for $\RZERO$,
+ $\RONE $ and $\RCHAR{c}$ are straightforward.
+ In the sequence $r_1 \cdot r_2$ case,
+ the inductive hypotheses state
+ $\exists N_1. \forall s. \; \llbracket \rderssimp{r}{s} \rrbracket \leq N_1$ and
+ $\exists N_2. \forall s. \; \llbracket \rderssimp{r_2}{s} \rrbracket \leq N_2$.
-When the string $s$ is not empty, we can reason as follows
-%
-\begin{center}
-\begin{tabular}{lcll}
+ When the string $s$ is not empty, we can reason as follows
+ %
+ \begin{center}
+ \begin{tabular}{lcll}
& & $ \llbracket \rderssimp{r_1\cdot r_2 }{s} \rrbracket_r $\\
& $ = $ & $\llbracket \rsimp{(\sum(r_1 \backslash_{rsimp} s \cdot r_2 \; \; :: \; \;
-\map \; (r_2\backslash_{rsimp} \_)\; (\vsuf{s}{r})))} \rrbracket_r $ & (1) \\
-& $\leq$ & $\llbracket \rdistinct{(r_1 \backslash_{rsimp} s \cdot r_2 \; \; :: \; \;
-\map \; (r_2\backslash_{rsimp} \_)\; (\vsuf{s}{r}))}{\varnothing} \rrbracket_r + 1$ & (2) \\
- & $\leq$ & $2 + N_1 + \rsize{r_2} + (N_2 * (card\;(\sizeNregex \; N_2)))$ & (3)\\
+ \map \; (r_2\backslash_{rsimp} \_)\; (\vsuf{s}{r})))} \rrbracket_r $ & (1) \\
+ & $\leq$ & $\llbracket \rdistinct{(r_1 \backslash_{rsimp} s \cdot r_2 \; \; :: \; \;
+ \map \; (r_2\backslash_{rsimp} \_)\; (\vsuf{s}{r}))}{\varnothing} \rrbracket_r + 1$ & (2) \\
+ & $\leq$ & $2 + N_1 + \rsize{r_2} + (N_2 * (card\;(\sizeNregex \; N_2)))$ & (3)\\
\end{tabular}
\end{center}
\noindent
@@ -1803,16 +1805,16 @@
Let $n_r = \llbracket r^* \rrbracket_r$.
When $s = c :: cs$ is not empty,
\begin{center}
-\begin{tabular}{lcll}
+ \begin{tabular}{lcll}
& & $ \llbracket \rderssimp{r^* }{c::cs} \rrbracket_r $\\
& $ = $ & $\llbracket \rsimp{(\sum (\map \; (\lambda s. (r \backslash_{rsimp} s) \cdot r^*) \; (\starupdates\;
-cs \; r \; [[c]] )) )} \rrbracket_r $ & (5) \\
-& $\leq$ & $\llbracket
-\rdistinct{
- (\map \;
- (\lambda s. (r \backslash_{rsimp} s) \cdot r^*) \;
- (\starupdates\; cs \; r \; [[c]] )
- )}
+ cs \; r \; [[c]] )) )} \rrbracket_r $ & (5) \\
+ & $\leq$ & $\llbracket
+ \rdistinct{
+ (\map \;
+ (\lambda s. (r \backslash_{rsimp} s) \cdot r^*) \;
+ (\starupdates\; cs \; r \; [[c]] )
+ )}
{\varnothing} \rrbracket_r + 1$ & (6) \\
& $\leq$ & $1 + (\textit{card} (\sizeNregex \; (N + n_r)))
* (1 + (N + n_r)) $ & (7)\\
@@ -1824,10 +1826,10 @@
(7) is by \ref{finiteSizeNCorollary}.
Combining with the case when $s = []$, one gets
\begin{center}
-\begin{tabular}{lcll}
- $\rsize{r^* \backslash_r s}$ & $\leq$ & $max \; n_r \; 1 + (\textit{card} (\sizeNregex \; (N + n_r)))
- * (1 + (N + n_r)) $ & (8)\\
-\end{tabular}
+ \begin{tabular}{lcll}
+ $\rsize{r^* \backslash_r s}$ & $\leq$ & $max \; n_r \; 1 + (\textit{card} (\sizeNregex \; (N + n_r)))
+ * (1 + (N + n_r)) $ & (8)\\
+ \end{tabular}
\end{center}
\noindent
@@ -1836,29 +1838,29 @@
is equivalent to $\exists N. \forall r \in (\map \; (\_ \backslash_r s) \; rs). \rsize{r} \leq N$.
In the case when $s = c::cs$, we have
\begin{center}
-\begin{tabular}{lcll}
+ \begin{tabular}{lcll}
& & $ \llbracket \rderssimp{\sum rs }{c::cs} \rrbracket_r $\\
& $ = $ & $\llbracket \rsimp{(\sum (\map \; (\_ \backslash_{rsimp} s) \; rs) )} \rrbracket_r $ & (9) \\
& $\leq$ & $\llbracket (\sum (\map \; (\_ \backslash_{rsimp} s) \; rs) ) \rrbracket_r $ & (10) \\
& $\leq$ & $1 + N * (length \; rs) $ & (11)\\
-\end{tabular}
+ \end{tabular}
\end{center}
\noindent
(9) is by \ref{altsClosedForm}, (10) by \ref{rsimpSize} and (11) by inductive hypothesis.
Combining with the case when $s = []$, one gets
\begin{center}
-\begin{tabular}{lcll}
- $\rsize{\sum rs \backslash_r s}$ & $\leq$ & $max \; \rsize{\sum rs} \; 1+N*(length \; rs)$
- & (12)\\
-\end{tabular}
+ \begin{tabular}{lcll}
+ $\rsize{\sum rs \backslash_r s}$ & $\leq$ & $max \; \rsize{\sum rs} \; 1+N*(length \; rs)$
+ & (12)\\
+ \end{tabular}
\end{center}
(4), (8), and (12) are all the inductive cases proven.
\end{proof}
\begin{corollary}
-For any regex $a$, $\exists N_r. \forall s. \; \rsize{\bderssimp{a}{s}} \leq N_r$
+ For any regex $a$, $\exists N_r. \forall s. \; \rsize{\bderssimp{a}{s}} \leq N_r$
\end{corollary}
\begin{proof}
By \ref{sizeRelations}.
@@ -1894,31 +1896,31 @@
Whatever the regex is, it will not grow indefinitely.
Take our previous example $(a + aa)^*$ as an example:
\begin{center}
-\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={number of $a$'s},
- x label style={at={(1.05,-0.05)}},
- ylabel={regex size},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax= 40,
- ytick={0,10,...,40},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={$(a + aa)^*$},
- legend pos=north west,
- legend cell align=left]
-\addplot[red,mark=*, mark options={fill=white}] table {a_aa_star.data};
-\end{axis}
-\end{tikzpicture}
-\end{tabular}
+ \begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
+ \begin{tikzpicture}
+ \begin{axis}[
+ xlabel={number of $a$'s},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={regex size},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax= 40,
+ ytick={0,10,...,40},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={$(a + aa)^*$},
+ legend pos=north west,
+ legend cell align=left]
+ \addplot[red,mark=*, mark options={fill=white}] table {a_aa_star.data};
+ \end{axis}
+ \end{tikzpicture}
+ \end{tabular}
\end{center}
We are able to limit the size of the regex $(a + aa)^*$'s derivatives
- with our simplification
+with our simplification
rules very effectively.
@@ -1941,71 +1943,71 @@
%a*, aa*, aaa*, .....
%randomly generated regular expressions
\begin{center}
-\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={number of $a$'s},
- x label style={at={(1.05,-0.05)}},
- ylabel={regex size},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=1000,
- ytick={0,100,...,1000},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={regex1},
- legend pos=north west,
- legend cell align=left]
-\addplot[red,mark=*, mark options={fill=white}] table {regex1_size_change.data};
-\end{axis}
-\end{tikzpicture}
+ \begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
+ \begin{tikzpicture}
+ \begin{axis}[
+ xlabel={number of $a$'s},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={regex size},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=1000,
+ ytick={0,100,...,1000},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={regex1},
+ legend pos=north west,
+ legend cell align=left]
+ \addplot[red,mark=*, mark options={fill=white}] table {regex1_size_change.data};
+ \end{axis}
+ \end{tikzpicture}
&
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- %ylabel={time in secs},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=1000,
- ytick={0,100,...,1000},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={regex2},
- legend pos=north west,
- legend cell align=left]
-\addplot[blue,mark=*, mark options={fill=white}] table {regex2_size_change.data};
-\end{axis}
-\end{tikzpicture}
+ \begin{tikzpicture}
+ \begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ %ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=1000,
+ ytick={0,100,...,1000},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={regex2},
+ legend pos=north west,
+ legend cell align=left]
+ \addplot[blue,mark=*, mark options={fill=white}] table {regex2_size_change.data};
+ \end{axis}
+ \end{tikzpicture}
&
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- %ylabel={time in secs},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=1000,
- ytick={0,100,...,1000},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={regex3},
- legend pos=north west,
- legend cell align=left]
-\addplot[cyan,mark=*, mark options={fill=white}] table {regex3_size_change.data};
-\end{axis}
-\end{tikzpicture}\\
-\multicolumn{3}{c}{Graphs: size change of 3 randomly generated regular expressions $w.r.t.$ input string length.}
-\end{tabular}
+ \begin{tikzpicture}
+ \begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ %ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=1000,
+ ytick={0,100,...,1000},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={regex3},
+ legend pos=north west,
+ legend cell align=left]
+ \addplot[cyan,mark=*, mark options={fill=white}] table {regex3_size_change.data};
+ \end{axis}
+ \end{tikzpicture}\\
+ \multicolumn{3}{c}{Graphs: size change of 3 randomly generated regular expressions $w.r.t.$ input string length.}
+ \end{tabular}
\end{center}
\noindent
Most of the regex's sizes seem to stay within a polynomial bound $w.r.t$ the
@@ -2026,8 +2028,8 @@
by $\simp$.
For example,
\begin{center}
-$\simp \;(\simpALTs\; (\map \;(\_\backslash \; x)\; (\distinct \; \mathit{rs}\; \phi))) =
- \simp \;(\simpALTs \;(\distinct \;(\map \;(\_ \backslash\; x) \; \mathit{rs}) \; \phi))$
+ $\simp \;(\simpALTs\; (\map \;(\_\backslash \; x)\; (\distinct \; \mathit{rs}\; \phi))) =
+ \simp \;(\simpALTs \;(\distinct \;(\map \;(\_ \backslash\; x) \; \mathit{rs}) \; \phi))$
\end{center}
@@ -2053,8 +2055,8 @@
\subsection{function $\backslash c$ is not injective (1-to-1)}
\begin{center}
-The derivative $w.r.t$ character $c$ is not one-to-one.
-Formally,
+ The derivative $w.r.t$ character $c$ is not one-to-one.
+ Formally,
$\exists r_1 \;r_2. r_1 \neq r_2 \mathit{and} r_1 \backslash c = r_2 \backslash c$
\end{center}
This property is trivially true for the
@@ -2084,13 +2086,13 @@
There are two problems with this finiteness result, though.
\begin{itemize}
\item
-First, It is not yet a direct formalisation of our lexer's complexity,
-as a complexity proof would require looking into
-the time it takes to execute {\bf all} the operations
-involved in the lexer (simp, collect, decode), not just the derivative.
-\item
-Second, the bound is not yet tight, and we seek to improve $N_a$ so that
-it is polynomial on $\llbracket a \rrbracket$.
+ First, It is not yet a direct formalisation of our lexer's complexity,
+ as a complexity proof would require looking into
+ the time it takes to execute {\bf all} the operations
+ involved in the lexer (simp, collect, decode), not just the derivative.
+ \item
+ Second, the bound is not yet tight, and we seek to improve $N_a$ so that
+ it is polynomial on $\llbracket a \rrbracket$.
\end{itemize}
Still, we believe this contribution is fruitful,
because
@@ -2109,25 +2111,25 @@
We refine $\blexersimp$ with $\blexerStrong$ in the next chapter
so that the bound becomes polynomial.
\end{itemize}
-
+
%----------------------------------------------------------------------------------------
% SECTION 4
%----------------------------------------------------------------------------------------
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
One might wonder the actual bound rather than the loose bound we gave
for the convenience of an easier proof.
How much can the regex $r^* \backslash s$ grow?
As earlier graphs have shown,
%TODO: reference that graph where size grows quickly
- they can grow at a maximum speed
- exponential $w.r.t$ the number of characters,
+they can grow at a maximum speed
+exponential $w.r.t$ the number of characters,
but will eventually level off when the string $s$ is long enough.
If they grow to a size exponential $w.r.t$ the original regex, our algorithm
would still be slow.
@@ -2135,23 +2137,23 @@
where such regular expressions grew exponentially large before levelling off:
$(a ^ * + (aa) ^ * + (aaa) ^ * + \ldots +
(\underbrace{a \ldots a}_{\text{n a's}})^*$ will already have a maximum
- size that is exponential on the number $n$
+size that is exponential on the number $n$
under our current simplification rules:
%TODO: graph of a regex whose size increases exponentially.
\begin{center}
-\begin{tikzpicture}
- \begin{axis}[
- height=0.5\textwidth,
- width=\textwidth,
- xlabel=number of a's,
- xtick={0,...,9},
- ylabel=maximum size,
- ymode=log,
- log basis y={2}
-]
- \addplot[mark=*,blue] table {re-chengsong.data};
- \end{axis}
-\end{tikzpicture}
+ \begin{tikzpicture}
+ \begin{axis}[
+ height=0.5\textwidth,
+ width=\textwidth,
+ xlabel=number of a's,
+ xtick={0,...,9},
+ ylabel=maximum size,
+ ymode=log,
+ log basis y={2}
+ ]
+ \addplot[mark=*,blue] table {re-chengsong.data};
+ \end{axis}
+ \end{tikzpicture}
\end{center}
For convenience we use $(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$
@@ -2165,41 +2167,41 @@
will therefore contain the following terms (after flattening out all nested
alternatives):
\begin{center}
-$(\oplus_{i = 1]{n} (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* })\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)$\\
-$(1 \leq m' \leq m )$
+ $(\oplus_{i = 1]{n} (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* })\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)$\\
+ $(1 \leq m' \leq m )$
\end{center}
These terms are distinct for $m' \leq L.C.M.(1, \ldots, n)$ (will be explained in appendix).
- With each new input character taking the derivative against the intermediate result, more and more such distinct
- terms will accumulate,
+With each new input character taking the derivative against the intermediate result, more and more such distinct
+terms will accumulate,
until the length reaches $L.C.M.(1, \ldots, n)$.
$\textit{distinctBy}$ will not be able to de-duplicate any two of these terms
$(\oplus_{i = 1}^{n} (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* )\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$\\
$(\oplus_{i = 1}^{n} (\underbrace{a \ldots a}_{\text{((i - (m'' \% i))\%i) a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* )\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$\\
- where $m' \neq m''$ \\
- as they are slightly different.
- This means that with our current simplification methods,
- we will not be able to control the derivative so that
- $\llbracket \bderssimp{r}{s} \rrbracket$ stays polynomial %\leq O((\llbracket r\rrbacket)^c)$
- as there are already exponentially many terms.
- These terms are similar in the sense that the head of those terms
- are all consisted of sub-terms of the form:
- $(\underbrace{a \ldots a}_{\text{j a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* $.
- For $\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*$, there will be at most
- $n * (n + 1) / 2$ such terms.
- For example, $(a^* + (aa)^* + (aaa)^*) ^*$'s derivatives
- can be described by 6 terms:
- $a^*$, $a\cdot (aa)^*$, $ (aa)^*$,
- $aa \cdot (aaa)^*$, $a \cdot (aaa)^*$, and $(aaa)^*$.
+where $m' \neq m''$ \\
+as they are slightly different.
+This means that with our current simplification methods,
+we will not be able to control the derivative so that
+$\llbracket \bderssimp{r}{s} \rrbracket$ stays polynomial %\leq O((\llbracket r\rrbacket)^c)$
+as there are already exponentially many terms.
+These terms are similar in the sense that the head of those terms
+are all consisted of sub-terms of the form:
+$(\underbrace{a \ldots a}_{\text{j a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* $.
+For $\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*$, there will be at most
+$n * (n + 1) / 2$ such terms.
+For example, $(a^* + (aa)^* + (aaa)^*) ^*$'s derivatives
+can be described by 6 terms:
+$a^*$, $a\cdot (aa)^*$, $ (aa)^*$,
+$aa \cdot (aaa)^*$, $a \cdot (aaa)^*$, and $(aaa)^*$.
The total number of different "head terms", $n * (n + 1) / 2$,
- is proportional to the number of characters in the regex
+is proportional to the number of characters in the regex
$(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$.
This suggests a slightly different notion of size, which we call the
alphabetic width:
%TODO:
(TODO: Alphabetic width def.)
-
+
Antimirov\parencite{Antimirov95} has proven that
$\textit{PDER}_{UNIV}(r) \leq \textit{awidth}(r)$.
where $\textit{PDER}_{UNIV}(r)$ is a set of all possible subterms
@@ -2226,7 +2228,7 @@
by $\simp$.
For example,
\begin{center}
-$\simp \;(\simpALTs\; (\map \;(\_\backslash \; x)\; (\distinct \; \mathit{rs}\; \phi))) =
- \simp \;(\simpALTs \;(\distinct \;(\map \;(\_ \backslash\; x) \; \mathit{rs}) \; \phi))$
+ $\simp \;(\simpALTs\; (\map \;(\_\backslash \; x)\; (\distinct \; \mathit{rs}\; \phi))) =
+ \simp \;(\simpALTs \;(\distinct \;(\map \;(\_ \backslash\; x) \; \mathit{rs}) \; \phi))$
\end{center}