--- a/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Tue Jul 25 17:28:29 2023 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Wed Aug 23 03:02:31 2023 +0100
@@ -44,288 +44,351 @@
\end{tabular}
\end{center}
\noindent
-The first problem with this algorithm is that
-for the function $\inj$ to work properly
-we cannot destroy the structure of a regular expression,
-and therefore cannot simplify too aggressively.
-For example,
-\[
- r + (r + a) \rightarrow r + a
-\]
-cannot be applied because that would require
-breaking up the inner alternative.
-The $\lexer$ plus $\textit{simp}$ therefore only enables
-same-level de-duplications like
-\[
- r + r \rightarrow r.
-\]
-Secondly, the algorithm recursively calls $\lexer$ on
-each new character input,
-and before starting a child call
-it stores information of previous lexing steps
-on a stack, in the form of regular expressions
-and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
-Each descent into deeper recursive calls in $\lexer$
-causes a new pair of $r_i, c_i$ to be pushed to the call stack.
-\begin{figure}[H]
-\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
-%\draw (-6,-6) grid (6,6);
-\node [ circle ] (r) at (-6, 5) {$r$};
+This algorithm works nicely as a functional program that utilizes Brzozowski derivatives:
+each derivative character is remembered and stacked up, and
+injected back in reverse order as they have been taken derivative of.
+The derivative operation $\backslash$ and its reverse operation
+$\inj$ is of similar shape and compexity, and work in lockstep with each other.
+However if we take a closer look into the example run
+of $\lexer$ we have shown in chapter \ref{Inj},
+many inefficiencies exist:
+\begin{center}
+ \begin{tabular}{lcl}
+ $(a+\textcolor{magenta}{a}\textcolor{blue}{a})^* \cdot \textcolor{red}{c}$ & $\stackrel{\backslash \textcolor{magenta}{a}}{\rightarrow}$ & $((\ONE + \textcolor{magenta}{\ONE} \textcolor{blue}{a})\cdot (a+aa)^*)\cdot \textcolor{red}{c}+\ZERO$\\
+ %& $\stackrel{\rightarrow}{\backslash a}$ & $((\ONE + \ONE a)\cdot (a+aa)^*)\cdot c + \ZERO$\\
+ & $\stackrel{\backslash \textcolor{blue}{a}}{\rightarrow}$ &
+ $(((\ZERO+(\ZERO a+ \textcolor{blue}{\ONE}))\cdot (a+aa)^* + (\ONE+\ONE a)\cdot (a+aa)^* )\cdot \textcolor{red}{\mathbf{c}} + \ZERO)+\ZERO$\\
+ & $\stackrel{\backslash \textcolor{red}{c}}{\rightarrow}$ &
+ $((r_{=0}\cdot c + \textcolor{red}{\ONE})+\ZERO)+\ZERO$\\
+ & $\stackrel{\mkeps}{\rightarrow}$ & $\Left (\Left \; (\Right \; \textcolor{red}{\Empty}))$ \\
+ & $\stackrel{\inj \;\textcolor{red}{c} }{\rightarrow}$ &
+ $\Left \; (\Left \; (\Seq \;(\Left \; (\Seq \; (\Right \; (\Right\; \textcolor{blue}{\Empty})) \; \Stars \, [])) \; \textcolor{red}{c}))$\\
+ & $\stackrel{\inj \;\textcolor{blue}{a}}{\rightarrow}$ &
+ $\Left\; (\Seq \; (\Seq\; (\Right \; (\Seq \; \textcolor{magenta}{\Empty }\; \textcolor{blue}{ \mathbf{a}}) )
+ \;\Stars \,[]) \; \textcolor{red}{c})$\\
+ & $\stackrel{\inj \;\textcolor{magenta}{a}}{\rightarrow}$ & $\Seq \; (\Stars \; [\Right \; (\Seq \; \mathbf{\textcolor{magenta}{a}} \; \textcolor{blue}{a})]) \; \textcolor{red}{ c}$
-%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
-\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
-%
-%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
-\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
-
-\node [minimum width = 2cm, rectangle, draw] (stack) at (0, 3) {Stack};
+ %$\inj \; r \; c\A$
+ %$\inj \; (a\cdot b)\cdot c \; \Seq \; \ONE \; b$ & $=$ & $(a+e)$\\
+ \end{tabular}
+\end{center}
+\noindent
+For the un
-\path
- (r)
- edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
-
-\path (r1)
- edge [bend right, dashed] node {saved} (stack);
-\path (c1)
- edge [bend right, dashed] node {} (stack);
-
-
-\end{tikzpicture}
-\caption{First derivative taken}
-\end{figure}
+\begin{center}
+ \begin{tabular}{lcl}
+$\inj$ & $\quad ((\ONE + {\ONE}
+ \textcolor{blue}{a})\cdot (a+aa)^*)\cdot
+ + \ZERO \; \quad \textcolor{blue}{a} \; $ & \\
+$\quad \Left \;
+(\Left \; (\Seq \;(\Left \; (\Seq \; (\Right \; (\Right\;
+ \textcolor{blue}{\Empty})) \; \Stars \, [])) \; c))$ & \\$=$\\
+ $\Left \; (\inj \; ((\ONE + \ONE
+ \textcolor{blue}{a})\cdot (a+aa)^*)\cdot
+ c \quad \textcolor{blue}{a} \quad (\Left \; (\Seq\; ( \Left \; (\Seq \; (\Right \; (\Right\;
+ \textcolor{blue}{\Empty})) \; \Stars \, []) ) \; c ) )\;\;)$ &\\ $=$\\
+ $\Left \; (\Seq \; (\inj \; (\ONE + \ONE \textcolor{blue}{a})\cdot(a+aa)^* \quad \textcolor{blue}{a} \quad
+ (\Left \; (\Seq \; (\Right \; (\Right\;\textcolor{blue}{\Empty})))) ) \; c ) $ & \\$=$\\
+ $\Left \; (\Seq \; (\Seq \; (\inj \quad (\ONE + \ONE \textcolor{blue}{a}) \quad \textcolor{blue}{a}\quad \Right \;(\Right \; \textcolor{blue}{\Empty}) ) \; \Stars \,[])\; c)$ &\\ $=$ \\
+ $\Left \; (\Seq \; (\Seq \; (\Right \; (\inj \; \ONE \textcolor{blue}{a} \quad \textcolor{blue}{a}\quad (\Right \;\textcolor{blue}{\Empty})))) \; \Stars \, [])$
+ & \\ $=$\\
+ $\Left \; (\Seq \; (\Seq \; (\Right \; (\Seq \; (\mkeps \; \ONE)\;(\inj \;\textcolor{blue}{a} \; \textcolor{blue}{a} \; \textcolor{blue}{\Empty})) ) ) \; \Stars \, [] )$
+ & \\ $=$\\
+ $\Left \; (\Seq \; (\Seq \; (\Right \; (\Seq \; \Empty \; \textcolor{blue}{a}) ) ) \; \Stars \, [] )$
+ \end{tabular}
+\end{center}
-\begin{figure}[H]
-\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
-%\draw (-6,-6) grid (6,6);
-\node [ circle ] (r) at (-6, 5) {$r$};
-%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
-\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
-%
-%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
-\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
-
-
-\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
-%
-%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
-\node [circle, draw] (r2) at (2, 5) {$r_2$};
-\node [minimum width = 3cm, minimum height = 1cm, rectangle, draw] (stack) at (0, 2) {\large Stack};
-
-\path
- (r)
- edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
-
-\path (r2)
- edge [bend right, dashed] node {} (stack);
-\path (c2)
- edge [bend right, dashed] node {} (stack);
-
-\path (r1)
- edge [] node {} (r2);
-
-\end{tikzpicture}
-\caption{Second derivative taken}
-\end{figure}
-\noindent
-As the number of derivative steps increases,
-the stack would increase:
-
-\begin{figure}[H]
-\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
-%\draw (-6,-6) grid (6,6);
-\node [ circle ] (r) at (-6, 5) {$r$};
-
-%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
-\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
-%
-%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
-\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
-
-
-\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
-%
-%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
-\node [circle, ] (r2) at (2, 5) {$r_2$};
-\node [minimum width = 4cm, minimum height = 2.5cm, rectangle, draw] (stack) at (0, 1) { \large Stack};
-
-\node [] (ldots) at (3.5, 5) {};
-%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
-
-\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {};
-
-\node (rldots) at ($(ldots)!.4!(rn)$) {\ldots};
-
-\path
- (r)
- edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
-
-\path (rldots)
- edge [bend left, dashed] node {} (stack);
-
-\path (r1)
- edge [] node {} (r2);
-
-\path (r2)
- edge [] node {} (ldots);
-\path (ldots)
- edge [bend left, dashed] node {} (stack);
-\path (5.03, 4.9)
- edge [bend left, dashed] node {} (stack);
-\end{tikzpicture}
-\caption{More derivatives taken}
-\end{figure}
-
-
-\begin{figure}[H]
-\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
-%\draw (-6,-6) grid (6,6);
-\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$};
-
-%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
-\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
-%
-%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
-\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
-%
-%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
-\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
-%
-%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
-\node [circle, draw] (r2) at (2, 5) {$r_2$};
-%
-%
-\node [] (ldots) at (4.5, 5) {};
-%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
-
-\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$};
-
-\node at ($(ldots)!.4!(rn)$) {\ldots};
-\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
-
-\path
- (r)
- edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
-\path
- (r1)
- edge [] node {} (r2);
-\path (r2)
- edge [] node {} (ldots);
-\path (r)
- edge [dashed, bend right] node {} (stack);
-
-\path (r1)
- edge [dashed, ] node {} (stack);
+% The first problem with this algorithm is that
+% for the function $\inj$ to work properly
+% we cannot destroy the structure of a regular expression,
+% and therefore cannot simplify along the way without mechanisms
+% that restores the values during the simplification process.
+% %too aggressively.
+% For example,
+% \[
+% r + (r + a) \rightarrow r + a
+% \]
+% cannot be applied because that would require
+% breaking up the inner alternative.
+% The $\lexer$ plus $\textit{simp}$ therefore only enables
+% same-level de-duplications like
+% \[
+% r + r \rightarrow r.
+% \]
+% Secondly, the algorithm recursively calls $\lexer$ on
+% each new character input,
+% and before starting a child call
+% it stores information of previous lexing steps
+% on a stack, in the form of regular expressions
+% and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc.
+% Each descent into deeper recursive calls in $\lexer$
+% causes a new pair of $r_i, c_i$ to be pushed to the call stack.
+% \begin{figure}[H]
+% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
+% %\draw (-6,-6) grid (6,6);
+% \node [ circle ] (r) at (-6, 5) {$r$};
-\path (c1)
- edge [dashed, bend right] node {} (stack);
+% %\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
+% \node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
+% %
+% %\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
+% \node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
+
+% \node [minimum width = 2cm, rectangle, draw] (stack) at (0, 3) {Stack};
+
+% \path
+% (r)
+% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
+
+% \path (r1)
+% edge [bend right, dashed] node {saved} (stack);
+% \path (c1)
+% edge [bend right, dashed] node {} (stack);
-\path (c2)
- edge [dashed] node {} (stack);
-\path (4.5, 5)
- edge [dashed, bend left] node {} (stack);
-\path (4.9, 5)
- edge [dashed, bend left] node {} (stack);
-\path (5.3, 5)
- edge [dashed, bend left] node {} (stack);
-\path (r2)
- edge [dashed, ] node {} (stack);
-\path (rn)
- edge [dashed, bend left] node {} (stack);
-\end{tikzpicture}
-\caption{Before injection phase starts}
-\end{figure}
+
+% \end{tikzpicture}
+% \caption{First derivative taken}
+% \end{figure}
+
+
+
+% \begin{figure}[H]
+% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
+% %\draw (-6,-6) grid (6,6);
+% \node [ circle ] (r) at (-6, 5) {$r$};
+
+% %\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
+% \node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
+% %
+% %\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
+% \node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
-\noindent
+% \node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
+% %
+% %\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
+% \node [circle, draw] (r2) at (2, 5) {$r_2$};
+% \node [minimum width = 3cm, minimum height = 1cm, rectangle, draw] (stack) at (0, 2) {\large Stack};
+
+% \path
+% (r)
+% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
+
+% \path (r2)
+% edge [bend right, dashed] node {} (stack);
+% \path (c2)
+% edge [bend right, dashed] node {} (stack);
+
+% \path (r1)
+% edge [] node {} (r2);
+
+% \end{tikzpicture}
+% \caption{Second derivative taken}
+% \end{figure}
+% \noindent
+% As the number of derivative steps increases,
+% the stack would increase:
+
+% \begin{figure}[H]
+% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
+% %\draw (-6,-6) grid (6,6);
+% \node [ circle ] (r) at (-6, 5) {$r$};
+
+% %\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
+% \node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
+% %
+% %\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
+% \node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
+
+
+% \node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
+% %
+% %\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
+% \node [circle, ] (r2) at (2, 5) {$r_2$};
+% \node [minimum width = 4cm, minimum height = 2.5cm, rectangle, draw] (stack) at (0, 1) { \large Stack};
+
+% \node [] (ldots) at (3.5, 5) {};
+% %\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
+
+% \node [minimum size = 0.5, circle, ] (rn) at (6, 5) {};
+
+% \node (rldots) at ($(ldots)!.4!(rn)$) {\ldots};
+
+% \path
+% (r)
+% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
+
+% \path (rldots)
+% edge [bend left, dashed] node {} (stack);
+
+% \path (r1)
+% edge [] node {} (r2);
+
+% \path (r2)
+% edge [] node {} (ldots);
+% \path (ldots)
+% edge [bend left, dashed] node {} (stack);
+% \path (5.03, 4.9)
+% edge [bend left, dashed] node {} (stack);
+% \end{tikzpicture}
+% \caption{More derivatives taken}
+% \end{figure}
+
+
+% \begin{figure}[H]
+% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
+% %\draw (-6,-6) grid (6,6);
+% \node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$};
+
+% %\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
+% \node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$};
+% %
+% %\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
+% \node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$};
+% %
+% %\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
+% \node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$};
+% %
+% %\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
+% \node [circle, draw] (r2) at (2, 5) {$r_2$};
+% %
+% %
+% \node [] (ldots) at (4.5, 5) {};
+% %\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
+
+% \node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$};
+
+% \node at ($(ldots)!.4!(rn)$) {\ldots};
+
+
+
+
+% \node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
+
+% \path
+% (r)
+% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
+% \path
+% (r1)
+% edge [] node {} (r2);
+% \path (r2)
+% edge [] node {} (ldots);
+% \path (r)
+% edge [dashed, bend right] node {} (stack);
+
+% \path (r1)
+% edge [dashed, ] node {} (stack);
+
+% \path (c1)
+% edge [dashed, bend right] node {} (stack);
+
+% \path (c2)
+% edge [dashed] node {} (stack);
+% \path (4.5, 5)
+% edge [dashed, bend left] node {} (stack);
+% \path (4.9, 5)
+% edge [dashed, bend left] node {} (stack);
+% \path (5.3, 5)
+% edge [dashed, bend left] node {} (stack);
+% \path (r2)
+% edge [dashed, ] node {} (stack);
+% \path (rn)
+% edge [dashed, bend left] node {} (stack);
+% \end{tikzpicture}
+% \caption{Before injection phase starts}
+% \end{figure}
+
+
+% \noindent
After all derivatives have been taken, the stack grows to a maximum size
and the pair of regular expressions and characters $r_i, c_{i+1}$
are then popped out and used in the injection phase.
-\begin{figure}[H]
-\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
-%\draw (-6,-6) grid (6,6);
-\node [radius = 0.5, circle, ] (r) at (-6, 5) {$r$};
+% \begin{figure}[H]
+% \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick]
+% %\draw (-6,-6) grid (6,6);
+% \node [radius = 0.5, circle, ] (r) at (-6, 5) {$r$};
-%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
-\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
-%
-%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
-\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
-%
-%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
-\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
-%
-%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
-\node [circle, ] (r2) at (2, 5) {$r_2$};
-%
-%
-\node [] (ldots) at (4.5, 5) {};
-%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
+% %\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$};
+% \node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$};
+% %
+% %\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$};
+% \node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$};
+% %
+% %\node (0, 6) (c2) circle [radius = 0.3] {$c_2$};
+% \node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$};
+% %
+% %\node (2, 5) (r2) circle [radius = 0.5] {$r_2$};
+% \node [circle, ] (r2) at (2, 5) {$r_2$};
+% %
+% %
+% \node [] (ldots) at (4.5, 5) {};
+% %\node (6, 5) (rn) circle [radius = 0.5] {$r_n$};
-\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {$r_n$};
+% \node [minimum size = 0.5, circle, ] (rn) at (6, 5) {$r_n$};
-\node at ($(ldots)!.4!(rn)$) {\ldots};
+% \node at ($(ldots)!.4!(rn)$) {\ldots};
-\node [minimum size = 0.5, circle, ] (vn) at (6, -5) {$v_n$};
+% \node [minimum size = 0.5, circle, ] (vn) at (6, -5) {$v_n$};
-\node [] (ldots2) at (3.5, -5) {};
+% \node [] (ldots2) at (3.5, -5) {};
-\node [minimum size = 0.5, circle, ] (v2) at (2, -5) {$v_2$};
+% \node [minimum size = 0.5, circle, ] (v2) at (2, -5) {$v_2$};
-\node at ($(ldots2)!.4!(v2)$) {\ldots};
+% \node at ($(ldots2)!.4!(v2)$) {\ldots};
-\node [circle, ] (v1) at (-2, -5) {$v_1$};
+% \node [circle, ] (v1) at (-2, -5) {$v_1$};
-\node [radius = 0.5, circle, ] (v) at (-6, -5) {$v$};
+% \node [radius = 0.5, circle, ] (v) at (-6, -5) {$v$};
-\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
+% \node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack};
-\path
- (r)
- edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
-\path
- (r1)
- edge [] node {} (r2);
-\path (r2)
- edge [] node {} (ldots);
-\path (rn)
- edge [] node {$\mkeps$} (vn);
-\path (vn)
- edge [] node {} (ldots2);
-\path (v2)
- edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1);
+% \path
+% (r)
+% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1);
+% \path
+% (r1)
+% edge [] node {} (r2);
+% \path (r2)
+% edge [] node {} (ldots);
+% \path (rn)
+% edge [] node {$\mkeps$} (vn);
+% \path (vn)
+% edge [] node {} (ldots2);
+% \path (v2)
+% edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1);
-\path (v1)
- edge [] node {$\inj \; r \; c_1 \; v_1$} (v);
+% \path (v1)
+% edge [] node {$\inj \; r \; c_1 \; v_1$} (v);
-\path (stack)
- edge [dashed] node {} (-4.2, -5.2);
-\path (stack)
- edge [dashed] node {} (-4, -5.2);
-\path (stack)
- edge [dashed] node {} (-0.1, -5.2);
-\path (stack)
- edge [dashed] node {} (0.2, -5.26);
-\path (stack)
- edge [dashed] node {} (3.2, -5);
-\path (stack)
- edge [dashed] node {} (2.7, -5);
-\path (stack)
- edge [dashed] node {} (3.7, -5);
-\end{tikzpicture}
-\caption{Stored $r_i, c_{i+1}$ Used by $\inj$}
-\end{figure}
-\noindent
+% \path (stack)
+% edge [dashed] node {} (-4.2, -5.2);
+% \path (stack)
+% edge [dashed] node {} (-4, -5.2);
+% \path (stack)
+% edge [dashed] node {} (-0.1, -5.2);
+% \path (stack)
+% edge [dashed] node {} (0.2, -5.26);
+% \path (stack)
+% edge [dashed] node {} (3.2, -5);
+% \path (stack)
+% edge [dashed] node {} (2.7, -5);
+% \path (stack)
+% edge [dashed] node {} (3.7, -5);
+% \end{tikzpicture}
+% \caption{Stored $r_i, c_{i+1}$ Used by $\inj$}
+% \end{figure}
+% \noindent
Storing all $r_i, c_{i+1}$ pairs recursively
allows the algorithm to work in an elegant way, at the expense of
storing quite a bit of verbose information on the stack.
--- a/ChengsongTanPhdThesis/Chapters/Finite.tex Tue Jul 25 17:28:29 2023 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Finite.tex Wed Aug 23 03:02:31 2023 +0100
@@ -1,6 +1,6 @@
% Chapter Template
-\chapter{Finiteness Bound} % Main chapter title
+\chapter{A Formal Proof That $\textit{Blexer}\_\textit{simp}$ will not Grow Unbounded} % Main chapter title
\label{Finite}
% In Chapter 4 \ref{Chapter4} we give the second guarantee
@@ -8,15 +8,18 @@
%regex's derivatives.
%(this is cahpter 5 now)
-In this chapter we give a bound in terms of the size of
+In this chapter we prove a bound in terms of the size of
the calculated derivatives:
-given an annotated regular expression $a$, for any string $s$
+given an annotated regular expression $a$, there exists
+a constant integer $N$, such that for any string $s$
our algorithm $\blexersimp$'s derivatives
-are finitely bounded
-by a constant that only depends on $a$.
-Formally we show that there exists a constant integer $N_a$ such that
+are bounded
+by $N$. %a constant that only depends on $a$.
+Formally this can be expresssed
+as
+%we show that there exists a constant integer $N_a$ such that
\begin{center}
- $\llbracket \bderssimp{a}{s} \rrbracket \leq N_a$
+ $\llbracket \bderssimp{a}{s} \rrbracket \leq N$
\end{center}
\noindent
where the size ($\llbracket \_ \rrbracket$) of
@@ -36,7 +39,7 @@
save other possibilities on a stack when exploring one possible
path of matching. Catastrophic backtracking typically occurs
when the number of steps increase exponentially with respect
- to input. In other words, the runtime is $O((c_r)^n)$ of the input
+ to input. In other words, the complexity is $O((c_r)^n)$ of the input
string length $n$, where the base of the exponent is determined by the
regular expression $r$.
%so that they
@@ -107,7 +110,7 @@
$\llbracket \ZERO \rrbracket$ & $\dn$ & $1$ \\
$\llbracket _{bs} r_1 \cdot r_2 \rrbracket$ & $\dn$ & $\llbracket r_1 \rrbracket + \llbracket r_2 \rrbracket + 1$\\
$\llbracket _{bs}\mathbf{c} \rrbracket $ & $\dn$ & $1$\\
- $\llbracket _{bs}\sum as \rrbracket $ & $\dn$ & $\map \; (\llbracket \_ \rrbracket)\; as + 1$\\
+ $\llbracket _{bs}\sum as \rrbracket $ & $\dn$ & $(\textit{sum}\; (\map \; (\llbracket \_ \rrbracket)\; as) ) + 1$\\
$\llbracket _{bs} a^* \rrbracket $ & $\dn$ & $\llbracket a \rrbracket + 1$.
\end{tabular}
\end{center}
@@ -152,40 +155,40 @@
Our proof states that all the blue nodes
stay below a size bound $N_a$ determined by the input $a$.
-\noindent
Sulzmann and Lu's assumed a similar picture of their algorithm,
-though in fact their algorithm's size might be better depicted by the following graph:
-\begin{figure}[H]
- \begin{tikzpicture}[scale=2,
- every node/.style={minimum size=11mm},
- ->,>=stealth',shorten >=1pt,auto,thick
- ]
- \node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
- \node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
- \draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
-
- \node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=7mm]{$a_{1s}$};
- \draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp'$};
-
- \node (r2) [rectangle, draw=black, thick, right=of r1s, minimum size = 17mm]{$a_2$};
- \draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
-
- \node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=14mm]{$a_{2s}$};
- \draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp'$};
-
- \node (r3) [rectangle, draw = black, thick, right= of r2s, minimum size = 22mm]{$a_3$};
- \draw[->,line width=0.2mm](r2s)--(r3) node[above,midway] {$\backslash c_3$};
-
- \node (rns) [right = of r3, draw=blue, minimum size = 20mm]{$a_{3s}$};
- \draw[->,line width=0.2mm] (r3)--(rns) node [above, midway] {$\simp'$};
-
- \node (rnn) [right = of rns, minimum size = 1mm]{};
- \draw[->, dashed] (rns)--(rnn) node [above, midway] {$\ldots$};
-
- \end{tikzpicture}
- \caption{Regular expression size change during our $\blexersimp$ algorithm}\label{sulzShrinks}
-\end{figure}
-\noindent
+even though it did not work as they expected.
+%though in fact their algorithm's size might be better depicted by the following graph:
+%\begin{figure}[H]
+% \begin{tikzpicture}[scale=2,
+% every node/.style={minimum size=11mm},
+% ->,>=stealth',shorten >=1pt,auto,thick
+% ]
+% \node (r0) [rectangle, draw=black, thick, minimum size = 5mm, draw=blue] {$a$};
+% \node (r1) [rectangle, draw=black, thick, right=of r0, minimum size = 7mm]{$a_1$};
+% \draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash c_1$};
+%
+% \node (r1s) [rectangle, draw=blue, thick, right=of r1, minimum size=7mm]{$a_{1s}$};
+% \draw[->, line width=0.2mm](r1)--(r1s) node[above, midway] {$\simp'$};
+%
+% \node (r2) [rectangle, draw=black, thick, right=of r1s, minimum size = 17mm]{$a_2$};
+% \draw[->,line width=0.2mm](r1s)--(r2) node[above,midway] {$\backslash c_2$};
+%
+% \node (r2s) [rectangle, draw = blue, thick, right=of r2,minimum size=14mm]{$a_{2s}$};
+% \draw[->,line width=0.2mm](r2)--(r2s) node[above,midway] {$\simp'$};
+%
+% \node (r3) [rectangle, draw = black, thick, right= of r2s, minimum size = 22mm]{$a_3$};
+% \draw[->,line width=0.2mm](r2s)--(r3) node[above,midway] {$\backslash c_3$};
+%
+% \node (rns) [right = of r3, draw=blue, minimum size = 20mm]{$a_{3s}$};
+% \draw[->,line width=0.2mm] (r3)--(rns) node [above, midway] {$\simp'$};
+%
+% \node (rnn) [right = of rns, minimum size = 1mm]{};
+% \draw[->, dashed] (rns)--(rnn) node [above, midway] {$\ldots$};
+%
+% \end{tikzpicture}
+% \caption{Regular expression size change during our $\blexersimp$ algorithm}\label{sulzShrinks}
+%\end{figure}
+%\noindent
The picture means that in some cases their lexer (where they use $\simpsulz$
as the simplification function)
will have a size explosion, causing the running time
@@ -765,10 +768,13 @@
we have to somehow get a more concrete form to begin.
We call such more concrete representations the ``closed forms'' of
string derivatives as opposed to their original definitions.
-The terminology ``closed form'' is borrowed from mathematics,
-which usually describe expressions that are solely comprised of finitely many
-well-known and easy-to-compute operations such as
-additions, multiplications, and exponential functions.
+The name ``closed from'' was inspired by closed forms in math,
+and the similarity with closed forms here is that they make
+estimating the same term easier.
+%The terminology ``closed form'' is borrowed from mathematics,
+%which usually describe expressions that are solely comprised of finitely many
+%well-known and easy-to-compute operations such as
+%additions, multiplications, and exponential functions.
We start by proving some basic identities
involving the simplification functions for r-regular expressions.
--- a/ChengsongTanPhdThesis/Chapters/Inj.tex Tue Jul 25 17:28:29 2023 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Inj.tex Wed Aug 23 03:02:31 2023 +0100
@@ -7,906 +7,33 @@
% used for describing the lexing algorithm by Sulzmann and Lu,
%and then give the algorithm and its variant and discuss
%why more aggressive simplifications are needed.
-We start with a technical overview of the catastrophic backtracking problem,
-motivating rigorous approaches to regular expression matching and lexing.
-Then we introduce basic notations and definitions of our problem.
-
-\section{Technical Overview}
-Consider for example the regular expression $(a^*)^*\,b$ and
-strings of the form $aa..a$. These strings cannot be matched by this regular
-expression: obviously the expected $b$ in the last
-position is missing. One would assume that modern regular expression
-matching engines can find this out very quickly. Surprisingly, if one tries
-this example in JavaScript, Python or Java 8, even with small strings,
-say of lenght of around 30 $a$'s,
-the decision takes large amounts of time to finish.
-This is inproportional
-to the simplicity of the input (see graphs in figure \ref{fig:aStarStarb}).
-The algorithms clearly show exponential behaviour, and as can be seen
-is triggered by some relatively simple regular expressions.
-Java 9 and newer
-versions mitigates this behaviour by several magnitudes,
-but are still slow compared
-with the approach we are going to use in this thesis.
-
-
-
-This superlinear blowup in regular expression engines
-has caused grief in ``real life'' where it is
-given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
-For example, on 20 July 2016 one evil
-regular expression brought the webpage
-\href{http://stackexchange.com}{Stack Exchange} to its
-knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
-In this instance, a regular expression intended to just trim white
-spaces from the beginning and the end of a line actually consumed
-massive amounts of CPU resources---causing the web servers to grind to a
-halt. In this example, the time needed to process
-the string was
-$O(n^2)$ with respect to the string length $n$. This
-quadratic overhead was enough for the homepage of Stack Exchange to
-respond so slowly that the load balancer assumed a $\mathit{DoS}$
-attack and therefore stopped the servers from responding to any
-requests. This made the whole site become unavailable.
-
-\begin{figure}[p]
-\begin{center}
-\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- ylabel={time in secs},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=35,
- ytick={0,5,...,30},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={JavaScript},
- legend pos=north west,
- legend cell align=left]
-\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
-\end{axis}
-\end{tikzpicture}
- &
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- %ylabel={time in secs},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=35,
- ytick={0,5,...,30},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={Python},
- legend pos=north west,
- legend cell align=left]
-\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
-\end{axis}
-\end{tikzpicture}\\
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- ylabel={time in secs},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=35,
- ytick={0,5,...,30},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={Java 8},
- legend pos=north west,
- legend cell align=left]
-\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
-\end{axis}
-\end{tikzpicture}
- &
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- %ylabel={time in secs},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=35,
- ytick={0,5,...,30},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={Dart},
- legend pos=north west,
- legend cell align=left]
-\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
-\end{axis}
-\end{tikzpicture}\\
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- ylabel={time in secs},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=35,
- ytick={0,5,...,30},
- scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={Swift},
- legend pos=north west,
- legend cell align=left]
-\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
-\end{axis}
-\end{tikzpicture}
- &
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- %ylabel={time in secs},
- enlargelimits=true,
- %xtick={0,5000,...,40000},
- %xmax=40000,
- %ymax=35,
- restrict x to domain*=0:40000,
- restrict y to domain*=0:35,
- %ytick={0,5,...,30},
- %scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={Java9+},
- legend pos=north west,
- legend cell align=left]
-\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
-\end{axis}
-\end{tikzpicture}\\
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- ylabel={time in secs},
- enlargelimits=true,
- %xtick={0,5,...,30},
- %xmax=33,
- %ymax=35,
- restrict x to domain*=0:60000,
- restrict y to domain*=0:35,
- %ytick={0,5,...,30},
- %scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend entries={Scala},
- legend pos=north west,
- legend cell align=left]
-\addplot[magenta,mark=*, mark options={fill=white}] table {re-blexersimp.data};
-\end{axis}
-\end{tikzpicture}
- &
-\begin{tikzpicture}
-\begin{axis}[
- xlabel={$n$},
- x label style={at={(1.05,-0.05)}},
- %ylabel={time in secs},
- enlargelimits=true,
- %xtick={0,5000,...,40000},
- %xmax=40000,
- %ymax=35,
- restrict x to domain*=0:60000,
- restrict y to domain*=0:45,
- %ytick={0,5,...,30},
- %scaled ticks=false,
- axis lines=left,
- width=5cm,
- height=4cm,
- legend style={cells={align=left}},
- legend entries={Isabelle \\ Extracted},
- legend pos=north west,
- legend cell align=left]
-\addplot[magenta,mark=*, mark options={fill=white}] table {re-fromIsabelle.data};
-\end{axis}
-\end{tikzpicture}\\
-\multicolumn{2}{c}{Graphs}
-\end{tabular}
-\end{center}
-\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings
- of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
- The reason for their fast growth %superlinear behaviour
- is that they do a depth-first-search
- using NFAs.
- If the string does not match, the regular expression matching
- engine starts to explore all possibilities.
- The last two graphs are for our implementation in Scala, one manual
- and one extracted from the verified lexer in Isabelle by $\textit{codegen}$.
- Our lexer
- performs better in this case, and
- is formally verified.
- Despite being almost identical, the codegen-generated lexer
- % is generated directly from Isabelle using $\textit{codegen}$,
- is slower than the manually written version since the code synthesised by
- $\textit{codegen}$ does not use native integer or string
- types of the target language.
- %Note that we are comparing our \emph{lexer} against other languages' matchers.
-}\label{fig:aStarStarb}
-\end{figure}\afterpage{\clearpage}
-
-A more recent example is a global outage of all Cloudflare servers on 2 July
-2019. A poorly written regular expression exhibited catastrophic backtracking
-and exhausted CPUs that serve HTTP traffic. Although the outage
-had several causes, at the heart was a regular expression that
-was used to monitor network
-traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
-These problems with regular expressions
-are not isolated events that happen
-very rarely,
-%but actually widespread.
-%They occur so often that they have a
-but they occur actually often enough that they have a
-name: Regular-Expression-Denial-Of-Service (ReDoS)
-attacks.
-Davis et al. \cite{Davis18} detected more
-than 1000 evil regular expressions
-in Node.js, Python core libraries, npm and pypi.
-They therefore concluded that evil regular expressions
-are a real problem rather than just "a parlour trick".
-
-The work in this thesis aims to address this issue
-with the help of formal proofs.
-We describe a lexing algorithm based
-on Brzozowski derivatives with verified correctness
-and a finiteness property for the size of derivatives
-(which are all done in Isabelle/HOL).
-Such properties %guarantee the absence of
-are an important step in preventing
-catastrophic backtracking once and for all.
-We will give more details in the next sections
-on (i) why the slow cases in graph \ref{fig:aStarStarb}
-can occur in traditional regular expression engines
-and (ii) why we choose our
-approach based on Brzozowski derivatives and formal proofs.
-
-
-
-
-In this chapter, we define the basic notions
-for regular languages and regular expressions.
-This is essentially a description in ``English''
-the functions and datatypes of our formalisation in Isabelle/HOL.
-We also define what $\POSIX$ lexing means,
-followed by the first lexing algorithm by Sulzmanna and Lu \parencite{Sulzmann2014}
-that produces the output conforming
-to the $\POSIX$ standard.
-This is a preliminary chapter which describes the results of
-Sulzmann and Lu and Ausaf et al \ref{AusafDyckhoffUrban2016},
-but the proof details and function definitions are needed to motivate our work
-as we show in chapter \ref{Bitcoded2} how the proofs break down when
-simplifications are applied.
-%TODO: Actually show this in chapter 4.
-
-In what follows
-we choose to use the Isabelle-style notation
-for function and datatype constructor applications, where
-the parameters of a function are not enclosed
-inside a pair of parentheses (e.g. $f \;x \;y$
-instead of $f(x,\;y)$). This is mainly
-to make the text visually more concise.
-
-
-
-
-\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
-Regular expressions and regular expression matchers
-have clearly been studied for many, many years.
-Theoretical results in automata theory state
-that basic regular expression matching should be linear
-w.r.t the input.
-This assumes that the regular expression
-$r$ was pre-processed and turned into a
-deterministic finite automaton (DFA) before matching \cite{Sakarovitch2009}.
-By basic we mean textbook definitions such as the one
-below, involving only regular expressions for characters, alternatives,
-sequences, and Kleene stars:
-\[
- r ::= c | r_1 + r_2 | r_1 \cdot r_2 | r^*
-\]
-Modern regular expression matchers used by programmers,
-however,
-support much richer constructs, such as bounded repetitions,
-negations,
-and back-references.
-To differentiate, we use the word \emph{regex} to refer
-to those expressions with richer constructs while reserving the
-term \emph{regular expression}
-for the more traditional meaning in formal languages theory.
-We follow this convention
-in this thesis.
-In the future, we aim to support all the popular features of regexes,
-but for this work we mainly look at basic regular expressions
-and bounded repetitions.
-
-
-
-%Most modern regex libraries
-%the so-called PCRE standard (Peral Compatible Regular Expressions)
-%has the back-references
-Regexes come with a number of constructs
-that make it more convenient for
-programmers to write regular expressions.
-Depending on the types of constructs
-the task of matching and lexing with them
-will have different levels of complexity.
-Some of those constructs are syntactic sugars that are
-simply short hand notations
-that save the programmers a few keystrokes.
-These will not cause problems for regex libraries.
-For example the
-non-binary alternative involving three or more choices just means:
-\[
- (a | b | c) \stackrel{means}{=} ((a + b)+ c)
-\]
-Similarly, the range operator
-%used to express the alternative
-%of all characters between its operands,
-is just a concise way
-of expressing an alternative of consecutive characters:
-\[
- [0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 )
-\]
-for an alternative. The
-wildcard character '$.$' is used to refer to any single character,
-\[
- . \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
-\]
-except the newline.
-
-\subsection{Bounded Repetitions}
-More interesting are bounded repetitions, which can
-make the regular expressions much
-more compact.
-Normally there are four kinds of bounded repetitions:
-$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
-(where $n$ and $m$ are constant natural numbers).
-Like the star regular expressions, the set of strings or language
-a bounded regular expression can match
-is defined using the power operation on sets:
-\begin{center}
- \begin{tabular}{lcl}
- $L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
- $L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
- $L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
- $L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
- \end{tabular}
-\end{center}
-The attraction of bounded repetitions is that they can be
-used to avoid a size blow up: for example $r^{\{n\}}$
-is a shorthand for
-the much longer regular expression:
-\[
- \underbrace{r\ldots r}_\text{n copies of r}.
-\]
-%Therefore, a naive algorithm that simply unfolds
-%them into their desugared forms
-%will suffer from at least an exponential runtime increase.
-The problem with matching
-such bounded repetitions
-is that tools based on the classic notion of
-automata need to expand $r^{\{n\}}$ into $n$ connected
-copies of the automaton for $r$. This leads to very inefficient matching
-algorithms or algorithms that consume large amounts of memory.
-Implementations using $\DFA$s will
-in such situations
-either become very slow
-(for example Verbatim++ \cite{Verbatimpp}) or run
-out of memory (for example $\mathit{LEX}$ and
-$\mathit{JFLEX}$\footnote{LEX and JFLEX are lexer generators
-in C and JAVA that generate $\mathit{DFA}$-based
-lexers. The user provides a set of regular expressions
-and configurations, and then
-gets an output program encoding a minimized $\mathit{DFA}$
-that can be compiled and run.
-When given the above countdown regular expression,
-a small $n$ (say 20) would result in a program representing a
-DFA
-with millions of states.}) for large counters.
-A classic example for this phenomenon is the regular expression $(a+b)^* a (a+b)^{n}$
-where the minimal DFA requires at least $2^{n+1}$ states.
-For example, when $n$ is equal to 2,
-the corresponding $\mathit{NFA}$ looks like:
-\vspace{6mm}
-\begin{center}
-\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
- \node[state,initial] (q_0) {$q_0$};
- \node[state, red] (q_1) [right=of q_0] {$q_1$};
- \node[state, red] (q_2) [right=of q_1] {$q_2$};
- \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
- \path[->]
- (q_0) edge node {a} (q_1)
- edge [loop below] node {a,b} ()
- (q_1) edge node {a,b} (q_2)
- (q_2) edge node {a,b} (q_3);
-\end{tikzpicture}
-\end{center}
-and when turned into a DFA by the subset construction
-requires at least $2^3$ states.\footnote{The
-red states are "countdown states" which count down
-the number of characters needed in addition to the current
-string to make a successful match.
-For example, state $q_1$ indicates a match that has
-gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
-and just consumed the "delimiter" $a$ in the middle, and
-needs to match 2 more iterations of $(a|b)$ to complete.
-State $q_2$ on the other hand, can be viewed as a state
-after $q_1$ has consumed 1 character, and just waits
-for 1 more character to complete.
-The state $q_3$ is the last (accepting) state, requiring 0
-more characters.
-Depending on the suffix of the
-input string up to the current read location,
-the states $q_1$ and $q_2$, $q_3$
-may or may
-not be active.
-A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
-contain at least $2^3$ non-equivalent states that cannot be merged,
-because the subset construction during determinisation will generate
-all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
-Generalizing this to regular expressions with larger
-bounded repetitions number, we have that
-regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
-would require at least $2^{n+1}$ states, if $r$ itself contains
-more than 1 string.
-This is to represent all different
-scenarios in which "countdown" states are active.}
-
+\marginpar{Gerog comment: addressed attributing work correctly problem,
+showed clearly these definitions come from Ausaf et al.}
-Bounded repetitions are important because they
-tend to occur frequently in practical use,
-for example in the regex library RegExLib, in
-the rules library of Snort \cite{Snort1999}\footnote{
-Snort is a network intrusion detection (NID) tool
-for monitoring network traffic.
-The network security community curates a list
-of malicious patterns written as regexes,
-which is used by Snort's detection engine
-to match against network traffic for any hostile
-activities such as buffer overflow attacks.},
-as well as in XML Schema definitions (XSDs).
-According to Bj\"{o}rklund et al \cite{xml2015},
-more than half of the
-XSDs they found on the Maven.org central repository
-have bounded regular expressions in them.
-Often the counters are quite large, with the largest being
-close to ten million.
-A smaller sample XSD they gave
-is:
-\lstset{
- basicstyle=\fontsize{8.5}{9}\ttfamily,
- language=XML,
- morekeywords={encoding,
- xs:schema,xs:element,xs:complexType,xs:sequence,xs:attribute}
-}
-\begin{lstlisting}
-<sequence minOccurs="0" maxOccurs="65535">
- <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
- <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
-</sequence>
-\end{lstlisting}
-This can be seen as the regex
-$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
-regular expressions
-satisfying certain constraints (such as
-satisfying the floating point number format).
-It is therefore quite unsatisfying that
-some regular expressions matching libraries
-impose adhoc limits
-for bounded regular expressions:
-For example, in the regular expression matching library in the Go
-language the regular expression $a^{1001}$ is not permitted, because no counter
-can be above 1000, and in the built-in Rust regular expression library
-expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
-for being too big.
-As Becchi and Crawley \cite{Becchi08} have pointed out,
-the reason for these restrictions
-is that they simulate a non-deterministic finite
-automata (NFA) with a breadth-first search.
-This way the number of active states could
-be equal to the counter number.
-When the counters are large,
-the memory requirement could become
-infeasible, and a regex engine
-like in Go will reject this pattern straight away.
-\begin{figure}[H]
-\begin{center}
-\begin{tikzpicture} [node distance = 2cm, on grid, auto]
-
- \node (q0) [state, initial] {$0$};
- \node (q1) [state, right = of q0] {$1$};
- %\node (q2) [state, right = of q1] {$2$};
- \node (qdots) [right = of q1] {$\ldots$};
- \node (qn) [state, right = of qdots] {$n$};
- \node (qn1) [state, right = of qn] {$n+1$};
- \node (qn2) [state, right = of qn1] {$n+2$};
- \node (qn3) [state, accepting, right = of qn2] {$n+3$};
-
-\path [-stealth, thick]
- (q0) edge [loop above] node {a} ()
- (q0) edge node {a} (q1)
- %(q1) edge node {.} (q2)
- (q1) edge node {.} (qdots)
- (qdots) edge node {.} (qn)
- (qn) edge node {.} (qn1)
- (qn1) edge node {b} (qn2)
- (qn2) edge node {$c$} (qn3);
-\end{tikzpicture}
-%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
-% \node[state,initial] (q_0) {$0$};
-% \node[state, ] (q_1) [right=of q_0] {$1$};
-% \node[state, ] (q_2) [right=of q_1] {$2$};
-% \node[state,
-% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
-% \path[->]
-% (q_0) edge node {a} (q_1)
-% edge [loop below] node {a,b} ()
-% (q_1) edge node {a,b} (q_2)
-% (q_2) edge node {a,b} (q_3);
-%\end{tikzpicture}
-\end{center}
-\caption{The example given by Becchi and Crawley
- that NFA simulation can consume large
- amounts of memory: $.^*a.^{\{n\}}bc$ matching
- strings of the form $aaa\ldots aaaabc$.
- When traversing in a breadth-first manner,
-all states from 0 till $n+1$ will become active.}\label{fig:inj}
+This is a preliminary chapter which describes the results of
+Sulzmann and Lu \cite{Sulzmann2014} and part of the formalisations by
+Ausaf et al \ref{AusafDyckhoffUrban2016} (mainly the definitions).
+These results are not part of this PhD work,
+but the details are included to provide necessary context
+for our work on the correctness proof of $\blexersimp$,
+as we show in chapter \ref{Bitcoded2} how the proofs break down when
+simplifications are applied.
-\end{figure}
-%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
-%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
-%in terms of input string length.
-%TODO:try out these lexers
-These problems can of course be solved in matching algorithms where
-automata go beyond the classic notion and for instance include explicit
-counters \cite{Turo_ov__2020}.
-These solutions can be quite efficient,
-with the ability to process
-gigabits of strings input per second
-even with large counters \cite{Becchi08}.
-These practical solutions do not come with
-formal guarantees, and as pointed out by
-Kuklewicz \cite{KuklewiczHaskell}, can be error-prone.
-%But formal reasoning about these automata especially in Isabelle
-%can be challenging
-%and un-intuitive.
-%Therefore, we take correctness and runtime claims made about these solutions
-%with a grain of salt.
-
-In the work reported in \cite{FoSSaCS2023} and here,
-we add better support using derivatives
-for bounded regular expression $r^{\{n\}}$.
-Our results
-extend straightforwardly to
-repetitions with intervals such as
-$r^{\{n\ldots m\}}$.
-The merit of Brzozowski derivatives (more on this later)
-on this problem is that
-it can be naturally extended to support bounded repetitions.
-Moreover these extensions are still made up of only small
-inductive datatypes and recursive functions,
-making it handy to deal with them in theorem provers.
-%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
-%straightforwardly extended to deal with bounded regular expressions
-%and moreover the resulting code still consists of only simple
-%recursive functions and inductive datatypes.
-Finally, bounded regular expressions do not destroy our finite
-boundedness property, which we shall prove later on.
-
-
-
-
-
-\subsection{Back-References}
-The other way to simulate an $\mathit{NFA}$ for matching is choosing
-a single transition each time, keeping all the other options in
-a queue or stack, and backtracking if that choice eventually
-fails.
-This method, often called a "depth-first-search",
-is efficient in many cases, but could end up
-with exponential run time.
-The backtracking method is employed in regex libraries
-that support \emph{back-references}, for example
-in Java and Python.
-%\section{Back-references and The Terminology Regex}
-
-%When one constructs an $\NFA$ out of a regular expression
-%there is often very little to be done in the first phase, one simply
-%construct the $\NFA$ states based on the structure of the input regular expression.
-
-%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
-%one by keeping track of all active states after consuming
-%a character, and update that set of states iteratively.
-%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
-%for a path terminating
-%at an accepting state.
-
+In the coming section, the definitions of basic notions
+for regular languages and regular expressions are given.
+This is essentially a description in ``English''
+the functions and datatypes used by Ausaf et al.
+\cite{AusafDyckhoffUrban2016} \cite{Ausaf}
+in their formalisation in Isabelle/HOL.
+We include them as we build on their formalisation,
+and therefore inherently use these definitions.
-Consider the following regular expression where the sequence
-operator is omitted for brevity:
-\begin{center}
- $r_1r_2r_3r_4$
-\end{center}
-In this example,
-one could label sub-expressions of interest
-by parenthesizing them and giving
-them a number in the order in which their opening parentheses appear.
-One possible way of parenthesizing and labelling is:
-\begin{center}
- $\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
-\end{center}
-The sub-expressions
-$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
-by 1 to 4, and can be ``referred back'' by their respective numbers.
-%These sub-expressions are called "capturing groups".
-To do so, one uses the syntax $\backslash i$
-to denote that we want the sub-string
-of the input matched by the i-th
-sub-expression to appear again,
-exactly the same as it first appeared:
-\begin{center}
-$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
-\underset{s_i \text{ which just matched} \;r_i}{\backslash i} \ldots$
-\end{center}
-Once the sub-string $s_i$ for the sub-expression $r_i$
-has been fixed, there is no variability on what the back-reference
-label $\backslash i$ can be---it is tied to $s_i$.
-The overall string will look like $\ldots s_i \ldots s_i \ldots $
-%The backslash and number $i$ are the
-%so-called "back-references".
-%Let $e$ be an expression made of regular expressions
-%and back-references. $e$ contains the expression $e_i$
-%as its $i$-th capturing group.
-%The semantics of back-reference can be recursively
-%written as:
-%\begin{center}
-% \begin{tabular}{c}
-% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
-% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
-% \end{tabular}
-%\end{center}
-A concrete example
-for back-references is
-\begin{center}
-$(.^*)\backslash 1$,
-\end{center}
-which matches
-strings that can be split into two identical halves,
-for example $\mathit{foofoo}$, $\mathit{ww}$ and so on.
-Note that this is different from
-repeating the sub-expression verbatim like
-\begin{center}
- $(.^*)(.^*)$,
-\end{center}
-which does not impose any restrictions on what strings the second
-sub-expression $.^*$
-might match.
-Another example for back-references is
-\begin{center}
-$(.)(.)\backslash 2\backslash 1$
-\end{center}
-which matches four-character palindromes
-like $abba$, $x??x$ and so on.
-
-Back-references are a regex construct
-that programmers find quite useful.
-According to Becchi and Crawley \cite{Becchi08},
-6\% of Snort rules (up until 2008) use them.
-The most common use of back-references
-is to express well-formed html files,
-where back-references are convenient for matching
-opening and closing tags like
-\begin{center}
- $\langle html \rangle \ldots \langle / html \rangle$
-\end{center}
-A regex describing such a format
-is
-\begin{center}
- $\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
-\end{center}
-Despite being useful, the expressive power of regexes
-go beyond regular languages
-once back-references are included.
-In fact, they allow the regex construct to express
-languages that cannot be contained in context-free
-languages either.
-For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
-expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
-which cannot be expressed by
-context-free grammars \cite{campeanu2003formal}.
-Such a language is contained in the context-sensitive hierarchy
-of formal languages.
-Also solving the matching problem involving back-references
-is known to be NP-complete \parencite{alfred2014algorithms}.
-Regex libraries supporting back-references such as
-PCRE \cite{pcre} therefore have to
-revert to a depth-first search algorithm including backtracking.
-What is unexpected is that even in the cases
-not involving back-references, there is still
-a (non-negligible) chance they might backtrack super-linearly,
-as shown in the graphs in figure \ref{fig:aStarStarb}.
-
-Summing up, we can categorise existing
-practical regex libraries into two kinds:
-(i) The ones with linear
-time guarantees like Go and Rust. The downside with them is that
-they impose restrictions
-on the regular expressions (not allowing back-references,
-bounded repetitions cannot exceed an ad hoc limit etc.).
-And (ii) those
-that allow large bounded regular expressions and back-references
-at the expense of using backtracking algorithms.
-They can potentially ``grind to a halt''
-on some very simple cases, resulting
-ReDoS attacks if exposed to the internet.
-
-The problems with both approaches are the motivation for us
-to look again at the regular expression matching problem.
-Another motivation is that regular expression matching algorithms
-that follow the POSIX standard often contain errors and bugs
-as we shall explain next.
-
-%We would like to have regex engines that can
-%deal with the regular part (e.g.
-%bounded repetitions) of regexes more
-%efficiently.
-%Also we want to make sure that they do it correctly.
-%It turns out that such aim is not so easy to achieve.
- %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
-% For example, the Rust regex engine claims to be linear,
-% but does not support lookarounds and back-references.
-% The GoLang regex library does not support over 1000 repetitions.
-% Java and Python both support back-references, but shows
-%catastrophic backtracking behaviours on inputs without back-references(
-%when the language is still regular).
- %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
- %TODO: verify the fact Rust does not allow 1000+ reps
-
-
-
-
-%The time cost of regex matching algorithms in general
-%involve two different phases, and different things can go differently wrong on
-%these phases.
-%$\DFA$s usually have problems in the first (construction) phase
-%, whereas $\NFA$s usually run into trouble
-%on the second phase.
-
-
-\section{Error-prone POSIX Implementations}
-Very often there are multiple ways of matching a string
-with a regular expression.
-In such cases the regular expressions matcher needs to
-disambiguate.
-The more widely used strategy is called POSIX,
-which roughly speaking always chooses the longest initial match.
-The POSIX strategy is widely adopted in many regular expression matchers
-because it is a natural strategy for lexers.
-However, many implementations (including the C libraries
-used by Linux and OS X distributions) contain bugs
-or do not meet the specification they claim to adhere to.
-Kuklewicz maintains a unit test repository which lists some
-problems with existing regular expression engines \cite{KuklewiczHaskell}.
-In some cases, they either fail to generate a
-result when there exists a match,
-or give results that are inconsistent with the POSIX standard.
-A concrete example is the regex:
-\begin{center}
- $(aba + ab + a)^* \text{and the string} \; ababa$
-\end{center}
-The correct POSIX match for the above
-involves the entire string $ababa$,
-split into two Kleene star iterations, namely $[ab], [aba]$ at positions
-$[0, 2), [2, 5)$
-respectively.
-But feeding this example to the different engines
-listed at regex101 \footnote{
- regex101 is an online regular expression matcher which
- provides API for trying out regular expression
- engines of multiple popular programming languages like
-Java, Python, Go, etc.} \parencite{regex101}.
-yields
-only two incomplete matches: $[aba]$ at $[0, 3)$
-and $a$ at $[4, 5)$.
-Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
-commented that most regex libraries are not
-correctly implementing the central POSIX
-rule, called the maximum munch rule.
-Grathwohl \parencite{grathwohl2014crash} wrote:
-\begin{quote}\it
- ``The POSIX strategy is more complicated than the
- greedy because of the dependence on information about
- the length of matched strings in the various subexpressions.''
-\end{quote}
-%\noindent
-People have recognised that the implementation complexity of POSIX rules also come from
-the specification being not very precise.
-The developers of the regexp package of Go
-\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
-commented that
-\begin{quote}\it
-``
-The POSIX rule is computationally prohibitive
-and not even well-defined.
-``
-\end{quote}
-There are many informal summaries of this disambiguation
-strategy, which are often quite long and delicate.
-For example Kuklewicz \cite{KuklewiczHaskell}
-described the POSIX rule as (section 1, last paragraph):
-\begin{quote}
- \begin{itemize}
- \item
-regular expressions (REs) take the leftmost starting match, and the longest match starting there
-earlier subpatterns have leftmost-longest priority over later subpatterns\\
-\item
-higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
-\item
-REs have right associative concatenation which can be changed with parenthesis\\
-\item
-parenthesized subexpressions return the match from their last usage\\
-\item
-text of component subexpressions must be contained in the text of the
-higher-level subexpressions\\
-\item
-if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
-\item
-if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
-\end{itemize}
-\end{quote}
-%The text above
-%is trying to capture something very precise,
-%and is crying out for formalising.
-Ribeiro and Du Bois \cite{RibeiroAgda2017} have
-formalised the notion of bit-coded regular expressions
-and proved their relations with simple regular expressions in
-the dependently-typed proof assistant Agda.
-They also proved the soundness and completeness of a matching algorithm
-based on the bit-coded regular expressions.
-Ausaf et al. \cite{AusafDyckhoffUrban2016}
-are the first to
-give a quite simple formalised POSIX
-specification in Isabelle/HOL, and also prove
-that their specification coincides with an earlier (unformalised)
-POSIX specification given by Okui and Suzuki \cite{Okui10}.
-They then formally proved the correctness of
-a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
-with regards to that specification.
-They also found that the informal POSIX
-specification by Sulzmann and Lu needs to be substantially
-revised in order for the correctness proof to go through.
-Their original specification and proof were unfixable
-according to Ausaf et al.
-
-
-In the next section, we will briefly
-introduce Brzozowski derivatives and Sulzmann
-and Lu's algorithm, which the main point of this thesis builds on.
+% In the next section, we will briefly
+% introduce Brzozowski derivatives and Sulzmann
+% and Lu's algorithm, which the main point of this thesis builds on.
%We give a taste of what they
%are like and why they are suitable for regular expression
%matching and lexing.
@@ -1679,6 +806,24 @@
Sulzmann and Lu \cite{Sulzmann2014} first came up with a nice and
elegant (arguably as beautiful as the definition of the
Brzozowski derivative) solution for this.
+\marginpar{explicitly attribute the work}
+For the same reason described
+at the beginning of this chapter,
+we introduce the formal
+semantics of $\POSIX$ lexing by
+Ausaf et al.\cite{AusafDyckhoffUrban2016},
+followed by the first lexing algorithm by
+Sulzmanna and Lu \cite{Sulzmann2014}
+that produces the output conforming
+to the $\POSIX$ standard.
+%TODO: Actually show this in chapter 4.
+In what follows
+we choose to use the Isabelle-style notation
+for function and datatype constructor applications, where
+the parameters of a function are not enclosed
+inside a pair of parentheses (e.g. $f \;x \;y$
+instead of $f(x,\;y)$). This is mainly
+to make the text visually more concise.
\section{Values and the Lexing Algorithm by Sulzmann and Lu}
In this section, we present a two-phase regular expression lexing
--- a/ChengsongTanPhdThesis/Chapters/Introduction.tex Tue Jul 25 17:28:29 2023 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Introduction.tex Wed Aug 23 03:02:31 2023 +0100
@@ -201,11 +201,15 @@
%----------------------------------------------------------------------------------------
%This part is about regular expressions, Brzozowski derivatives,
%and a bit-coded lexing algorithm with proven correctness and time bounds.
+% \marginpar{Totally rewritten introduction}
+% Formal proofs,
+
%TODO: look up snort rules to use here--give readers idea of what regexes look like
-\marginpar{rephrasing using "imprecise words"}
-Regular expressions, since their inception in the 1940s,
+\marginpar{rephrasing following Christian comment}
+Regular expressions, since their inception in the 1950s
+\cite{RegularExpressions},
have been subject to extensive study and implementation.
Their primary application lies in text processing--finding
matches and identifying patterns in a string.
@@ -215,7 +219,7 @@
to recognise email addresses is
\marginpar{rephrased from "the regex for recognising" to "a simple regex that tries to match email"}
\begin{center}
-\verb|[a-z0-9._]^+@[a-z0-9.-]^+\.\{a-z\}\{2,6\}|
+\verb|[a-z0-9._]+@[a-z0-9.-]+\.[a-z]{2,6}|
%$[a-z0-9._]^+@[a-z0-9.-]^+\.[a-z]{2,6}$.
\end{center}
\marginpar{Simplified example, but the distinction between . and escaped . is correct
@@ -223,35 +227,363 @@
%Using this, regular expression matchers and lexers are able to extract
%the domain names by the use of \verb|[a-zA-Z0-9.-]+|.
\marginpar{Rewrote explanation for the expression.}
-The bracketed sub-expressions are used to extract specific parts of an email address.
-The local part is recognised by the expression enclosed in
-the first pair of brackets: $[a-z0-9._]$, and after the ``@'' sign
-is the part that recognises the domain, where $[a-z]{2, 6}$ specifically
-matches the top-level domain.
+This expression assumes all letters in the email have been converted into lower-case.
+The local-part is recognised by the first
+bracketed expression $[a-z0-9.\_]^+$
+where the
+operator ``+'' (should be viewed as a superscript)
+means that it must be some non-empty string
+made of alpha-numeric characters.
+After the ``@'' sign
+is the sub-expression that recognises the domain of that email,
+where $[a-z]^{\{2, 6\}}$ specifically
+matches the top-level domain, such as ``org'', ``com'', ``uk'' and etc.
+The counters in the superscript
+$2, 6$ specifies that all top-level domains
+should have between two to six characters.
+This regular expression does not represent all possible email addresses
+(e.g. those involving ``-'' cannot be recognised), but patterns
+of similar shape and using roughly the same set of syntax are used
+everywhere in our daily life,
+% Indeed regular expressions are used everywhere
+% in a computer user's daily life--for example searching for a string
+% in a text document using ctrl-F.
+% When such regex searching
+% a useful and simple way to handle email-processing when the task does
+% not require absolute precision.
+%The same can be said about
+%applications like .
%Consequently, they are an indispensible components in text processing tools
%of software systems such as compilers, IDEs, and firewalls.
+% The picture gets more blurry when we start to ask questions like
+% why
+%TODO: Insert corresponding bibliography
+%There are many more scenarios in which regular exp% %use cases for regular expressions in computer science,
+for example in compilers \cite{LEX}, networking \cite{Snort1999},
+software engineering (IDEs) \cite{Atom}
+and operating systems \cite{GREP}, where the correctness
+of matching can be crucially important.
-The study of regular expressions is ongoing due to an
-issue known as catastrophic backtracking.
-This phenomenon refers to scenarios in which the regular expression
-matching or lexing engine exhibits a disproportionately long
-runtime despite the simplicity of the input and expression.
+Implementations of regular expression matching out in the wild, however,
+are surprisingly unreliable.
+%are not always reliable as %people think
+%they
+%should be. %Given the importance of the correct functioning
+% Indeed they have been heavily relied on and
+% extensively implemented and studied in modern computer systems--whenever you do
+% a
+%TODO: double check this is true
+An example is the regular expresion $(a^*)^*b$ and the string
+$aa\ldots a$. The expectation is that any regex engine should
+be able to solve this (return a no match) in no time.
+However, if this
+%regular expression and string pair
+is tried out in an
+regex engine like that of Java 8, the runtime would quickly grow beyond 100 seconds
+with just dozens of characters
+%in the string.
+Such behaviour can result in Denial-of-Service (ReDoS) attacks
+with minimal resources--just the pair of ``evil''
+regular expression and string.
+The outage of the web service provider Cloudflare \cite{Cloudflare}
+in 2019 \cite{CloudflareOutage} is a recent example
+where such issues caused serious negative impact.
+
+
+The reason why these regex matching engines get so slow
+is because they use backtracking or a depth-first-search (DFS) on the
+search space of possible matches. They employ backtracking algorithms to
+support back-references, a mechanism allowing
+expressing languages which repeating an arbitrary long string
+such as
+$\{ww| w\in \Sigma*\}$. Such a constructs makes matching
+NP-complete, for which no known non-backtracking algorithms exist.
+More modern programming languages' regex engines such as
+Rust and GO's do promise linear-time behaviours
+with respect to input string,
+but they do not support back-references, and often
+impose ad-hoc restrictions on the input patterns.
+More importantly, these promises are purely empirical,
+making them prone to future ReDoS attacks and other types of errors.
+
+The other unreliability of industry regex engines
+is that they do not produce the desired output.
+Kuklewicz have found out during his testing
+of practical regex engines that almost all of them are incorrect with respect
+to the POSIX standard, a disambiguation strategy adopted most
+widely in computer science. Roughly speaking POSIX lexing means to always
+go for the longest initial submatch possible,
+for instance a single iteration
+$aa$ is preferred over two iterations $a,a$
+when matching $(a|aa)^*$ with the string $aa$.
+The POSIX strategy as summarised by Kuklewicz goes
+as follows:
+\marginpar{\em Deleted some peripheral specifications.}
+\begin{quote}
+ \begin{itemize}
+ \item
+regular expressions (REs) take the leftmost starting match, and the longest match starting there
+earlier subpatterns have leftmost-longest priority over later subpatterns\\
+\item
+higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
+\item
+ $\ldots$
+%REs have right associative concatenation which can be changed with parenthesis\\
+%\item
+%parenthesized subexpressions return the match from their last usage\\
+%\item
+%text of component subexpressions must be contained in the text of the
+%higher-level subexpressions\\
+%\item
+%if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
+%\item
+%if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
+\end{itemize}
+\end{quote}
+The author noted that various lexers that come with POSIX operating systems
+are rarely correct according to this standard.
+A test case that exposes the bugs
+is the regular expression $(aba|ab|a)^*$ and string $ababa$.
+A correct match would split the string into $ab, aba$, involving
+two repetitions of the Kleene star $(aba|ab|a)^*$.
+Most regex engines would instead return two (partial) matches
+$aba$ and $a$ \footnote{Try it out here: \url{https://regex101.com/r/c5hga5/1}, last accessed 22-Aug-2023}.
+There are numerous occasions where programmers realised the subtlety and
+difficulty to implement POSIX correctly, one such quote from Go's regexp library author:
+\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}, last accessed 22-Aug-2023}
+\begin{quote}\it
+``
+The POSIX rule is computationally prohibitive
+and not even well-defined.
+''
+\end{quote}
+% Being able to formally define and capture the idea of
+% POSIX rules and prove
+% the correctness of regular expression matching/lexing
+% algorithms against the POSIX semantics definitions
+% is valuable.
+These all point towards a formal treatment of
+POSIX lexing to clear up inaccuracies and errors
+in understanding and implementation of regex. The work presented
+in this thesis uses formal proofs to ensure the correctness
+and runtime properties
+of POSIX regular expression lexing.
+
+Formal proofs or mechanised proofs are
+computer checked programs
+ %such as the ones written in Isabelle/HOL, is a powerful means
+that certify the correctness of facts
+with respect to a set of axioms and definitions.
+% This is done by
+% recursively checking that every fact in a proof script
+% is either an axiom or a fact that is derived from
+% known axioms or verified facts.
+%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
+%methods as their implementation and complexity analysis tend to be error-prone.
+They provide an unprecendented level of asssurance
+that an algorithm will perform as expected under all inputs.
+We believe such proofs can help eliminate catastrophic backtracking
+once and for all.
+The software systems that help people interactively build and check
+formal proofs are called proof assitants.
+% Many theorem-provers have been developed, such as Mizar,
+% Isabelle/HOL, HOL-Light, HOL4,
+% Coq, Agda, Idris, Lean and so on.
+Isabelle/HOL \cite{Isabelle} is a widely-used proof assistant with a simple type theory
+and powerful automated proof generators like sledgehammer.
+We chose to use Isabelle/HOL for its powerful automation
+and ease and simplicity in expressing regular expressions and
+regular languages.
+%Some of those employ
+%dependent types like Mizar, Coq, Agda, Lean and Idris.
+%Some support a constructivism approach, such as Coq.
+
-One cause of catastrophic backtracking lies within the
-ambiguities of lexing.\marginpar{rephrased "the origin of catastrophic ...}
-In the process of matching a multi-character string with
-a regular expression that encompasses several sub-expressions,
-different positions can be designated to mark
-the boundaries of corresponding substrings of the sub-expressions.
-For instance, in matching the string $aaa$ with the
-regular expression $(a+aa)^*$, the divide between
-the initial match and the subsequent iteration could either be
-set between the first and second characters ($a | aa$) or between the second and third characters ($aa | a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.
+% Formal proofs on regular expression matching and lexing
+% complements the efforts in
+% industry which tend to focus on overall speed
+% with techniques like parallelization (FPGA paper), tackling
+% the problem of catastrophic backtracking
+% in an ad-hoc manner (cloudflare and stackexchange article).
+The algorithm we work on is based on Brzozowski derivatives.
+Brzozowski invented the notion of ``derivatives'' on
+regular expressions \cite{Brzozowski1964}, and the idea is
+that we can reason about what regular expressions can match
+by taking derivatives of them.
+A derivative operation takes a regular expression $r$ and a character
+$c$,
+and returns a new regular expression $r\backslash c$.
+The derivative is taken with respect to $c$:
+it transforms $r$ in such a way that the resulting derivative
+$r\backslash c$ contains all strings in the language of $r$ that
+starts with $c$, but now with the head character $c$ thrown away.
+For example, for $r$ equal to $(aba | ab | a)^*$
+as discussed earlier, its derivative with respect to character $a$ is
+\[
+ r\backslash a = (ba | b| \ONE)(aba | ab | a)^*.
+\]
+% By applying derivatives repeatedly one can c
+Taking derivatives repeatedly with respect to the sequence
+of characters in the string $s$, one obtain a regular expression
+containing the information of how $r$ matched $s$.
+
+Brzozowski derivatives are purely algebraic operations
+that can be implemented as a recursive function
+that does a pattern match on the structure of the regular expression.
+For example, the derivatives of character regular expressions,
+Kleene star and bounded repetitions can be described by the following
+code equations:
+% For example some clauses
+\begin{center}
+ \begin{tabular}{lcl}
+ $d \backslash c$ & $\dn$ &
+ $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
+% $(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
+% $(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\
+% & & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
+% & & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
+ $(r^*)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^*$\\
+ $r^{\{n\}} \backslash c$ & $\dn$ & $r \backslash c \cdot
+ r^{\{n-1\}} (\text{when} \; n > 0)$\\
+ \end{tabular}
+\end{center}
+(end of ready work, rest construction site)
+In particular, we are working on an improvement of the work
+by Ausaf et al. \cite{Ausaf} \cite{AusafDyckhoffUrban2016}
+and Sulzmann and Lu \cite{Sulzmann2014}.
+
+
+The variant of the problem we are looking at centers around
+an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
+The reason we chose to look at $\blexer$ and its simplifications
+is because it allows a lexical tree to be generated
+by some elegant and subtle procedure based on Brzozowski derivatives.
+The procedures are made of recursive functions and inductive datatypes just like derivatives,
+allowing intuitive and concise formal reasoning with theorem provers.
+Most importantly, $\blexer$ opens up a path to an optimized version
+of $\blexersimp$ possibilities to improve
+performance with simplifications that aggressively change the structure of regular expressions.
+While most derivative-based methods
+rely on structures to be maintained to allow induction to
+go through.
+For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
+with derivatives, but as soon as they started introducing
+optimizations such as memoization, they reverted to constructing
+DFAs first.
+Edelmann \ref{Edelmann2020} explored similar optimizations in his
+work on verified LL(1) parsing, with additional enhancements with data structures like
+zippers.
-Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
+%Sulzmann and Lu have worked out an algorithm
+%that is especially suited for verification
+%which utilized the fact
+%that whenever ambiguity occurs one may duplicate a sub-expression and use
+%different copies to describe different matching choices.
+The idea behind the correctness of $\blexer$ is simple: during a derivative,
+multiple matches might be possible, where an alternative with multiple children
+each corresponding to a
+different match is created. In the end of
+a lexing process one always picks up the leftmost alternative, which is guarnateed
+to be a POSIX value.
+This is done by consistently keeping sub-regular expressions in an alternative
+with longer submatches
+to the left of other copies (
+Remember that POSIX values are roughly the values with the longest inital
+submatch).
+The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
+is that since we only take the leftmost copy, then all re-occurring copies can be
+eliminated without losing the POSIX property, and this can be done with
+children of alternatives at different levels by merging them together.
+Proving $\blexersimp$ requires a different
+proof strategy compared to that by Ausaf \cite{FahadThesis}.
+We invent a rewriting relation as an
+inductive predicate which captures
+a strong enough invariance that ensures correctness,
+which commutes with the derivative operation.
+This predicate allows a simple
+induction on the input string to go through.
+
+%This idea has been repeatedly used in different variants of lexing
+%algorithms in their paper, one of which involves bit-codes. The bit-coded
+%derivative-based algorithm even allows relatively aggressive
+%%simplification rules which cause
+%structural changes that render the induction used in the correctness
+%proofs unusable.
+%More details will be given in \ref{Bitcoded2} including the
+%new correctness proof which involves a new inductive predicate which allows
+%rule induction to go through.
+
-Various disambiguation strategies are
-employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
+\marginpar{20230821 Addition: high-level idea in context}
+To summarise, we expect modern regex matching and lexing engines
+to be reliabe, fast and correct, and support rich syntax constructs
+like bounded repetitions and back references.
+Backtracking regex engines have exhibited exponential
+worst-case behaviours (catastrophic backtracking)
+for employing a depth-first-search on the search tree of possible matches,
+and complexity analysis about them also takes worst-case exponential
+time (ref Minamide and Birmingham work).
+%Expand notes: JFLEX generates DFAs, cannot support backrefs
+%Expand notes: Java 8, python supports pcre, but is exponential
+%Expand notes: more modern prog langs like GO, Rust claims linear, and forbids large counters and backrefs
+%The work described in this thesis is part of an ongoing
+%project which aims to build a formally verified lexer satisfying all these
+To ensure the correctness and predictable behaviour of lexical analysis,
+we propose to
+build a formally verified lexer that satisfy correctness and non-backtracking
+requirements in a bottom-up manner using Brzozowski derivatives.
+We build on the line of work by Ausaf et al.
+A derivative-based matching algorithm avoids backtracking, by explicitly
+representing possible matches on the same level of a search tree
+as regular expression terms
+in a breadth-first manner.
+Efficiency of such algorithms rely on limiting the size
+of the derivatives during the computation, for example by
+eliminating redundant
+terms that lead to identical matches.
+
+
+The end goal would be a lexer that comes with additional formal guarantees
+on computational complexity and actual speed competent with other
+unverified regex engines.
+We will report in the next section the outcome of this project
+so far and the contribution with respect to other works in
+lexical analysis and theorem proving.
+
+
+Derivatives on regular expressions are popular in the theorem proving community
+because their algebraic features are amenable to formal reasoning.
+As a testimony for this, there exists a sizable number of formalisations of Brzozowski derivatives and
+regular expressions in different theorem proving languages.
+%Expand notes: verified lexers: proof-of-concept: Ruberio, Egolf, fast but no formal proof in a proof assistant: Magnus,
+
+
+
+% The study of regular expressions is ongoing due to an
+% issue known as catastrophic backtracking.
+% This phenomenon refers to scenarios in which the regular expression
+
+
+% One cause of catastrophic backtracking lies within the
+% ambiguities of lexing.\marginpar{rephrased "the origin of catastrophic ...}
+% In the process of matching a multi-character string with
+% a regular expression that encompasses several sub-expressions,
+% different positions can be designated to mark
+% the boundaries of corresponding substrings of the sub-expressions.
+% For instance, in matching the string $aaa$ with the
+% regular expression $(a+aa)^*$, the divide between
+% the initial match and the subsequent iteration could either be
+% set between the first and second characters ($a | aa$) or
+% between the second and third characters ($aa | a$).
+% As both the length of the input string and the structural complexity
+% of the regular expression increase, the number of potential delimiter
+% combinations can grow exponentially, leading to a corresponding increase
+% in complexity for algorithms that do not optimally navigate these possibilities.
+
+% Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
+
+% Various disambiguation strategies are
+% employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
%Regular expressions
@@ -295,83 +627,58 @@
%There have been prose definitions like the following
%by Kuklewicz \cite{KuklewiczHaskell}:
%described the POSIX rule as (section 1, last paragraph):
-\marginpar{\em Deleted some peripheral specifications.}
-\begin{quote}
- \begin{itemize}
- \item
-regular expressions (REs) take the leftmost starting match, and the longest match starting there
-earlier subpatterns have leftmost-longest priority over later subpatterns\\
-\item
-higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
-\item
- $\ldots$
-%REs have right associative concatenation which can be changed with parenthesis\\
-%\item
-%parenthesized subexpressions return the match from their last usage\\
-%\item
-%text of component subexpressions must be contained in the text of the
-%higher-level subexpressions\\
-%\item
-%if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
-%\item
-%if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
-\end{itemize}
-\end{quote}
-However, the author noted that various lexers that claim to be POSIX
-are rarely correct according to this standard.
-There are numerous occasions where programmers realised the subtlety and
-difficulty to implement correctly, one such quote from Go's regexp library author
-\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
-\begin{quote}\it
-``
-The POSIX rule is computationally prohibitive
-and not even well-defined.
-``
-\end{quote}
-Being able to formally define and capture the idea of
-POSIX rules and prove
-the correctness of regular expression matching/lexing
-algorithms against the POSIX semantics definitions
-is valuable.
+
+
+
+
+
+%first character is removed
+%state of the automaton after matching that character
+%where nodes are represented as
+%a sub-expression (for example tagged NFA
+%Working on regular expressions
+%Because of these problems with automata, we prefer regular expressions
+%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
+%the regular expression and a different data structure.
+%
+%
+%The key idea
+
-Formal proofs are
-machine checked programs
- %such as the ones written in Isabelle/HOL, is a powerful means
-for computer scientists to be certain
-about the correctness of their algorithms.
-This is done by
-recursively checking that every fact in a proof script
-is either an axiom or a fact that is derived from
-known axioms or verified facts.
-%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
-%methods as their implementation and complexity analysis tend to be error-prone.
-Formal proofs provides an unprecendented level of asssurance
-that an algorithm will perform as expected under all inputs.
-The software systems that help people interactively build and check
-such proofs are called theorem-provers or proof assitants.
-Many theorem-provers have been developed, such as Mizar,
-Isabelle/HOL, HOL-Light, HOL4,
-Coq, Agda, Idris, Lean and so on.
-Isabelle/HOL is a theorem prover with a simple type theory
-and powerful automated proof generators like sledgehammer.
-We chose to use Isabelle/HOL for its powerful automation
-and ease and simplicity in expressing regular expressions and
-regular languages.
-%Some of those employ
-%dependent types like Mizar, Coq, Agda, Lean and Idris.
-%Some support a constructivism approach, such as Coq.
+%Regular expressions are widely used in computer science:
+%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
+%command-line tools like $\mathit{grep}$ that facilitate easy
+%text-processing \cite{grep}; network intrusion
+%detection systems that inspect suspicious traffic; or compiler
+%front ends.
+%Given their usefulness and ubiquity, one would assume that
+%modern regular expression matching implementations
+%are mature and fully studied.
+%Indeed, in many popular programming languages' regular expression engines,
+%one can supply a regular expression and a string, and in most cases get
+%get the matching information in a very short time.
+%Those engines can sometimes be blindingly fast--some
+%network intrusion detection systems
+%use regular expression engines that are able to process
+%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
+%However, those engines can sometimes also exhibit a surprising security vulnerability
+%under a certain class of inputs.
+%However, , this is not the case for $\mathbf{all}$ inputs.
+%TODO: get source for SNORT/BRO's regex matching engine/speed
-Formal proofs on regular expression matching and lexing
-complements the efforts in
-industry which tend to focus on overall speed
-with techniques like parallelization (FPGA paper), tackling
-the problem of catastrophic backtracking
-in an ad-hoc manner (cloudflare and stackexchange article).
+%----------------------------------------------------------------------------------------
+\section{Contribution}
+%{\color{red} \rule{\linewidth}{0.5mm}}
+%\textbf{issue with this part: way too short, not enough details of what I have done.}
+ %{\color{red} \rule{\linewidth}{0.5mm}}
+\marginpar{\em Gerog: No sufficient context on related work making contribution hard to digest.}
-There have been many interesting steps in the theorem-proving community
-about formalising regular expressions and lexing.
+
+\marginpar{introducing formalisations on regex matching}
+There have been many formalisations in the theorem-proving community
+about regular expressions and lexing.
One flavour is to build from the regular expression an automaton, and determine
acceptance in terms of the resulting
state after executing the input string on that automaton.
@@ -441,115 +748,6 @@
%(Some person who's a big name in formal methods)
-The variant of the problem we are looking at centers around
-an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
-The reason we chose to look at $\blexer$ and its simplifications
-is because it allows a lexical tree to be generated
-by some elegant and subtle procedure based on Brzozowski derivatives.
-The procedures are made of recursive functions and inductive datatypes just like derivatives,
-allowing intuitive and concise formal reasoning with theorem provers.
-Most importantly, $\blexer$ opens up a path to an optimized version
-of $\blexersimp$ possibilities to improve
-performance with simplifications that aggressively change the structure of regular expressions.
-While most derivative-based methods
-rely on structures to be maintained to allow induction to
-go through.
-For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
-with derivatives, but as soon as they started introducing
-optimizations such as memoization, they reverted to constructing
-DFAs first.
-Edelmann \ref{Edelmann2020} explored similar optimizations in his
-work on verified LL(1) parsing, with additional enhancements with data structures like
-zippers.
-
-%Sulzmann and Lu have worked out an algorithm
-%that is especially suited for verification
-%which utilized the fact
-%that whenever ambiguity occurs one may duplicate a sub-expression and use
-%different copies to describe different matching choices.
-The idea behind the correctness of $\blexer$ is simple: during a derivative,
-multiple matches might be possible, where an alternative with multiple children
-each corresponding to a
-different match is created. In the end of
-a lexing process one always picks up the leftmost alternative, which is guarnateed
-to be a POSIX value.
-This is done by consistently keeping sub-regular expressions in an alternative
-with longer submatches
-to the left of other copies (
-Remember that POSIX values are roughly the values with the longest inital
-submatch).
-The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
-is that since we only take the leftmost copy, then all re-occurring copies can be
-eliminated without losing the POSIX property, and this can be done with
-children of alternatives at different levels by merging them together.
-Proving $\blexersimp$ requires a different
-proof strategy compared to that by Ausaf \cite{FahadThesis}.
-We invent a rewriting relation as an
-inductive predicate which captures
-a strong enough invariance that ensures correctness,
-which commutes with the derivative operation.
-This predicate allows a simple
-induction on the input string to go through.
-
-%This idea has been repeatedly used in different variants of lexing
-%algorithms in their paper, one of which involves bit-codes. The bit-coded
-%derivative-based algorithm even allows relatively aggressive
-%%simplification rules which cause
-%structural changes that render the induction used in the correctness
-%proofs unusable.
-%More details will be given in \ref{Bitcoded2} including the
-%new correctness proof which involves a new inductive predicate which allows
-%rule induction to go through.
-
-
-
-
-
-
-
-%first character is removed
-%state of the automaton after matching that character
-%where nodes are represented as
-%a sub-expression (for example tagged NFA
-%Working on regular expressions
-%Because of these problems with automata, we prefer regular expressions
-%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
-%the regular expression and a different data structure.
-%
-%
-%The key idea
-
-(ends)
-
-%Regular expressions are widely used in computer science:
-%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
-%command-line tools like $\mathit{grep}$ that facilitate easy
-%text-processing \cite{grep}; network intrusion
-%detection systems that inspect suspicious traffic; or compiler
-%front ends.
-%Given their usefulness and ubiquity, one would assume that
-%modern regular expression matching implementations
-%are mature and fully studied.
-%Indeed, in many popular programming languages' regular expression engines,
-%one can supply a regular expression and a string, and in most cases get
-%get the matching information in a very short time.
-%Those engines can sometimes be blindingly fast--some
-%network intrusion detection systems
-%use regular expression engines that are able to process
-%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
-%However, those engines can sometimes also exhibit a surprising security vulnerability
-%under a certain class of inputs.
-%However, , this is not the case for $\mathbf{all}$ inputs.
-%TODO: get source for SNORT/BRO's regex matching engine/speed
-
-
-%----------------------------------------------------------------------------------------
-\section{Contribution}
-%{\color{red} \rule{\linewidth}{0.5mm}}
-%\textbf{issue with this part: way too short, not enough details of what I have done.}
- %{\color{red} \rule{\linewidth}{0.5mm}}
-\marginpar{\em Gerog: No sufficient context on related work making contribution hard to digest.}
-
%In this thesis,
%we propose a solution to catastrophic
@@ -567,21 +765,47 @@
%lexer we developed based on Brzozowski derivatives and
%Sulzmanna and Lu's developments called
proving the lexer $\blexersimp$ is both i) correctness and ii)fast.
-It is correct w.r.t a formalisation of POSIX lexing by Ausaf et al.\ref{AusafDyckhoffUrban2016}.
+It is correct w.r.t a formalisation of POSIX lexing by Ausaf et al.\cite{AusafDyckhoffUrban2016}.
It is fast compared with un-optimised implementations like Sulzmann and Lu's orignal
development by our metric of internal data structures not growing unbounded.
+\subsection{Theorem Proving with Derivatives}
+Ribeiro and Du Bois \cite{RibeiroAgda2017} have
+formalised the notion of bit-coded regular expressions
+and proved their relations with simple regular expressions in
+the dependently-typed proof assistant Agda.
+They also proved the soundness and completeness of a matching algorithm
+based on the bit-coded regular expressions.
+Ausaf et al. \cite{AusafDyckhoffUrban2016}
+are the first to
+give a quite simple formalised POSIX
+specification in Isabelle/HOL, and also prove
+that their specification coincides with an earlier (unformalised)
+POSIX specification given by Okui and Suzuki \cite{Okui10}.
+They then formally proved the correctness of
+a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
+with regards to that specification.
+They also found that the informal POSIX
+specification by Sulzmann and Lu needs to be substantially
+revised in order for the correctness proof to go through.
+Their original specification and proof were unfixable
+according to Ausaf et al.
+
+
+
+\subsection{Complexity Results}
Our formalisation of complexity is unique among similar works in the sense that
%is about the size of internal data structures.
to our knowledge %we don't know of a
there are not other certified
lexing/parsing algorithms with similar data structure size bound theorems.
Common practices involve making empirical analysis of the complexity of the algorithm
-in question (\ref{Verbatim}, \ref{Verbatimpp}), or relying
-on prior (unformalised) complexity analysis of well-known algorithms (\ref{ValiantParsing}),
+in question (\cite{Verbatim}, \cite{Verbatimpp}), or relying
+on prior (unformalised) complexity analysis of well-known algorithms (\cite{ValiantParsing}),
making them prone to complexity bugs.
+
%TODO: add citation
-%, for example in the Verbatim work \ref{Verbatim}
+%, for example in the Verbatim work \cite{Verbatim}
%While formalised proofs have not included
%Issues known as "performance bugs" can
@@ -590,21 +814,28 @@
%while this work is done,
they do find themselves in the broader theorem-proving literature:
\emph{time credits} have been formalised for separation logic in Coq
-\ref{atkey2010amortised}%not used in
-to characterise the runtime complexity of an algorithm,
-where integer values are recorded %from the beginning of an execution
-as part of the program state
-and decremented in each step.
-The idea is that the total number of decrements
-from the time credits during execution represents the complexity of an algorithm.
+\cite{atkey2010amortised}
+%not used in
+to characterise the runtime complexity of union-find.
+%where integer values are recorded %from the beginning of an execution
+%as part of the program state
+%and decremented in each step.
+The idea is that the total number of instructions executed
+is equal to the time credits consumed
+during the execution of an algorithm.
%each time a basic step is executed.
%The way to use a time credit Time credit is an integer
%is assigned to a program prior to execution, and each step in the program consumes
%one credit.
-Arma{\"e}l et al. have extended the framework to allow expressing time
-credits using big-O notations
+Arma{\"e}l et al. \cite{bigOImperative} have extended the framework to allow expressing time
+credits using big-O notations,
so one can prove both the functional correctness and asymptotic complexity
-of higher-order imperative algorithms \ref{bigOImperative}.
+of higher-order imperative algorithms.
+A number of formal proofs also exist for some other
+algorithms in Coq \cite{subtypingFormalComplexity}.
+
+The big-O notation have also been formalised in Isabelle
+\cite{bigOIsabelle}
%for example the complexity of
%the Quicksort algorithm
%is $\Theta n\ln n$
@@ -614,7 +845,7 @@
Our work focuses on the space complexity of the algorithm under our notion of the size of
a regular expression.
Despite not being a direct asymptotic time complexity proof,
-our result is an important stepping towards one.
+our result is an important stepping stone towards one.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongTanPhdThesis/Chapters/Overview.tex Wed Aug 23 03:02:31 2023 +0100
@@ -0,0 +1,872 @@
+\chapter{Technical Overview}
+\label{Overview}
+We start with a technical overview of the
+catastrophic backtracking problem,
+motivating rigorous approaches to regular expression matching and lexing.
+In doing so we also briefly
+introduce common terminology such as
+bounded repetitions and back-references.
+
+\section{Motivating Examples}
+Consider for example the regular expression $(a^*)^*\,b$ and
+strings of the form $aa..a$. These strings cannot be matched by this regular
+expression: obviously the expected $b$ in the last
+position is missing. One would assume that modern regular expression
+matching engines can find this out very quickly. Surprisingly, if one tries
+this example in JavaScript, Python or Java 8, even with small strings,
+say of lenght of around 30 $a$'s,
+the decision takes large amounts of time to finish.
+This is inproportional
+to the simplicity of the input (see graphs in figure \ref{fig:aStarStarb}).
+The algorithms clearly show exponential behaviour, and as can be seen
+is triggered by some relatively simple regular expressions.
+Java 9 and newer
+versions mitigates this behaviour by several magnitudes,
+but are still slow compared
+with the approach we are going to use in this thesis.
+
+
+
+This superlinear blowup in regular expression engines
+has caused grief in ``real life'' where it is
+given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
+A less serious example is a bug in the Atom editor:
+a user found out when writing the following code
+\begin{verbatim}
+ vVar.Type().Name() == "" && vVar.Kind() == reflect.Ptr
+\end{verbatim}
+\begin{verbatim}
+ && vVar.Type().Elem().Name() == "" && vVar.Type().Elem().Kind() ==
+\end{verbatim}
+\begin{verbatim}
+ reflect.Slice
+\end{verbatim}
+it took the editor half a hour to finish computing.
+This was subsequently fixed by Galbraith \cite{fixedAtom},
+and the issue was due to the regex engine of Atom.
+But evil regular expressions can be more than a nuisance in a text editor:
+on 20 July 2016 one evil
+regular expression brought the webpage
+\href{http://stackexchange.com}{Stack Exchange} to its
+knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
+In this instance, a regular expression intended to just trim white
+spaces from the beginning and the end of a line actually consumed
+massive amounts of CPU resources---causing the web servers to grind to a
+halt. In this example, the time needed to process
+the string was
+$O(n^2)$ with respect to the string length $n$. This
+quadratic overhead was enough for the homepage of Stack Exchange to
+respond so slowly that the load balancer assumed a $\mathit{DoS}$
+attack and therefore stopped the servers from responding to any
+requests. This made the whole site become unavailable.
+
+\begin{figure}[p]
+\begin{center}
+\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=35,
+ ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={JavaScript},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
+\end{axis}
+\end{tikzpicture}
+ &
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ %ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=35,
+ ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={Python},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
+\end{axis}
+\end{tikzpicture}\\
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=35,
+ ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={Java 8},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
+\end{axis}
+\end{tikzpicture}
+ &
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ %ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=35,
+ ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={Dart},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
+\end{axis}
+\end{tikzpicture}\\
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=35,
+ ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={Swift},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
+\end{axis}
+\end{tikzpicture}
+ &
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ %ylabel={time in secs},
+ enlargelimits=true,
+ %xtick={0,5000,...,40000},
+ %xmax=40000,
+ %ymax=35,
+ restrict x to domain*=0:40000,
+ restrict y to domain*=0:35,
+ %ytick={0,5,...,30},
+ %scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={Java9+},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
+\end{axis}
+\end{tikzpicture}\\
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={time in secs},
+ enlargelimits=true,
+ %xtick={0,5,...,30},
+ %xmax=33,
+ %ymax=35,
+ restrict x to domain*=0:60000,
+ restrict y to domain*=0:35,
+ %ytick={0,5,...,30},
+ %scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={Scala},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[magenta,mark=*, mark options={fill=white}] table {re-blexersimp.data};
+\end{axis}
+\end{tikzpicture}
+ &
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ %ylabel={time in secs},
+ enlargelimits=true,
+ %xtick={0,5000,...,40000},
+ %xmax=40000,
+ %ymax=35,
+ restrict x to domain*=0:60000,
+ restrict y to domain*=0:45,
+ %ytick={0,5,...,30},
+ %scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend style={cells={align=left}},
+ legend entries={Isabelle \\ Extracted},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[magenta,mark=*, mark options={fill=white}] table {re-fromIsabelle.data};
+\end{axis}
+\end{tikzpicture}\\
+\multicolumn{2}{c}{Graphs}
+\end{tabular}
+\end{center}
+\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings
+ of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
+ The reason for their fast growth %superlinear behaviour
+ is that they do a depth-first-search
+ using NFAs.
+ If the string does not match, the regular expression matching
+ engine starts to explore all possibilities.
+ The last two graphs are for our implementation in Scala, one manual
+ and one extracted from the verified lexer in Isabelle by $\textit{codegen}$.
+ Our lexer
+ performs better in this case, and
+ is formally verified.
+ Despite being almost identical, the codegen-generated lexer
+ % is generated directly from Isabelle using $\textit{codegen}$,
+ is slower than the manually written version since the code synthesised by
+ $\textit{codegen}$ does not use native integer or string
+ types of the target language.
+ %Note that we are comparing our \emph{lexer} against other languages' matchers.
+}\label{fig:aStarStarb}
+\end{figure}\afterpage{\clearpage}
+
+A more recent example is a global outage of all Cloudflare servers on
+2 July 2019.
+The traffic Cloudflare services each day is more than
+Twitter, Amazon, Instagram, Apple, Bing and Wikipedia combined, yet
+it became completely unavailable for half an hour because of
+a poorly-written regular expression roughly of the form $^*.^*=.^*$
+exhausted CPU resources that serve HTTP traffic.
+Although the outage
+had several causes, at the heart was a regular expression that
+was used to monitor network
+traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
+These problems with regular expressions
+are not isolated events that happen
+very rarely,
+%but actually widespread.
+%They occur so often that they have a
+but they occur actually often enough that they have a
+name: Regular-Expression-Denial-Of-Service (ReDoS)
+attacks.
+Davis et al. \cite{Davis18} detected more
+than 1000 evil regular expressions
+in Node.js, Python core libraries, npm and pypi.
+They therefore concluded that evil regular expressions
+are a real problem rather than just "a parlour trick".
+
+The work in this thesis aims to address this issue
+with the help of formal proofs.
+We describe a lexing algorithm based
+on Brzozowski derivatives with verified correctness
+and a finiteness property for the size of derivatives
+(which are all done in Isabelle/HOL).
+Such properties %guarantee the absence of
+are an important step in preventing
+catastrophic backtracking once and for all.
+We will give more details in the next sections
+on (i) why the slow cases in graph \ref{fig:aStarStarb}
+can occur in traditional regular expression engines
+and (ii) why we choose our
+approach based on Brzozowski derivatives and formal proofs.
+
+
+
+\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
+Regular expressions and regular expression matchers
+have been studied for many years.
+Theoretical results in automata theory state
+that basic regular expression matching should be linear
+w.r.t the input.
+This assumes that the regular expression
+$r$ was pre-processed and turned into a
+deterministic finite automaton (DFA) before matching \cite{Sakarovitch2009}.
+By basic we mean textbook definitions such as the one
+below, involving only regular expressions for characters, alternatives,
+sequences, and Kleene stars:
+\[
+ r ::= c | r_1 + r_2 | r_1 \cdot r_2 | r^*
+\]
+Modern regular expression matchers used by programmers,
+however,
+support much richer constructs, such as bounded repetitions,
+negations,
+and back-references.
+To differentiate, we use the word \emph{regex} to refer
+to those expressions with richer constructs while reserving the
+term \emph{regular expression}
+for the more traditional meaning in formal languages theory.
+We follow this convention
+in this thesis.
+In the future, we aim to support all the popular features of regexes,
+but for this work we mainly look at basic regular expressions
+and bounded repetitions.
+
+
+%Most modern regex libraries
+%the so-called PCRE standard (Peral Compatible Regular Expressions)
+%has the back-references
+Regexes come with a number of constructs
+that make it more convenient for
+programmers to write regular expressions.
+Depending on the types of constructs
+the task of matching and lexing with them
+will have different levels of complexity.
+Some of those constructs are syntactic sugars that are
+simply short hand notations
+that save the programmers a few keystrokes.
+These will not cause problems for regex libraries.
+For example the
+non-binary alternative involving three or more choices just means:
+\[
+ (a | b | c) \stackrel{means}{=} ((a + b)+ c)
+\]
+Similarly, the range operator
+%used to express the alternative
+%of all characters between its operands,
+is just a concise way
+of expressing an alternative of consecutive characters:
+\[
+ [0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 )
+\]
+for an alternative. The
+wildcard character '$.$' is used to refer to any single character,
+\[
+ . \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
+\]
+except the newline.
+
+\subsection{Bounded Repetitions}
+More interesting are bounded repetitions, which can
+make the regular expressions much
+more compact.
+Normally there are four kinds of bounded repetitions:
+$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
+(where $n$ and $m$ are constant natural numbers).
+Like the star regular expressions, the set of strings or language
+a bounded regular expression can match
+is defined using the power operation on sets:
+\begin{center}
+ \begin{tabular}{lcl}
+ $L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
+ $L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
+ $L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
+ $L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
+ \end{tabular}
+\end{center}
+The attraction of bounded repetitions is that they can be
+used to avoid a size blow up: for example $r^{\{n\}}$
+is a shorthand for
+the much longer regular expression:
+\[
+ \underbrace{r\ldots r}_\text{n copies of r}.
+\]
+%Therefore, a naive algorithm that simply unfolds
+%them into their desugared forms
+%will suffer from at least an exponential runtime increase.
+
+
+The problem with matching
+such bounded repetitions
+is that tools based on the classic notion of
+automata need to expand $r^{\{n\}}$ into $n$ connected
+copies of the automaton for $r$. This leads to very inefficient matching
+algorithms or algorithms that consume large amounts of memory.
+Implementations using $\DFA$s will
+in such situations
+either become very slow
+(for example Verbatim++ \cite{Verbatimpp}) or run
+out of memory (for example $\mathit{LEX}$ and
+$\mathit{JFLEX}$\footnote{LEX and JFLEX are lexer generators
+in C and JAVA that generate $\mathit{DFA}$-based
+lexers. The user provides a set of regular expressions
+and configurations, and then
+gets an output program encoding a minimized $\mathit{DFA}$
+that can be compiled and run.
+When given the above countdown regular expression,
+a small $n$ (say 20) would result in a program representing a
+DFA
+with millions of states.}) for large counters.
+A classic example for this phenomenon is the regular expression $(a+b)^* a (a+b)^{n}$
+where the minimal DFA requires at least $2^{n+1}$ states.
+For example, when $n$ is equal to 2,
+the corresponding $\mathit{NFA}$ looks like:
+\vspace{6mm}
+\begin{center}
+\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
+ \node[state,initial] (q_0) {$q_0$};
+ \node[state, red] (q_1) [right=of q_0] {$q_1$};
+ \node[state, red] (q_2) [right=of q_1] {$q_2$};
+ \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
+ \path[->]
+ (q_0) edge node {a} (q_1)
+ edge [loop below] node {a,b} ()
+ (q_1) edge node {a,b} (q_2)
+ (q_2) edge node {a,b} (q_3);
+\end{tikzpicture}
+\end{center}
+and when turned into a DFA by the subset construction
+requires at least $2^3$ states.\footnote{The
+red states are "countdown states" which count down
+the number of characters needed in addition to the current
+string to make a successful match.
+For example, state $q_1$ indicates a match that has
+gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
+and just consumed the "delimiter" $a$ in the middle, and
+needs to match 2 more iterations of $(a|b)$ to complete.
+State $q_2$ on the other hand, can be viewed as a state
+after $q_1$ has consumed 1 character, and just waits
+for 1 more character to complete.
+The state $q_3$ is the last (accepting) state, requiring 0
+more characters.
+Depending on the suffix of the
+input string up to the current read location,
+the states $q_1$ and $q_2$, $q_3$
+may or may
+not be active.
+A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
+contain at least $2^3$ non-equivalent states that cannot be merged,
+because the subset construction during determinisation will generate
+all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
+Generalizing this to regular expressions with larger
+bounded repetitions number, we have that
+regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
+would require at least $2^{n+1}$ states, if $r$ itself contains
+more than 1 string.
+This is to represent all different
+scenarios in which "countdown" states are active.}
+
+
+Bounded repetitions are important because they
+tend to occur frequently in practical use,
+for example in the regex library RegExLib, in
+the rules library of Snort \cite{Snort1999}\footnote{
+Snort is a network intrusion detection (NID) tool
+for monitoring network traffic.
+The network security community curates a list
+of malicious patterns written as regexes,
+which is used by Snort's detection engine
+to match against network traffic for any hostile
+activities such as buffer overflow attacks.},
+as well as in XML Schema definitions (XSDs).
+According to Bj\"{o}rklund et al \cite{xml2015},
+more than half of the
+XSDs they found on the Maven.org central repository
+have bounded regular expressions in them.
+Often the counters are quite large, with the largest being
+close to ten million.
+A smaller sample XSD they gave
+is:
+\lstset{
+ basicstyle=\fontsize{8.5}{9}\ttfamily,
+ language=XML,
+ morekeywords={encoding,
+ xs:schema,xs:element,xs:complexType,xs:sequence,xs:attribute}
+}
+\begin{lstlisting}
+<sequence minOccurs="0" maxOccurs="65535">
+ <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
+ <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
+</sequence>
+\end{lstlisting}
+This can be seen as the regex
+$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
+regular expressions
+satisfying certain constraints (such as
+satisfying the floating point number format).
+It is therefore quite unsatisfying that
+some regular expressions matching libraries
+impose adhoc limits
+for bounded regular expressions:
+For example, in the regular expression matching library in the Go
+language the regular expression $a^{1001}$ is not permitted, because no counter
+can be above 1000, and in the built-in Rust regular expression library
+expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
+for being too big~\footnote{Try it out here: https://rustexp.lpil.uk}.
+As Becchi and Crawley \cite{Becchi08} have pointed out,
+the reason for these restrictions
+is that they simulate a non-deterministic finite
+automata (NFA) with a breadth-first search.
+This way the number of active states could
+be equal to the counter number.
+When the counters are large,
+the memory requirement could become
+infeasible, and a regex engine
+like in Go will reject this pattern straight away.
+\begin{figure}[H]
+\begin{center}
+\begin{tikzpicture} [node distance = 2cm, on grid, auto]
+
+ \node (q0) [state, initial] {$0$};
+ \node (q1) [state, right = of q0] {$1$};
+ %\node (q2) [state, right = of q1] {$2$};
+ \node (qdots) [right = of q1] {$\ldots$};
+ \node (qn) [state, right = of qdots] {$n$};
+ \node (qn1) [state, right = of qn] {$n+1$};
+ \node (qn2) [state, right = of qn1] {$n+2$};
+ \node (qn3) [state, accepting, right = of qn2] {$n+3$};
+
+\path [-stealth, thick]
+ (q0) edge [loop above] node {a} ()
+ (q0) edge node {a} (q1)
+ %(q1) edge node {.} (q2)
+ (q1) edge node {.} (qdots)
+ (qdots) edge node {.} (qn)
+ (qn) edge node {.} (qn1)
+ (qn1) edge node {b} (qn2)
+ (qn2) edge node {$c$} (qn3);
+\end{tikzpicture}
+%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
+% \node[state,initial] (q_0) {$0$};
+% \node[state, ] (q_1) [right=of q_0] {$1$};
+% \node[state, ] (q_2) [right=of q_1] {$2$};
+% \node[state,
+% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
+% \path[->]
+% (q_0) edge node {a} (q_1)
+% edge [loop below] node {a,b} ()
+% (q_1) edge node {a,b} (q_2)
+% (q_2) edge node {a,b} (q_3);
+%\end{tikzpicture}
+\end{center}
+\caption{The example given by Becchi and Crawley
+ that NFA simulation can consume large
+ amounts of memory: $.^*a.^{\{n\}}bc$ matching
+ strings of the form $aaa\ldots aaaabc$.
+ When traversing in a breadth-first manner,
+all states from 0 till $n+1$ will become active.}\label{fig:inj}
+
+\end{figure}
+%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
+%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
+%in terms of input string length.
+%TODO:try out these lexers
+These problems can of course be solved in matching algorithms where
+automata go beyond the classic notion and for instance include explicit
+counters \cite{Turo_ov__2020}.
+These solutions can be quite efficient,
+with the ability to process
+gigabits of strings input per second
+even with large counters \cite{Becchi08}.
+These practical solutions do not come with
+formal guarantees, and as pointed out by
+Kuklewicz \cite{KuklewiczHaskell}, can be error-prone.
+%But formal reasoning about these automata especially in Isabelle
+%can be challenging
+%and un-intuitive.
+%Therefore, we take correctness and runtime claims made about these solutions
+%with a grain of salt.
+
+In the work reported in \cite{ITP2023} and here,
+we add better support using derivatives
+for bounded regular expression $r^{\{n\}}$.
+Our results
+extend straightforwardly to
+repetitions with intervals such as
+$r^{\{n\ldots m\}}$.
+The merit of Brzozowski derivatives (more on this later)
+on this problem is that
+it can be naturally extended to support bounded repetitions.
+Moreover these extensions are still made up of only small
+inductive datatypes and recursive functions,
+making it handy to deal with them in theorem provers.
+%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
+%straightforwardly extended to deal with bounded regular expressions
+%and moreover the resulting code still consists of only simple
+%recursive functions and inductive datatypes.
+Finally, bounded regular expressions do not destroy our finite
+boundedness property, which we shall prove later on.
+
+
+
+
+
+\subsection{Back-References}
+The other way to simulate an $\mathit{NFA}$ for matching is choosing
+a single transition each time, keeping all the other options in
+a queue or stack, and backtracking if that choice eventually
+fails.
+This method, often called a "depth-first-search",
+is efficient in many cases, but could end up
+with exponential run time.
+The backtracking method is employed in regex libraries
+that support \emph{back-references}, for example
+in Java and Python.
+%\section{Back-references and The Terminology Regex}
+
+%When one constructs an $\NFA$ out of a regular expression
+%there is often very little to be done in the first phase, one simply
+%construct the $\NFA$ states based on the structure of the input regular expression.
+
+%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
+%one by keeping track of all active states after consuming
+%a character, and update that set of states iteratively.
+%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
+%for a path terminating
+%at an accepting state.
+
+
+
+Consider the following regular expression where the sequence
+operator is omitted for brevity:
+\begin{center}
+ $r_1r_2r_3r_4$
+\end{center}
+In this example,
+one could label sub-expressions of interest
+by parenthesizing them and giving
+them a number in the order in which their opening parentheses appear.
+One possible way of parenthesizing and labelling is:
+\begin{center}
+ $\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
+\end{center}
+The sub-expressions
+$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
+by 1 to 4, and can be ``referred back'' by their respective numbers.
+%These sub-expressions are called "capturing groups".
+To do so, one uses the syntax $\backslash i$
+to denote that we want the sub-string
+of the input matched by the i-th
+sub-expression to appear again,
+exactly the same as it first appeared:
+\begin{center}
+$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
+\underset{s_i \text{ which just matched} \;r_i}{\backslash i} \ldots$
+\end{center}
+Once the sub-string $s_i$ for the sub-expression $r_i$
+has been fixed, there is no variability on what the back-reference
+label $\backslash i$ can be---it is tied to $s_i$.
+The overall string will look like $\ldots s_i \ldots s_i \ldots $
+%The backslash and number $i$ are the
+%so-called "back-references".
+%Let $e$ be an expression made of regular expressions
+%and back-references. $e$ contains the expression $e_i$
+%as its $i$-th capturing group.
+%The semantics of back-reference can be recursively
+%written as:
+%\begin{center}
+% \begin{tabular}{c}
+% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
+% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
+% \end{tabular}
+%\end{center}
+A concrete example
+for back-references is
+\begin{center}
+$(.^*)\backslash 1$,
+\end{center}
+which matches
+strings that can be split into two identical halves,
+for example $\mathit{foofoo}$, $\mathit{ww}$ and so on.
+Note that this is different from
+repeating the sub-expression verbatim like
+\begin{center}
+ $(.^*)(.^*)$,
+\end{center}
+which does not impose any restrictions on what strings the second
+sub-expression $.^*$
+might match.
+Another example for back-references is
+\begin{center}
+$(.)(.)\backslash 2\backslash 1$
+\end{center}
+which matches four-character palindromes
+like $abba$, $x??x$ and so on.
+
+Back-references are a regex construct
+that programmers find quite useful.
+According to Becchi and Crawley \cite{Becchi08},
+6\% of Snort rules (up until 2008) use them.
+The most common use of back-references
+is to express well-formed html files,
+where back-references are convenient for matching
+opening and closing tags like
+\begin{center}
+ $\langle html \rangle \ldots \langle / html \rangle$
+\end{center}
+A regex describing such a format
+is
+\begin{center}
+ $\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
+\end{center}
+Despite being useful, the expressive power of regexes
+go beyond regular languages
+once back-references are included.
+In fact, they allow the regex construct to express
+languages that cannot be contained in context-free
+languages either.
+For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
+expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
+which cannot be expressed by
+context-free grammars \cite{campeanu2003formal}.
+Such a language is contained in the context-sensitive hierarchy
+of formal languages.
+Also solving the matching problem involving back-references
+is known to be NP-complete \parencite{alfred2014algorithms}.
+Regex libraries supporting back-references such as
+PCRE \cite{pcre} therefore have to
+revert to a depth-first search algorithm including backtracking.
+What is unexpected is that even in the cases
+not involving back-references, there is still
+a (non-negligible) chance they might backtrack super-linearly,
+as shown in the graphs in figure \ref{fig:aStarStarb}.
+
+Summing up, we can categorise existing
+practical regex libraries into two kinds:
+(i) The ones with linear
+time guarantees like Go and Rust. The downside with them is that
+they impose restrictions
+on the regular expressions (not allowing back-references,
+bounded repetitions cannot exceed an ad hoc limit etc.).
+And (ii) those
+that allow large bounded regular expressions and back-references
+at the expense of using backtracking algorithms.
+They can potentially ``grind to a halt''
+on some very simple cases, resulting
+ReDoS attacks if exposed to the internet.
+
+The problems with both approaches are the motivation for us
+to look again at the regular expression matching problem.
+Another motivation is that regular expression matching algorithms
+that follow the POSIX standard often contain errors and bugs
+as we shall explain next.
+
+%We would like to have regex engines that can
+%deal with the regular part (e.g.
+%bounded repetitions) of regexes more
+%efficiently.
+%Also we want to make sure that they do it correctly.
+%It turns out that such aim is not so easy to achieve.
+ %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
+% For example, the Rust regex engine claims to be linear,
+% but does not support lookarounds and back-references.
+% The GoLang regex library does not support over 1000 repetitions.
+% Java and Python both support back-references, but shows
+%catastrophic backtracking behaviours on inputs without back-references(
+%when the language is still regular).
+ %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
+ %TODO: verify the fact Rust does not allow 1000+ reps
+
+
+
+
+%The time cost of regex matching algorithms in general
+%involve two different phases, and different things can go differently wrong on
+%these phases.
+%$\DFA$s usually have problems in the first (construction) phase
+%, whereas $\NFA$s usually run into trouble
+%on the second phase.
+
+
+\section{Error-prone POSIX Implementations}
+Very often there are multiple ways of matching a string
+with a regular expression.
+In such cases the regular expressions matcher needs to
+disambiguate.
+The more widely used strategy is called POSIX,
+which roughly speaking always chooses the longest initial match.
+The POSIX strategy is widely adopted in many regular expression matchers
+because it is a natural strategy for lexers.
+However, many implementations (including the C libraries
+used by Linux and OS X distributions) contain bugs
+or do not meet the specification they claim to adhere to.
+Kuklewicz maintains a unit test repository which lists some
+problems with existing regular expression engines \cite{KuklewiczHaskell}.
+In some cases, they either fail to generate a
+result when there exists a match,
+or give results that are inconsistent with the POSIX standard.
+A concrete example is the regex:
+\begin{center}
+ $(aba + ab + a)^* \text{and the string} \; ababa$
+\end{center}
+The correct POSIX match for the above
+involves the entire string $ababa$,
+split into two Kleene star iterations, namely $[ab], [aba]$ at positions
+$[0, 2), [2, 5)$
+respectively.
+But feeding this example to the different engines
+listed at regex101 \footnote{
+ regex101 is an online regular expression matcher which
+ provides API for trying out regular expression
+ engines of multiple popular programming languages like
+Java, Python, Go, etc.} \parencite{regex101}.
+yields
+only two incomplete matches: $[aba]$ at $[0, 3)$
+and $a$ at $[4, 5)$.
+Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
+commented that most regex libraries are not
+correctly implementing the central POSIX
+rule, called the maximum munch rule.
+Grathwohl \parencite{grathwohl2014crash} wrote:
+\begin{quote}\it
+ ``The POSIX strategy is more complicated than the
+ greedy because of the dependence on information about
+ the length of matched strings in the various subexpressions.''
+\end{quote}
+%\noindent
+People have recognised that the implementation complexity of POSIX rules also come from
+the specification being not very precise.
+The developers of the regexp package of Go
+\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
+commented that
+\begin{quote}\it
+``
+The POSIX rule is computationally prohibitive
+and not even well-defined.
+``
+\end{quote}
+There are many informal summaries of this disambiguation
+strategy, which are often quite long and delicate.
+For example Kuklewicz \cite{KuklewiczHaskell}
+described the POSIX rule as (section 1, last paragraph):
+\begin{quote}
+ \begin{itemize}
+ \item
+regular expressions (REs) take the leftmost starting match, and the longest match starting there
+earlier subpatterns have leftmost-longest priority over later subpatterns\\
+\item
+higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
+\item
+REs have right associative concatenation which can be changed with parenthesis\\
+\item
+parenthesized subexpressions return the match from their last usage\\
+\item
+text of component subexpressions must be contained in the text of the
+higher-level subexpressions\\
+\item
+if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
+\item
+if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
+\end{itemize}
+\end{quote}
+%The text above
+%is trying to capture something very precise,
+%and is crying out for formalising.
--- a/ChengsongTanPhdThesis/Chapters/RelatedWork.tex Tue Jul 25 17:28:29 2023 +0100
+++ b/ChengsongTanPhdThesis/Chapters/RelatedWork.tex Wed Aug 23 03:02:31 2023 +0100
@@ -291,6 +291,14 @@
%flavours of back-references syntax (e.g. whether references can be circular,
%can labels be repeated etc.).
+The recently published work on formally verified lexer
+\cite{Margus2023PLDI} by Moseley et al.
+moves a big step forward in practical performance--their lexer can compete
+with unverified commercial regex engines like those in Java, Python and etc.
+and supports lookaheads.
+To reach their speed on larger datasets like \cite{TwainRegex}, one has to
+incorporate many fine-tunings and one of the first modifications needed would
+be to drop whole-string lexing but only extract the submatch needed by the user.
--- a/ChengsongTanPhdThesis/example.bib Tue Jul 25 17:28:29 2023 +0100
+++ b/ChengsongTanPhdThesis/example.bib Wed Aug 23 03:02:31 2023 +0100
@@ -2,8 +2,21 @@
%% https://bibdesk.sourceforge.io/
%% Created for CS TAN at 2022-11-20 17:26:32 +0000
-
-
+@inproceedings{LEX,
+ title={Lex—a lexical analyzer generator},
+ author={Michael E. Lesk and Eric E. Schmidt},
+ year={1990},
+ url={https://api.semanticscholar.org/CorpusID:7900881}
+}
+@article{RegularExpressions,
+ title={REPRESENTATION OF EVENTS IN NERVE NETS AND FINITE AUTOMATA$^1$},
+ author={Kleene, SC},
+ journal={Automata Studies: Annals of Mathematics Studies. Number 34},
+ number={34},
+ pages={3},
+ year={1956},
+ publisher={Princeton University Press}
+}
%% Saved with string encoding Unicode (UTF-8)
@inproceedings{Doczkal2013,
author = {C.~Doczkaland J.O.~Kaiser and G.~Smolka},
@@ -684,6 +697,38 @@
title = {{Snort Community Rules}},
year = {2022}}
+@misc{Atom,
+ howpublished = {\url{https://github.com/atom/atom}},
+ note = {[Online; last accessed 22-Aug-2023]},
+ title = {{Atom Editor}},
+ year = {2023}}
+
+@misc{fixedAtom,
+ howpublished = {\url{http://davidvgalbraith.com/how-i-fixed-atom/}},
+ note = {[Online; last accessed 22-Aug-2023}},
+ title = {{How I fixed Atom}},
+ year = {2016}}
+
+
+@misc{Cloudflare,
+ howpublished = {\url{https://www.cloudflare.com/en-gb/}},
+ note = {[Online; last accessed 22-Aug-2023}},
+ title = {{Cloudflare}},
+ year = {2023}}
+
+@misc{CloudflareOutage,
+ howpublished = {\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}},
+ note = {[Online; last accessed 22-Aug-2023}},
+ title = {{Cloudflare}},
+ year = {2023}}
+
+@misc{TwainRegex,
+ howpublished = {\url{https://zherczeg.github.io/sljit/regex_perf.html}},
+ %note = {[Online; last accessed 19-November-2022]},
+ author = {Zoltan Herczeg},
+ title = {{Performance comparison of regular expression engines}},
+ year = {2015}}
+
@misc{KuklewiczHaskell,
author = {C.~Kuklewicz},
keywords = {Buggy C POSIX Lexing Libraries},
@@ -857,6 +902,15 @@
volume = 1479,
year = 1998}
+@book{Isabelle,
+ title={Isabelle/HOL: a proof assistant for higher-order logic},
+ author={Nipkow, Tobias and Paulson, Lawrence C and Wenzel, Markus},
+ volume={2283},
+ year={2002},
+ publisher={Springer Science \& Business Media}
+}
+
+
@article{Brzozowski1964,
author = {J.~A.~Brzozowski},
journal = {Journal of the {ACM}},
@@ -1054,6 +1108,37 @@
abstract="We present a framework for simultaneously verifying the functional correctness and the worst-case asymptotic time complexity of higher-order imperative programs. We build on top of Separation Logic with Time Credits, embedded in an interactive proof assistant. We formalize the O notation, which is key to enabling modular specifications and proofs. We cover the subtleties of the multivariate case, where the complexity of a program fragment depends on multiple parameters. We propose a way of integrating complexity bounds into specifications, present lemmas and tactics that support a natural reasoning style, and illustrate their use with a collection of examples.",
isbn="978-3-319-89884-1"
}
+
+
+@article{ValiantParsing,
+ title={General Context-Free Recognition in Less than Cubic Time},
+ author={Leslie G. Valiant},
+ journal={J. Comput. Syst. Sci.},
+ year={1975},
+ volume={10},
+ pages={308-315}
+}
+
+
+@inproceedings{bigOIsabelle,
+ title={Formalizing O notation in Isabelle/HOL},
+ author={Avigad, Jeremy and Donnelly, Kevin},
+ booktitle={International Joint Conference on Automated Reasoning},
+ pages={357--371},
+ year={2004},
+ organization={Springer}
+}
+
+
+@inbook{subtypingFormalComplexity,
+author = {Bessai, Jan and Rehof, Jakob and Düdder, Boris},
+year = {2019},
+month = {06},
+pages = {356-371},
+title = {Fast Verified BCD Subtyping},
+isbn = {978-3-030-22347-2},
+doi = {10.1007/978-3-030-22348-9_21}
+}
%@article{10.1145/640128.604148,
%author = {Hofmann, Martin and Jost, Steffen},
%title = {Static Prediction of Heap Space Usage for First-Order Functional Programs},
@@ -1093,3 +1178,23 @@
%}
+%10.1145/3591262,
+
+@article{Margus2023PLDI,
+author = {Moseley, Dan and Nishio, Mario and Perez Rodriguez, Jose and Saarikivi, Olli and Toub, Stephen and Veanes, Margus and Wan, Tiki and Xu, Eric},
+title = {Derivative Based Nonbacktracking Real-World Regex Matching with Backtracking Semantics},
+year = {2023},
+issue_date = {June 2023},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+volume = {7},
+number = {PLDI},
+url = {https://doi.org/10.1145/3591262},
+doi = {10.1145/3591262},
+abstract = {We develop a new derivative based theory and algorithm for nonbacktracking regex matching that supports anchors and counting, preserves backtracking semantics, and can be extended with lookarounds. The algorithm has been implemented as a new regex backend in .NET and was extensively tested as part of the formal release process of .NET7. We present a formal proof of the correctness of the algorithm, which we believe to be the first of its kind concerning industrial implementations of regex matchers. The paper describes the complete foundation, the matching algorithm, and key aspects of the implementation involving a regex rewrite system, as well as a comprehensive evaluation over industrial case studies and other regex engines.},
+journal = {Proc. ACM Program. Lang.},
+month = {jun},
+articleno = {148},
+numpages = {24},
+keywords = {automata, derivative, symbolic, regex, PCRE, matching}
+}
--- a/ChengsongTanPhdThesis/main.tex Tue Jul 25 17:28:29 2023 +0100
+++ b/ChengsongTanPhdThesis/main.tex Wed Aug 23 03:02:31 2023 +0100
@@ -481,7 +481,7 @@
% DEDICATION
%----------------------------------------------------------------------------------------
-\dedicatory{For/Dedicated to/To my\ldots}
+\dedicatory{To my granny, Shoucai Cheng. You are dearly missed.}
%----------------------------------------------------------------------------------------
% THESIS CONTENT - CHAPTERS
@@ -495,6 +495,7 @@
% Uncomment the lines as you write the chapters
\include{Chapters/Introduction}
+\include{Chapters/Overview}
\include{Chapters/Inj}
\include{Chapters/Bitcoded1}
\include{Chapters/Bitcoded2}
--- a/RegexExplosionPlot/injToSimp.sc Tue Jul 25 17:28:29 2023 +0100
+++ b/RegexExplosionPlot/injToSimp.sc Wed Aug 23 03:02:31 2023 +0100
@@ -418,11 +418,11 @@
println("Injection!");println("r: "+r); println("c: "+c); println("v:"+v);
inj(r, c, v ) }
}
-val r = STAR("a"|"aa") ~ ("c")
-println("unsimplified lexer's intermediate derivative sizes:")
-val v = print_lexer(r, "aac".toList)
-println("simplified lexer's intermediate derivative sizes:")
-val v2 = simp_lexer(r, "aac".toList)
+//val r = STAR("a"|"aa") ~ ("c")
+//println("unsimplified lexer's intermediate derivative sizes:")
+//val v = print_lexer(r, "aac".toList)
+//println("simplified lexer's intermediate derivative sizes:")
+//val v2 = simp_lexer(r, "aac".toList)
//println(v)
// some "rectification" functions for simplification
@@ -1222,16 +1222,16 @@
// Sequ(Sequ(Sequ(Empty,Chr(b)),Stars(List(Chr(b), Chr(b)))),Sequ(Right(Stars(List(Right(Sequ(Sequ(Stars(List(Right(Chr(a)), Right(Chr(a)))),Sequ(Chr(a),Sequ(Left(Chr(b)),Sequ(Empty,Chr(b))))),Empty)), Right(Sequ(Sequ(Stars(List(Right(Chr(a)), Right(Chr(a)))),Sequ(Chr(a),Sequ(Left(Chr(b)),Sequ(Empty,Chr(b))))),Empty))))),Empty))
-//def time[R](block: => R): R = {
-// val t0 = System.nanoTime()
-// val result = block // call-by-name
-// val t1 = System.nanoTime()
-// println(" " + (t1 - t0)/1e9 + "")
-// result
-//}
+def time[R](block: => R): R = {
+ val t0 = System.nanoTime()
+ val result = block // call-by-name
+ val t1 = System.nanoTime()
+ println(" " + (t1 - t0)/1e9 + "")
+ result
+}
val astarstarB = STAR(STAR("a"))~("b")
-val data_points_x_axis = List.range(0, 60000, 3000)
-//for (i <- data_points_x_axis) {print(i); time { blexing(astarstarB, ("a"*i)) } }
+val data_points_x_axis = List.range(0, 40, 1)
+for (i <- data_points_x_axis) {print(i); time { blexing_simp(astarstarB, ("a"*i)) } }
////println(data_points_x_axis.zip( List(0,1,3,4)))
//for(i <- List.range(0, 40000, 3000)) print(i + " ")