% Chapter 1
\chapter{Introduction} % Main chapter title
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
%----------------------------------------------------------------------------------------
% Define some commands to keep the formatting separated from the content
\newcommand{\keyword}[1]{\textbf{#1}}
\newcommand{\tabhead}[1]{\textbf{#1}}
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\file}[1]{\texttt{\bfseries#1}}
\newcommand{\option}[1]{\texttt{\itshape#1}}
%boxes
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
\def\derssimp{\textit{ders}\_\textit{simp}}
\def\rders{\textit{rders}}
\newcommand{\bders}[2]{#1 \backslash #2}
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
\def\bsimps{\textit{bsimp}}
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
\newcommand{\ZERO}{\mbox{\bf 0}}
\newcommand{\ONE}{\mbox{\bf 1}}
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
\def\rdistincts{\textit{rdistinct}}
\def\rDistinct{\textit{rdistinct}}
\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
\def\SEQ{\textit{SEQ}}
\def\SEQs{\textit{SEQs}}
\def\case{\textit{case}}
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
\def\rsimpalts{\textit{rsimp}_{ALTS}}
\def\good{\textit{good}}
\def\btrue{\textit{true}}
\def\bfalse{\textit{false}}
\def\bnullable{\textit{bnullable}}
\def\bnullables{\textit{bnullables}}
\def\Some{\textit{Some}}
\def\None{\textit{None}}
\def\code{\textit{code}}
\def\decode{\textit{decode}}
\def\internalise{\textit{internalise}}
\def\lexer{\mathit{lexer}}
\def\mkeps{\textit{mkeps}}
\newcommand{\rder}[2]{#2 \backslash_r #1}
\def\rerases{\textit{rerase}}
\def\nonnested{\textit{nonnested}}
\def\AZERO{\textit{AZERO}}
\def\sizeNregex{\textit{sizeNregex}}
\def\AONE{\textit{AONE}}
\def\ACHAR{\textit{ACHAR}}
\def\simpsulz{\textit{simp}_{Sulz}}
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
\def\frewrite{\rightsquigarrow_f}
\def\hrewrite{\rightsquigarrow_h}
\def\grewrite{\rightsquigarrow_g}
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
\def\fuse{\textit{fuse}}
\def\bder{\textit{bder}}
\def\der{\textit{der}}
\def\POSIX{\textit{POSIX}}
\def\ALTS{\textit{ALTS}}
\def\ASTAR{\textit{ASTAR}}
\def\DFA{\textit{DFA}}
\def\NFA{\textit{NFA}}
\def\bmkeps{\textit{bmkeps}}
\def\bmkepss{\textit{bmkepss}}
\def\retrieve{\textit{retrieve}}
\def\blexer{\textit{blexer}}
\def\flex{\textit{flex}}
\def\inj{\textit{inj}}
\def\Empty{\textit{Empty}}
\def\Left{\textit{Left}}
\def\Right{\textit{Right}}
\def\Stars{\textit{Stars}}
\def\Char{\textit{Char}}
\def\Seq{\textit{Seq}}
\def\Der{\textit{Der}}
\def\Ders{\textit{Ders}}
\def\nullable{\mathit{nullable}}
\def\Z{\mathit{Z}}
\def\S{\mathit{S}}
\def\rup{r^\uparrow}
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
\def\distinctWith{\textit{distinctWith}}
\def\lf{\textit{lf}}
\def\PD{\textit{PD}}
\def\suffix{\textit{Suffix}}
\def\distinctBy{\textit{distinctBy}}
\def\starupdate{\textit{starUpdate}}
\def\starupdates{\textit{starUpdates}}
\def\size{\mathit{size}}
\def\rexp{\mathbf{rexp}}
\def\simp{\mathit{simp}}
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
\def\map{\mathit{map}}
\def\distinct{\mathit{distinct}}
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
\def\blexerStrong{\textit{blexerStrong}}
\def\bsimpStrong{\textit{bsimpStrong}}
\def\bdersStrongs{\textit{bdersStrong}}
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
\def\map{\textit{map}}
\def\rrexp{\textit{rrexp}}
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
\newcommand\asize[1]{\llbracket #1 \rrbracket}
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
\def\rflts{\textit{rflts}}
\def\rrewrite{\textit{rrewrite}}
\def\bsimpalts{\textit{bsimp}_{ALTS}}
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
\def\rsimlalts{\textit{rsimp}_{ALTs}}
\def\rsimpseq{\textit{rsimp}_{SEQ}}
\def\erase{\textit{erase}}
\def\STAR{\textit{STAR}}
\def\flts{\textit{flts}}
\def\zeroable{\textit{zeroable}}
\def\nub{\textit{nub}}
\def\filter{\textit{filter}}
%\def\not{\textit{not}}
\def\RZERO{\mathbf{0}_r }
\def\RONE{\mathbf{1}_r}
\newcommand\RCHAR[1]{\mathbf{#1}_r}
\newcommand\RSEQ[2]{#1 \cdot #2}
\newcommand\RALTS[1]{\sum #1}
\newcommand\RSTAR[1]{#1^*}
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
\lstdefinestyle{myScalastyle}{
frame=tb,
language=scala,
aboveskip=3mm,
belowskip=3mm,
showstringspaces=false,
columns=flexible,
basicstyle={\small\ttfamily},
numbers=none,
numberstyle=\tiny\color{gray},
keywordstyle=\color{blue},
commentstyle=\color{dkgreen},
stringstyle=\color{mauve},
frame=single,
breaklines=true,
breakatwhitespace=true,
tabsize=3,
}
%----------------------------------------------------------------------------------------
%This part is about regular expressions, Brzozowski derivatives,
%and a bit-coded lexing algorithm with proven correctness and time bounds.
%TODO: look up snort rules to use here--give readers idea of what regexes look like
Regular expressions are widely used in computer science:
be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
command-line tools like $\mathit{grep}$ that facilitate easy
text-processing; network intrusion
detection systems that inspect suspicious traffic; or compiler
front ends.
Given their usefulness and ubiquity, one would assume that
modern regular expression matching implementations
are mature and fully studied.
Indeed, in a popular programming language's regex engine,
supplying it with regular expressions and strings,
in most cases one can
get the matching information in a very short time.
Those matchers can be blindingly fast--some
network intrusion detection systems
use regex engines that are able to process
megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
However, those matchers can exhibit a surprising security vulnerability
under a certain class of inputs.
%However, , this is not the case for $\mathbf{all}$ inputs.
%TODO: get source for SNORT/BRO's regex matching engine/speed
Consider $(a^*)^*\,b$ and ask whether
strings of the form $aa..a$ can be matched by this regular
expression. Obviously this is not the case---the expected $b$ in the last
position is missing. One would expect that modern regular expression
matching engines can find this out very quickly. Surprisingly, if one tries
this example in JavaScript, Python or Java 8, even with small strings,
say of lenght of around 30 $a$'s,
the decision takes an absurd time to finish (see graphs in figure \ref{fig:aStarStarb}).
This is clearly exponential behaviour, and
is triggered by some relatively simple regular expressions.
Java 9 and newer
versions improve this behaviour somewhat, but is still slow compared
with the approach we are going to use in this thesis.
This superlinear blowup in regular expression engines
had repeatedly caused grief in ``real life'' where it is
given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
For example, on 20 July 2016 one evil
regular expression brought the webpage
\href{http://stackexchange.com}{Stack Exchange} to its
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
In this instance, a regular expression intended to just trim white
spaces from the beginning and the end of a line actually consumed
massive amounts of CPU resources---causing web servers to grind to a
halt. In this example, the time needed to process
the string was $O(n^2)$ with respect to the string length. This
quadratic overhead was enough for the homepage of Stack Exchange to
respond so slowly that the load balancer assumed a $\mathit{DoS}$
attack and therefore stopped the servers from responding to any
requests. This made the whole site become unavailable.
\begin{figure}[p]
\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,-0.05)}},
ylabel={time in secs},
enlargelimits=false,
xtick={0,5,...,30},
xmax=33,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5cm,
height=4cm,
legend entries={JavaScript},
legend pos=north west,
legend cell align=left]
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
\end{axis}
\end{tikzpicture}
&
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,-0.05)}},
%ylabel={time in secs},
enlargelimits=false,
xtick={0,5,...,30},
xmax=33,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5cm,
height=4cm,
legend entries={Python},
legend pos=north west,
legend cell align=left]
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
\end{axis}
\end{tikzpicture}\\
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,-0.05)}},
ylabel={time in secs},
enlargelimits=false,
xtick={0,5,...,30},
xmax=33,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5cm,
height=4cm,
legend entries={Java 8},
legend pos=north west,
legend cell align=left]
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
\end{axis}
\end{tikzpicture}
&
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,-0.05)}},
%ylabel={time in secs},
enlargelimits=false,
xtick={0,5,...,30},
xmax=33,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5cm,
height=4cm,
legend entries={Dart},
legend pos=north west,
legend cell align=left]
\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
\end{axis}
\end{tikzpicture}\\
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,-0.05)}},
ylabel={time in secs},
enlargelimits=false,
xtick={0,5,...,30},
xmax=33,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5cm,
height=4cm,
legend entries={Swift},
legend pos=north west,
legend cell align=left]
\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
\end{axis}
\end{tikzpicture}
&
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,-0.05)}},
%ylabel={time in secs},
enlargelimits=true,
%xtick={0,5000,...,40000},
%xmax=40000,
%ymax=35,
restrict x to domain*=0:40000,
restrict y to domain*=0:35,
%ytick={0,5,...,30},
%scaled ticks=false,
axis lines=left,
width=5cm,
height=4cm,
legend entries={Java9+},
legend pos=north west,
legend cell align=left]
\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
\end{axis}
\end{tikzpicture}\\
\multicolumn{2}{c}{Graphs}
\end{tabular}
\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings
of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
The reason for their superlinear behaviour is that they do a depth-first-search
using NFAs.
If the string does not match, the regular expression matching
engine starts to explore all possibilities.
}\label{fig:aStarStarb}
\end{figure}\afterpage{\clearpage}
A more recent example is a global outage of all Cloudflare servers on 2 July
2019. A poorly written regular expression exhibited catastrophic backtracking
and exhausted CPUs that serve HTTP traffic. Although the outage
had several causes, at the heart was a regular expression that
was used to monitor network
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
These problems with regular expressions
are not isolated events that happen
very occasionally, but actually widespread.
They occur so often that they have a
name: Regular-Expression-Denial-Of-Service (ReDoS)
attack.
\citeauthor{Davis18} detected more
than 1000 evil regular expressions
in Node.js, Python core libraries, npm and in pypi.
They therefore concluded that evil regular expressions
are real problems rather than "a parlour trick".
This work aims to address this issue
with the help of formal proofs.
We describe a lexing algorithm based
on Brzozowski derivatives with verified correctness (in
Isabelle/HOL)
and a finiteness property.
Such properties %guarantee the absence of
are an important step in preventing
catastrophic backtracking once and for all.
We will give more details in the next sections
on (i) why the slow cases in graph \ref{fig:aStarStarb}
can occur in traditional regular expression engines
and (ii) why we choose our
approach based on Brzozowski derivatives and formal proofs.
\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
Regular expressions and regular expression matchers
have of course been studied for many, many years.
Theoretical results in automata theory state
that basic regular expression matching should be linear
w.r.t the input.
This assumes that the regular expression
$r$ was pre-processed and turned into a
deterministic finite automaton (DFA) before matching\cite{Sakarovitch2009}.
By basic we mean textbook definitions such as the one
below, involving only regular expressions for characters, alternatives,
sequences, and Kleene stars:
\[
r ::= c | r_1 + r_2 | r_1 \cdot r_2 | r^*
\]
Modern regular expression matchers used by programmers,
however,
support much richer constructs, such as bounded repetitions
and back-references.
To differentiate, we use the word \emph{regex} to refer
to those expressions with richer constructs while reserving the
term \emph{regular expression}
for the more traditional meaning in formal languages theory.
We follow this convention
in this thesis.
In the future, we aim to support all the popular features of regexes,
but for this work we mainly look at basic regular expressions
and bounded repetitions.
%Most modern regex libraries
%the so-called PCRE standard (Peral Compatible Regular Expressions)
%has the back-references
Regexes come with a number of constructs
that make it more convenient for
programmers to write regular expressions.
Depending on the types of constructs
the task of matching and lexing with them
will have different levels of complexity.
Some of those constructs are just syntactic sugars that are
simply short hand notations
that save the programmers a few keystrokes.
These will not cause problems for regex libraries.
For example the
non-binary alternative involving three or more choices just means:
\[
(a | b | c) \stackrel{means}{=} ((a + b)+ c)
\]
Similarly, the range operator used to express the alternative
of all characters between its operands is just a concise way:
\[
[0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 ) \; \text{(all number digits)}
\]
for an alternative. The
wildcard character $.$ is used to refer to any single character,
\[
. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
\]
except the newline.
\subsection{Bounded Repetitions}
More interesting are bounded repetitions, which can
make the regular expressions much
more compact.
There are
$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
(where $n$ and $m$ are constant natural numbers).
Like the star regular expressions, the set of strings or language
a bounded regular expression can match
is defined using the power operation on sets:
\begin{center}
\begin{tabular}{lcl}
$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
\end{tabular}
\end{center}
The attraction of bounded repetitions is that they can be
used to avoid a blow up: for example $r^{\{n\}}$
is a shorthand for
\[
\underbrace{r\ldots r}_\text{n copies of r}.
\]
%Therefore, a naive algorithm that simply unfolds
%them into their desugared forms
%will suffer from at least an exponential runtime increase.
The problem with matching
is that tools based on the classic notion of
automata need to expand $r^{\{n\}}$ into $n$ connected
copies of the automaton for $r$. This leads to very inefficient matching
algorithms or algorithms that consume large amounts of memory.
Implementations using $\DFA$s will
either become excruciatingly slow
(for example Verbatim++\cite{Verbatimpp}) or get
out of memory errors (for example $\mathit{LEX}$ and
$\mathit{JFLEX}$\footnote{which are lexer generators
in C and JAVA that generate $\mathit{DFA}$-based
lexers. The user provides a set of regular expressions
and configurations to them, and then
gets an output program encoding a minimized $\mathit{DFA}$
that can be compiled and run.
When given the above countdown regular expression,
a small $n$ (a few dozen) would result in a
determinised automata
with millions of states.}) for large counters.
A classic example for this phenomenon is the regular expression $(a+b)^* a (a+b)^{n}$
where the minimal DFA requires at least $2^{n+1}$ states.
For example, when $n$ is equal to 2,
The corresponding $\mathit{NFA}$ looks like:
\begin{center}
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
\node[state,initial] (q_0) {$q_0$};
\node[state, red] (q_1) [right=of q_0] {$q_1$};
\node[state, red] (q_2) [right=of q_1] {$q_2$};
\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
\path[->]
(q_0) edge node {a} (q_1)
edge [loop below] node {a,b} ()
(q_1) edge node {a,b} (q_2)
(q_2) edge node {a,b} (q_3);
\end{tikzpicture}
\end{center}
when turned into a DFA by the subset construction
requires at least $2^3$ states.\footnote{The
red states are "countdown states" which counts down
the number of characters needed in addition to the current
string to make a successful match.
For example, state $q_1$ indicates a match that has
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
and just consumed the "delimiter" $a$ in the middle, and
need to match 2 more iterations of $(a|b)$ to complete.
State $q_2$ on the other hand, can be viewed as a state
after $q_1$ has consumed 1 character, and just waits
for 1 more character to complete.
$q_3$ is the last state, requiring 0 more character and is accepting.
Depending on the suffix of the
input string up to the current read location,
the states $q_1$ and $q_2$, $q_3$
may or may
not be active, independent from each other.
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
contain at least $2^3$ non-equivalent states that cannot be merged,
because the subset construction during determinisation will generate
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
Generalizing this to regular expressions with larger
bounded repetitions number, we have that
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
would require at least $2^{n+1}$ states, if $r$ itself contains
more than 1 string.
This is to represent all different
scenarios which "countdown" states are active.}
Bounded repetitions are very important because they
tend to occur a lot in practical use,
for example in the regex library RegExLib,
the rules library of Snort \cite{Snort1999}\footnote{
Snort is a network intrusion detection (NID) tool
for monitoring network traffic.
The network security community curates a list
of malicious patterns written as regexes,
which is used by Snort's detection engine
to match against network traffic for any hostile
activities such as buffer overflow attacks.},
as well as in XML Schema definitions (XSDs).
According to Bj\"{o}rklund et al \cite{xml2015},
more than half of the
XSDs they found on the Maven.org central repository
have bounded regular expressions in them.
Often the counters are quite large, with the largest being
approximately up to ten million.
An example XSD they gave
is:
\begin{verbatim}
<sequence minOccurs="0" maxOccurs="65535">
<element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
<element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
</sequence>
\end{verbatim}
This can be seen as the expression
$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
regular expressions
satisfying certain constraints (such as
satisfying the floating point number format).
It is therefore quite unsatisfying that
some regular expressions matching libraries
impose adhoc limits
for bounded regular expressions:
For example, in the regular expression matching library in the Go
language the regular expression $a^{1001}$ is not permitted, because no counter
can be above 1000, and in the built-in Rust regular expression library
expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
for being too big.
As Becchi and Crawley\cite{Becchi08} have pointed out,
the reason for these restrictions
is that they simulate a non-deterministic finite
automata (NFA) with a breadth-first search.
This way the number of active states could
be equal to the counter number.
When the counters are large,
the memory requirement could become
infeasible, and a regex engine
like Go will reject this pattern straight away.
\begin{figure}[H]
\begin{center}
\begin{tikzpicture} [node distance = 2cm, on grid, auto]
\node (q0) [state, initial] {$0$};
\node (q1) [state, right = of q0] {$1$};
%\node (q2) [state, right = of q1] {$2$};
\node (qdots) [right = of q1] {$\ldots$};
\node (qn) [state, right = of qdots] {$n$};
\node (qn1) [state, right = of qn] {$n+1$};
\node (qn2) [state, right = of qn1] {$n+2$};
\node (qn3) [state, accepting, right = of qn2] {$n+3$};
\path [-stealth, thick]
(q0) edge [loop above] node {a} ()
(q0) edge node {a} (q1)
%(q1) edge node {.} (q2)
(q1) edge node {.} (qdots)
(qdots) edge node {.} (qn)
(qn) edge node {.} (qn1)
(qn1) edge node {b} (qn2)
(qn2) edge node {$c$} (qn3);
\end{tikzpicture}
%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
% \node[state,initial] (q_0) {$0$};
% \node[state, ] (q_1) [right=of q_0] {$1$};
% \node[state, ] (q_2) [right=of q_1] {$2$};
% \node[state,
% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
% \path[->]
% (q_0) edge node {a} (q_1)
% edge [loop below] node {a,b} ()
% (q_1) edge node {a,b} (q_2)
% (q_2) edge node {a,b} (q_3);
%\end{tikzpicture}
\end{center}
\caption{The example given by Becchi and Crawley
that NFA simulation can consume large
amounts of memory: $.^*a.^{\{n\}}bc$ matching
strings of the form $aaa\ldots aaaabc$.
When traversing in a breadth-first manner,
all states from 0 till $n+1$ will become active.}
\end{figure}
%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
%in terms of input string length.
%TODO:try out these lexers
These problems can of course be solved in matching algorithms where
automata go beyond the classic notion and for instance include explicit
counters \cite{Turo_ov__2020}.
These solutions can be quite efficient,
with the ability to process
gigabytes of strings input per second
even with large counters \cite{Becchi08}.
But formal reasoning about these automata especially in Isabelle
can be challenging
and un-intuitive.
Therefore, we take correctness and runtime claims made about these solutions
with a grain of salt.
In the work reported in \cite{CSL2022} and here,
we add better support using derivatives
for bounded regular expressions $r^{\{n\}}$.
The results
extend straightforwardly to
repetitions with an interval such as
$r^{\{n\ldots m\}}$.
The merit of Brzozowski derivatives (more on this later)
on this problem is that
it can be naturally extended to support bounded repetitions.
Moreover these extensions are still made up of only
inductive datatypes and recursive functions,
making it handy to deal with using theorem provers.
%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
%straightforwardly extended to deal with bounded regular expressions
%and moreover the resulting code still consists of only simple
%recursive functions and inductive datatypes.
Finally, bounded regular expressions do not destroy our finite
boundedness property, which we shall prove later on.
\subsection{Back-References}
The other way to simulate an $\mathit{NFA}$ for matching is choosing
a single transition each time, keeping all the other options in
a queue or stack, and backtracking if that choice eventually
fails. This method, often called a "depth-first-search",
is efficient in a lot of cases, but could end up
with exponential run time.
The backtracking method is employed in regex libraries
that support \emph{back-references}, for example
in Java and Python.
%\section{Back-references and The Terminology Regex}
%When one constructs an $\NFA$ out of a regular expression
%there is often very little to be done in the first phase, one simply
%construct the $\NFA$ states based on the structure of the input regular expression.
%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
%one by keeping track of all active states after consuming
%a character, and update that set of states iteratively.
%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
%for a path terminating
%at an accepting state.
Given a regular expression like this (the sequence
operator is omitted for brevity):
\begin{center}
$r_1r_2r_3r_4$
\end{center}
one could label sub-expressions of interest
by parenthesizing them and giving
them a number by the order in which their opening parentheses appear.
One possible way of parenthesizing and labelling is given below:
\begin{center}
$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
\end{center}
The sub-expressions
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
by 1 to 4, and can be ``referred back'' by their respective numbers.
%These sub-expressions are called "capturing groups".
To do so, we use the syntax $\backslash i$
to denote that we want the sub-string
of the input just matched by the i-th
sub-expression to appear again,
exactly the same as it first appeared:
\begin{center}
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
\end{center}
%The backslash and number $i$ are the
%so-called "back-references".
%Let $e$ be an expression made of regular expressions
%and back-references. $e$ contains the expression $e_i$
%as its $i$-th capturing group.
%The semantics of back-reference can be recursively
%written as:
%\begin{center}
% \begin{tabular}{c}
% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
% \end{tabular}
%\end{center}
A concrete example
for back-references would be
\begin{center}
$(.^*)\backslash 1$,
\end{center}
which would match
strings that can be split into two identical halves,
for example $\mathit{foofoo}$, $\mathit{ww}$ and etc.
Note that this is different from
repeating the sub-expression verbatim like
\begin{center}
$(.^*)(.^*)$,
\end{center}
which does not impose any restrictions on what strings the second
sub-expression $.^*$
might match.
Another example of back-references would be
\begin{center}
$(.)(.)\backslash 2\backslash 1$
\end{center}
which expresses four-character palindromes
like $abba$, $x??x$ etc.
Back-references is a regex construct
that programmers found quite useful.
According to Becchi and Crawley\cite{Becchi08},
6\% of Snort rules (up until 2008) include the use of them.
The most common use of back-references
would be expressing well-formed html files,
where back-references would be handy in expressing
a pair of opening and closing tags like
\begin{center}
$\langle html \rangle \ldots \langle / html \rangle$
\end{center}
A regex describing such a format
could be
\begin{center}
$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
\end{center}
Despite being useful, the syntax and expressive power of regexes
go beyond the regular language hierarchy
with back-references.
In fact, they allow the regex construct to express
languages that cannot be contained in context-free
languages either.
For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
Such a language is contained in the context-sensitive hierarchy
of formal languages.
Solving the back-reference expressions matching problem
is known to be NP-complete \parencite{alfred2014algorithms}.
A non-bactracking,
efficient solution is not known to exist.
Regex libraries supporting back-references such as
PCRE \cite{pcre} therefore have to
revert to a depth-first search algorithm which backtracks.
What is unexpected is that even in the cases
not involving back-references, there is still
a (non-negligible) chance they might backtrack super-linearly,
as shown in the graphs in \ref{fig:aStarStarb}.
\subsection{Summary of the Catastrophic Backtracking Problem}
Summing these up, we can categorise existing
practical regex libraries into two kinds:
(i)The ones with linear
time guarantees like Go and Rust. The cost with them is that
they impose restrictions
on the user input (not allowing back-references,
bounded repetitions cannot exceed a counter limit etc.).
(ii) Those
that allow large bounded regular expressions and back-references
at the expense of using a backtracking algorithm.
They could grind to a halt
on some very simple cases, posing a vulnerability of
a ReDoS attack.
We would like to have regex engines that can
deal with the regular part (e.g.
bounded repetitions) of regexes more
efficiently.
Also we want to make sure that they do it correctly.
It turns out that such aim is not so easy to achieve.
%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
% For example, the Rust regex engine claims to be linear,
% but does not support lookarounds and back-references.
% The GoLang regex library does not support over 1000 repetitions.
% Java and Python both support back-references, but shows
%catastrophic backtracking behaviours on inputs without back-references(
%when the language is still regular).
%TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
%TODO: verify the fact Rust does not allow 1000+ reps
%The time cost of regex matching algorithms in general
%involve two different phases, and different things can go differently wrong on
%these phases.
%$\DFA$s usually have problems in the first (construction) phase
%, whereas $\NFA$s usually run into trouble
%on the second phase.
\section{Error-prone POSIX Implementations}
When there are multiple ways of matching a string
with a regular expression, a matcher needs to
disambiguate.
The standard for which particular match to pick
is called the disambiguation strategy.
The more intuitive strategy is called POSIX,
which always chooses the longest initial match.
An alternative strategy would be greedy matches,
which always ends a sub-match as early as possible.
The POSIX standard is widely adopted in many operating systems.
However, many implementations (including the C libraries
used by Linux and OS X distributions) contain bugs
or do not meet the specification they claim to adhere to.
In some cases, they either fail to generate a lexing
result when there exists a match,
or give results that are inconsistent with the $\POSIX$ standard.
A concrete example would be the regex given by \cite{fowler2003}
\begin{center}
$(aba + ab + a)^* \text{and the string} ababa$
\end{center}
The correct $\POSIX$ match for the above would be
with the entire string $ababa$,
split into two Kleene star iterations, $[ab] [aba]$ at positions
$[0, 2), [2, 5)$
respectively.
But trying this out in regex101\parencite{regex101}
with different language engines would yield
the same two fragmented matches: $[aba]$ at $[0, 3)$
and $a$ at $[4, 5)$.
Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
commented that most regex libraries are not
correctly implementing the POSIX (maximum-munch)
rule of regular expression matching.
As Grathwohl\parencite{grathwohl2014crash} wrote,
\begin{quote}
``The POSIX strategy is more complicated than the
greedy because of the dependence on information about
the length of matched strings in the various subexpressions.''
\end{quote}
%\noindent
The implementation complexity of POSIX rules also come from
the specification being not very clear.
There are many informal summaries of this disambiguation
strategy, which are often quite long and delicate.
For example Kuklewicz \cite{KuklewiczHaskell}
described the POSIX rule as
\begin{quote}
``
\begin{itemize}
\item
regular expressions (REs) take the leftmost starting match, and the longest match starting there
earlier subpatterns have leftmost-longest priority over later subpatterns\\
\item
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
\item
REs have right associative concatenation which can be changed with parenthesis\\
\item
parenthesized subexpressions return the match from their last usage\\
\item
text of component subexpressions must be contained in the text of the
higher-level subexpressions\\
\item
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
\item
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\''
\end{itemize}
\end{quote}
The text above
is trying to capture something very precise,
and is crying out for formalising.
Ausaf et al. \cite{AusafDyckhoffUrban2016}
are the first to fill the gap
by not just describing such a formalised POSIX
specification in Isabelle/HOL, but also proving
that their specification coincides with the
POSIX specification given by Okui and Suzuki \cite{Okui10}
which is a completely
different characterisation.
They then formally proved the correctness of
a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
based on that specification.
In the next section we will very briefly
introduce Brzozowski derivatives and Sulzmann
and Lu's algorithm, which this thesis builds on.
We give a taste of what they
are like and why they are suitable for regular expression
matching and lexing.
\section{Our Solution--Formal Specification of POSIX Matching
and Brzozowski Derivatives}
Now we start with the central topic of the thesis: Brzozowski derivatives.
Brzozowski \cite{Brzozowski1964} first introduced the
concept of the \emph{derivative} in the 1960s.
The derivative of a regular expression $r$
with respect to a character $c$, is written as $r \backslash c$.\footnote{
Despite having the same name, regular expression
derivatives bear little similarity with the mathematical definition
of derivatives on functions.
}
It tells us what $r$ would transform into
if we chop off the first character $c$
from all strings in the language of $r$ ($L \; r$).
To give a flavour of Brzozowski derivatives, we present
two straightforward clauses from it:
\begin{center}
\begin{tabular}{lcl}
$d \backslash c$ & $\dn$ &
$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
\end{tabular}
\end{center}
\noindent
The first clause says that for the regular expression
denoting a singleton set consisting of a sinlge-character string $\{ d \}$,
we check the derivative character $c$ against $d$,
returning a set containing only the empty string $\{ [] \}$
if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.
The second clause states that to obtain the regular expression
representing all strings' head character $c$ being chopped off
from $r_1 + r_2$, one simply needs to recursively take derivative
of $r_1$ and $r_2$ and then put them together.
Thanks to the definition, derivatives have the nice property
that $s \in L \; (r\backslash c)$ if and only if
$c::s \in L \; r$.
%This property can be used on regular expressions
%matching and lexing--to test whether a string $s$ is in $L \; r$,
%one simply takes derivatives of $r$ successively with
%respect to the characters (in the correct order) in $s$,
%and then test whether the empty string is in the last regular expression.
Derivatives give a simple solution
to the problem of matching and lexing a string $s$ with a regular
expression $r$: if the derivative of $r$ w.r.t.\ (in
succession) all the characters of the string matches the empty string,
then $r$ matches $s$ (and {\em vice versa}).
This makes formally reasoning about these properties such
as correctness and complexity smooth and intuitive.
In fact, there has already been several mechanised proofs about them,
for example the one by Owens and Slind \cite{Owens2008} in HOL4,
another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, and
yet another in Coq by Coquand and Siles \cite{Coquand2012}.
In addition, one can extend the clauses to bounded repetitions
``for free'':
\begin{center}
\begin{tabular}{lcl}
$r^{\{n\}} \backslash c$ & $\dn$ & $r \backslash c \cdot
r^{\{n-1\}}$\\
\end{tabular}
\end{center}
\noindent
And experimental results suggest that unlike DFA-based solutions,
this derivatives can support
bounded regular expressions with large counters
quite well.
There has also been
extensions to other constructs.
For example, Owens et al include the derivatives
for \emph{NOT} regular expressions, which is
able to concisely express C-style comments of the form
$/* \ldots */$.
Another extension for derivatives would be
regular expressions with look-aheads, done by
by Miyazaki and Minamide
\cite{Takayuki2019}.
%We therefore use Brzozowski derivatives on regular expressions
%lexing
Given the above definitions and properties of
Brzozowski derivatives, one quickly realises their potential
in generating a formally verified algorithm for lexing--the clauses and property
can be easily expressed in a functional programming language
or converted to theorem prover
code, with great extensibility.
Perhaps this is the reason why it has sparked quite a bit of interest
in the functional programming and theorem prover communities in the last
fifteen or so years (
\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},
\cite{Chen12} and \cite{Coquand2012}
to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}
after they were first published.
However, there are two difficulties with derivative-based matchers:
First, Brzozowski's original matcher only generates a yes/no answer
for whether a regular expression matches a string or not. This is too
little information in the context of lexing where separate tokens must
be identified and also classified (for example as keywords
or identifiers).
Second, derivative-based matchers need to be more efficient.
Elegant and beautiful
as many implementations are,
they can be excruciatingly slow.
For example, Sulzmann and Lu
claim a linear running time of their proposed algorithm,
but that was falsified by our experiments. The running time
is actually $\Omega(2^n)$ in the worst case.
A similar claim about a theoretical runtime of $O(n^2)$
is made for the Verbatim \cite{Verbatim}
%TODO: give references
lexer, which calculates POSIX matches and is based on derivatives.
They formalized the correctness of the lexer, but not the complexity.
In the performance evaluation section, they simply analyzed the run time
of matching $a$ with the string
\begin{center}
$\underbrace{a \ldots a}_{\text{n a's}}$
\end{center}
and concluded that the algorithm is quadratic in terms of input length.
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
the time it took to lex only 40 $a$'s was 5 minutes.
\subsection{Sulzmann and Lu's Algorithm}
Sulzmann and Lu~\cite{Sulzmann2014} overcame the first
difficulty by cleverly extending Brzozowski's matching
algorithm. Their extended version generates additional information on
\emph{how} a regular expression matches a string following the POSIX
rules for regular expression matching. They achieve this by adding a
second ``phase'' to Brzozowski's algorithm involving an injection
function simplification of internal data structures
eliminating the exponential behaviours.
In an earlier work, Ausaf et al provided the formal
specification of what POSIX matching means and proved in Isabelle/HOL
the correctness
of Sulzmann and Lu's extended algorithm accordingly
\cite{AusafDyckhoffUrban2016}.
The version of the algorithm proven correct
suffers from the
second difficulty though, where the internal derivatives can
grow to arbitrarily big sizes.
For example if we start with the
regular expression $(a+aa)^*$ and take
successive derivatives according to the character $a$, we end up with
a sequence of ever-growing derivatives like
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
\begin{center}
\begin{tabular}{rll}
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
\end{tabular}
\end{center}
\noindent where after around 35 steps we run out of memory on a
typical computer (we shall define in the next chapter
the precise details of our
regular expressions and the derivative operation). Clearly, the
notation involving $\ZERO$s and $\ONE$s already suggests
simplification rules that can be applied to regular regular
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
r$. While such simple-minded simplifications have been proved in our
earlier work to preserve the correctness of Sulzmann and Lu's
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
\emph{not} help with limiting the growth of the derivatives shown
above: the growth is slowed, but the derivatives can still grow rather
quickly beyond any finite bound.
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
\cite{Sulzmann2014} where they introduce bit-coded
regular expressions. In this version, POSIX values are
represented as bit sequences and such sequences are incrementally generated
when derivatives are calculated. The compact representation
of bit sequences and regular expressions allows them to define a more
``aggressive'' simplification method that keeps the size of the
derivatives finite no matter what the length of the string is.
They make some informal claims about the correctness and linear behaviour
of this version, but do not provide any supporting proof arguments, not
even ``pencil-and-paper'' arguments. They write about their bit-coded
\emph{incremental parsing method} (that is the algorithm to be formalised
in this dissertation)
\begin{quote}\it
``Correctness Claim: We further claim that the incremental parsing
method [..] in combination with the simplification steps [..]
yields POSIX parse trees. We have tested this claim
extensively [..] but yet
have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
\end{quote}
Ausaf and Urban were able to back this correctness claim with
a formal proof.
However a faster formally verified
lexing program with the optimisations
mentioned by Sulzmann and Lu's second algorithm
is still missing.
As they stated,
\begin{quote}\it
``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''
\end{quote}
This thesis implements the aggressive simplifications envisioned
by Ausaf and Urban,
together with a formal proof of the correctness with those simplifications.
One of the most recent work in the context of lexing
%with this issue
is the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.
This is relevant work for us and we will compare it later with
our derivative-based matcher we are going to present.
There is also some newer work called
Verbatim++\cite{Verbatimpp}, which does not use derivatives,
but deterministic finite automaton instead.
%An example that gives problem to automaton approaches would be
%the regular expression $(a|b)^*a(a|b)^{\{n\}}$.
%It requires at least $2^{n+1}$ states to represent
%as a DFA.
%----------------------------------------------------------------------------------------
\section{Contribution}
In this thesis,
we propose a solution to catastrophic
backtracking and error-prone matchers: a formally verified
regular expression lexing algorithm
that is both fast
and correct by extending Ausaf et al.'s work.
The end result is %a regular expression lexing algorithm that comes with
\begin{itemize}
\item
an improved version of Sulzmann and Lu's bit-coded algorithm using
derivatives with simplifications,
accompanied by
a proven correctness theorem according to POSIX specification
given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
\item
a complexity-related property for that algorithm saying that the
internal data structure will
remain finite,
\item
and extension to
the bounded repetitions construct with the correctness and finiteness property
maintained.
\end{itemize}
With a formal finiteness bound in place,
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
Further improvements to the algorithm with an even stronger version of
simplification is made.
Thanks to our theorem-prover-friendly approach,
we believe that
this finiteness bound can be improved to a bound
linear to input and
cubic to the regular expression size using a technique by
Antimirov\cite{Antimirov95}.
Once formalised, this would be a guarantee for the absence of all super-linear behavious.
We are working out the
details.
To our best knowledge, no lexing libraries using Brzozowski derivatives
have similar complexity-related bounds,
and claims about running time are usually speculative and backed by empirical
evidence on a few test cases.
If a matching or lexing algorithm
does not come with certain basic complexity related
guarantees (for examaple the internal data structure size
does not grow indefinitely),
then they cannot claim with confidence having solved the problem
of catastrophic backtracking.
\section{Structure of the thesis}
In chapter 2 \ref{Inj} we will introduce the concepts
and notations we
use for describing the lexing algorithm by Sulzmann and Lu,
and then give the lexing algorithm.
We will give its variant in \ref{Bitcoded1}.
Then we illustrate in \ref{Bitcoded2}
how the algorithm without bitcodes falls short for such aggressive
simplifications and therefore introduce our version of the
bit-coded algorithm and
its correctness proof .
In \ref{Finite} we give the second guarantee
of our bitcoded algorithm, that is a finite bound on the size of any
regex's derivatives.
In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
algorithm to include constructs such as bounded repetitions and negations.
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------