lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@d15a0b7d6d90


% Chapter 1

\chapter{Introduction} % Main chapter title

\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 

%----------------------------------------------------------------------------------------

% Define some commands to keep the formatting separated from the content 
\newcommand{\keyword}[1]{\textbf{#1}}
\newcommand{\tabhead}[1]{\textbf{#1}}
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\file}[1]{\texttt{\bfseries#1}}
\newcommand{\option}[1]{\texttt{\itshape#1}}

%boxes
\newcommand*{\mybox}[1]{\framebox{\strut #1}}

%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
\def\derssimp{\textit{ders}\_\textit{simp}}
\def\rders{\textit{rders}}
\newcommand{\bders}[2]{#1 \backslash #2}
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
\def\bsimps{\textit{bsimp}}
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
\newcommand{\ZERO}{\mbox{\bf 0}}
\newcommand{\ONE}{\mbox{\bf 1}}
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
\def\rdistincts{\textit{rdistinct}}
\def\rDistinct{\textit{rdistinct}}
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
\def\cbn{\textit{createdByNtimes}}
\def\hpa{\textit{highestPowerAux}}
\def\hpower{\textit{highestPower}}
\def\ntset{\textit{ntset}}
\def\optermsimp{\textit{optermsimp}}
\def\optermOsimp{\textit{optermOsimp}}
\def\optermosimp{\textit{optermosimp}}
\def\opterm{\textit{opterm}}
\def\nString{\textit{nonemptyString}}

\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}

\def\SEQ{\textit{SEQ}}
\def\SEQs{\textit{SEQs}}
\def\case{\textit{case}}
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
\def\rsimpalts{\textit{rsimp}_{ALTS}}
\def\good{\textit{good}}
\def\btrue{\textit{true}}
\def\bfalse{\textit{false}}
\def\bnullable{\textit{bnullable}}
\def\bnullables{\textit{bnullables}}
\def\Some{\textit{Some}}
\def\None{\textit{None}}
\def\code{\textit{code}}
\def\decode{\textit{decode}}
\def\internalise{\textit{internalise}}
\def\lexer{\mathit{lexer}}
\def\mkeps{\textit{mkeps}}
\newcommand{\rder}[2]{#2 \backslash_r #1}

\def\rerases{\textit{rerase}}

\def\nonnested{\textit{nonnested}}
\def\AZERO{\textit{AZERO}}
\def\sizeNregex{\textit{sizeNregex}}
\def\AONE{\textit{AONE}}
\def\ACHAR{\textit{ACHAR}}

\def\simpsulz{\textit{simp}_{Sulz}}

\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
\def\frewrite{\rightsquigarrow_f}
\def\hrewrite{\rightsquigarrow_h}
\def\grewrite{\rightsquigarrow_g}
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
\def\fuse{\textit{fuse}}
\def\bder{\textit{bder}}
\def\der{\textit{der}}
\def\POSIX{\textit{POSIX}}
\def\ALTS{\textit{ALTS}}
\def\ASTAR{\textit{ASTAR}}
\def\DFA{\textit{DFA}}
\def\NFA{\textit{NFA}}
\def\bmkeps{\textit{bmkeps}}
\def\bmkepss{\textit{bmkepss}}
\def\retrieve{\textit{retrieve}}
\def\blexer{\textit{blexer}}
\def\flex{\textit{flex}}
\def\inj{\textit{inj}}
\def\Empty{\textit{Empty}}
\def\Left{\textit{Left}}
\def\Right{\textit{Right}}
\def\Stars{\textit{Stars}}
\def\Char{\textit{Char}}
\def\Seq{\textit{Seq}}
\def\Der{\textit{Der}}
\def\Ders{\textit{Ders}}
\def\nullable{\mathit{nullable}}
\def\Z{\mathit{Z}}
\def\S{\mathit{S}}
\def\rup{r^\uparrow}
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
\def\distinctWith{\textit{distinctWith}}
\def\lf{\textit{lf}}
\def\PD{\textit{PD}}
\def\suffix{\textit{Suffix}}
\def\distinctBy{\textit{distinctBy}}
\def\starupdate{\textit{starUpdate}}
\def\starupdates{\textit{starUpdates}}
\def\nupdate{\textit{nupdate}}
\def\nupdates{\textit{nupdates}}


\def\size{\mathit{size}}
\def\rexp{\mathbf{rexp}}
\def\simp{\mathit{simp}}
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
\def\map{\mathit{map}}
\def\distinct{\mathit{distinct}}
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
\def\blexerStrong{\textit{blexerStrong}}
\def\bsimpStrong{\textit{bsimpStrong}}
\def\bdersStrongs{\textit{bdersStrong}}
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}

\def\map{\textit{map}}
\def\rrexp{\textit{rrexp}}
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
\newcommand\asize[1]{\llbracket #1 \rrbracket}
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}

\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}


\def\rflts{\textit{rflts}}
\def\rrewrite{\textit{rrewrite}}
\def\bsimpalts{\textit{bsimp}_{ALTS}}
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
\def\rsimlalts{\textit{rsimp}_{ALTs}}
\def\rsimpseq{\textit{rsimp}_{SEQ}}

\def\erase{\textit{erase}}
\def\STAR{\textit{STAR}}
\def\flts{\textit{flts}}


\def\zeroable{\textit{zeroable}}
\def\nub{\textit{nub}}
\def\filter{\textit{filter}}
%\def\not{\textit{not}}



\def\RZERO{\mathbf{0}_r }
\def\RONE{\mathbf{1}_r}
\newcommand\RCHAR[1]{\mathbf{#1}_r}
\newcommand\RSEQ[2]{#1 \cdot #2}
\newcommand\RALTS[1]{\sum #1}
\newcommand\RSTAR[1]{#1^*}
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}




\lstdefinestyle{myScalastyle}{
  frame=tb,
  language=scala,
  aboveskip=3mm,
  belowskip=3mm,
  showstringspaces=false,
  columns=flexible,
  basicstyle={\small\ttfamily},
  numbers=none,
  numberstyle=\tiny\color{gray},
  keywordstyle=\color{blue},
  commentstyle=\color{dkgreen},
  stringstyle=\color{mauve},
  frame=single,
  breaklines=true,
  breakatwhitespace=true,
  tabsize=3,
}


%----------------------------------------------------------------------------------------
%This part is about regular expressions, Brzozowski derivatives,
%and a bit-coded lexing algorithm with proven correctness and time bounds.

%TODO: look up snort rules to use here--give readers idea of what regexes look like


Regular expressions, since their inception in the 1940s, 
have been subject to extensive study and implementation. 
Their primary application lies in lexing, 
a process whereby an unstructured input string is disassembled into a tree of tokens
with their guidance.
This is often used to match strings that comprises of numerous fields, 
where certain fields may recur or be omitted. 
For example, the regular expression for recognising email addresses is
\begin{center}
\verb|^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$|.
\end{center}
Using this, regular expression matchers and lexers are able to extract 
for example the domain names by the use of \verb|[a-zA-Z0-9.-]+|. 
Consequently, they are an indispensible component in text processing tools 
of software systems such as compilers, IDEs, and firewalls.

The study of regular expressions is ongoing due to an
issue known as catastrophic backtracking. 
This phenomenon refers to scenarios in which the regular expression 
matching or lexing engine exhibits a disproportionately long 
runtime despite the simplicity of the input and expression.

The origin of catastrophic backtracking is rooted in the 
ambiguities of lexing. 
In the process of matching a multi-character string with 
a regular expression that encompasses several sub-expressions, 
different positions can be designated to mark 
the boundaries of corresponding substrings of the sub-expressions. 
For instance, in matching the string $aaa$ with the 
regular expression $(a+aa)^*$, the divide between 
the initial match and the subsequent iteration could either be 
set between the first and second characters ($a | aa$) or between the second and third characters ($aa | a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.

Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).

Various disambiguation strategies are employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:


%Regular expressions 
%have been extensively studied and
%implemented since their invention in 1940s.
%It is primarily used in lexing, where an unstructured input string is broken
%down into a tree of tokens.
%That tree's construction is guided by the shape of the regular expression.
%This is particularly useful in expressing the shape of a string
%with many fields, where certain fields might repeat or be omitted.
%Regular expression matchers and Lexers allow us to 
%identify and delimit different subsections of a string and potentially 
%extract information from inputs, making them
%an indispensible component in modern software systems' text processing tasks
%such as compilers, IDEs, and firewalls.
%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
%which means the regular expression matching or lexing engine takes an unproportional time to run 
%despite the input and the expression being relatively simple.
%
%Catastrophic backtracking stems from the ambiguities of lexing: 
%when matching a multiple-character string with a regular 
%exression that includes serveral sub-expressions, there might be different positions to set 
%the border between sub-expressions' corresponding sub-strings.
%For example, matching the string $aaa$ against the regular expression
%$(a+aa)^*$, the border between the initial match and the second iteration
%could be between the first and second character ($a | aa$) 
%or between the second and third character ($aa | a$).
%As the size of the input string and the structural complexity 
%of the regular expression increase,
%the number of different combinations of delimiters can grow exponentially, and
%algorithms that explore these possibilities unwisely will also see an exponential complexity.
%
%Catastrophic backtracking allow a class of computationally inexpensive attacks called
%Regular expression Denial of Service attacks (ReDoS), in which the hacker 
%simply sends out a small attack string to a server, 
%triggering high-complexity behaviours in its regular expression engine. 
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more 
%details of this in the practical overview section in chapter \ref{Introduction}).
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
%There have been prose definitions like the following
%by Kuklewicz \cite{KuklewiczHaskell}: 
%described the POSIX rule as (section 1, last paragraph):
\begin{quote}
	\begin{itemize}
		\item
regular expressions (REs) take the leftmost starting match, and the longest match starting there
earlier subpatterns have leftmost-longest priority over later subpatterns\\
\item
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
\item
REs have right associative concatenation which can be changed with parenthesis\\
\item
parenthesized subexpressions return the match from their last usage\\
\item
text of component subexpressions must be contained in the text of the 
higher-level subexpressions\\
\item
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
\item
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
\end{itemize}
\end{quote}
However, the author noted that lexers are rarely correct according to this standard.
Being able to formally define and capture the idea of POSIX rules and matching/lexing algorithms and prove 
the correctness of such algorithms against the POSIX semantics definitions
would be valuable.

Formal proofs are machine checked programs
such as the ones written in Isabelle/HOL, is a powerful means 
for computer scientists to be certain about correctness of their algorithms and correctness properties.
This is done by automatically checking that the end goal is derived solely from a list of axioms, definitions
and proof rules that are known to be correct.
The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
methods as their implementation and complexity analysis tend to be error-prone.

There have been many interesting steps taken in the direction of formal proofs and regular expressions
lexing and matching.



(ends)

%Regular expressions are widely used in computer science: 
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
%command-line tools like $\mathit{grep}$ that facilitate easy 
%text-processing \cite{grep}; network intrusion
%detection systems that inspect suspicious traffic; or compiler
%front ends.
%Given their usefulness and ubiquity, one would assume that
%modern regular expression matching implementations
%are mature and fully studied.
%Indeed, in many popular programming languages' regular expression engines, 
%one can supply a regular expression and a string, and in most cases get
%get the matching information in a very short time.
%Those engines can sometimes be blindingly fast--some 
%network intrusion detection systems
%use regular expression engines that are able to process 
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
%However, those engines can sometimes also exhibit a surprising security vulnerability
%under a certain class of inputs.
%However, , this is not the case for $\mathbf{all}$ inputs.
%TODO: get source for SNORT/BRO's regex matching engine/speed


Consider for example the regular expression $(a^*)^*\,b$ and 
strings of the form $aa..a$. These strings cannot be matched by this regular
expression: Obviously the expected $b$ in the last
position is missing. One would assume that modern regular expression
matching engines can find this out very quickly. Surprisingly, if one tries
this example in JavaScript, Python or Java 8, even with small strings, 
say of lenght of around 30 $a$'s,
the decision takes an absurd amount of time to finish (see graphs in figure \ref{fig:aStarStarb}).
The algorithms clearly show exponential behaviour, and as can be seen
is triggered by some relatively simple regular expressions.
Java 9 and newer
versions improve this behaviour somewhat, but are still slow compared 
with the approach we are going to use in this thesis.



This superlinear blowup in regular expression engines
has caused grief in ``real life'' where it is 
given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
For example, on 20 July 2016 one evil
regular expression brought the webpage
\href{http://stackexchange.com}{Stack Exchange} to its
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
In this instance, a regular expression intended to just trim white
spaces from the beginning and the end of a line actually consumed
massive amounts of CPU resources---causing the web servers to grind to a
halt. In this example, the time needed to process
the string was 
$O(n^2)$ with respect to the string length $n$. This
quadratic overhead was enough for the homepage of Stack Exchange to
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
attack and therefore stopped the servers from responding to any
requests. This made the whole site become unavailable. 

\begin{figure}[p]
\begin{center}
\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={JavaScript},  
    legend pos=north west,
    legend cell align=left]
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
\end{axis}
\end{tikzpicture}
  &
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    %ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Python},  
    legend pos=north west,
    legend cell align=left]
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
\end{axis}
\end{tikzpicture}\\ 
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Java 8},  
    legend pos=north west,
    legend cell align=left]
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
\end{axis}
\end{tikzpicture}
  &
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    %ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Dart},  
    legend pos=north west,
    legend cell align=left]
\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
\end{axis}
\end{tikzpicture}\\ 
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Swift},  
    legend pos=north west,
    legend cell align=left]
\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
\end{axis}
\end{tikzpicture}
  & 
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    %ylabel={time in secs},
    enlargelimits=true,
    %xtick={0,5000,...,40000},
    %xmax=40000,
    %ymax=35,
    restrict x to domain*=0:40000,
    restrict y to domain*=0:35,
    %ytick={0,5,...,30},
    %scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Java9+},  
    legend pos=north west,
    legend cell align=left]
\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
\end{axis}
\end{tikzpicture}\\ 
\multicolumn{2}{c}{Graphs}
\end{tabular}    
\end{center}
\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings 
           of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
   The reason for their superlinear behaviour is that they do a depth-first-search
   using NFAs.
   If the string does not match, the regular expression matching
   engine starts to explore all possibilities. 
}\label{fig:aStarStarb}
\end{figure}\afterpage{\clearpage}

A more recent example is a global outage of all Cloudflare servers on 2 July
2019. A poorly written regular expression exhibited catastrophic backtracking
and exhausted CPUs that serve HTTP traffic. Although the outage
had several causes, at the heart was a regular expression that
was used to monitor network
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
These problems with regular expressions 
are not isolated events that happen
very rarely, 
%but actually widespread.
%They occur so often that they have a 
but they occur actually often enough that they have a
name: Regular-Expression-Denial-Of-Service (ReDoS)
attacks.
Davis et al. \cite{Davis18} detected more
than 1000 evil regular expressions
in Node.js, Python core libraries, npm and pypi. 
They therefore concluded that evil regular expressions
are a real problem rather than just "a parlour trick".

The work in this thesis aims to address this issue
with the help of formal proofs.
We describe a lexing algorithm based
on Brzozowski derivatives with verified correctness 
and a finiteness property for the size of derivatives
(which are all done in Isabelle/HOL).
Such properties %guarantee the absence of 
are an important step in preventing
catastrophic backtracking once and for all.
We will give more details in the next sections
on (i) why the slow cases in graph \ref{fig:aStarStarb}
can occur in traditional regular expression engines
and (ii) why we choose our 
approach based on Brzozowski derivatives and formal proofs.


\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
Regular expressions and regular expression matchers 
have clearly been studied for many, many years.
Theoretical results in automata theory state 
that basic regular expression matching should be linear
w.r.t the input.
This assumes that the regular expression
$r$ was pre-processed and turned into a
deterministic finite automaton (DFA) before matching \cite{Sakarovitch2009}.
By basic we mean textbook definitions such as the one
below, involving only regular expressions for characters, alternatives,
sequences, and Kleene stars:
\[
	r ::= c | r_1 + r_2 | r_1 \cdot r_2 | r^*
\]
Modern regular expression matchers used by programmers,
however,
support much richer constructs, such as bounded repetitions,
negations,
and back-references.
To differentiate, we use the word \emph{regex} to refer
to those expressions with richer constructs while reserving the
term \emph{regular expression}
for the more traditional meaning in formal languages theory.
We follow this convention 
in this thesis.
In the future, we aim to support all the popular features of regexes, 
but for this work we mainly look at basic regular expressions
and bounded repetitions.



%Most modern regex libraries
%the so-called PCRE standard (Peral Compatible Regular Expressions)
%has the back-references
Regexes come with a number of constructs
that make it more convenient for 
programmers to write regular expressions.
Depending on the types of constructs
the task of matching and lexing with them
will have different levels of complexity.
Some of those constructs are syntactic sugars that are
simply short hand notations
that save the programmers a few keystrokes.
These will not cause problems for regex libraries.
For example the
non-binary alternative involving three or more choices just means:
\[
	(a | b | c) \stackrel{means}{=} ((a + b)+ c)
\]
Similarly, the range operator
%used to express the alternative
%of all characters between its operands, 
is just a concise way
of expressing an alternative of consecutive characters:
\[
	[0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 )  
\]
for an alternative. The
wildcard character '$.$' is used to refer to any single character,
\[
	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
\]
except the newline.

\subsection{Bounded Repetitions}
More interesting are bounded repetitions, which can 
make the regular expressions much
more compact.
Normally there are four kinds of bounded repetitions:
$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
(where $n$ and $m$ are constant natural numbers).
Like the star regular expressions, the set of strings or language
a bounded regular expression can match
is defined using the power operation on sets:
\begin{center}
	\begin{tabular}{lcl}
		$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
		$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
		$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
		$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
	\end{tabular}
\end{center}
The attraction of bounded repetitions is that they can be
used to avoid a size blow up: for example $r^{\{n\}}$
is a shorthand for
the much longer regular expression:
\[
	\underbrace{r\ldots r}_\text{n copies of r}.
\]
%Therefore, a naive algorithm that simply unfolds
%them into their desugared forms
%will suffer from at least an exponential runtime increase.


The problem with matching 
such bounded repetitions
is that tools based on the classic notion of
automata need to expand $r^{\{n\}}$ into $n$ connected 
copies of the automaton for $r$. This leads to very inefficient matching
algorithms  or algorithms that consume large amounts of memory.
Implementations using $\DFA$s will
in such situations
either become excruciatingly slow 
(for example Verbatim++ \cite{Verbatimpp}) or run
out of memory (for example $\mathit{LEX}$ and 
$\mathit{JFLEX}$\footnote{LEX and JFLEX are lexer generators
in C and JAVA that generate $\mathit{DFA}$-based
lexers. The user provides a set of regular expressions
and configurations, and then 
gets an output program encoding a minimized $\mathit{DFA}$
that can be compiled and run. 
When given the above countdown regular expression,
a small $n$ (say 20) would result in a program representing a
DFA
with millions of states.}) for large counters.
A classic example for this phenomenon is the regular expression $(a+b)^*  a (a+b)^{n}$
where the minimal DFA requires at least $2^{n+1}$ states.
For example, when $n$ is equal to 2,
the corresponding $\mathit{NFA}$ looks like:
\vspace{6mm}
\begin{center}
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
   \node[state,initial] (q_0)   {$q_0$}; 
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
    \path[->] 
    (q_0) edge  node {a} (q_1)
    	  edge [loop below] node {a,b} ()
    (q_1) edge  node  {a,b} (q_2)
    (q_2) edge  node  {a,b} (q_3);
\end{tikzpicture}
\end{center}
and when turned into a DFA by the subset construction
requires at least $2^3$ states.\footnote{The 
red states are "countdown states" which count down 
the number of characters needed in addition to the current
string to make a successful match.
For example, state $q_1$ indicates a match that has
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
and just consumed the "delimiter" $a$ in the middle, and 
needs to match 2 more iterations of $(a|b)$ to complete.
State $q_2$ on the other hand, can be viewed as a state
after $q_1$ has consumed 1 character, and just waits
for 1 more character to complete.
The state $q_3$ is the last (accepting) state, requiring 0 
more characters.
Depending on the suffix of the
input string up to the current read location,
the states $q_1$ and $q_2$, $q_3$
may or may
not be active.
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
contain at least $2^3$ non-equivalent states that cannot be merged, 
because the subset construction during determinisation will generate
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
Generalizing this to regular expressions with larger
bounded repetitions number, we have that
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
would require at least $2^{n+1}$ states, if $r$ itself contains
more than 1 string.
This is to represent all different 
scenarios in which "countdown" states are active.}


Bounded repetitions are important because they
tend to occur frequently in practical use,
for example in the regex library RegExLib, in
the rules library of Snort \cite{Snort1999}\footnote{
Snort is a network intrusion detection (NID) tool
for monitoring network traffic.
The network security community curates a list
of malicious patterns written as regexes,
which is used by Snort's detection engine
to match against network traffic for any hostile
activities such as buffer overflow attacks.}, 
as well as in XML Schema definitions (XSDs).
According to Bj\"{o}rklund et al \cite{xml2015},
more than half of the 
XSDs they found on the Maven.org central repository
have bounded regular expressions in them.
Often the counters are quite large, with the largest being
close to ten million. 
A smaller sample XSD they gave
is:
\lstset{
	basicstyle=\fontsize{8.5}{9}\ttfamily,
  language=XML,
  morekeywords={encoding,
    xs:schema,xs:element,xs:complexType,xs:sequence,xs:attribute}
}
\begin{lstlisting}
<sequence minOccurs="0" maxOccurs="65535">
	<element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
 	<element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
</sequence>
\end{lstlisting}
This can be seen as the regex
$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
regular expressions 
satisfying certain constraints (such as 
satisfying the floating point number format).
It is therefore quite unsatisfying that 
some regular expressions matching libraries
impose adhoc limits
for bounded regular expressions:
For example, in the regular expression matching library in the Go
language the regular expression $a^{1001}$ is not permitted, because no counter
can be above 1000, and in the built-in Rust regular expression library
expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
for being too big. 
As Becchi and Crawley \cite{Becchi08}  have pointed out,
the reason for these restrictions
is that they simulate a non-deterministic finite
automata (NFA) with a breadth-first search.
This way the number of active states could
be equal to the counter number.
When the counters are large, 
the memory requirement could become
infeasible, and a regex engine
like in Go will reject this pattern straight away.
\begin{figure}[H]
\begin{center}
\begin{tikzpicture} [node distance = 2cm, on grid, auto]
 
    	\node (q0) [state, initial] {$0$};
	\node (q1) [state, right = of q0] {$1$};
	%\node (q2) [state, right = of q1] {$2$};
	\node (qdots) [right = of q1] {$\ldots$};
	\node (qn) [state, right = of qdots] {$n$};
	\node (qn1) [state, right = of qn] {$n+1$};
	\node (qn2) [state, right = of qn1] {$n+2$};
	\node (qn3) [state, accepting, right = of qn2] {$n+3$}; 
 
\path [-stealth, thick]
	(q0) edge [loop above] node {a} ()
    (q0) edge node {a}   (q1) 
    %(q1) edge node {.}   (q2)
    (q1) edge node {.}   (qdots)
    (qdots) edge node {.} (qn)
    (qn) edge node {.} (qn1)
    (qn1) edge node {b} (qn2)
    (qn2) edge node {$c$} (qn3);
\end{tikzpicture}
%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
%   \node[state,initial] (q_0)   {$0$}; 
%   \node[state, ] (q_1) [right=of q_0] {$1$}; 
%   \node[state, ] (q_2) [right=of q_1] {$2$}; 
%   \node[state,
%   \node[state, accepting, ](q_3) [right=of q_2] {$3$};
%    \path[->] 
%    (q_0) edge  node {a} (q_1)
%    	  edge [loop below] node {a,b} ()
%    (q_1) edge  node  {a,b} (q_2)
%    (q_2) edge  node  {a,b} (q_3);
%\end{tikzpicture}
\end{center}
\caption{The example given by Becchi and Crawley
	that NFA simulation can consume large
	amounts of memory: $.^*a.^{\{n\}}bc$ matching
	strings of the form $aaa\ldots aaaabc$.
	When traversing in a breadth-first manner,
all states from 0 till $n+1$ will become active.}
\end{figure}
%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
%in terms of input string length.
%TODO:try out these lexers
These problems can of course be solved in matching algorithms where 
automata go beyond the classic notion and for instance include explicit
counters \cite{Turo_ov__2020}.
These solutions can be quite efficient,
with the ability to process
gigabits of strings input per second
even with large counters \cite{Becchi08}.
These practical solutions do not come with
formal guarantees, and as pointed out by
Kuklewicz \cite{KuklewiczHaskell}, can be error-prone.
%But formal reasoning about these automata especially in Isabelle 
%can be challenging
%and un-intuitive. 
%Therefore, we take correctness and runtime claims made about these solutions
%with a grain of salt.

In the work reported in \cite{FoSSaCS2023} and here, 
we add better support using derivatives
for bounded regular expression $r^{\{n\}}$.
Our results
extend straightforwardly to
repetitions with intervals such as 
$r^{\{n\ldots m\}}$.
The merit of Brzozowski derivatives (more on this later)
on this problem is that
it can be naturally extended to support bounded repetitions.
Moreover these extensions are still made up of only small
inductive datatypes and recursive functions,
making it handy to deal with them in theorem provers.
%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
%straightforwardly extended to deal with bounded regular expressions
%and moreover the resulting code still consists of only simple
%recursive functions and inductive datatypes.
Finally, bounded regular expressions do not destroy our finite
boundedness property, which we shall prove later on.





\subsection{Back-References}
The other way to simulate an $\mathit{NFA}$ for matching is choosing  
a single transition each time, keeping all the other options in 
a queue or stack, and backtracking if that choice eventually 
fails. 
This method, often called a  "depth-first-search", 
is efficient in many cases, but could end up
with exponential run time.
The backtracking method is employed in regex libraries
that support \emph{back-references}, for example
in Java and Python.
%\section{Back-references and The Terminology Regex}

%When one constructs an $\NFA$ out of a regular expression
%there is often very little to be done in the first phase, one simply 
%construct the $\NFA$ states based on the structure of the input regular expression.

%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
%one by keeping track of all active states after consuming 
%a character, and update that set of states iteratively.
%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
%for a path terminating
%at an accepting state.



Consider the following regular expression where the sequence
operator is omitted for brevity:
\begin{center}
	$r_1r_2r_3r_4$
\end{center}
In this example,
one could label sub-expressions of interest 
by parenthesizing them and giving 
them a number in the order in which their opening parentheses appear.
One possible way of parenthesizing and labelling is: 
\begin{center}
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
\end{center}
The sub-expressions
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
by 1 to 4, and can be ``referred back'' by their respective numbers. 
%These sub-expressions are called "capturing groups".
To do so, one uses the syntax $\backslash i$ 
to denote that we want the sub-string 
of the input matched by the i-th
sub-expression to appear again, 
exactly the same as it first appeared: 
\begin{center}
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
\underset{s_i \text{ which just matched} \;r_i}{\backslash i} \ldots$
\end{center}
Once the sub-string $s_i$ for the sub-expression $r_i$
has been fixed, there is no variability on what the back-reference
label $\backslash i$ can be---it is tied to $s_i$.
The overall string will look like $\ldots s_i \ldots s_i \ldots $
%The backslash and number $i$ are the
%so-called "back-references".
%Let $e$ be an expression made of regular expressions 
%and back-references. $e$ contains the expression $e_i$
%as its $i$-th capturing group.
%The semantics of back-reference can be recursively
%written as:
%\begin{center}
%	\begin{tabular}{c}
%		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
%		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
%	\end{tabular}
%\end{center}
A concrete example
for back-references is
\begin{center}
$(.^*)\backslash 1$,
\end{center}
which matches
strings that can be split into two identical halves,
for example $\mathit{foofoo}$, $\mathit{ww}$ and so on.
Note that this is different from 
repeating the  sub-expression verbatim like
\begin{center}
	$(.^*)(.^*)$,
\end{center}
which does not impose any restrictions on what strings the second 
sub-expression $.^*$
might match.
Another example for back-references is
\begin{center}
$(.)(.)\backslash 2\backslash 1$
\end{center}
which matches four-character palindromes
like $abba$, $x??x$ and so on.

Back-references are a regex construct 
that programmers find quite useful.
According to Becchi and Crawley \cite{Becchi08},
6\% of Snort rules (up until 2008) use them.
The most common use of back-references
is to express well-formed html files,
where back-references are convenient for matching
opening and closing tags like 
\begin{center}
	$\langle html \rangle \ldots \langle / html \rangle$
\end{center}
A regex describing such a format
is
\begin{center}
	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
\end{center}
Despite being useful, the expressive power of regexes 
go beyond regular languages 
once back-references are included.
In fact, they allow the regex construct to express 
languages that cannot be contained in context-free
languages either.
For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
which cannot be expressed by 
context-free grammars \cite{campeanu2003formal}.
Such a language is contained in the context-sensitive hierarchy
of formal languages. 
Also solving the matching problem involving back-references
is known to be NP-complete \parencite{alfred2014algorithms}.
Regex libraries supporting back-references such as 
PCRE \cite{pcre} therefore have to
revert to a depth-first search algorithm including backtracking.
What is unexpected is that even in the cases 
not involving back-references, there is still
a (non-negligible) chance they might backtrack super-linearly,
as shown in the graphs in figure \ref{fig:aStarStarb}.

Summing up, we can categorise existing 
practical regex libraries into two kinds:
(i) The ones  with  linear
time guarantees like Go and Rust. The downside with them is that
they impose restrictions
on the regular expressions (not allowing back-references, 
bounded repetitions cannot exceed an ad hoc limit etc.).
And (ii) those 
that allow large bounded regular expressions and back-references
at the expense of using backtracking algorithms.
They can potentially ``grind to a halt''
on some very simple cases, resulting 
ReDoS attacks if exposed to the internet.

The problems with both approaches are the motivation for us 
to look again at the regular expression matching problem. 
Another motivation is that regular expression matching algorithms
that follow the POSIX standard often contain errors and bugs 
as we shall explain next.

%We would like to have regex engines that can 
%deal with the regular part (e.g.
%bounded repetitions) of regexes more
%efficiently.
%Also we want to make sure that they do it correctly.
%It turns out that such aim is not so easy to achieve.
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
% For example, the Rust regex engine claims to be linear, 
% but does not support lookarounds and back-references.
% The GoLang regex library does not support over 1000 repetitions.  
% Java and Python both support back-references, but shows
%catastrophic backtracking behaviours on inputs without back-references(
%when the language is still regular).
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
 %TODO: verify the fact Rust does not allow 1000+ reps




%The time cost of regex matching algorithms in general
%involve two different phases, and different things can go differently wrong on 
%these phases.
%$\DFA$s usually have problems in the first (construction) phase
%, whereas $\NFA$s usually run into trouble
%on the second phase.


\section{Error-prone POSIX Implementations}
Very often there are multiple ways of matching a string
with a regular expression.
In such cases the regular expressions matcher needs to
disambiguate.
The more widely used strategy is called POSIX,
which roughly speaking always chooses the longest initial match.
The POSIX strategy is widely adopted in many regular expression matchers
because it is a natural strategy for lexers.
However, many implementations (including the C libraries
used by Linux and OS X distributions) contain bugs
or do not meet the specification they claim to adhere to.
Kuklewicz maintains a unit test repository which lists some
problems with existing regular expression engines \cite{KuklewiczHaskell}.
In some cases, they either fail to generate a
result when there exists a match,
or give results that are inconsistent with the POSIX standard.
A concrete example is the regex:
\begin{center}
	$(aba + ab + a)^* \text{and the string} \; ababa$
\end{center}
The correct POSIX match for the above
involves the entire string $ababa$, 
split into two Kleene star iterations, namely $[ab], [aba]$ at positions
$[0, 2), [2, 5)$
respectively.
But feeding this example to the different engines
listed at regex101 \footnote{
	regex101 is an online regular expression matcher which
	provides API for trying out regular expression
	engines of multiple popular programming languages like
Java, Python, Go, etc.} \parencite{regex101}. 
yields
only two incomplete matches: $[aba]$ at $[0, 3)$
and $a$ at $[4, 5)$.
Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell} 
commented that most regex libraries are not
correctly implementing the central POSIX
rule, called the maximum munch rule.
Grathwohl \parencite{grathwohl2014crash} wrote:
\begin{quote}\it
	``The POSIX strategy is more complicated than the 
	greedy because of the dependence on information about 
	the length of matched strings in the various subexpressions.''
\end{quote}
%\noindent
People have recognised that the implementation complexity of POSIX rules also come from
the specification being not very precise.
The developers of the regexp package of Go 
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
commented that
\begin{quote}\it
``
The POSIX rule is computationally prohibitive
and not even well-defined.
``
\end{quote}
There are many informal summaries of this disambiguation
strategy, which are often quite long and delicate.
For example Kuklewicz \cite{KuklewiczHaskell} 
described the POSIX rule as (section 1, last paragraph):
\begin{quote}
	\begin{itemize}
		\item
regular expressions (REs) take the leftmost starting match, and the longest match starting there
earlier subpatterns have leftmost-longest priority over later subpatterns\\
\item
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
\item
REs have right associative concatenation which can be changed with parenthesis\\
\item
parenthesized subexpressions return the match from their last usage\\
\item
text of component subexpressions must be contained in the text of the 
higher-level subexpressions\\
\item
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
\item
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
\end{itemize}
\end{quote}
%The text above 
%is trying to capture something very precise,
%and is crying out for formalising.
Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
formalised the notion of bit-coded regular expressions
and proved their relations with simple regular expressions in
the dependently-typed proof assistant Agda.
They also proved the soundness and completeness of a matching algorithm
based on the bit-coded regular expressions.
Ausaf et al. \cite{AusafDyckhoffUrban2016}
are the first to
give a quite simple formalised POSIX
specification in Isabelle/HOL, and also prove 
that their specification coincides with an earlier (unformalised) 
POSIX specification given by Okui and Suzuki \cite{Okui10}.
They then formally proved the correctness of
a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
with regards to that specification.
They also found that the informal POSIX
specification by Sulzmann and Lu needs to be substantially 
revised in order for the correctness proof to go through.
Their original specification and proof were unfixable
according to Ausaf et al.


In the next section, we will briefly
introduce Brzozowski derivatives and Sulzmann
and Lu's algorithm, which the main point of this thesis builds on.
%We give a taste of what they 
%are like and why they are suitable for regular expression
%matching and lexing.
\section{Formal Specification of POSIX Matching 
and Brzozowski Derivatives}
%Now we start with the central topic of the thesis: Brzozowski derivatives.
Brzozowski \cite{Brzozowski1964} first introduced the 
concept of a \emph{derivative} of regular expression in 1964.
The derivative of a regular expression $r$
with respect to a character $c$, is written as $r \backslash c$.
This operation tells us what $r$ transforms into
if we ``chop'' off a particular character $c$ 
from all strings in the language of $r$ (defined
later as $L \; r$).
%To give a flavour of Brzozowski derivatives, we present
%two straightforward clauses from it:
%\begin{center}
%	\begin{tabular}{lcl}
%		$d \backslash c$     & $\dn$ & 
%		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
%$(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
%	\end{tabular}
%\end{center}
%\noindent
%The first clause says that for the regular expression
%denoting a singleton set consisting of a single-character string $\{ d \}$,
%we check the derivative character $c$ against $d$,
%returning a set containing only the empty string $\{ [] \}$
%if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.
%The second clause states that to obtain the regular expression
%representing all strings' head character $c$ being chopped off
%from $r_1 + r_2$, one simply needs to recursively take derivative
%of $r_1$ and $r_2$ and then put them together.
Derivatives have the property
that $s \in L \; (r\backslash c)$ if and only if 
$c::s \in L \; r$ where $::$ stands for list prepending.
%This property can be used on regular expressions
%matching and lexing--to test whether a string $s$ is in $L \; r$,
%one simply takes derivatives of $r$ successively with
%respect to the characters (in the correct order) in $s$,
%and then test whether the empty string is in the last regular expression.
With this property, derivatives can give a simple solution
to the problem of matching a string $s$ with a regular
expression $r$: if the derivative of $r$ w.r.t.\ (in
succession) all the characters of the string matches the empty string,
then $r$ matches $s$ (and {\em vice versa}).  
%This makes formally reasoning about these properties such
%as correctness and complexity smooth and intuitive.
There are several mechanised proofs of this property in various theorem
provers,
for example one by Owens and Slind \cite{Owens2008} in HOL4,
another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, and
yet another in Coq by Coquand and Siles \cite{Coquand2012}.

In addition, one can extend derivatives to bounded repetitions
relatively straightforwardly. For example, the derivative for 
this can be defined as:
\begin{center}
	\begin{tabular}{lcl}
		$r^{\{n\}} \backslash c$     & $\dn$ & $r \backslash c \cdot
		r^{\{n-1\}} (\text{when} n > 0)$\\
	\end{tabular}
\end{center}
\noindent
Experimental results suggest that  unlike DFA-based solutions
for bounded regular expressions,
derivatives can cope
large counters
quite well.

There have also been 
extensions of derivatives to other regex constructs.
For example, Owens et al include the derivatives
for the \emph{NOT} regular expression, which is
able to concisely express C-style comments of the form
$/* \ldots */$ (see \cite{Owens2008}).
Another extension for derivatives is
regular expressions with look-aheads, done
by Miyazaki and Minamide
\cite{Takayuki2019}.
%We therefore use Brzozowski derivatives on regular expressions 
%lexing 



Given the above definitions and properties of
Brzozowski derivatives, one quickly realises their potential
in generating a formally verified algorithm for lexing: the clauses and property
can be easily expressed in a functional programming language 
or converted to theorem prover
code, with great ease.
Perhaps this is the reason why derivatives have sparked quite a bit of interest
in the functional programming and theorem prover communities in the last
fifteen or so years (
\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},
\cite{Chen12} and \cite{Coquand2012}
to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}
after they were first published by Brzozowski.


However, there are two difficulties with derivative-based matchers:
First, Brzozowski's original matcher only generates a yes/no answer
for whether a regular expression matches a string or not.  This is too
little information in the context of lexing where separate tokens must
be identified and also classified (for example as keywords
or identifiers). 
Second, derivative-based matchers need to be more efficient in terms
of the sizes of derivatives.
Elegant and beautiful
as many implementations are,
they can be excruciatingly slow. 
For example, Sulzmann and Lu
claim a linear running time of their proposed algorithm,
but that was falsified by our experiments. The running time 
is actually $\Omega(2^n)$ in the worst case.
A similar claim about a theoretical runtime of $O(n^2)$ 
is made for the Verbatim \cite{Verbatim}
%TODO: give references
lexer, which calculates POSIX matches and is based on derivatives.
They formalized the correctness of the lexer, but not their complexity result.
In the performance evaluation section, they analyzed the run time
of matching $a$ with the string 
\begin{center}
	$\underbrace{a \ldots a}_{\text{n a's}}$.
\end{center}
\noindent
They concluded that the algorithm is quadratic in terms of 
the length of the input string.
When we tried out their extracted OCaml code with the example $(a+aa)^*$,
the time it took to match a string of 40 $a$'s was approximately 5 minutes.


\subsection{Sulzmann and Lu's Algorithm}
Sulzmann and Lu~\cite{Sulzmann2014} overcame the first 
problem with the yes/no answer 
by cleverly extending Brzozowski's matching
algorithm. Their extended version generates additional information on
\emph{how} a regular expression matches a string following the POSIX
rules for regular expression matching. They achieve this by adding a
second ``phase'' to Brzozowski's algorithm involving an injection
function. This injection function in a sense undoes the ``damage''
of the derivatives chopping off characters.
In earlier work, Ausaf et al provided the formal
specification of what POSIX matching means and proved in Isabelle/HOL
the correctness
of this extended algorithm accordingly
\cite{AusafDyckhoffUrban2016}.

The version of the algorithm proven correct
suffers however heavily from a 
second difficulty, where derivatives can
grow to arbitrarily big sizes. 
For example if we start with the
regular expression $(a+aa)^*$ and take
successive derivatives according to the character $a$, we end up with
a sequence of ever-growing derivatives like 

\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
\begin{center}
\begin{tabular}{rll}
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
\end{tabular}
\end{center}
 
\noindent where after around 35 steps we usually run out of memory on a
typical computer.  Clearly, the
notation involving $\ZERO$s and $\ONE$s already suggests
simplification rules that can be applied to regular regular
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
r$. While such simple-minded simplifications have been proved in 
the work by Ausaf et al. to preserve the correctness of Sulzmann and Lu's
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
\emph{not} help with limiting the growth of the derivatives shown
above: the growth is slowed, but the derivatives can still grow rather
quickly beyond any finite bound.

Therefore we want to look in this thesis at a second
algorithm by Sulzmann and Lu where they
overcame this ``growth problem'' \cite{Sulzmann2014}.
In this version, POSIX values are 
represented as bit sequences and such sequences are incrementally generated
when derivatives are calculated. The compact representation
of bit sequences and regular expressions allows them to define a more
``aggressive'' simplification method that keeps the size of the
derivatives finite no matter what the length of the string is.
They make some informal claims about the correctness and linear behaviour
of this version, but do not provide any supporting proof arguments, not
even ``pencil-and-paper'' arguments. They write about their bit-coded
\emph{incremental parsing method} (that is the algorithm to be formalised
in this dissertation)


  
  \begin{quote}\it
  ``Correctness Claim: We further claim that the incremental parsing
  method [..] in combination with the simplification steps [..]
  yields POSIX parse trees. We have tested this claim
  extensively [..] but yet
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
\end{quote}  
Ausaf and Urban made some initial progress towards the 
full correctness proof but still had to leave out the optimisation
Sulzmann and Lu proposed.
Ausaf  wrote \cite{Ausaf},
  \begin{quote}\it
``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''
\end{quote}  
This thesis implements the aggressive simplifications envisioned
by Ausaf and Urban,
together with a formal proof of the correctness of those simplifications.


One of the most recent work in the context of lexing
%with this issue
is the Verbatim lexer by Egolf, Lasser and Fisher~\cite{Verbatim}.
This is relevant work for us and we will compare it later with 
our derivative-based matcher we are going to present.
There is also some newer work called
Verbatim++~\cite{Verbatimpp}, which does not use derivatives, 
but deterministic finite automaton instead.
We will also study this work in a later section.
%An example that gives problem to automaton approaches would be
%the regular expression $(a|b)^*a(a|b)^{\{n\}}$.
%It requires at least $2^{n+1}$ states to represent
%as a DFA.


%----------------------------------------------------------------------------------------
\section{Contribution}
{\color{red} \rule{\linewidth}{0.5mm}}
\textbf{issue with this part: way too short, not enough details of what I have done.}
{\color{red} \rule{\linewidth}{0.5mm}}

In this thesis,
we propose a solution to catastrophic
backtracking and error-prone matchers: a formally verified
regular expression lexing algorithm
that is both fast
and correct. 
{\color{red} \rule{\linewidth}{0.5mm}}
\HandRight Added content:
%Package \verb`pifont`: \ding{43}
%Package \verb`utfsym`: \usym{261E} 
%Package \verb`dingbat`: \leftpointright 
%Package \verb`dingbat`: \rightpointright 

In particular, the main problem we solved on top of previous work was
coming up with a formally verified algorithm called $\blexersimp$ based
on Brzozowski derivatives. It calculates a POSIX
lexical value from a string and a regular expression. This algorithm was originally
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
function does not really simplify intermediate results where it needs to and improved the
algorithm accordingly.
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
\begin{theorem}
$|\blexersimp \; r \; s | \leq N_r$
\end{theorem}
The simplifications applied in each step of $\blexersimp$ 

\begin{center}
	$\blexersimp
	$
\end{center}
keeps the derivatives small, but presents a 
challenge


establishing a correctness theorem of the below form:
%TODO: change this to "theorem to prove"
\begin{theorem}
	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$, 
	then $\blexersimp\; r \; s = \Some \; v$. Otherwise 
	$\blexersimp \; r \; s = \None$.
\end{theorem}




The result is %a regular expression lexing algorithm that comes with 
\begin{itemize}
\item
an improved version of  Sulzmann and Lu's bit-coded algorithm using 
derivatives with simplifications, 
accompanied by
a proven correctness theorem according to POSIX specification 
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, 
\item 
a complexity-related property for that algorithm saying that the 
internal data structure will
remain below a finite bound,
\item
and an extension to
the bounded repetition constructs with the correctness and finiteness property
maintained.
\end{itemize}
\noindent
{\color{red} \rule{\linewidth}{0.5mm}}
With a formal finiteness bound in place,
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
The Isabelle/HOL code for our formalisation can be 
found at
\begin{center}
\url{https://github.com/hellotommmy/posix}
\end{center}
Further improvements to the algorithm with an even stronger version of 
simplification can be made. We conjecture that the resulting size of derivatives
can be bounded by a cubic bound w.r.t. the size of the regular expression.
We will give relevant code in Scala, 
but do not give a formal proof for that in Isabelle/HOL.
This is still future work.


\section{Structure of the thesis}
In chapter \ref{Inj} we will introduce the concepts
and notations we 
use for describing regular expressions and derivatives,
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including 
its correctness proof).
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
together with the correctness proof by Ausaf and Urban.
Then we illustrate in chapter \ref{Bitcoded2}
how Sulzmann and Lu's
simplifications fail to simplify correctly. We therefore introduce our own version of the
algorithm with correct simplifications and 
their correctness proof.  
In chapter \ref{Finite} we give the second guarantee
of our bitcoded algorithm, that is a finite bound on the size of any 
regular expression's derivatives. 
We also show how one can extend the
algorithm to include bounded repetitions. 
In chapter \ref{Cubic} we discuss stronger simplification rules which 
improve the finite bound to a cubic bound. %and the NOT regular expression.
Chapter \ref{RelatedWork} introduces relevant work for this thesis.
Chapter \ref{Future} concludes and mentions avenues of future research.
 




%----------------------------------------------------------------------------------------


%----------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------
author	Chengsong
	Fri, 26 May 2023 23:42:15 +0100
changeset 648	d15a0b7d6d90
parent 646	56057198e4f5
child 649	ef2b8abcbc55
permissions	-rwxr-xr-x