lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@16d67f9c07d4


% Chapter 1

\chapter{Introduction} % Main chapter title

\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 

%----------------------------------------------------------------------------------------

% Define some commands to keep the formatting separated from the content 
\newcommand{\keyword}[1]{\textbf{#1}}
\newcommand{\tabhead}[1]{\textbf{#1}}
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\file}[1]{\texttt{\bfseries#1}}
\newcommand{\option}[1]{\texttt{\itshape#1}}

%boxes
\newcommand*{\mybox}[1]{\framebox{\strut #1}}

%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
\def\derssimp{\textit{ders}\_\textit{simp}}
\def\rders{\textit{rders}}
\newcommand{\bders}[2]{#1 \backslash #2}
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
\def\bsimps{\textit{bsimp}}
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
\newcommand{\ZERO}{\mbox{\bf 0}}
\newcommand{\ONE}{\mbox{\bf 1}}
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
\def\rdistincts{\textit{rdistinct}}
\def\rDistinct{\textit{rdistinct}}
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}

\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}

\def\SEQ{\textit{SEQ}}
\def\SEQs{\textit{SEQs}}
\def\case{\textit{case}}
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
\def\rsimpalts{\textit{rsimp}_{ALTS}}
\def\good{\textit{good}}
\def\btrue{\textit{true}}
\def\bfalse{\textit{false}}
\def\bnullable{\textit{bnullable}}
\def\bnullables{\textit{bnullables}}
\def\Some{\textit{Some}}
\def\None{\textit{None}}
\def\code{\textit{code}}
\def\decode{\textit{decode}}
\def\internalise{\textit{internalise}}
\def\lexer{\mathit{lexer}}
\def\mkeps{\textit{mkeps}}
\newcommand{\rder}[2]{#2 \backslash_r #1}

\def\rerases{\textit{rerase}}

\def\nonnested{\textit{nonnested}}
\def\AZERO{\textit{AZERO}}
\def\sizeNregex{\textit{sizeNregex}}
\def\AONE{\textit{AONE}}
\def\ACHAR{\textit{ACHAR}}

\def\simpsulz{\textit{simp}_{Sulz}}

\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
\def\frewrite{\rightsquigarrow_f}
\def\hrewrite{\rightsquigarrow_h}
\def\grewrite{\rightsquigarrow_g}
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
\def\fuse{\textit{fuse}}
\def\bder{\textit{bder}}
\def\der{\textit{der}}
\def\POSIX{\textit{POSIX}}
\def\ALTS{\textit{ALTS}}
\def\ASTAR{\textit{ASTAR}}
\def\DFA{\textit{DFA}}
\def\NFA{\textit{NFA}}
\def\bmkeps{\textit{bmkeps}}
\def\bmkepss{\textit{bmkepss}}
\def\retrieve{\textit{retrieve}}
\def\blexer{\textit{blexer}}
\def\flex{\textit{flex}}
\def\inj{\textit{inj}}
\def\Empty{\textit{Empty}}
\def\Left{\textit{Left}}
\def\Right{\textit{Right}}
\def\Stars{\textit{Stars}}
\def\Char{\textit{Char}}
\def\Seq{\textit{Seq}}
\def\Der{\textit{Der}}
\def\Ders{\textit{Ders}}
\def\nullable{\mathit{nullable}}
\def\Z{\mathit{Z}}
\def\S{\mathit{S}}
\def\rup{r^\uparrow}
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
\def\distinctWith{\textit{distinctWith}}
\def\lf{\textit{lf}}
\def\PD{\textit{PD}}
\def\suffix{\textit{Suffix}}
\def\distinctBy{\textit{distinctBy}}
\def\starupdate{\textit{starUpdate}}
\def\starupdates{\textit{starUpdates}}


\def\size{\mathit{size}}
\def\rexp{\mathbf{rexp}}
\def\simp{\mathit{simp}}
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
\def\map{\mathit{map}}
\def\distinct{\mathit{distinct}}
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
\def\blexerStrong{\textit{blexerStrong}}
\def\bsimpStrong{\textit{bsimpStrong}}
\def\bdersStrongs{\textit{bdersStrong}}
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}

\def\map{\textit{map}}
\def\rrexp{\textit{rrexp}}
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
\newcommand\asize[1]{\llbracket #1 \rrbracket}
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}

\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}


\def\rflts{\textit{rflts}}
\def\rrewrite{\textit{rrewrite}}
\def\bsimpalts{\textit{bsimp}_{ALTS}}
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
\def\rsimlalts{\textit{rsimp}_{ALTs}}
\def\rsimpseq{\textit{rsimp}_{SEQ}}

\def\erase{\textit{erase}}
\def\STAR{\textit{STAR}}
\def\flts{\textit{flts}}


\def\zeroable{\textit{zeroable}}
\def\nub{\textit{nub}}
\def\filter{\textit{filter}}
%\def\not{\textit{not}}



\def\RZERO{\mathbf{0}_r }
\def\RONE{\mathbf{1}_r}
\newcommand\RCHAR[1]{\mathbf{#1}_r}
\newcommand\RSEQ[2]{#1 \cdot #2}
\newcommand\RALTS[1]{\sum #1}
\newcommand\RSTAR[1]{#1^*}
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}




\lstdefinestyle{myScalastyle}{
  frame=tb,
  language=scala,
  aboveskip=3mm,
  belowskip=3mm,
  showstringspaces=false,
  columns=flexible,
  basicstyle={\small\ttfamily},
  numbers=none,
  numberstyle=\tiny\color{gray},
  keywordstyle=\color{blue},
  commentstyle=\color{dkgreen},
  stringstyle=\color{mauve},
  frame=single,
  breaklines=true,
  breakatwhitespace=true,
  tabsize=3,
}


%----------------------------------------------------------------------------------------
%This part is about regular expressions, Brzozowski derivatives,
%and a bit-coded lexing algorithm with proven correctness and time bounds.

%TODO: look up snort rules to use here--give readers idea of what regexes look like






Regular expressions are widely used in computer science: 
be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
command-line tools like $\mathit{grep}$ that facilitate easy 
text-processing; network intrusion
detection systems that reject suspicious traffic; or compiler
front ends--the majority of the solutions to these tasks 
involve lexing with regular 
expressions.
Given its usefulness and ubiquity, one would imagine that
modern regular expression matching implementations
are mature and fully studied.
Indeed, in a popular programming language's regex engine, 
supplying it with regular expressions and strings,
in most cases one can
get the matching information in a very short time.
Those matchers can be blindingly fast--some 
network intrusion detection systems
use regex engines that are able to process 
megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
However, those matchers can exhibit a surprising security vulnerability
under a certain class of inputs.
%However, , this is not the case for $\mathbf{all}$ inputs.
%TODO: get source for SNORT/BRO's regex matching engine/speed


Take $(a^*)^*\,b$ and ask whether
strings of the form $aa..a$ match this regular
expression. Obviously this is not the case---the expected $b$ in the last
position is missing. One would expect that modern regular expression
matching engines can find this out very quickly. Alas, if one tries
this example in JavaScript, Python or Java 8, even with strings of a small
length, say around 30 $a$'s,
the decision takes crazy time to finish (graph \ref{fig:aStarStarb}).
This is clearly exponential behaviour, and 
is triggered by some relatively simple regex patterns.
Java 9 and newer
versions improves this behaviour, but is still slow compared 
with the approach we are going to use.




This superlinear blowup in regular expression engines
had repeatedly caused grief in real life that they
get a name for them--``catastrophic backtracking''.
For example, on 20 July 2016 one evil
regular expression brought the webpage
\href{http://stackexchange.com}{Stack Exchange} to its
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
In this instance, a regular expression intended to just trim white
spaces from the beginning and the end of a line actually consumed
massive amounts of CPU resources---causing web servers to grind to a
halt. In this example, the time needed to process
the string was $O(n^2)$ with respect to the string length. This
quadratic overhead was enough for the homepage of Stack Exchange to
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
attack and therefore stopped the servers from responding to any
requests. This made the whole site become unavailable. 

\begin{figure}[p]
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={JavaScript},  
    legend pos=north west,
    legend cell align=left]
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
\end{axis}
\end{tikzpicture}
  &
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    %ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Python},  
    legend pos=north west,
    legend cell align=left]
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
\end{axis}
\end{tikzpicture}
  &
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    %ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Java 8},  
    legend pos=north west,
    legend cell align=left]
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
\end{axis}
\end{tikzpicture}\\
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Dart},  
    legend pos=north west,
    legend cell align=left]
\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
\end{axis}
\end{tikzpicture}
  &
\begin{tikzpicture}
\begin{axis}[
    xlabel={$n$},
    x label style={at={(1.05,-0.05)}},
    %ylabel={time in secs},
    enlargelimits=false,
    xtick={0,5,...,30},
    xmax=33,
    ymax=35,
    ytick={0,5,...,30},
    scaled ticks=false,
    axis lines=left,
    width=5cm,
    height=4cm, 
    legend entries={Swift},  
    legend pos=north west,
    legend cell align=left]
\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
\end{axis}
\end{tikzpicture}
  & \\
\multicolumn{3}{c}{Graphs}
\end{tabular}    
\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings 
           of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
   The reason for their superlinear behaviour is that they do a depth-first-search.
   If the string does not match, the engine starts to explore all possibilities. 
}\label{fig:aStarStarb}
\end{figure}\afterpage{\clearpage}

A more recent example is a global outage of all Cloudflare servers on 2 July
2019. A poorly written regular expression exhibited exponential
behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
had several causes, at the heart was a regular expression that
was used to monitor network
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
These problems with regular expressions 
are not isolated events that happen
very occasionally, but actually widespread.
They occur so often that they get a 
name--Regular-Expression-Denial-Of-Service (ReDoS)
attack.
\citeauthor{Davis18} detected more
than 1000 super-linear (SL) regular expressions
in Node.js, Python core libraries, and npm and pypi. 
They therefore concluded that evil regular expressions
are problems "more than a parlour trick", but one that
requires
more research attention.

This work aims to address this issue
with the help of formal proofs.
We offer a lexing algorithm based
on Brzozowski derivatives with certified correctness (in 
Isabelle/HOL)
and finiteness property.
Such properties guarantee the absence of 
catastrophic backtracking in most cases.
We will give more details in the next sections
on (i) why the slow cases in graph \ref{fig:aStarStarb}
can occur
and (ii) why we choose our 
approach (Brzozowski derivatives and formal proofs).


\section{Terminology, and the Problem with Bounded Repetitions}
Regular expressions and regular expression matchers 
have of course been studied for many, many years.
Theoretical results in automata theory says
that basic regular expression matching should be linear
w.r.t the input, provided that the regular expression
$r$ had been pre-processed and turned into a
deterministic finite automata (DFA).
By basic we mean textbook definitions such as the one
below, involving only characters, alternatives,
sequences, and Kleene stars:
\[
	r ::= \ZERO | \ONE | c | r_1 + r_2 | r_1 \cdot r_2 | r^*
\]
Modern regular expression matchers used by programmers,
however,
support richer constructs such as bounded repetitions
and back-references.
The syntax and expressive power of those 
matching engines
make ``regular expressions'' quite different from 
their original meaning in the formal languages
theory.
To differentiate, people tend to use the word \emph{regex} to refer
those expressions with richer constructs, and regular expressions
for the more traditional meaning.
For example, the PCRE standard (Peral Compatible Regular Expressions)
is such a regex syntax standard.
We follow this convention in this thesis.
We aim to support all the popular features of regexes in the future,
but for this work we mainly look at regular expressions.

\subsection{A Little Introduction to Regexes: Bounded Repetitions
and Back-references}
Regexes come with a lot of constructs
that makes it more convenient for 
programmers to write regular expressions.
Some of those constructs are syntactic sugars that are
simply short hand notations
that save the programmers a few keystrokes,
for example the
non-binary alternative involving three or more choices:
\[
	r = (a | b | c | \ldots | z)^*
\]
, the range operator $-$ which means the alternative
of all characters between its operands:
\[
	r = [0-9a-zA-Z] \; \text{(all alpha-numeric characters)}
\]
and the 
wildcard character $.$ meaning any character
\[
	. = [0-9a-zA-Z+-()*&\ldots]

\]
Some of those constructs do make the expressions much
more compact, and matching time could be greatly increase.
For example, $r^{n}$ is exponentially more concise compared with
the expression $\underbrace{r}_\text{n copies of r}$,
and therefore a naive algorithm that simply unfolds
$r^{n}$ into $\underbrace{r}_\text{n copies of r}$
will suffer exponential runtime increase.
Some constructs can even raise the expressive
power to the non-regular realm, for example
the back-references.

bounded repetitions, as we have discussed in the 
previous section.
This super-linear behaviour of the 
regex matching engines we have?
One of the most recent work in the context of lexing
is the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.
This is relevant work and we will compare later on
our derivative-based matcher we are going to present.
There is also some newer work called
Verbatim++\cite{Verbatimpp}, this does not use derivatives, but automaton instead.
For that the problem is dealing with the bounded regular expressions of the form
$r^{n}$ where $n$ is a constant specifying that $r$ must repeat
exactly $n$ times. The Verbatim++ lexer becomes excruciatingly slow
on the bounded repetitions construct.

In the work reported in \cite{CSL2022} and here, we add better support
for them. 
The other repetition constructs include
$r^{\ldots m}$, $r^{n\ldots}$ and $r^{n\ldots m}$ which specify 
intervals for how many times $r$ should match.
$r^{\ldots m}$ means repeating
at most $m$ times, $r^{n\ldots}$ means repeating at least $n$ times and 
$r^{n\ldots m}$ means repeating between $n$ and $m$ times.
The results presented in this thesis extend straightforwardly to them
too. 
Their formal definitions will be given later.

Bounded repetitions are important because they
tend to occur often in practical use, for example in RegExLib,
Snort, as well as in XML Schema definitions (XSDs).
According to Bj\"{o}rklund et al \cite{xml2015},
bounded regular expressions occur frequently in the latter and can have
counters up to ten million. An example XSD with a large counter they gave
was:
\begin{verbatim}
<sequence minOccurs="0" maxOccurs="65535">
    <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
    <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
</sequence>
\end{verbatim}
This can be seen as the expression 
$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
regular expressions 
satisfying certain constraints (such as 
satisfying the floating point number format).
The problem here is that tools based on the classic notion of
automata need to expand $r^{n}$ into $n$ connected 
copies of the automaton for $r$. This leads to very inefficient matching
algorithms  or algorithms that consume large amounts of memory.
A classic example is the regular expression $(a+b)^*  a (a+b)^{n}$
where the minimal DFA requires at least $2^{n+1}$ states (more on this
later).
Therefore regular expressions matching libraries that rely on the classic
notion of DFAs  often impose adhoc limits
for bounded regular expressions:
For example, in the regular expression matching library in the Go
language the regular expression $a^{1001}$ is not permitted, because no counter
can be above 1000, and in the built-in Rust regular expression library
expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
for being too big. These problems can of course be solved in matching algorithms where 
automata go beyond the classic notion and for instance include explicit
counters \cite{Turo_ov__2020}.
The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
straightforwardly extended to deal with bounded regular expressions
and moreover the resulting code still consists of only simple
recursive functions and inductive datatypes.
Finally, bounded regular expressions do not destroy our finite
boundedness property, which we shall prove later on.

\section{Back-references and The Terminology Regex}


Bounded repetitions, usually written in the form
$r^{\{c\}}$ (where $c$ is a constant natural number),
denotes a regular expression accepting strings
that can be divided into $c$ substrings, where each 
substring is in $r$. 
For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
an $\mathit{NFA}$ describing it would look like:
\begin{center}
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
   \node[state,initial] (q_0)   {$q_0$}; 
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
    \path[->] 
    (q_0) edge  node {a} (q_1)
    	  edge [loop below] node {a,b} ()
    (q_1) edge  node  {a,b} (q_2)
    (q_2) edge  node  {a,b} (q_3);
\end{tikzpicture}
\end{center}
The red states are "countdown states" which counts down 
the number of characters needed in addition to the current
string to make a successful match.
For example, state $q_1$ indicates a match that has
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
and just consumed the "delimiter" $a$ in the middle, and 
need to match 2 more iterations of $(a|b)$ to complete.
State $q_2$ on the other hand, can be viewed as a state
after $q_1$ has consumed 1 character, and just waits
for 1 more character to complete.
$q_3$ is the last state, requiring 0 more character and is accepting.
Depending on the suffix of the
input string up to the current read location,
the states $q_1$ and $q_2$, $q_3$
may or may
not be active, independent from each other.
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
contain at least $2^3$ non-equivalent states that cannot be merged, 
because the subset construction during determinisation will generate
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
Generalizing this to regular expressions with larger
bounded repetitions number, we have that
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
would require at least $2^{n+1}$ states, if $r$ contains
more than 1 string.
This is to represent all different 
scenarios which "countdown" states are active.
For those regexes, tools that uses $\DFA$s will get
out of memory errors.


The time cost of regex matching algorithms in general
involve two different phases, and different things can go differently wrong on 
these phases.
$\DFA$s usually have problems in the first (construction) phase
, whereas $\NFA$s usually run into trouble
on the second phase.


\section{Error-prone POSIX Implementations}
The problems with practical implementations
of reare not limited to slowness on certain 
cases. 
Another thing about these libraries is that there
is no correctness guarantee.
In some cases, they either fail to generate a lexing result when there exists a match,
or give results that are inconsistent with the $\POSIX$ standard.
A concrete example would be the regex
\begin{center}
	$(aba + ab + a)* \text{and the string} ababa$
\end{center}
The correct $\POSIX$ match for the above would be 
with the entire string $ababa$, 
split into two Kleene star iterations, $[ab] [aba]$ at positions
$[0, 2), [2, 5)$
respectively.
But trying this out in regex101\parencite{regex101}
with different language engines would yield 
the same two fragmented matches: $[aba]$ at $[0, 3)$
and $a$ at $[4, 5)$.

Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
correctly implementing the POSIX (maximum-munch)
rule of regular expression matching.

As Grathwohl\parencite{grathwohl2014crash} wrote,
\begin{quote}
	The POSIX strategy is more complicated than the 
	greedy because of the dependence on information about 
	the length of matched strings in the various subexpressions.
\end{quote}
%\noindent
To summarise the above, regular expressions are important.
They are popular and programming languages' library functions
for them are very fast on non-catastrophic cases.
But there are problems with current practical implementations.
First thing is that the running time might blow up.
The second problem is that they might be error-prone on certain
very simple cases.
In the next part of the chapter, we will look into reasons why 
certain regex engines are running horribly slow on the "catastrophic"
cases and propose a solution that addresses both of these problems
based on Brzozowski and Sulzmann and Lu's work.



\subsection{Different Phases of a Matching/Lexing Algorithm}


Most lexing algorithms can be roughly divided into 
two phases during its run.
The first phase is the "construction" phase,
in which the algorithm builds some  
suitable data structure from the input regex $r$, so that
it can be easily operated on later.
We denote
the time cost for such a phase by $P_1(r)$.
The second phase is the lexing phase, when the input string 
$s$ is read and the data structure
representing that regex $r$ is being operated on. 
We represent the time
it takes by $P_2(r, s)$.\\
For $\mathit{DFA}$,
we have $P_2(r, s) = O( |s| )$,
because we take at most $|s|$ steps, 
and each step takes
at most one transition--
a deterministic-finite-automata
by definition has at most one state active and at most one
transition upon receiving an input symbol.
But unfortunately in the  worst case
$P_1(r) = O(exp^{|r|})$. An example will be given later. 
For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
expressions like $r^n$ into 
\[
	\underbrace{r \cdots r}_{\text{n copies of r}}.
\]
The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
On the other hand, if backtracking is used, the worst-case time bound bloats
to $|r| * 2^{|s|}$.
%on the input
%And when calculating the time complexity of the matching algorithm,
%we are assuming that each input reading step requires constant time.
%which translates to that the number of 
%states active and transitions taken each time is bounded by a
%constant $C$.
%But modern  regex libraries in popular language engines
% often want to support much richer constructs than just
% sequences and Kleene stars,
%such as negation, intersection, 
%bounded repetitions and back-references.
%And de-sugaring these "extended" regular expressions 
%into basic ones might bloat the size exponentially.
%TODO: more reference for exponential size blowup on desugaring. 

\subsection{Why $\mathit{DFA}s$ can be slow in the first phase}


The good things about $\mathit{DFA}$s is that once
generated, they are fast and stable, unlike
backtracking algorithms. 
However, they do not scale well with bounded repetitions.

\subsubsection{Problems with Bounded Repetitions}





\subsubsection{Tools that uses $\mathit{DFA}$s}
%TODO:more tools that use DFAs?
$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
lexers. The user provides a set of regular expressions
and configurations to such lexer generators, and then 
gets an output program encoding a minimized $\mathit{DFA}$
that can be compiled and run. 
When given the above countdown regular expression,
a small number $n$ would result in a determinised automata
with millions of states.

For this reason, regex libraries that support 
bounded repetitions often choose to use the $\mathit{NFA}$ 
approach.








\subsection{Why $\mathit{NFA}$s can be slow in the second phase}
When one constructs an $\NFA$ out of a regular expression
there is often very little to be done in the first phase, one simply 
construct the $\NFA$ states based on the structure of the input regular expression.

In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
one by keeping track of all active states after consuming 
a character, and update that set of states iteratively.
This can be viewed as a breadth-first-search of the $\mathit{NFA}$
for a path terminating
at an accepting state.
Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
type of $\mathit{NFA}$ simulation and guarantees a linear runtime
in terms of input string length.
%TODO:try out these lexers
The other way to use $\mathit{NFA}$ for matching is choosing  
a single transition each time, keeping all the other options in 
a queue or stack, and backtracking if that choice eventually 
fails. This method, often called a  "depth-first-search", 
is efficient in a lot of cases, but could end up
with exponential run time.\\
%TODO:COMPARE java python lexer speed with Rust and Go
The reason behind backtracking algorithms in languages like
Java and Python is that they support back-references.
\subsubsection{Back References}
If we have a regular expression like this (the sequence
operator is omitted for brevity):
\begin{center}
	$r_1(r_2(r_3r_4))$
\end{center}
We could label sub-expressions of interest 
by parenthesizing them and giving 
them a number by the order in which their opening parentheses appear.
One possible way of parenthesizing and labelling is given below:
\begin{center}
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
\end{center}
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
by 1 to 4. $1$ would refer to the entire expression 
$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
These sub-expressions are called "capturing groups".
We can use the following syntax to denote that we want a string just matched by a 
sub-expression (capturing group) to appear at a certain location again, 
exactly as it was:
\begin{center}
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
\end{center}
The backslash and number $i$ are used to denote such 
so-called "back-references".
Let $e$ be an expression made of regular expressions 
and back-references. $e$ contains the expression $e_i$
as its $i$-th capturing group.
The semantics of back-reference can be recursively
written as:
\begin{center}
	\begin{tabular}{c}
		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
	\end{tabular}
\end{center}
The concrete example
$((a|b|c|\ldots|z)^*)\backslash 1$
would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
Back-reference is a construct in the "regex" standard
that programmers found useful, but not exactly 
regular any more.
In fact, that allows the regex construct to express 
languages that cannot be contained in context-free
languages either.
For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
Such a language is contained in the context-sensitive hierarchy
of formal languages. 
Solving the back-reference expressions matching problem
is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
efficient solution is not known to exist.
%TODO:read a bit more about back reference algorithms

It seems that languages like Java and Python made the trade-off
to support back-references at the expense of having to backtrack,
even in the case of regexes not involving back-references.\\
Summing these up, we can categorise existing 
practical regex libraries into the ones  with  linear
time guarantees like Go and Rust, which impose restrictions
on the user input (not allowing back-references, 
bounded repetitions cannot exceed 1000 etc.), and ones  
 that allows the programmer much freedom, but grinds to a halt
 in some non-negligible portion of cases.
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
% For example, the Rust regex engine claims to be linear, 
% but does not support lookarounds and back-references.
% The GoLang regex library does not support over 1000 repetitions.  
% Java and Python both support back-references, but shows
%catastrophic backtracking behaviours on inputs without back-references(
%when the language is still regular).
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
 %TODO: verify the fact Rust does not allow 1000+ reps


So we have practical implementations 
on regular expression matching/lexing which are fast
but do not come with any guarantees that it will not grind to a halt
or give wrong answers.
Our goal is to have a regex lexing algorithm that comes with 
\begin{itemize}
\item
proven correctness 
\item 
proven non-catastrophic properties
\item
easy extensions to
constructs like 
 bounded repetitions, negation,  lookarounds, and even back-references.
 \end{itemize}
 
\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
We propose Brzozowski derivatives on regular expressions as
  a solution to this.
In the last fifteen or so years, Brzozowski's derivatives of regular
expressions have sparked quite a bit of interest in the functional
programming and theorem prover communities.   

\subsection{Motivation}
  
Derivatives give a simple solution
to the problem of matching a string $s$ with a regular
expression $r$: if the derivative of $r$ w.r.t.\ (in
succession) all the characters of the string matches the empty string,
then $r$ matches $s$ (and {\em vice versa}).  

The beauty of
Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
expressible in any functional language, and easily definable and
reasoned about in theorem provers---the definitions just consist of
inductive datatypes and simple recursive functions. 
And an algorithms based on it by 
Suzmann and Lu  \parencite{Sulzmann2014} allows easy extension
to include  extended regular expressions and 
 simplification of internal data structures 
 eliminating the exponential behaviours.

However, two difficulties with derivative-based matchers exist:
\subsubsection{Problems with Current Brzozowski Matchers}
First, Brzozowski's original matcher only generates a yes/no answer
for whether a regular expression matches a string or not.  This is too
little information in the context of lexing where separate tokens must
be identified and also classified (for example as keywords
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
difficulty by cleverly extending Brzozowski's matching
algorithm. Their extended version generates additional information on
\emph{how} a regular expression matches a string following the POSIX
rules for regular expression matching. They achieve this by adding a
second ``phase'' to Brzozowski's algorithm involving an injection
function.  In our own earlier work, we provided the formal
specification of what POSIX matching means and proved in Isabelle/HOL
the correctness
of Sulzmann and Lu's extended algorithm accordingly
\cite{AusafDyckhoffUrban2016}.

The second difficulty is that Brzozowski's derivatives can 
grow to arbitrarily big sizes. For example if we start with the
regular expression $(a+aa)^*$ and take
successive derivatives according to the character $a$, we end up with
a sequence of ever-growing derivatives like 

\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
\begin{center}
\begin{tabular}{rll}
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
\end{tabular}
\end{center}
 
\noindent where after around 35 steps we run out of memory on a
typical computer (we shall define shortly the precise details of our
regular expressions and the derivative operation).  Clearly, the
notation involving $\ZERO$s and $\ONE$s already suggests
simplification rules that can be applied to regular regular
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
r$. While such simple-minded simplifications have been proved in our
earlier work to preserve the correctness of Sulzmann and Lu's
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
\emph{not} help with limiting the growth of the derivatives shown
above: the growth is slowed, but the derivatives can still grow rather
quickly beyond any finite bound.


Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
\cite{Sulzmann2014} where they introduce bit-coded
regular expressions. In this version, POSIX values are
represented as bit sequences and such sequences are incrementally generated
when derivatives are calculated. The compact representation
of bit sequences and regular expressions allows them to define a more
``aggressive'' simplification method that keeps the size of the
derivatives finite no matter what the length of the string is.
They make some informal claims about the correctness and linear behaviour
of this version, but do not provide any supporting proof arguments, not
even ``pencil-and-paper'' arguments. They write about their bit-coded
\emph{incremental parsing method} (that is the algorithm to be formalised
in this dissertation)


  
  \begin{quote}\it
  ``Correctness Claim: We further claim that the incremental parsing
  method [..] in combination with the simplification steps [..]
  yields POSIX parse trees. We have tested this claim
  extensively [..] but yet
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
\end{quote}  

Ausaf and Urban were able to back this correctness claim with
a formal proof.

But as they stated,
  \begin{quote}\it
The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
\end{quote}  

This thesis implements the aggressive simplifications envisioned
by Ausaf and Urban,
and gives a formal proof of the correctness with those simplifications.


%----------------------------------------------------------------------------------------
\section{Contribution}



This work addresses the vulnerability of super-linear and
buggy regex implementations by the combination
of Brzozowski's derivatives and interactive theorem proving. 
We give an 
improved version of  Sulzmann and Lu's bit-coded algorithm using 
derivatives, which come with a formal guarantee in terms of correctness and 
running time as an Isabelle/HOL proof.
Further improvements to the algorithm with an even stronger version of 
simplification is made.
We have not yet come up with one, but believe that it leads to a 
formalised proof with a time bound linear to input and
cubic to regular expression size using a technique by
Antimirov\cite{Antimirov}.

 
The main contribution of this thesis is 
\begin{itemize}
\item
a proven correct lexing algorithm
\item
with formalized finite bounds on internal data structures' sizes.
\end{itemize}

To our best knowledge, no lexing libraries using Brzozowski derivatives
have a provable time guarantee, 
and claims about running time are usually speculative and backed by thin empirical
evidence.
%TODO: give references
For example, Sulzmann and Lu had proposed an algorithm  in which they
claim a linear running time.
But that was falsified by our experiments and the running time 
is actually $\Omega(2^n)$ in the worst case.
A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
%TODO: give references
lexer, which calculates POSIX matches and is based on derivatives.
They formalized the correctness of the lexer, but not the complexity.
In the performance evaluation section, they simply analyzed the run time
of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
and concluded that the algorithm is quadratic in terms of input length.
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
the time it took to lex only 40 $a$'s was 5 minutes.



\subsection{Related Work}
We are aware
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
in Coq is given by Coquand and Siles \parencite{Coquand2012}.
Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
 
 
 When a regular expression does not behave as intended,
people usually try to rewrite the regex to some equivalent form
or they try to avoid the possibly problematic patterns completely,
for which many false positives exist\parencite{Davis18}.
Animated tools to "debug" regular expressions such as
 \parencite{regexploit2021} \parencite{regex101} are also popular.
We are also aware of static analysis work on regular expressions that
aims to detect potentially expoential regex patterns. Rathnayake and Thielecke 
\parencite{Rathnayake2014StaticAF} proposed an algorithm
that detects regular expressions triggering exponential
behavious on backtracking matchers.
Weideman \parencite{Weideman2017Static} came up with 
non-linear polynomial worst-time estimates
for regexes, attack string that exploit the worst-time 
scenario, and "attack automata" that generates
attack strings.




\section{Structure of the thesis}
In chapter 2 \ref{Inj} we will introduce the concepts
and notations we 
use for describing the lexing algorithm by Sulzmann and Lu,
and then give the lexing algorithm.
We will give its variant in \ref{Bitcoded1}.
Then we illustrate in \ref{Bitcoded2}
how the algorithm without bitcodes falls short for such aggressive 
simplifications and therefore introduce our version of the
 bit-coded algorithm and 
its correctness proof .  
In \ref{Finite} we give the second guarantee
of our bitcoded algorithm, that is a finite bound on the size of any 
regex's derivatives.
In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
algorithm to include constructs such as bounded repetitions and negations.
 




%----------------------------------------------------------------------------------------


%----------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------
author	Chengsong
	Sun, 25 Sep 2022 01:40:12 +0100
changeset 604	16d67f9c07d4
parent 603	370fe1dde7c7
child 605	ed53ce26ecb6
permissions	-rwxr-xr-x