% Chapter 1\chapter{Introduction} % Main chapter title\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} %----------------------------------------------------------------------------------------% Define some commands to keep the formatting separated from the content \newcommand{\keyword}[1]{\textbf{#1}}\newcommand{\tabhead}[1]{\textbf{#1}}\newcommand{\code}[1]{\texttt{#1}}\newcommand{\file}[1]{\texttt{\bfseries#1}}\newcommand{\option}[1]{\texttt{\itshape#1}}%boxes\newcommand*{\mybox}[1]{\framebox{\strut #1}}%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}\def\derssimp{\textit{ders}\_\textit{simp}}\def\rders{\textit{rders}}\newcommand{\bders}[2]{#1 \backslash #2}\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}\def\bsimps{\textit{bsimp}}\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%\newcommand{\ZERO}{\mbox{\bf 0}}\newcommand{\ONE}{\mbox{\bf 1}}\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}\def\rdistincts{\textit{rdistinct}}\def\rDistinct{\textit{rdistinct}}\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}\def\cbn{\textit{createdByNtimes}}\def\hpa{\textit{highestPowerAux}}\def\hpower{\textit{highestPower}}\def\ntset{\textit{ntset}}\def\optermsimp{\textit{optermsimp}}\def\optermOsimp{\textit{optermOsimp}}\def\optermosimp{\textit{optermosimp}}\def\opterm{\textit{opterm}}\def\nString{\textit{nonemptyString}}\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}\def\SEQ{\textit{SEQ}}\def\SEQs{\textit{SEQs}}\def\case{\textit{case}}\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}\def\rsimpalts{\textit{rsimp}_{ALTS}}\def\good{\textit{good}}\def\btrue{\textit{true}}\def\bfalse{\textit{false}}\def\bnullable{\textit{bnullable}}\def\bnullables{\textit{bnullables}}\def\Some{\textit{Some}}\def\None{\textit{None}}\def\code{\textit{code}}\def\decode{\textit{decode}}\def\internalise{\textit{internalise}}\def\lexer{\mathit{lexer}}\def\mkeps{\textit{mkeps}}\newcommand{\rder}[2]{#2 \backslash_r #1}\def\rerases{\textit{rerase}}\def\nonnested{\textit{nonnested}}\def\AZERO{\textit{AZERO}}\def\sizeNregex{\textit{sizeNregex}}\def\AONE{\textit{AONE}}\def\ACHAR{\textit{ACHAR}}\def\simpsulz{\textit{simp}_{Sulz}}\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}\def\frewrite{\rightsquigarrow_f}\def\hrewrite{\rightsquigarrow_h}\def\grewrite{\rightsquigarrow_g}\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}\def\fuse{\textit{fuse}}\def\bder{\textit{bder}}\def\der{\textit{der}}\def\POSIX{\textit{POSIX}}\def\ALTS{\textit{ALTS}}\def\ASTAR{\textit{ASTAR}}\def\DFA{\textit{DFA}}\def\NFA{\textit{NFA}}\def\bmkeps{\textit{bmkeps}}\def\bmkepss{\textit{bmkepss}}\def\retrieve{\textit{retrieve}}\def\blexer{\textit{blexer}}\def\flex{\textit{flex}}\def\inj{\textit{inj}}\def\Empty{\textit{Empty}}\def\Left{\textit{Left}}\def\Right{\textit{Right}}\def\Stars{\textit{Stars}}\def\Char{\textit{Char}}\def\Seq{\textit{Seq}}\def\Der{\textit{Der}}\def\Ders{\textit{Ders}}\def\nullable{\mathit{nullable}}\def\Z{\mathit{Z}}\def\S{\mathit{S}}\def\rup{r^\uparrow}%\def\bderssimp{\mathit{bders}\_\mathit{simp}}\def\distinctWith{\textit{distinctWith}}\def\lf{\textit{lf}}\def\PD{\textit{PD}}\def\suffix{\textit{Suffix}}\def\distinctBy{\textit{distinctBy}}\def\starupdate{\textit{starUpdate}}\def\starupdates{\textit{starUpdates}}\def\nupdate{\textit{nupdate}}\def\nupdates{\textit{nupdates}}\def\size{\mathit{size}}\def\rexp{\mathbf{rexp}}\def\simp{\mathit{simp}}\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}\def\map{\mathit{map}}\def\distinct{\mathit{distinct}}\def\blexersimp{\mathit{blexer}\_\mathit{simp}}\def\blexerStrong{\textit{blexerStrong}}\def\bsimpStrong{\textit{bsimpStrong}}\def\bdersStrongs{\textit{bdersStrong}}\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}\def\map{\textit{map}}\def\rrexp{\textit{rrexp}}\newcommand\rnullable[1]{\textit{rnullable} \; #1 }\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}\newcommand\asize[1]{\llbracket #1 \rrbracket}\newcommand\rerase[1]{ (#1)_{\downarrow_r}}\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}\def\rflts{\textit{rflts}}\def\rrewrite{\textit{rrewrite}}\def\bsimpalts{\textit{bsimp}_{ALTS}}\def\bsimpaseq{\textit{bsimp}_{ASEQ}}\def\rsimlalts{\textit{rsimp}_{ALTs}}\def\rsimpseq{\textit{rsimp}_{SEQ}}\def\erase{\textit{erase}}\def\STAR{\textit{STAR}}\def\flts{\textit{flts}}\def\zeroable{\textit{zeroable}}\def\nub{\textit{nub}}\def\filter{\textit{filter}}%\def\not{\textit{not}}\def\RZERO{\mathbf{0}_r }\def\RONE{\mathbf{1}_r}\newcommand\RCHAR[1]{\mathbf{#1}_r}\newcommand\RSEQ[2]{#1 \cdot #2}\newcommand\RALTS[1]{\sum #1}\newcommand\RSTAR[1]{#1^*}\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}\lstdefinestyle{myScalastyle}{ frame=tb, language=scala, aboveskip=3mm, belowskip=3mm, showstringspaces=false, columns=flexible, basicstyle={\small\ttfamily}, numbers=none, numberstyle=\tiny\color{gray}, keywordstyle=\color{blue}, commentstyle=\color{dkgreen}, stringstyle=\color{mauve}, frame=single, breaklines=true, breakatwhitespace=true, tabsize=3,}%----------------------------------------------------------------------------------------%This part is about regular expressions, Brzozowski derivatives,%and a bit-coded lexing algorithm with proven correctness and time bounds.%TODO: look up snort rules to use here--give readers idea of what regexes look likeRegular expressions are widely used in computer science: be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;command-line tools like $\mathit{grep}$ that facilitate easy text-processing; network intrusiondetection systems that inspect suspicious traffic; or compilerfront ends.Given their usefulness and ubiquity, one would assume thatmodern regular expression matching implementationsare mature and fully studied.Indeed, in a popular programming language's regex engine, supplying it with regular expressions and strings,in most cases one canget the matching information in a very short time.Those matchers can be blindingly fast--some network intrusion detection systemsuse regex engines that are able to process megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.However, those matchers can exhibit a surprising security vulnerabilityunder a certain class of inputs.%However, , this is not the case for $\mathbf{all}$ inputs.%TODO: get source for SNORT/BRO's regex matching engine/speedConsider $(a^*)^*\,b$ and ask whetherstrings of the form $aa..a$ can be matched by this regularexpression. Obviously this is not the case---the expected $b$ in the lastposition is missing. One would expect that modern regular expressionmatching engines can find this out very quickly. Surprisingly, if one triesthis example in JavaScript, Python or Java 8, even with small strings, say of lenght of around 30 $a$'s,the decision takes an absurd time to finish (see graphs in figure \ref{fig:aStarStarb}).This is clearly exponential behaviour, and is triggered by some relatively simple regular expressions.Java 9 and newerversions improve this behaviour somewhat, but is still slow compared with the approach we are going to use in this thesis.This superlinear blowup in regular expression engineshad repeatedly caused grief in ``real life'' where it is given the name ``catastrophic backtracking'' or ``evil'' regular expressions.For example, on 20 July 2016 one evilregular expression brought the webpage\href{http://stackexchange.com}{Stack Exchange} to itsknees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}In this instance, a regular expression intended to just trim whitespaces from the beginning and the end of a line actually consumedmassive amounts of CPU resources---causing web servers to grind to ahalt. In this example, the time needed to processthe string was $O(n^2)$ with respect to the string length. Thisquadratic overhead was enough for the homepage of Stack Exchange torespond so slowly that the load balancer assumed a $\mathit{DoS}$ attack and therefore stopped the servers from responding to anyrequests. This made the whole site become unavailable. \begin{figure}[p]\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}\begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={JavaScript}, legend pos=north west, legend cell align=left]\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};\end{axis}\end{tikzpicture} &\begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, %ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Python}, legend pos=north west, legend cell align=left]\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};\end{axis}\end{tikzpicture}\\ \begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Java 8}, legend pos=north west, legend cell align=left]\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};\end{axis}\end{tikzpicture} &\begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, %ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Dart}, legend pos=north west, legend cell align=left]\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};\end{axis}\end{tikzpicture}\\ \begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Swift}, legend pos=north west, legend cell align=left]\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};\end{axis}\end{tikzpicture} & \begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, %ylabel={time in secs}, enlargelimits=true, %xtick={0,5000,...,40000}, %xmax=40000, %ymax=35, restrict x to domain*=0:40000, restrict y to domain*=0:35, %ytick={0,5,...,30}, %scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Java9+}, legend pos=north west, legend cell align=left]\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};\end{axis}\end{tikzpicture}\\ \multicolumn{2}{c}{Graphs}\end{tabular} \caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries. The reason for their superlinear behaviour is that they do a depth-first-search using NFAs. If the string does not match, the regular expression matching engine starts to explore all possibilities. }\label{fig:aStarStarb}\end{figure}\afterpage{\clearpage}A more recent example is a global outage of all Cloudflare servers on 2 July2019. A poorly written regular expression exhibited catastrophic backtrackingand exhausted CPUs that serve HTTP traffic. Although the outagehad several causes, at the heart was a regular expression thatwas used to monitor networktraffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}These problems with regular expressions are not isolated events that happenvery occasionally, but actually widespread.They occur so often that they have a name: Regular-Expression-Denial-Of-Service (ReDoS)attack.\citeauthor{Davis18} detected morethan 1000 evil regular expressionsin Node.js, Python core libraries, npm and in pypi. They therefore concluded that evil regular expressionsare real problems rather than "a parlour trick".This work aims to address this issuewith the help of formal proofs.We describe a lexing algorithm basedon Brzozowski derivatives with verified correctness (in Isabelle/HOL)and a finiteness property.Such properties %guarantee the absence of are an important step in preventingcatastrophic backtracking once and for all.We will give more details in the next sectionson (i) why the slow cases in graph \ref{fig:aStarStarb}can occur in traditional regular expression enginesand (ii) why we choose our approach based on Brzozowski derivatives and formal proofs.\section{Preliminaries}%Regex, and the Problems with Regex Matchers}Regular expressions and regular expression matchers have of course been studied for many, many years.Theoretical results in automata theory state that basic regular expression matching should be linearw.r.t the input.This assumes that the regular expression$r$ was pre-processed and turned into adeterministic finite automaton (DFA) before matching\cite{Sakarovitch2009}.By basic we mean textbook definitions such as the onebelow, involving only regular expressions for characters, alternatives,sequences, and Kleene stars:\[ r ::= c | r_1 + r_2 | r_1 \cdot r_2 | r^*\]Modern regular expression matchers used by programmers,however,support much richer constructs, such as bounded repetitionsand back-references.To differentiate, we use the word \emph{regex} to referto those expressions with richer constructs while reserving theterm \emph{regular expression}for the more traditional meaning in formal languages theory.We follow this convention in this thesis.In the future, we aim to support all the popular features of regexes, but for this work we mainly look at basic regular expressionsand bounded repetitions.%Most modern regex libraries%the so-called PCRE standard (Peral Compatible Regular Expressions)%has the back-referencesRegexes come with a number of constructsthat make it more convenient for programmers to write regular expressions.Depending on the types of constructsthe task of matching and lexing with themwill have different levels of complexity.Some of those constructs are just syntactic sugars that aresimply short hand notationsthat save the programmers a few keystrokes.These will not cause problems for regex libraries.For example thenon-binary alternative involving three or more choices just means:\[ (a | b | c) \stackrel{means}{=} ((a + b)+ c)\]Similarly, the range operator used to express the alternativeof all characters between its operands is just a concise way:\[ [0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 ) \; \text{(all number digits)}\]for an alternative. Thewildcard character $.$ is used to refer to any single character,\[ . \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]\]except the newline.\subsection{Bounded Repetitions}More interesting are bounded repetitions, which can make the regular expressions muchmore compact.There are $r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$(where $n$ and $m$ are constant natural numbers).Like the star regular expressions, the set of strings or languagea bounded regular expression can matchis defined using the power operation on sets:\begin{center} \begin{tabular}{lcl} $L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\ $L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\ $L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\ $L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$ \end{tabular}\end{center}The attraction of bounded repetitions is that they can beused to avoid a blow up: for example $r^{\{n\}}$is a shorthand for\[ \underbrace{r\ldots r}_\text{n copies of r}.\]%Therefore, a naive algorithm that simply unfolds%them into their desugared forms%will suffer from at least an exponential runtime increase.The problem with matching is that tools based on the classic notion ofautomata need to expand $r^{\{n\}}$ into $n$ connected copies of the automaton for $r$. This leads to very inefficient matchingalgorithms or algorithms that consume large amounts of memory.Implementations using $\DFA$s willeither become excruciatingly slow (for example Verbatim++\cite{Verbatimpp}) or getout of memory errors (for example $\mathit{LEX}$ and $\mathit{JFLEX}$\footnote{which are lexer generatorsin C and JAVA that generate $\mathit{DFA}$-basedlexers. The user provides a set of regular expressionsand configurations to them, and then gets an output program encoding a minimized $\mathit{DFA}$that can be compiled and run. When given the above countdown regular expression,a small $n$ (a few dozen) would result in a determinised automatawith millions of states.}) for large counters.A classic example for this phenomenon is the regular expression $(a+b)^* a (a+b)^{n}$where the minimal DFA requires at least $2^{n+1}$ states.For example, when $n$ is equal to 2,The corresponding $\mathit{NFA}$ looks like:\begin{center}\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] \node[state,initial] (q_0) {$q_0$}; \node[state, red] (q_1) [right=of q_0] {$q_1$}; \node[state, red] (q_2) [right=of q_1] {$q_2$}; \node[state, accepting, red](q_3) [right=of q_2] {$q_3$}; \path[->] (q_0) edge node {a} (q_1) edge [loop below] node {a,b} () (q_1) edge node {a,b} (q_2) (q_2) edge node {a,b} (q_3);\end{tikzpicture}\end{center}when turned into a DFA by the subset constructionrequires at least $2^3$ states.\footnote{The red states are "countdown states" which counts down the number of characters needed in addition to the currentstring to make a successful match.For example, state $q_1$ indicates a match that hasgone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,and just consumed the "delimiter" $a$ in the middle, and need to match 2 more iterations of $(a|b)$ to complete.State $q_2$ on the other hand, can be viewed as a stateafter $q_1$ has consumed 1 character, and just waitsfor 1 more character to complete.$q_3$ is the last state, requiring 0 more character and is accepting.Depending on the suffix of theinput string up to the current read location,the states $q_1$ and $q_2$, $q_3$may or maynot be active, independent from each other.A $\mathit{DFA}$ for such an $\mathit{NFA}$ wouldcontain at least $2^3$ non-equivalent states that cannot be merged, because the subset construction during determinisation will generateall the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.Generalizing this to regular expressions with largerbounded repetitions number, we have thatregexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$swould require at least $2^{n+1}$ states, if $r$ itself containsmore than 1 string.This is to represent all different scenarios which "countdown" states are active.}Bounded repetitions are very important because theytend to occur a lot in practical use,for example in the regex library RegExLib,the rules library of Snort \cite{Snort1999}\footnote{Snort is a network intrusion detection (NID) toolfor monitoring network traffic.The network security community curates a listof malicious patterns written as regexes,which is used by Snort's detection engineto match against network traffic for any hostileactivities such as buffer overflow attacks.}, as well as in XML Schema definitions (XSDs).According to Bj\"{o}rklund et al \cite{xml2015},more than half of the XSDs they found on the Maven.org central repositoryhave bounded regular expressions in them.Often the counters are quite large, with the largest beingapproximately up to ten million. An example XSD they gaveis:\begin{verbatim}<sequence minOccurs="0" maxOccurs="65535"> <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/> <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/></sequence>\end{verbatim}This can be seen as the expression $(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselvesregular expressions satisfying certain constraints (such as satisfying the floating point number format).It is therefore quite unsatisfying that some regular expressions matching librariesimpose adhoc limitsfor bounded regular expressions:For example, in the regular expression matching library in the Golanguage the regular expression $a^{1001}$ is not permitted, because no countercan be above 1000, and in the built-in Rust regular expression libraryexpressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error messagefor being too big. As Becchi and Crawley\cite{Becchi08} have pointed out,the reason for these restrictionsis that they simulate a non-deterministic finiteautomata (NFA) with a breadth-first search.This way the number of active states couldbe equal to the counter number.When the counters are large, the memory requirement could becomeinfeasible, and a regex enginelike Go will reject this pattern straight away.\begin{figure}[H]\begin{center}\begin{tikzpicture} [node distance = 2cm, on grid, auto] \node (q0) [state, initial] {$0$}; \node (q1) [state, right = of q0] {$1$}; %\node (q2) [state, right = of q1] {$2$}; \node (qdots) [right = of q1] {$\ldots$}; \node (qn) [state, right = of qdots] {$n$}; \node (qn1) [state, right = of qn] {$n+1$}; \node (qn2) [state, right = of qn1] {$n+2$}; \node (qn3) [state, accepting, right = of qn2] {$n+3$}; \path [-stealth, thick] (q0) edge [loop above] node {a} () (q0) edge node {a} (q1) %(q1) edge node {.} (q2) (q1) edge node {.} (qdots) (qdots) edge node {.} (qn) (qn) edge node {.} (qn1) (qn1) edge node {b} (qn2) (qn2) edge node {$c$} (qn3);\end{tikzpicture}%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] % \node[state,initial] (q_0) {$0$}; % \node[state, ] (q_1) [right=of q_0] {$1$}; % \node[state, ] (q_2) [right=of q_1] {$2$}; % \node[state,% \node[state, accepting, ](q_3) [right=of q_2] {$3$};% \path[->] % (q_0) edge node {a} (q_1)% edge [loop below] node {a,b} ()% (q_1) edge node {a,b} (q_2)% (q_2) edge node {a,b} (q_3);%\end{tikzpicture}\end{center}\caption{The example given by Becchi and Crawley that NFA simulation can consume large amounts of memory: $.^*a.^{\{n\}}bc$ matching strings of the form $aaa\ldots aaaabc$. When traversing in a breadth-first manner,all states from 0 till $n+1$ will become active.}\end{figure}%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this%type of $\mathit{NFA}$ simulation and guarantees a linear runtime%in terms of input string length.%TODO:try out these lexersThese problems can of course be solved in matching algorithms where automata go beyond the classic notion and for instance include explicitcounters \cite{Turo_ov__2020}.These solutions can be quite efficient,with the ability to processgigabytes of strings input per secondeven with large counters \cite{Becchi08}.But formal reasoning about these automata especially in Isabelle can be challengingand un-intuitive. Therefore, we take correctness and runtime claims made about these solutionswith a grain of salt.In the work reported in \cite{CSL2022} and here, we add better support using derivativesfor bounded regular expressions $r^{\{n\}}$.The resultsextend straightforwardly torepetitions with an interval such as $r^{\{n\ldots m\}}$.The merit of Brzozowski derivatives (more on this later)on this problem is thatit can be naturally extended to support bounded repetitions.Moreover these extensions are still made up of onlyinductive datatypes and recursive functions,making it handy to deal with using theorem provers.%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be%straightforwardly extended to deal with bounded regular expressions%and moreover the resulting code still consists of only simple%recursive functions and inductive datatypes.Finally, bounded regular expressions do not destroy our finiteboundedness property, which we shall prove later on.\subsection{Back-References}The other way to simulate an $\mathit{NFA}$ for matching is choosing a single transition each time, keeping all the other options in a queue or stack, and backtracking if that choice eventually fails. This method, often called a "depth-first-search", is efficient in a lot of cases, but could end upwith exponential run time.The backtracking method is employed in regex librariesthat support \emph{back-references}, for examplein Java and Python.%\section{Back-references and The Terminology Regex}%When one constructs an $\NFA$ out of a regular expression%there is often very little to be done in the first phase, one simply %construct the $\NFA$ states based on the structure of the input regular expression.%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:%one by keeping track of all active states after consuming %a character, and update that set of states iteratively.%This can be viewed as a breadth-first-search of the $\mathit{NFA}$%for a path terminating%at an accepting state.Given a regular expression like this (the sequenceoperator is omitted for brevity):\begin{center} $r_1r_2r_3r_4$\end{center}one could label sub-expressions of interest by parenthesizing them and giving them a number by the order in which their opening parentheses appear.One possible way of parenthesizing and labelling is given below:\begin{center} $\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$\end{center}The sub-expressions$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelledby 1 to 4, and can be ``referred back'' by their respective numbers. %These sub-expressions are called "capturing groups".To do so, we use the syntax $\backslash i$ to denote that we want the sub-string of the input just matched by the i-thsub-expression to appear again, exactly the same as it first appeared: \begin{center}$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots \underset{s_i \text{ which just matched} \;r_i}{\backslash i}$\end{center}%The backslash and number $i$ are the%so-called "back-references".%Let $e$ be an expression made of regular expressions %and back-references. $e$ contains the expression $e_i$%as its $i$-th capturing group.%The semantics of back-reference can be recursively%written as:%\begin{center}% \begin{tabular}{c}% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$% \end{tabular}%\end{center}A concrete examplefor back-references is\begin{center}$(.^*)\backslash 1$,\end{center}which matchesstrings that can be split into two identical halves,for example $\mathit{foofoo}$, $\mathit{ww}$ and so on.Note that this is different from repeating the sub-expression verbatim like\begin{center} $(.^*)(.^*)$,\end{center}which does not impose any restrictions on what strings the second sub-expression $.^*$might match.Another example of back-references is\begin{center}$(.)(.)\backslash 2\backslash 1$\end{center}which matches four-character palindromeslike $abba$, $x??x$ and so on.Back-references is a regex construct that programmers find quite useful.According to Becchi and Crawley\cite{Becchi08},6\% of Snort rules (up until 2008) use them.The most common use of back-referencesis to express well-formed html files,where back-references are convenient for matchingopening and closing tags like \begin{center} $\langle html \rangle \ldots \langle / html \rangle$\end{center}A regex describing such a formatis\begin{center} $\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$\end{center}Despite being useful, the expressive power of regexes go beyond the regular language hierarchyonce back-references are included.In fact, they allow the regex construct to express languages that cannot be contained in context-freelanguages either.For example, the back-reference $(a^*)b\backslash1 b \backslash 1$expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.Such a language is contained in the context-sensitive hierarchyof formal languages. Also solving the matching problem involving back-referencesis known to be NP-complete \parencite{alfred2014algorithms}.Regex libraries supporting back-references such as PCRE \cite{pcre} therefore have torevert to a depth-first search algorithm which backtracks.What is unexpected is that even in the cases not involving back-references, there is stilla (non-negligible) chance they might backtrack super-linearly,as shown in the graphs in figure\ref{fig:aStarStarb}.Summing these up, we can categorise existing practical regex libraries into two kinds:(i) The ones with lineartime guarantees like Go and Rust. The downside with them is thatthey impose restrictionson the regular expressions (not allowing back-references, bounded repetitions cannot exceed an ad hoc limit etc.).(ii) Those that allow large bounded regular expressions and back-referencesat the expense of using backtracking algorithms.They can potentially ``grind to a halt''on some very simple cases, resulting ReDoS attacks.The proble with both approaches is the motivation for us to look again at the regular expression matching problem. Another motivation is that regular expression matching algorithmsthat follow the POSIX standard often contain errors and bugs as we shall explain next.%We would like to have regex engines that can %deal with the regular part (e.g.%bounded repetitions) of regexes more%efficiently.%Also we want to make sure that they do it correctly.%It turns out that such aim is not so easy to achieve. %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions % For example, the Rust regex engine claims to be linear, % but does not support lookarounds and back-references.% The GoLang regex library does not support over 1000 repetitions. % Java and Python both support back-references, but shows%catastrophic backtracking behaviours on inputs without back-references(%when the language is still regular). %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac %TODO: verify the fact Rust does not allow 1000+ reps%The time cost of regex matching algorithms in general%involve two different phases, and different things can go differently wrong on %these phases.%$\DFA$s usually have problems in the first (construction) phase%, whereas $\NFA$s usually run into trouble%on the second phase.\section{Error-prone POSIX Implementations}Very often there are multiple ways of matching a stringwith a regular expression.In such cases the regular expressions matcher needs todisambiguate.The more widely used strategy is called POSIX,which roughly speaking always chooses the longest initial match.The POSIX strategy is widely adopted in many regular expression matchers.However, many implementations (including the C librariesused by Linux and OS X distributions) contain bugsor do not meet the specification they claim to adhere to.Kuklewicz maintains a unit test repository which lists someproblems with existing regular expression engines \ref{KuklewiczHaskell}.In some cases, they either fail to generate aresult when there exists a match,or give results that are inconsistent with the POSIX standard.A concrete example is the regex:\begin{center} $(aba + ab + a)^* \text{and the string} ababa$\end{center}The correct POSIX match for the aboveis the entire string $ababa$, split into two Kleene star iterations, namely $[ab], [aba]$ at positions$[0, 2), [2, 5)$respectively.But trying this out in regex101 \parencite{regex101} \footnote{ regex101 is an online regular expression matcher which provides API for trying out regular expression engines of multiple popular programming languages likeJava, Python, Go, etc.}with different engines yieldsalways matches: $[aba]$ at $[0, 3)$and $a$ at $[4, 5)$.Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell} commented that most regex libraries are notcorrectly implementing the central POSIXrule, called the maximum munch rule.Grathwohl \parencite{grathwohl2014crash} wrote,\begin{quote} ``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''\end{quote}%\noindentWe think the implementation complexity of POSIX rules also come fromthe specification being not very precise.There are many informal summaries of this disambiguationstrategy, which are often quite long and delicate.For example Kuklewicz \cite{KuklewiczHaskell} described the POSIX rule as (section 1, last paragraph):\begin{quote} \begin{itemize} \itemregular expressions (REs) take the leftmost starting match, and the longest match starting thereearlier subpatterns have leftmost-longest priority over later subpatterns\\\itemhigher-level subpatterns have leftmost-longest priority over their component subpatterns\\\itemREs have right associative concatenation which can be changed with parenthesis\\\itemparenthesized subexpressions return the match from their last usage\\\itemtext of component subexpressions must be contained in the text of the higher-level subexpressions\\\itemif "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\\itemif "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\\end{itemize}\end{quote}%The text above %is trying to capture something very precise,%and is crying out for formalising.Ausaf et al. \cite{AusafDyckhoffUrban2016}are the first togive a quite simple formalised POSIXspecification in Isabelle/HOL, and also prove that their specification coincides with the POSIX specification given by Okui and Suzuki \cite{Okui10}.They then formally proved the correctness ofa lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}with regards to that specification.They also found that the informal POSIXspecification by Sulzmann and Lu does not work for the correctness proof.In the next section we will brieflyintroduce Brzozowski derivatives and Sulzmannand Lu's algorithm, which the main point of this thesis builds on.%We give a taste of what they %are like and why they are suitable for regular expression%matching and lexing.\section{Formal Specification of POSIX Matching and Brzozowski Derivatives}%Now we start with the central topic of the thesis: Brzozowski derivatives.Brzozowski \cite{Brzozowski1964} first introduced the concept of a \emph{derivative} of regular expression in 1964.The derivative of a regular expression $r$with respect to a character $c$, is written as $r \backslash c$.This operation tells us what $r$ transforms intoif we ``chop'' off the first character $c$ from all strings in the language of $r$ (definedlater as $L \; r$).%To give a flavour of Brzozowski derivatives, we present%two straightforward clauses from it:%\begin{center}% \begin{tabular}{lcl}% $d \backslash c$ & $\dn$ & % $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\%$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\% \end{tabular}%\end{center}%\noindent%The first clause says that for the regular expression%denoting a singleton set consisting of a sinlge-character string $\{ d \}$,%we check the derivative character $c$ against $d$,%returning a set containing only the empty string $\{ [] \}$%if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.%The second clause states that to obtain the regular expression%representing all strings' head character $c$ being chopped off%from $r_1 + r_2$, one simply needs to recursively take derivative%of $r_1$ and $r_2$ and then put them together.Derivatives have the propertythat $s \in L \; (r\backslash c)$ if and only if $c::s \in L \; r$ where $::$ stands for list prepending.%This property can be used on regular expressions%matching and lexing--to test whether a string $s$ is in $L \; r$,%one simply takes derivatives of $r$ successively with%respect to the characters (in the correct order) in $s$,%and then test whether the empty string is in the last regular expression.With this derivatives give a simple solutionto the problem of matching a string $s$ with a regularexpression $r$: if the derivative of $r$ w.r.t.\ (insuccession) all the characters of the string matches the empty string,then $r$ matches $s$ (and {\em vice versa}). %This makes formally reasoning about these properties such%as correctness and complexity smooth and intuitive.There had been several mechanised proofs about this property in various theoremprovers,for example one by Owens and Slind \cite{Owens2008} in HOL4,another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, andyet another in Coq by Coquand and Siles \cite{Coquand2012}.In addition, one can extend derivatives to bounded repetitionsrelatively straightforwardly. For example, the derivative for this can be defined as:\begin{center} \begin{tabular}{lcl} $r^{\{n\}} \backslash c$ & $\dn$ & $r \backslash c \cdot r^{\{n-1\}}$\\ \end{tabular}\end{center}\noindentExperimental results suggest that unlike DFA-based solutionsfor bounded regular expressions,derivatives can copelarge countersquite well.There has also been extensions to other constructs.For example, Owens et al include the derivativesfor the \emph{NOT} regular expression, which isable to concisely express C-style comments of the form$/* \ldots */$ (see \cite{Owens2008}).Another extension for derivatives isregular expressions with look-aheads, doneby Miyazaki and Minamide\cite{Takayuki2019}.%We therefore use Brzozowski derivatives on regular expressions %lexing Given the above definitions and properties ofBrzozowski derivatives, one quickly realises their potentialin generating a formally verified algorithm for lexing--the clauses and propertycan be easily expressed in a functional programming language or converted to theorem provercode, with great extensibility.Perhaps this is the reason why it has sparked quite a bit of interestin the functional programming and theorem prover communities in the lastfifteen or so years (\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},\cite{Chen12} and \cite{Coquand2012}to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}after they were first published by Brzozowski.However, there are two difficulties with derivative-based matchers:First, Brzozowski's original matcher only generates a yes/no answerfor whether a regular expression matches a string or not. This is toolittle information in the context of lexing where separate tokens mustbe identified and also classified (for example as keywordsor identifiers). Second, derivative-based matchers need to be more efficient.Elegant and beautifulas many implementations are,they can be excruciatingly slow. For example, Sulzmann and Luclaim a linear running time of their proposed algorithm,but that was falsified by our experiments. The running time is actually $\Omega(2^n)$ in the worst case.A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim \cite{Verbatim}%TODO: give referenceslexer, which calculates POSIX matches and is based on derivatives.They formalized the correctness of the lexer, but not their complexity result.In the performance evaluation section, they analyzed the run timeof matching $a$ with the string \begin{center} $\underbrace{a \ldots a}_{\text{n a's}}$.\end{center}\noindentThey concluded that the algorithm is quadratic in terms of the length of the input string.When we tried out their extracted OCaml code with our example $(a+aa)^*$,the time it took to match a string of 40 $a$'s was approximately 5 minutes.\subsection{Sulzmann and Lu's Algorithm}Sulzmann and Lu~\cite{Sulzmann2014} overcame the first problem with the yes/no answer by cleverly extending Brzozowski's matchingalgorithm. Their extended version generates additional information on\emph{how} a regular expression matches a string following the POSIXrules for regular expression matching. They achieve this by adding asecond ``phase'' to Brzozowski's algorithm involving an injectionfunction.In earlier work, Ausaf et al provided the formalspecification of what POSIX matching means and proved in Isabelle/HOLthe correctnessof this extended algorithm accordingly\cite{AusafDyckhoffUrban2016}.The version of the algorithm proven correct suffers heavily from a second difficulty, where the internal derivatives cangrow to arbitrarily big sizes. For example if we start with theregular expression $(a+aa)^*$ and takesuccessive derivatives according to the character $a$, we end up witha sequence of ever-growing derivatives like \def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}\begin{center}\begin{tabular}{rll}$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)\end{tabular}\end{center}\noindent where after around 35 steps we run out of memory on atypical computer. Clearly, thenotation involving $\ZERO$s and $\ONE$s already suggestssimplification rules that can be applied to regular regularexpressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrowr$. While such simple-minded simplifications have been proved in the work by Ausaf et al. to preserve the correctness of Sulzmann and Lu'salgorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do\emph{not} help with limiting the growth of the derivatives shownabove: the growth is slowed, but the derivatives can still grow ratherquickly beyond any finite bound.Therefore we want to look in this thesis at a secondalgorithm by Sulzmann and Lu where theyovercame this ``growth problem'' \cite{Sulzmann2014}.In this version, POSIX values are represented as bit sequences and such sequences are incrementally generatedwhen derivatives are calculated. The compact representationof bit sequences and regular expressions allows them to define a more``aggressive'' simplification method that keeps the size of thederivatives finite no matter what the length of the string is.They make some informal claims about the correctness and linear behaviourof this version, but do not provide any supporting proof arguments, noteven ``pencil-and-paper'' arguments. They write about their bit-coded\emph{incremental parsing method} (that is the algorithm to be formalisedin this dissertation) \begin{quote}\it ``Correctness Claim: We further claim that the incremental parsing method [..] in combination with the simplification steps [..] yields POSIX parse trees. We have tested this claim extensively [..] but yet have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}\end{quote} Ausaf and Urban made some initial progress towards the full correctness proof but still had to leave out the optimisationSulzmann and Lu proposed.Ausaf wrote \cite{Ausaf}, \begin{quote}\it``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''\end{quote} This thesis implements the aggressive simplifications envisionedby Ausaf and Urban,together with a formal proof of the correctness of those simplifications.One of the most recent work in the context of lexing%with this issueis the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.This is relevant work for us and we will compare it later with our derivative-based matcher we are going to present.There is also some newer work calledVerbatim++\cite{Verbatimpp}, which does not use derivatives, but deterministic finite automaton instead.%An example that gives problem to automaton approaches would be%the regular expression $(a|b)^*a(a|b)^{\{n\}}$.%It requires at least $2^{n+1}$ states to represent%as a DFA.%----------------------------------------------------------------------------------------\section{Contribution}In this thesis,we propose a solution to catastrophicbacktracking and error-prone matchers: a formally verifiedregular expression lexing algorithmthat is both fastand correct by extending Ausaf et al.'s work.The end result is %a regular expression lexing algorithm that comes with \begin{itemize}\iteman improved version of Sulzmann and Lu's bit-coded algorithm using derivatives with simplifications, accompanied bya proven correctness theorem according to POSIX specification given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, \item a complexity-related property for that algorithm saying that the internal data structure willremain finite,\itemand extension tothe bounded repetitions construct with the correctness and finiteness propertymaintained.\end{itemize}\noindentWith a formal finiteness bound in place,we can greatly reduce the attack surface of servers in terms of ReDoS attacks.Further improvements to the algorithm with an even stronger version of simplification can be made.\section{Structure of the thesis}In chapter \ref{Inj} we will introduce the conceptsand notations we use for describing regular expressions and derivatives,and the first version of their lexing algorithm without bitcodes (including its correctness proof).We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}together with the correctness proof by Ausaf and Urban.Then we illustrate in chapter \ref{Bitcoded2}how Sulzmann and Lu'ssimplifications fail to simplify. We therefore introduce our version of thealgorithm with simplification and its correctness proof . In chapter \ref{Finite} we give the second guaranteeof our bitcoded algorithm, that is a finite bound on the size of any regular expression's derivatives.In chapter \ref{Cubic} we discuss stronger simplification rules which improve the finite bound to a polynomial bound, and also show how one can extend thealgorithm to include bounded repetitions. %and the NOT regular expression.%----------------------------------------------------------------------------------------%----------------------------------------------------------------------------------------%----------------------------------------------------------------------------------------%----------------------------------------------------------------------------------------