% Chapter 1\chapter{Introduction} % Main chapter title\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} %----------------------------------------------------------------------------------------% Define some commands to keep the formatting separated from the content \newcommand{\keyword}[1]{\textbf{#1}}\newcommand{\tabhead}[1]{\textbf{#1}}\newcommand{\code}[1]{\texttt{#1}}\newcommand{\file}[1]{\texttt{\bfseries#1}}\newcommand{\option}[1]{\texttt{\itshape#1}}%boxes\newcommand*{\mybox}[1]{\framebox{\strut #1}}%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}\def\derssimp{\textit{ders}\_\textit{simp}}\def\rders{\textit{rders}}\newcommand{\bders}[2]{#1 \backslash #2}\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}\def\bsimps{\textit{bsimp}}\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%\newcommand{\ZERO}{\mbox{\bf 0}}\newcommand{\ONE}{\mbox{\bf 1}}\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}\def\rdistincts{\textit{rdistinct}}\def\rDistinct{\textit{rdistinct}}\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}\def\SEQ{\textit{SEQ}}\def\SEQs{\textit{SEQs}}\def\case{\textit{case}}\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}\def\rsimpalts{\textit{rsimp}_{ALTS}}\def\good{\textit{good}}\def\btrue{\textit{true}}\def\bfalse{\textit{false}}\def\bnullable{\textit{bnullable}}\def\bnullables{\textit{bnullables}}\def\Some{\textit{Some}}\def\None{\textit{None}}\def\code{\textit{code}}\def\decode{\textit{decode}}\def\internalise{\textit{internalise}}\def\lexer{\mathit{lexer}}\def\mkeps{\textit{mkeps}}\newcommand{\rder}[2]{#2 \backslash_r #1}\def\rerases{\textit{rerase}}\def\nonnested{\textit{nonnested}}\def\AZERO{\textit{AZERO}}\def\sizeNregex{\textit{sizeNregex}}\def\AONE{\textit{AONE}}\def\ACHAR{\textit{ACHAR}}\def\simpsulz{\textit{simp}_{Sulz}}\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}\def\frewrite{\rightsquigarrow_f}\def\hrewrite{\rightsquigarrow_h}\def\grewrite{\rightsquigarrow_g}\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}\def\fuse{\textit{fuse}}\def\bder{\textit{bder}}\def\der{\textit{der}}\def\POSIX{\textit{POSIX}}\def\ALTS{\textit{ALTS}}\def\ASTAR{\textit{ASTAR}}\def\DFA{\textit{DFA}}\def\NFA{\textit{NFA}}\def\bmkeps{\textit{bmkeps}}\def\bmkepss{\textit{bmkepss}}\def\retrieve{\textit{retrieve}}\def\blexer{\textit{blexer}}\def\flex{\textit{flex}}\def\inj{\textit{inj}}\def\Empty{\textit{Empty}}\def\Left{\textit{Left}}\def\Right{\textit{Right}}\def\Stars{\textit{Stars}}\def\Char{\textit{Char}}\def\Seq{\textit{Seq}}\def\Der{\textit{Der}}\def\Ders{\textit{Ders}}\def\nullable{\mathit{nullable}}\def\Z{\mathit{Z}}\def\S{\mathit{S}}\def\rup{r^\uparrow}%\def\bderssimp{\mathit{bders}\_\mathit{simp}}\def\distinctWith{\textit{distinctWith}}\def\lf{\textit{lf}}\def\PD{\textit{PD}}\def\suffix{\textit{Suffix}}\def\distinctBy{\textit{distinctBy}}\def\starupdate{\textit{starUpdate}}\def\starupdates{\textit{starUpdates}}\def\size{\mathit{size}}\def\rexp{\mathbf{rexp}}\def\simp{\mathit{simp}}\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}\def\map{\mathit{map}}\def\distinct{\mathit{distinct}}\def\blexersimp{\mathit{blexer}\_\mathit{simp}}\def\blexerStrong{\textit{blexerStrong}}\def\bsimpStrong{\textit{bsimpStrong}}\def\bdersStrongs{\textit{bdersStrong}}\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}\def\map{\textit{map}}\def\rrexp{\textit{rrexp}}\newcommand\rnullable[1]{\textit{rnullable} \; #1 }\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}\newcommand\asize[1]{\llbracket #1 \rrbracket}\newcommand\rerase[1]{ (#1)_{\downarrow_r}}\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}\def\rflts{\textit{rflts}}\def\rrewrite{\textit{rrewrite}}\def\bsimpalts{\textit{bsimp}_{ALTS}}\def\bsimpaseq{\textit{bsimp}_{ASEQ}}\def\rsimlalts{\textit{rsimp}_{ALTs}}\def\rsimpseq{\textit{rsimp}_{SEQ}}\def\erase{\textit{erase}}\def\STAR{\textit{STAR}}\def\flts{\textit{flts}}\def\zeroable{\textit{zeroable}}\def\nub{\textit{nub}}\def\filter{\textit{filter}}%\def\not{\textit{not}}\def\RZERO{\mathbf{0}_r }\def\RONE{\mathbf{1}_r}\newcommand\RCHAR[1]{\mathbf{#1}_r}\newcommand\RSEQ[2]{#1 \cdot #2}\newcommand\RALTS[1]{\sum #1}\newcommand\RSTAR[1]{#1^*}\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}\lstdefinestyle{myScalastyle}{ frame=tb, language=scala, aboveskip=3mm, belowskip=3mm, showstringspaces=false, columns=flexible, basicstyle={\small\ttfamily}, numbers=none, numberstyle=\tiny\color{gray}, keywordstyle=\color{blue}, commentstyle=\color{dkgreen}, stringstyle=\color{mauve}, frame=single, breaklines=true, breakatwhitespace=true, tabsize=3,}%----------------------------------------------------------------------------------------%This part is about regular expressions, Brzozowski derivatives,%and a bit-coded lexing algorithm with proven correctness and time bounds.%TODO: look up snort rules to use here--give readers idea of what regexes look likeRegular expressions are widely used in computer science: be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;command-line tools like $\mathit{grep}$ that facilitate easy text-processing; network intrusiondetection systems that reject suspicious traffic; or compilerfront ends--the majority of the solutions to these tasks involve lexing with regular expressions.Given its usefulness and ubiquity, one would imagine thatmodern regular expression matching implementationsare mature and fully studied.Indeed, in a popular programming language's regex engine, supplying it with regular expressions and strings,in most cases one canget the matching information in a very short time.Those matchers can be blindingly fast--some network intrusion detection systemsuse regex engines that are able to process megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.However, those matchers can exhibit a surprising security vulnerabilityunder a certain class of inputs.%However, , this is not the case for $\mathbf{all}$ inputs.%TODO: get source for SNORT/BRO's regex matching engine/speed\begin{figure}[p]\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}\begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={JavaScript}, legend pos=north west, legend cell align=left]\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};\end{axis}\end{tikzpicture} &\begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, %ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Python}, legend pos=north west, legend cell align=left]\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};\end{axis}\end{tikzpicture} &\begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, %ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Java 8}, legend pos=north west, legend cell align=left]\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};\end{axis}\end{tikzpicture}\\\begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Dart}, legend pos=north west, legend cell align=left]\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};\end{axis}\end{tikzpicture} &\begin{tikzpicture}\begin{axis}[ xlabel={$n$}, x label style={at={(1.05,-0.05)}}, %ylabel={time in secs}, enlargelimits=false, xtick={0,5,...,30}, xmax=33, ymax=35, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, width=5cm, height=4cm, legend entries={Swift}, legend pos=north west, legend cell align=left]\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};\end{axis}\end{tikzpicture} & \\\multicolumn{3}{c}{Graphs}\end{tabular} \caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries. The reason for their superlinear behaviour is that they do a depth-first-search. If the string does not match, the engine starts to explore all possibilities. }\label{fig:aStarStarb}\end{figure}\afterpage{\clearpage}Take $(a^*)^*\,b$ and ask whetherstrings of the form $aa..a$ match this regularexpression. Obviously this is not the case---the expected $b$ in the lastposition is missing. One would expect that modern regular expressionmatching engines can find this out very quickly. Alas, if one triesthis example in JavaScript, Python or Java 8, even with strings of a smalllength, say around 30 $a$'s, one discovers that this decision takes crazy time to finish given the simplicity of the problem.This is clearly exponential behaviour, and is triggered by some relatively simple regex patterns, as the graphs in \ref{fig:aStarStarb} show.Java 9 and newerversions improves this behaviour, but is still slow compared with the approach we are going to use.This superlinear blowup in regular expression engineshad repeatedly caused grief in real life.For example, on 20 July 2016 one evilregular expression brought the webpage\href{http://stackexchange.com}{Stack Exchange} to itsknees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}In this instance, a regular expression intended to just trim whitespaces from the beginning and the end of a line actually consumedmassive amounts of CPU resources---causing web servers to grind to ahalt. In this example, the time needed to processthe string was $O(n^2)$ with respect to the string length. Thisquadratic overhead was enough for the homepage of Stack Exchange torespond so slowly that the load balancer assumed a $\mathit{DoS}$ attack and therefore stopped the servers from responding to anyrequests. This made the whole site become unavailable. A more recent example is a global outage of all Cloudflare servers on 2 July2019. A poorly written regular expression exhibited exponentialbehaviour and exhausted CPUs that serve HTTP traffic. Although the outagehad several causes, at the heart was a regular expression thatwas used to monitor networktraffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}These problems with regular expressions are not isolated events that happenvery occasionally, but actually widespread.They occur so often that they get a name--Regular-Expression-Denial-Of-Service (ReDoS)attack.\citeauthor{Davis18} detected morethan 1000 super-linear (SL) regular expressionsin Node.js, Python core libraries, and npm and pypi. They therefore concluded that evil regular expressionsare problems "more than a parlour trick", but one thatrequiresmore research attention.\ChristianComment{I am not totally sure where this sentence should beput, seems a little out-standing here.}Regular expressions and regular expression matchers have of course been studied for many, many years.One of the most recent work in the context of lexingis the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.This is relevant work and we will compare later onour derivative-based matcher we are going to present.There is also some newer work calledVerbatim++\cite{Verbatimpp}, this does not use derivatives, but automaton instead.For that the problem is dealing with the bounded regular expressions of the form$r^{n}$ where $n$ is a constant specifying that $r$ must repeatexactly $n$ times.The other repetition constructs include$r^{\ldots m}$, $r^{n\ldots}$ and $r^{n\ldots m}$ which respectively mean repeatingat most $m$ times, repeating at least $n$ times and repeating between $n$ and $m$ times.Their formal definitions will be given later.Bounded repetitions are important because theytend to occur often in practical use\cite{xml2015}, for example in RegExLib,Snort, as well as in XML Schema definitions (XSDs).One XSD that seems to be related to the MPEG-7 standard involvesthe below regular expression:\begin{verbatim}<sequence minOccurs="0" maxOccurs="65535"> <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/> <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/></sequence>\end{verbatim}This is just a fancy way of writing the regular expression $(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselvesregular expressions satisfy certain constraints such as floating point number format.The problems are not limited to slowness on certain cases. Another thing about these libraries is that thereis no correctness guarantee.In some cases, they either fail to generate a lexing result when there exists a match,or give results that are inconsistent with the $\POSIX$ standard.A concrete example would be the regex\begin{center} $(aba + ab + a)* \text{and the string} ababa$\end{center}The correct $\POSIX$ match for the above would be with the entire string $ababa$, split into two Kleene star iterations, $[ab] [aba]$ at positions$[0, 2), [2, 5)$respectively.But trying this out in regex101\parencite{regex101}with different language engines would yield the same two fragmented matches: $[aba]$ at $[0, 3)$and $a$ at $[4, 5)$.Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are notcorrectly implementing the POSIX (maximum-munch)rule of regular expression matching.As Grathwohl\parencite{grathwohl2014crash} wrote,\begin{quote} The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.\end{quote}%\noindentTo summarise the above, regular expressions are important.They are popular and programming languages' library functionsfor them are very fast on non-catastrophic cases.But there are problems with current practical implementations.First thing is that the running time might blow up.The second problem is that they might be error-prone on certainvery simple cases.In the next part of the chapter, we will look into reasons why certain regex engines are running horribly slow on the "catastrophic"cases and propose a solution that addresses both of these problemsbased on Brzozowski and Sulzmann and Lu's work. \section{Why are current regex engines slow?}%find literature/find out for yourself that REGEX->DFA on basic regexes%does not blow up the sizeShouldn't regular expression matching be linear?How can one explain the super-linear behaviour of the regex matching engines we have?The time cost of regex matching algorithms in generalinvolve two different phases, and different things can go differently wrong on these phases.$\DFA$s usually have problems in the first (construction) phase, whereas $\NFA$s usually run into troubleon the second phase.\subsection{Different Phases of a Matching/Lexing Algorithm}Most lexing algorithms can be roughly divided into two phases during its run.The first phase is the "construction" phase,in which the algorithm builds some suitable data structure from the input regex $r$, so thatit can be easily operated on later.We denotethe time cost for such a phase by $P_1(r)$.The second phase is the lexing phase, when the input string $s$ is read and the data structurerepresenting that regex $r$ is being operated on. We represent the timeit takes by $P_2(r, s)$.\\For $\mathit{DFA}$,we have $P_2(r, s) = O( |s| )$,because we take at most $|s|$ steps, and each step takesat most one transition--a deterministic-finite-automataby definition has at most one state active and at most onetransition upon receiving an input symbol.But unfortunately in the worst case$P_1(r) = O(exp^{|r|})$. An example will be given later. For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold expressions like $r^n$ into \[ \underbrace{r \cdots r}_{\text{n copies of r}}.\]The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.On the other hand, if backtracking is used, the worst-case time bound bloatsto $|r| * 2^{|s|}$.%on the input%And when calculating the time complexity of the matching algorithm,%we are assuming that each input reading step requires constant time.%which translates to that the number of %states active and transitions taken each time is bounded by a%constant $C$.%But modern regex libraries in popular language engines% often want to support much richer constructs than just% sequences and Kleene stars,%such as negation, intersection, %bounded repetitions and back-references.%And de-sugaring these "extended" regular expressions %into basic ones might bloat the size exponentially.%TODO: more reference for exponential size blowup on desugaring. \subsection{Why $\mathit{DFA}s$ can be slow in the first phase}The good things about $\mathit{DFA}$s is that oncegenerated, they are fast and stable, unlikebacktracking algorithms. However, they do not scale well with bounded repetitions.\subsubsection{Problems with Bounded Repetitions}Bounded repetitions, usually written in the form$r^{\{c\}}$ (where $c$ is a constant natural number),denotes a regular expression accepting stringsthat can be divided into $c$ substrings, where each substring is in $r$. For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,an $\mathit{NFA}$ describing it would look like:\begin{center}\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] \node[state,initial] (q_0) {$q_0$}; \node[state, red] (q_1) [right=of q_0] {$q_1$}; \node[state, red] (q_2) [right=of q_1] {$q_2$}; \node[state, accepting, red](q_3) [right=of q_2] {$q_3$}; \path[->] (q_0) edge node {a} (q_1) edge [loop below] node {a,b} () (q_1) edge node {a,b} (q_2) (q_2) edge node {a,b} (q_3);\end{tikzpicture}\end{center}The red states are "countdown states" which counts down the number of characters needed in addition to the currentstring to make a successful match.For example, state $q_1$ indicates a match that hasgone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,and just consumed the "delimiter" $a$ in the middle, and need to match 2 more iterations of $(a|b)$ to complete.State $q_2$ on the other hand, can be viewed as a stateafter $q_1$ has consumed 1 character, and just waitsfor 1 more character to complete.$q_3$ is the last state, requiring 0 more character and is accepting.Depending on the suffix of theinput string up to the current read location,the states $q_1$ and $q_2$, $q_3$may or maynot be active, independent from each other.A $\mathit{DFA}$ for such an $\mathit{NFA}$ wouldcontain at least $2^3$ non-equivalent states that cannot be merged, because the subset construction during determinisation will generateall the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.Generalizing this to regular expressions with largerbounded repetitions number, we have thatregexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$swould require at least $2^{n+1}$ states, if $r$ containsmore than 1 string.This is to represent all different scenarios which "countdown" states are active.For those regexes, tools that uses $\DFA$s will getout of memory errors.\subsubsection{Tools that uses $\mathit{DFA}$s}%TODO:more tools that use DFAs?$\mathit{LEX}$ and $\mathit{JFLEX}$ are toolsin $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-basedlexers. The user provides a set of regular expressionsand configurations to such lexer generators, and then gets an output program encoding a minimized $\mathit{DFA}$that can be compiled and run. When given the above countdown regular expression,a small number $n$ would result in a determinised automatawith millions of states.For this reason, regex libraries that support bounded repetitions often choose to use the $\mathit{NFA}$ approach.\subsection{Why $\mathit{NFA}$s can be slow in the second phase}When one constructs an $\NFA$ out of a regular expressionthere is often very little to be done in the first phase, one simply construct the $\NFA$ states based on the structure of the input regular expression.In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:one by keeping track of all active states after consuming a character, and update that set of states iteratively.This can be viewed as a breadth-first-search of the $\mathit{NFA}$for a path terminatingat an accepting state.Languages like $\mathit{Go}$ and $\mathit{Rust}$ use thistype of $\mathit{NFA}$ simulation and guarantees a linear runtimein terms of input string length.%TODO:try out these lexersThe other way to use $\mathit{NFA}$ for matching is choosing a single transition each time, keeping all the other options in a queue or stack, and backtracking if that choice eventually fails. This method, often called a "depth-first-search", is efficient in a lot of cases, but could end upwith exponential run time.\\%TODO:COMPARE java python lexer speed with Rust and GoThe reason behind backtracking algorithms in languages likeJava and Python is that they support back-references.\subsubsection{Back References}If we have a regular expression like this (the sequenceoperator is omitted for brevity):\begin{center} $r_1(r_2(r_3r_4))$\end{center}We could label sub-expressions of interest by parenthesizing them and giving them a number by the order in which their opening parentheses appear.One possible way of parenthesizing and labelling is given below:\begin{center} $\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$\end{center}$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelledby 1 to 4. $1$ would refer to the entire expression $(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.These sub-expressions are called "capturing groups".We can use the following syntax to denote that we want a string just matched by a sub-expression (capturing group) to appear at a certain location again, exactly as it was:\begin{center}$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots \underset{s_i \text{ which just matched} \;r_i}{\backslash i}$\end{center}The backslash and number $i$ are used to denote such so-called "back-references".Let $e$ be an expression made of regular expressions and back-references. $e$ contains the expression $e_i$as its $i$-th capturing group.The semantics of back-reference can be recursivelywritten as:\begin{center} \begin{tabular}{c} $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\ $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$ \end{tabular}\end{center}The concrete example$((a|b|c|\ldots|z)^*)\backslash 1$would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\Back-reference is a construct in the "regex" standardthat programmers found useful, but not exactly regular any more.In fact, that allows the regex construct to express languages that cannot be contained in context-freelanguages either.For example, the back-reference $((a^*)b\backslash1 b \backslash 1$expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.Such a language is contained in the context-sensitive hierarchyof formal languages. Solving the back-reference expressions matching problemis NP-complete\parencite{alfred2014algorithms} and a non-bactracking,efficient solution is not known to exist.%TODO:read a bit more about back reference algorithmsIt seems that languages like Java and Python made the trade-offto support back-references at the expense of having to backtrack,even in the case of regexes not involving back-references.\\Summing these up, we can categorise existing practical regex libraries into the ones with lineartime guarantees like Go and Rust, which impose restrictionson the user input (not allowing back-references, bounded repetitions cannot exceed 1000 etc.), and ones that allows the programmer much freedom, but grinds to a halt in some non-negligible portion of cases. %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions % For example, the Rust regex engine claims to be linear, % but does not support lookarounds and back-references.% The GoLang regex library does not support over 1000 repetitions. % Java and Python both support back-references, but shows%catastrophic backtracking behaviours on inputs without back-references(%when the language is still regular). %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac %TODO: verify the fact Rust does not allow 1000+ repsSo we have practical implementations on regular expression matching/lexing which are fastbut do not come with any guarantees that it will not grind to a haltor give wrong answers.Our goal is to have a regex lexing algorithm that comes with \begin{itemize}\itemproven correctness \item proven non-catastrophic properties\itemeasy extensions toconstructs like bounded repetitions, negation, lookarounds, and even back-references. \end{itemize}\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}We propose Brzozowski derivatives on regular expressions as a solution to this.In the last fifteen or so years, Brzozowski's derivatives of regularexpressions have sparked quite a bit of interest in the functionalprogramming and theorem prover communities. \subsection{Motivation}Derivatives give a simple solutionto the problem of matching a string $s$ with a regularexpression $r$: if the derivative of $r$ w.r.t.\ (insuccession) all the characters of the string matches the empty string,then $r$ matches $s$ (and {\em vice versa}). The beauty ofBrzozowski's derivatives \parencite{Brzozowski1964} is that they are neatlyexpressible in any functional language, and easily definable andreasoned about in theorem provers---the definitions just consist ofinductive datatypes and simple recursive functions. And an algorithms based on it by Suzmann and Lu \parencite{Sulzmann2014} allows easy extensionto include extended regular expressions and simplification of internal data structures eliminating the exponential behaviours.However, two difficulties with derivative-based matchers exist:\subsubsection{Problems with Current Brzozowski Matchers}First, Brzozowski's original matcher only generates a yes/no answerfor whether a regular expression matches a string or not. This is toolittle information in the context of lexing where separate tokens mustbe identified and also classified (for example as keywordsor identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome thisdifficulty by cleverly extending Brzozowski's matchingalgorithm. Their extended version generates additional information on\emph{how} a regular expression matches a string following the POSIXrules for regular expression matching. They achieve this by adding asecond ``phase'' to Brzozowski's algorithm involving an injectionfunction. In our own earlier work, we provided the formalspecification of what POSIX matching means and proved in Isabelle/HOLthe correctnessof Sulzmann and Lu's extended algorithm accordingly\cite{AusafDyckhoffUrban2016}.The second difficulty is that Brzozowski's derivatives can grow to arbitrarily big sizes. For example if we start with theregular expression $(a+aa)^*$ and takesuccessive derivatives according to the character $a$, we end up witha sequence of ever-growing derivatives like \def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}\begin{center}\begin{tabular}{rll}$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)\end{tabular}\end{center}\noindent where after around 35 steps we run out of memory on atypical computer (we shall define shortly the precise details of ourregular expressions and the derivative operation). Clearly, thenotation involving $\ZERO$s and $\ONE$s already suggestssimplification rules that can be applied to regular regularexpressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrowr$. While such simple-minded simplifications have been proved in ourearlier work to preserve the correctness of Sulzmann and Lu'salgorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do\emph{not} help with limiting the growth of the derivatives shownabove: the growth is slowed, but the derivatives can still grow ratherquickly beyond any finite bound.Sulzmann and Lu overcome this ``growth problem'' in a second algorithm\cite{Sulzmann2014} where they introduce bit-codedregular expressions. In this version, POSIX values arerepresented as bit sequences and such sequences are incrementally generatedwhen derivatives are calculated. The compact representationof bit sequences and regular expressions allows them to define a more``aggressive'' simplification method that keeps the size of thederivatives finite no matter what the length of the string is.They make some informal claims about the correctness and linear behaviourof this version, but do not provide any supporting proof arguments, noteven ``pencil-and-paper'' arguments. They write about their bit-coded\emph{incremental parsing method} (that is the algorithm to be formalisedin this dissertation) \begin{quote}\it ``Correctness Claim: We further claim that the incremental parsing method [..] in combination with the simplification steps [..] yields POSIX parse trees. We have tested this claim extensively [..] but yet have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}\end{quote} Ausaf and Urban were able to back this correctness claim witha formal proof.But as they stated, \begin{quote}\itThe next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.\end{quote} This thesis implements the aggressive simplifications envisionedby Ausaf and Urban,and gives a formal proof of the correctness with those simplifications.%----------------------------------------------------------------------------------------\section{Contribution}This work addresses the vulnerability of super-linear andbuggy regex implementations by the combinationof Brzozowski's derivatives and interactive theorem proving. We give an improved version of Sulzmann and Lu's bit-coded algorithm using derivatives, which come with a formal guarantee in terms of correctness and running time as an Isabelle/HOL proof.Further improvements to the algorithm with an even stronger version of simplification is made.We have not yet come up with one, but believe that it leads to a formalised proof with a time bound linear to input andcubic to regular expression size using a technique byAntimirov\cite{Antimirov}.The main contribution of this thesis is \begin{itemize}\itema proven correct lexing algorithm\itemwith formalized finite bounds on internal data structures' sizes.\end{itemize}To our best knowledge, no lexing libraries using Brzozowski derivativeshave a provable time guarantee, and claims about running time are usually speculative and backed by thin empiricalevidence.%TODO: give referencesFor example, Sulzmann and Lu had proposed an algorithm in which theyclaim a linear running time.But that was falsified by our experiments and the running time is actually $\Omega(2^n)$ in the worst case.A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim%TODO: give referenceslexer, which calculates POSIX matches and is based on derivatives.They formalized the correctness of the lexer, but not the complexity.In the performance evaluation section, they simply analyzed the run timeof matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$and concluded that the algorithm is quadratic in terms of input length.When we tried out their extracted OCaml code with our example $(a+aa)^*$,the time it took to lex only 40 $a$'s was 5 minutes.\subsection{Related Work}We are awareof a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 byOwens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is partof the work by Krauss and Nipkow \parencite{Krauss2011}. And another onein Coq is given by Coquand and Siles \parencite{Coquand2012}.Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}. When a regular expression does not behave as intended,people usually try to rewrite the regex to some equivalent formor they try to avoid the possibly problematic patterns completely,for which many false positives exist\parencite{Davis18}.Animated tools to "debug" regular expressions such as \parencite{regexploit2021} \parencite{regex101} are also popular.We are also aware of static analysis work on regular expressions thataims to detect potentially expoential regex patterns. Rathnayake and Thielecke \parencite{Rathnayake2014StaticAF} proposed an algorithmthat detects regular expressions triggering exponentialbehavious on backtracking matchers.Weideman \parencite{Weideman2017Static} came up with non-linear polynomial worst-time estimatesfor regexes, attack string that exploit the worst-time scenario, and "attack automata" that generatesattack strings.\section{Structure of the thesis}In chapter 2 \ref{Inj} we will introduce the conceptsand notations we use for describing the lexing algorithm by Sulzmann and Lu,and then give the lexing algorithm.We will give its variant in \ref{Bitcoded1}.Then we illustrate in \ref{Bitcoded2}how the algorithm without bitcodes falls short for such aggressive simplifications and therefore introduce our version of the bit-coded algorithm and its correctness proof . In \ref{Finite} we give the second guaranteeof our bitcoded algorithm, that is a finite bound on the size of any regex's derivatives.In \ref{Cubic} we discuss stronger simplifications to improve the finite boundin \ref{Finite} to a polynomial one, and demonstrate how one can extend thealgorithm to include constructs such as bounded repetitions and negations.%----------------------------------------------------------------------------------------%----------------------------------------------------------------------------------------%----------------------------------------------------------------------------------------%----------------------------------------------------------------------------------------