ChengsongTanPhdThesis/Chapters/Introduction.tex
changeset 532 cc54ce075db5
child 537 50e590823220
equal deleted inserted replaced
531:c334f0b3ef52 532:cc54ce075db5
       
     1 % Chapter 1
       
     2 
       
     3 \chapter{Introduction} % Main chapter title
       
     4 
       
     5 \label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
       
     6 
       
     7 %----------------------------------------------------------------------------------------
       
     8 
       
     9 % Define some commands to keep the formatting separated from the content 
       
    10 \newcommand{\keyword}[1]{\textbf{#1}}
       
    11 \newcommand{\tabhead}[1]{\textbf{#1}}
       
    12 \newcommand{\code}[1]{\texttt{#1}}
       
    13 \newcommand{\file}[1]{\texttt{\bfseries#1}}
       
    14 \newcommand{\option}[1]{\texttt{\itshape#1}}
       
    15 
       
    16 %boxes
       
    17 \newcommand*{\mybox}[1]{\framebox{\strut #1}}
       
    18 
       
    19 %\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
       
    20 \newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
       
    21 \newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
       
    22 \newcommand{\bderssimp}[2]{#1 \backslash_{bsimp} #2}
       
    23 \newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
       
    24 \newcommand{\bders}[2]{#1 \backslash #2}
       
    25 \newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
       
    26 \newcommand{\rsimp}[1]{\textit{rsimp}(#1)}
       
    27 \newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
       
    28 \newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
       
    29 \newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
       
    30 \newcommand{\ZERO}{\mbox{\bf 0}}
       
    31 \newcommand{\ONE}{\mbox{\bf 1}}
       
    32 \newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
       
    33 \newcommand{\rdistinct}[2]{\textit{distinct} \; \textit{#1} \; #2}
       
    34 \newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
       
    35 \newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
       
    36 \newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
       
    37 
       
    38 \newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
       
    39 
       
    40 \def\decode{\textit{decode}}
       
    41 \def\internalise{\textit{internalise}}
       
    42 \def\lexer{\mathit{lexer}}
       
    43 \def\mkeps{\textit{mkeps}}
       
    44 \newcommand{\rder}[2]{#2 \backslash #1}
       
    45 
       
    46 \def\AZERO{\textit{AZERO}}
       
    47 \def\AONE{\textit{AONE}}
       
    48 \def\ACHAR{\textit{ACHAR}}
       
    49 
       
    50 \def\POSIX{\textit{POSIX}}
       
    51 \def\ALTS{\textit{ALTS}}
       
    52 \def\ASTAR{\textit{ASTAR}}
       
    53 \def\DFA{\textit{DFA}}
       
    54 \def\bmkeps{\textit{bmkeps}}
       
    55 \def\retrieve{\textit{retrieve}}
       
    56 \def\blexer{\textit{blexer}}
       
    57 \def\flex{\textit{flex}}
       
    58 \def\inj{\mathit{inj}}
       
    59 \def\Empty{\mathit{Empty}}
       
    60 \def\Left{\mathit{Left}}
       
    61 \def\Right{\mathit{Right}}
       
    62 \def\Stars{\mathit{Stars}}
       
    63 \def\Char{\mathit{Char}}
       
    64 \def\Seq{\mathit{Seq}}
       
    65 \def\Der{\textit{Der}}
       
    66 \def\Ders{\textit{Ders}}
       
    67 \def\nullable{\mathit{nullable}}
       
    68 \def\Z{\mathit{Z}}
       
    69 \def\S{\mathit{S}}
       
    70 \def\rup{r^\uparrow}
       
    71 %\def\bderssimp{\mathit{bders}\_\mathit{simp}}
       
    72 \def\distinctWith{\textit{distinctWith}}
       
    73 \def\lf{\textit{lf}}
       
    74 \def\PD{\textit{PD}}
       
    75 \def\suffix{\textit{Suffix}}
       
    76 
       
    77 
       
    78 \def\size{\mathit{size}}
       
    79 \def\rexp{\mathbf{rexp}}
       
    80 \def\simp{\mathit{simp}}
       
    81 \def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
       
    82 \def\map{\mathit{map}}
       
    83 \def\distinct{\mathit{distinct}}
       
    84 \def\blexersimp{\mathit{blexer}\_\mathit{simp}}
       
    85 \def\map{\textit{map}}
       
    86 %\def\vsuf{\textit{vsuf}}
       
    87 %\def\sflataux{\textit{sflat}\_\textit{aux}}
       
    88 \def\rrexp{\textit{rrexp}}
       
    89 \newcommand\rnullable[1]{\textit{rnullable}(#1)}
       
    90 \newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
       
    91 \newcommand\asize[1]{\llbracket #1 \rrbracket}
       
    92 \newcommand\rerase[1]{ (#1)\downarrow_r}
       
    93 
       
    94 
       
    95 \def\erase{\textit{erase}}
       
    96 \def\STAR{\textit{STAR}}
       
    97 \def\flts{\textit{flts}}
       
    98 
       
    99 
       
   100 \def\RZERO{\mathbf{0}_r }
       
   101 \def\RONE{\mathbf{1}_r}
       
   102 \newcommand\RCHAR[1]{\mathbf{#1}_r}
       
   103 \newcommand\RSEQ[2]{#1 \cdot #2}
       
   104 \newcommand\RALTS[1]{\oplus #1}
       
   105 \newcommand\RSTAR[1]{#1^*}
       
   106 \newcommand\vsuf[2]{\textit{vsuf} \;#1\;#2}
       
   107 
       
   108 %----------------------------------------------------------------------------------------
       
   109 %This part is about regular expressions, Brzozowski derivatives,
       
   110 %and a bit-coded lexing algorithm with proven correctness and time bounds.
       
   111 
       
   112 %TODO: look up snort rules to use here--give readers idea of what regexes look like
       
   113 
       
   114 
       
   115 Regular expressions are widely used in computer science: 
       
   116 be it in text-editors\parencite{atomEditor} with syntax highlighting and auto-completion, 
       
   117 command-line tools like $\mathit{grep}$ that facilitate easy 
       
   118 text processing, network intrusion
       
   119 detection systems that reject suspicious traffic, or compiler
       
   120 front ends--the majority of the solutions to these tasks 
       
   121 involve lexing with regular 
       
   122 expressions.
       
   123 Given its usefulness and ubiquity, one would imagine that
       
   124 modern regular expression matching implementations
       
   125 are mature and fully studied.
       
   126 Indeed, in a popular programming language' regex engine, 
       
   127 supplying it with regular expressions and strings, one can
       
   128 get rich matching information in a very short time.
       
   129 Some network intrusion detection systems
       
   130 use regex engines that are able to process 
       
   131 megabytes or even gigabytes of data per second\parencite{Turo_ov__2020}.
       
   132 Unfortunately, this is not the case for $\mathbf{all}$ inputs.
       
   133 %TODO: get source for SNORT/BRO's regex matching engine/speed
       
   134 
       
   135 
       
   136 Take $(a^*)^*\,b$ and ask whether
       
   137 strings of the form $aa..a$ match this regular
       
   138 expression. Obviously this is not the case---the expected $b$ in the last
       
   139 position is missing. One would expect that modern regular expression
       
   140 matching engines can find this out very quickly. Alas, if one tries
       
   141 this example in JavaScript, Python or Java 8, even with strings of a small
       
   142 length, say around 30 $a$'s, one discovers that 
       
   143 this decision takes crazy time to finish given the simplicity of the problem.
       
   144 This is clearly exponential behaviour, and 
       
   145 is triggered by some relatively simple regex patterns, as the graphs
       
   146 below show:
       
   147 
       
   148 \begin{figure}
       
   149 \centering
       
   150 \begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
       
   151 \begin{tikzpicture}
       
   152 \begin{axis}[
       
   153     xlabel={$n$},
       
   154     x label style={at={(1.05,-0.05)}},
       
   155     ylabel={time in secs},
       
   156     enlargelimits=false,
       
   157     xtick={0,5,...,30},
       
   158     xmax=33,
       
   159     ymax=35,
       
   160     ytick={0,5,...,30},
       
   161     scaled ticks=false,
       
   162     axis lines=left,
       
   163     width=5cm,
       
   164     height=4cm, 
       
   165     legend entries={JavaScript},  
       
   166     legend pos=north west,
       
   167     legend cell align=left]
       
   168 \addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
       
   169 \end{axis}
       
   170 \end{tikzpicture}
       
   171   &
       
   172 \begin{tikzpicture}
       
   173 \begin{axis}[
       
   174     xlabel={$n$},
       
   175     x label style={at={(1.05,-0.05)}},
       
   176     %ylabel={time in secs},
       
   177     enlargelimits=false,
       
   178     xtick={0,5,...,30},
       
   179     xmax=33,
       
   180     ymax=35,
       
   181     ytick={0,5,...,30},
       
   182     scaled ticks=false,
       
   183     axis lines=left,
       
   184     width=5cm,
       
   185     height=4cm, 
       
   186     legend entries={Python},  
       
   187     legend pos=north west,
       
   188     legend cell align=left]
       
   189 \addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
       
   190 \end{axis}
       
   191 \end{tikzpicture}
       
   192   &
       
   193 \begin{tikzpicture}
       
   194 \begin{axis}[
       
   195     xlabel={$n$},
       
   196     x label style={at={(1.05,-0.05)}},
       
   197     %ylabel={time in secs},
       
   198     enlargelimits=false,
       
   199     xtick={0,5,...,30},
       
   200     xmax=33,
       
   201     ymax=35,
       
   202     ytick={0,5,...,30},
       
   203     scaled ticks=false,
       
   204     axis lines=left,
       
   205     width=5cm,
       
   206     height=4cm, 
       
   207     legend entries={Java 8},  
       
   208     legend pos=north west,
       
   209     legend cell align=left]
       
   210 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
       
   211 \end{axis}
       
   212 \end{tikzpicture}\\
       
   213 \multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
       
   214            of the form $\underbrace{aa..a}_{n}$.}
       
   215 \end{tabular}    
       
   216 \caption{aStarStarb} \label{fig:aStarStarb}
       
   217 \end{figure}
       
   218 
       
   219 
       
   220 
       
   221 
       
   222 This superlinear blowup in matching algorithms sometimes cause
       
   223 considerable grief in real life: for example on 20 July 2016 one evil
       
   224 regular expression brought the webpage
       
   225 \href{http://stackexchange.com}{Stack Exchange} to its
       
   226 knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
       
   227 In this instance, a regular expression intended to just trim white
       
   228 spaces from the beginning and the end of a line actually consumed
       
   229 massive amounts of CPU-resources---causing web servers to grind to a
       
   230 halt. This happened when a post with 20,000 white spaces was submitted,
       
   231 but importantly the white spaces were neither at the beginning nor at
       
   232 the end. As a result, the regular expression matching engine needed to
       
   233 backtrack over many choices. In this example, the time needed to process
       
   234 the string was $O(n^2)$ with respect to the string length. This
       
   235 quadratic overhead was enough for the homepage of Stack Exchange to
       
   236 respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
       
   237 attack and therefore stopped the servers from responding to any
       
   238 requests. This made the whole site become unavailable. 
       
   239 A more recent example is a global outage of all Cloudflare servers on 2 July
       
   240 2019. A poorly written regular expression exhibited exponential
       
   241 behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
       
   242 had several causes, at the heart was a regular expression that
       
   243 was used to monitor network
       
   244 traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}}
       
   245 %TODO: data points for some new versions of languages
       
   246 These problems with regular expressions 
       
   247 are not isolated events that happen
       
   248 very occasionally, but actually widespread.
       
   249 They occur so often that they get a 
       
   250 name--Regular-Expression-Denial-Of-Service (ReDoS)
       
   251 attack.
       
   252 Davis et al. \parencite{Davis18} detected more
       
   253 than 1000 super-linear (SL) regular expressions
       
   254 in Node.js, Python core libraries, and npm and pypi. 
       
   255 They therefore concluded that evil regular expressions
       
   256 are problems more than "a parlour trick", but one that
       
   257 requires
       
   258 more research attention.
       
   259 
       
   260  \section{The Problem Behind Slow Cases}
       
   261 
       
   262 %find literature/find out for yourself that REGEX->DFA on basic regexes
       
   263 %does not blow up the size
       
   264 Shouldn't regular expression matching be linear?
       
   265 How can one explain the super-linear behaviour of the 
       
   266 regex matching engines we have?
       
   267 The time cost of regex matching algorithms in general
       
   268 involve two phases: 
       
   269 the construction phase, in which the algorithm builds some  
       
   270 suitable data structure from the input regex $r$, we denote
       
   271 the time cost by $P_1(r)$.
       
   272 The lexing
       
   273 phase, when the input string $s$ is read and the data structure
       
   274 representing that regex $r$ is being operated on. We represent the time
       
   275 it takes by $P_2(r, s)$.\\
       
   276 In the case of a $\mathit{DFA}$,
       
   277 we have $P_2(r, s) = O( |s| )$,
       
   278 because we take at most $|s|$ steps, 
       
   279 and each step takes
       
   280 at most one transition--
       
   281 a deterministic-finite-automata
       
   282 by definition has at most one state active and at most one
       
   283 transition upon receiving an input symbol.
       
   284 But unfortunately in the  worst case
       
   285 $P_1(r) = O(exp^{|r|})$. An example will be given later. \\
       
   286 For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
       
   287 expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
       
   288 The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
       
   289 On the other hand, if backtracking is used, the worst-case time bound bloats
       
   290 to $|r| * 2^|s|$ .
       
   291 %on the input
       
   292 %And when calculating the time complexity of the matching algorithm,
       
   293 %we are assuming that each input reading step requires constant time.
       
   294 %which translates to that the number of 
       
   295 %states active and transitions taken each time is bounded by a
       
   296 %constant $C$.
       
   297 %But modern  regex libraries in popular language engines
       
   298 % often want to support much richer constructs than just
       
   299 % sequences and Kleene stars,
       
   300 %such as negation, intersection, 
       
   301 %bounded repetitions and back-references.
       
   302 %And de-sugaring these "extended" regular expressions 
       
   303 %into basic ones might bloat the size exponentially.
       
   304 %TODO: more reference for exponential size blowup on desugaring. 
       
   305 \subsection{Tools that uses $\mathit{DFA}$s}
       
   306 %TODO:more tools that use DFAs?
       
   307 $\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
       
   308 in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
       
   309 lexers. The user provides a set of regular expressions
       
   310 and configurations to such lexer generators, and then 
       
   311 gets an output program encoding a minimized $\mathit{DFA}$
       
   312 that can be compiled and run. 
       
   313 The good things about $\mathit{DFA}$s is that once
       
   314 generated, they are fast and stable, unlike
       
   315 backtracking algorithms. 
       
   316 However, they do not scale well with bounded repetitions.\\
       
   317 
       
   318 
       
   319 
       
   320 
       
   321 Bounded repetitions, usually written in the form
       
   322 $r^{\{c\}}$ (where $c$ is a constant natural number),
       
   323 denotes a regular expression accepting strings
       
   324 that can be divided into $c$ substrings, where each 
       
   325 substring is in $r$. 
       
   326 For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
       
   327 an $\mathit{NFA}$ describing it would look like:
       
   328 \begin{center}
       
   329 \begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
       
   330    \node[state,initial] (q_0)   {$q_0$}; 
       
   331    \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
       
   332    \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
       
   333    \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
       
   334     \path[->] 
       
   335     (q_0) edge  node {a} (q_1)
       
   336     	  edge [loop below] node {a,b} ()
       
   337     (q_1) edge  node  {a,b} (q_2)
       
   338     (q_2) edge  node  {a,b} (q_3);
       
   339 \end{tikzpicture}
       
   340 \end{center}
       
   341 The red states are "countdown states" which counts down 
       
   342 the number of characters needed in addition to the current
       
   343 string to make a successful match.
       
   344 For example, state $q_1$ indicates a match that has
       
   345 gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
       
   346 and just consumed the "delimiter" $a$ in the middle, and 
       
   347 need to match 2 more iterations of $(a|b)$ to complete.
       
   348 State $q_2$ on the other hand, can be viewed as a state
       
   349 after $q_1$ has consumed 1 character, and just waits
       
   350 for 1 more character to complete.
       
   351 $q_3$ is the last state, requiring 0 more character and is accepting.
       
   352 Depending on the suffix of the
       
   353 input string up to the current read location,
       
   354 the states $q_1$ and $q_2$, $q_3$
       
   355 may or may
       
   356 not be active, independent from each other.
       
   357 A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
       
   358 contain at least $2^3$ non-equivalent states that cannot be merged, 
       
   359 because the subset construction during determinisation will generate
       
   360 all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
       
   361 Generalizing this to regular expressions with larger
       
   362 bounded repetitions number, we have that
       
   363 regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
       
   364 would require at least $2^{n+1}$ states, if $r$ contains
       
   365 more than 1 string.
       
   366 This is to represent all different 
       
   367 scenarios which "countdown" states are active.
       
   368 For those regexes, tools such as $\mathit{JFLEX}$ 
       
   369 would generate gigantic $\mathit{DFA}$'s or
       
   370 out of memory errors.
       
   371 For this reason, regex libraries that support 
       
   372 bounded repetitions often choose to use the $\mathit{NFA}$ 
       
   373 approach.
       
   374 \subsection{The $\mathit{NFA}$ approach to regex matching}
       
   375 One can simulate the $\mathit{NFA}$ running in two ways:
       
   376 one by keeping track of all active states after consuming 
       
   377 a character, and update that set of states iteratively.
       
   378 This can be viewed as a breadth-first-search of the $\mathit{NFA}$
       
   379 for a path terminating
       
   380 at an accepting state.
       
   381 Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
       
   382 type of $\mathit{NFA}$ simulation, and guarantees a linear runtime
       
   383 in terms of input string length.
       
   384 %TODO:try out these lexers
       
   385 The other way to use $\mathit{NFA}$ for matching is choosing  
       
   386 a single transition each time, keeping all the other options in 
       
   387 a queue or stack, and backtracking if that choice eventually 
       
   388 fails. This method, often called a  "depth-first-search", 
       
   389 is efficient in a lot of cases, but could end up
       
   390 with exponential run time.\\
       
   391 %TODO:COMPARE java python lexer speed with Rust and Go
       
   392 The reason behind backtracking algorithms in languages like
       
   393 Java and Python is that they support back-references.
       
   394 \subsection{Back References in Regex--Non-Regular part}
       
   395 If we have a regular expression like this (the sequence
       
   396 operator is omitted for brevity):
       
   397 \begin{center}
       
   398 	$r_1(r_2(r_3r_4))$
       
   399 \end{center}
       
   400 We could label sub-expressions of interest 
       
   401 by parenthesizing them and giving 
       
   402 them a number by the order in which their opening parentheses appear.
       
   403 One possible way of parenthesizing and labelling is given below:
       
   404 \begin{center}
       
   405 	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
       
   406 \end{center}
       
   407 $r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
       
   408 by 1 to 4. $1$ would refer to the entire expression 
       
   409 $(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
       
   410 These sub-expressions are called "capturing groups".
       
   411 We can use the following syntax to denote that we want a string just matched by a 
       
   412 sub-expression (capturing group) to appear at a certain location again, 
       
   413 exactly as it was:
       
   414 \begin{center}
       
   415 $\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
       
   416 \underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
       
   417 \end{center}
       
   418 The backslash and number $i$ are used to denote such 
       
   419 so-called "back-references".
       
   420 Let $e$ be an expression made of regular expressions 
       
   421 and back-references. $e$ contains the expression $e_i$
       
   422 as its $i$-th capturing group.
       
   423 The semantics of back-reference can be recursively
       
   424 written as:
       
   425 \begin{center}
       
   426 	\begin{tabular}{c}
       
   427 		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
       
   428 		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
       
   429 	\end{tabular}
       
   430 \end{center}
       
   431 The concrete example
       
   432 $((a|b|c|\ldots|z)^*)\backslash 1$
       
   433 would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
       
   434 Back-reference is a construct in the "regex" standard
       
   435 that programmers found useful, but not exactly 
       
   436 regular any more.
       
   437 In fact, that allows the regex construct to express 
       
   438 languages that cannot be contained in context-free
       
   439 languages either.
       
   440 For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
       
   441 expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
       
   442 which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
       
   443 Such a language is contained in the context-sensitive hierarchy
       
   444 of formal languages. 
       
   445 Solving the back-reference expressions matching problem
       
   446 is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
       
   447 efficient solution is not known to exist.
       
   448 %TODO:read a bit more about back reference algorithms
       
   449 It seems that languages like Java and Python made the trade-off
       
   450 to support back-references at the expense of having to backtrack,
       
   451 even in the case of regexes not involving back-references.\\
       
   452 Summing these up, we can categorise existing 
       
   453 practical regex libraries into the ones  with  linear
       
   454 time guarantees like Go and Rust, which impose restrictions
       
   455 on the user input (not allowing back-references, 
       
   456 bounded repetitions canno exceed 1000 etc.), and ones  
       
   457  that allows the programmer much freedom, but grinds to a halt
       
   458  in some non-negligible portion of cases.
       
   459  %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
       
   460 % For example, the Rust regex engine claims to be linear, 
       
   461 % but does not support lookarounds and back-references.
       
   462 % The GoLang regex library does not support over 1000 repetitions.  
       
   463 % Java and Python both support back-references, but shows
       
   464 %catastrophic backtracking behaviours on inputs without back-references(
       
   465 %when the language is still regular).
       
   466  %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
       
   467  %TODO: verify the fact Rust does not allow 1000+ reps
       
   468  %TODO: Java 17 updated graphs? Is it ok to still use Java 8 graphs?
       
   469 \section{Buggy Regex Engines} 
       
   470 
       
   471 
       
   472  Another thing about these libraries is that there
       
   473  is no correctness guarantee.
       
   474  In some cases, they either fail to generate a lexing result when there exists a match,
       
   475  or give the wrong way of matching.
       
   476  
       
   477 
       
   478 It turns out that regex libraries not only suffer from 
       
   479 exponential backtracking problems, 
       
   480 but also undesired (or even buggy) outputs.
       
   481 %TODO: comment from who
       
   482 Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
       
   483 correctly implementing the POSIX (maximum-munch)
       
   484 rule of regular expression matching.
       
   485 This experience is echoed by the writer's
       
   486 tryout of a few online regex testers:
       
   487 A concrete example would be 
       
   488 the regex
       
   489 \begin{verbatim}
       
   490 (((((a*a*)b*)b){20})*)c
       
   491 \end{verbatim}
       
   492 and the string
       
   493 \begin{verbatim}
       
   494 baabaabababaabaaaaaaaaababaa
       
   495 aababababaaaabaaabaaaaaabaab
       
   496 aabababaababaaaaaaaaababaaaa
       
   497 babababaaaaaaaaaaaaac
       
   498 \end{verbatim}
       
   499 
       
   500 This seemingly complex regex simply says "some $a$'s
       
   501 followed by some $b$'s then followed by 1 single $b$,
       
   502 and this iterates 20 times, finally followed by a $c$.
       
   503 And a POSIX match would involve the entire string,"eating up"
       
   504 all the $b$'s in it.
       
   505 %TODO: give a coloured example of how this matches POSIXly
       
   506 
       
   507 This regex would trigger catastrophic backtracking in 
       
   508 languages like Python and Java,
       
   509 whereas it gives a non-POSIX  and uninformative 
       
   510 match in languages like Go or .NET--The match with only 
       
   511 character $c$.
       
   512 
       
   513 As Grathwohl\parencite{grathwohl2014crash} commented,
       
   514 \begin{center}
       
   515 	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
       
   516 \end{center}
       
   517 
       
   518 %\section{How people solve problems with regexes}
       
   519 
       
   520 
       
   521 When a regular expression does not behave as intended,
       
   522 people usually try to rewrite the regex to some equivalent form
       
   523 or they try to avoid the possibly problematic patterns completely,
       
   524 for which many false positives exist\parencite{Davis18}.
       
   525 Animated tools to "debug" regular expressions
       
   526 are also popular, regexploit\parencite{regexploit2021}, regex101\parencite{regex101} 
       
   527 to name a few.
       
   528 We are also aware of static analysis work on regular expressions that
       
   529 aims to detect potentially expoential regex patterns. Rathnayake and Thielecke 
       
   530 \parencite{Rathnayake2014StaticAF} proposed an algorithm
       
   531 that detects regular expressions triggering exponential
       
   532 behavious on backtracking matchers.
       
   533 Weideman \parencite{Weideman2017Static} came up with 
       
   534 non-linear polynomial worst-time estimates
       
   535 for regexes, attack string that exploit the worst-time 
       
   536 scenario, and "attack automata" that generates
       
   537 attack strings.
       
   538 %Arguably these methods limits the programmers' freedom
       
   539 %or productivity when all they want is to come up with a regex
       
   540 %that solves the text processing problem.
       
   541 
       
   542 %TODO:also the regex101 debugger
       
   543 \section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
       
   544  Is it possible to have a regex lexing algorithm with proven correctness and 
       
   545  time complexity, which allows easy extensions to
       
   546   constructs like 
       
   547  bounded repetitions, negation,  lookarounds, and even back-references? 
       
   548   
       
   549   We propose Brzozowski derivatives on regular expressions as
       
   550   a solution to this.
       
   551   
       
   552   In the last fifteen or so years, Brzozowski's derivatives of regular
       
   553 expressions have sparked quite a bit of interest in the functional
       
   554 programming and theorem prover communities.  The beauty of
       
   555 Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
       
   556 expressible in any functional language, and easily definable and
       
   557 reasoned about in theorem provers---the definitions just consist of
       
   558 inductive datatypes and simple recursive functions. 
       
   559 And an algorithms based on it by 
       
   560 Suzmann and Lu  \parencite{Sulzmann2014} allows easy extension
       
   561 to include  extended regular expressions and 
       
   562  simplification of internal data structures 
       
   563  eliminating the exponential behaviours.
       
   564  
       
   565 
       
   566   \section{Motivation}
       
   567   
       
   568 Derivatives give a simple solution
       
   569 to the problem of matching a string $s$ with a regular
       
   570 expression $r$: if the derivative of $r$ w.r.t.\ (in
       
   571 succession) all the characters of the string matches the empty string,
       
   572 then $r$ matches $s$ (and {\em vice versa}).  
       
   573 
       
   574 
       
   575 
       
   576 However, two difficulties with derivative-based matchers exist:
       
   577 First, Brzozowski's original matcher only generates a yes/no answer
       
   578 for whether a regular expression matches a string or not.  This is too
       
   579 little information in the context of lexing where separate tokens must
       
   580 be identified and also classified (for example as keywords
       
   581 or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
       
   582 difficulty by cleverly extending Brzozowski's matching
       
   583 algorithm. Their extended version generates additional information on
       
   584 \emph{how} a regular expression matches a string following the POSIX
       
   585 rules for regular expression matching. They achieve this by adding a
       
   586 second ``phase'' to Brzozowski's algorithm involving an injection
       
   587 function.  In our own earlier work we provided the formal
       
   588 specification of what POSIX matching means and proved in Isabelle/HOL
       
   589 the correctness
       
   590 of Sulzmann and Lu's extended algorithm accordingly
       
   591 \cite{AusafDyckhoffUrban2016}.
       
   592 
       
   593 The second difficulty is that Brzozowski's derivatives can 
       
   594 grow to arbitrarily big sizes. For example if we start with the
       
   595 regular expression $(a+aa)^*$ and take
       
   596 successive derivatives according to the character $a$, we end up with
       
   597 a sequence of ever-growing derivatives like 
       
   598 
       
   599 \def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
       
   600 \begin{center}
       
   601 \begin{tabular}{rll}
       
   602 $(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   603 & $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   604 & $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
       
   605 & & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   606 & $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
       
   607 \end{tabular}
       
   608 \end{center}
       
   609  
       
   610 \noindent where after around 35 steps we run out of memory on a
       
   611 typical computer (we shall define shortly the precise details of our
       
   612 regular expressions and the derivative operation).  Clearly, the
       
   613 notation involving $\ZERO$s and $\ONE$s already suggests
       
   614 simplification rules that can be applied to regular regular
       
   615 expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
       
   616 \Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
       
   617 r$. While such simple-minded simplifications have been proved in our
       
   618 earlier work to preserve the correctness of Sulzmann and Lu's
       
   619 algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
       
   620 \emph{not} help with limiting the growth of the derivatives shown
       
   621 above: the growth is slowed, but the derivatives can still grow rather
       
   622 quickly beyond any finite bound.
       
   623 
       
   624 
       
   625 Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
       
   626 \cite{Sulzmann2014} where they introduce bitcoded
       
   627 regular expressions. In this version, POSIX values are
       
   628 represented as bitsequences and such sequences are incrementally generated
       
   629 when derivatives are calculated. The compact representation
       
   630 of bitsequences and regular expressions allows them to define a more
       
   631 ``aggressive'' simplification method that keeps the size of the
       
   632 derivatives finite no matter what the length of the string is.
       
   633 They make some informal claims about the correctness and linear behaviour
       
   634 of this version, but do not provide any supporting proof arguments, not
       
   635 even ``pencil-and-paper'' arguments. They write about their bitcoded
       
   636 \emph{incremental parsing method} (that is the algorithm to be formalised
       
   637 in this paper):
       
   638 
       
   639 
       
   640   
       
   641   \begin{quote}\it
       
   642   ``Correctness Claim: We further claim that the incremental parsing
       
   643   method [..] in combination with the simplification steps [..]
       
   644   yields POSIX parse trees. We have tested this claim
       
   645   extensively [..] but yet
       
   646   have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
       
   647 \end{quote}  
       
   648 
       
   649 Ausaf and Urban were able to back this correctness claim with
       
   650 a formal proof.
       
   651 
       
   652 But as they stated,
       
   653   \begin{quote}\it
       
   654 The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
       
   655 \end{quote}  
       
   656 
       
   657 This thesis implements the aggressive simplifications envisioned
       
   658 by Ausaf and Urban,
       
   659 and gives a formal proof of the correctness with those simplifications.
       
   660 
       
   661 
       
   662  
       
   663 
       
   664 
       
   665 
       
   666 %----------------------------------------------------------------------------------------
       
   667 
       
   668 \section{Contribution}
       
   669 
       
   670 
       
   671 
       
   672 This work addresses the vulnerability of super-linear and
       
   673 buggy regex implementations by the combination
       
   674 of Brzozowski's derivatives and interactive theorem proving. 
       
   675 We give an 
       
   676 improved version of  Sulzmann and Lu's bit-coded algorithm using 
       
   677 derivatives, which come with a formal guarantee in terms of correctness and 
       
   678 running time as an Isabelle/HOL proof.
       
   679 Then we improve the algorithm with an even stronger version of 
       
   680 simplification, and prove a time bound linear to input and
       
   681 cubic to regular expression size using a technique by
       
   682 Antimirov.
       
   683 
       
   684  
       
   685 The main contribution of this thesis is a proven correct lexing algorithm
       
   686 with formalized time bounds.
       
   687 To our best knowledge, no lexing libraries using Brzozowski derivatives
       
   688 have a provable time guarantee, 
       
   689 and claims about running time are usually speculative and backed by thin empirical
       
   690 evidence.
       
   691 %TODO: give references
       
   692 For example, Sulzmann and Lu had proposed an algorithm  in which they
       
   693 claim a linear running time.
       
   694 But that was falsified by our experiments and the running time 
       
   695 is actually $\Omega(2^n)$ in the worst case.
       
   696 A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
       
   697 %TODO: give references
       
   698 lexer, which calculates POSIX matches and is based on derivatives.
       
   699 They formalized the correctness of the lexer, but not the complexity.
       
   700 In the performance evaluation section, they simply analyzed the run time
       
   701 of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
       
   702 and concluded that the algorithm is quadratic in terms of input length.
       
   703 When we tried out their extracted OCaml code with our example $(a+aa)^*$,
       
   704 the time it took to lex only 40 $a$'s was 5 minutes.
       
   705 
       
   706 We  believe our results of a proof of performance on general
       
   707 inputs rather than specific examples a novel contribution.\\
       
   708 
       
   709 
       
   710 \subsection{Related Work}
       
   711 We are aware
       
   712 of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
       
   713 Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
       
   714 of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
       
   715 in Coq is given by Coquand and Siles \parencite{Coquand2012}.
       
   716 Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
       
   717  
       
   718  %We propose Brzozowski's derivatives as a solution to this problem.
       
   719 % about Lexing Using Brzozowski derivatives
       
   720 
       
   721 
       
   722 \section{Structure of the thesis}
       
   723 In chapter 2 \ref{Chapter2} we will introduce the concepts
       
   724 and notations we 
       
   725 use for describing the lexing algorithm by Sulzmann and Lu,
       
   726 and then give the algorithm and its variant, and discuss
       
   727 why more aggressive simplifications are needed. 
       
   728 Then we illustrate in Chapter 3\ref{Chapter3}
       
   729 how the algorithm without bitcodes falls short for such aggressive 
       
   730 simplifications and therefore introduce our version of the
       
   731  bitcoded algorithm and 
       
   732 its correctness proof .  
       
   733 In Chapter 4 \ref{Chapter4} we give the second guarantee
       
   734 of our bitcoded algorithm, that is a finite bound on the size of any 
       
   735 regex's derivatives.
       
   736 In Chapter 5\ref{Chapter5} we discuss stronger simplifications to improve the finite bound
       
   737 in Chapter 4 to a polynomial one, and demonstrate how one can extend the
       
   738 algorithm to include constructs such as bounded repetitions and negations.
       
   739  
       
   740 
       
   741 
       
   742 
       
   743 
       
   744 %----------------------------------------------------------------------------------------
       
   745 
       
   746 
       
   747 %----------------------------------------------------------------------------------------
       
   748 
       
   749 %----------------------------------------------------------------------------------------
       
   750 
       
   751 %----------------------------------------------------------------------------------------
       
   752 
       
   753