ChengsongTanPhdThesis/Chapters/Chapter1.tex
changeset 565 0497408a3598
parent 564 3cbcd7cda0a9
child 566 94604a5fd271
equal deleted inserted replaced
564:3cbcd7cda0a9 565:0497408a3598
     1 % Chapter 1
       
     2 
       
     3 \chapter{Introduction} % Main chapter title
       
     4 
       
     5 \label{Chapter1} % For referencing the chapter elsewhere, use \ref{Chapter1} 
       
     6 
       
     7 %----------------------------------------------------------------------------------------
       
     8 
       
     9 % Define some commands to keep the formatting separated from the content 
       
    10 \newcommand{\keyword}[1]{\textbf{#1}}
       
    11 \newcommand{\tabhead}[1]{\textbf{#1}}
       
    12 \newcommand{\code}[1]{\texttt{#1}}
       
    13 \newcommand{\file}[1]{\texttt{\bfseries#1}}
       
    14 \newcommand{\option}[1]{\texttt{\itshape#1}}
       
    15 
       
    16 %\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
       
    17 \newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
       
    18 \newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
       
    19 \newcommand{\bderssimp}[2]{#1 \backslash_{bsimp} #2}
       
    20 \newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
       
    21 \newcommand{\bders}[2]{#1 \backslash #2}
       
    22 \newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
       
    23 \newcommand{\rsimp}[1]{\textit{rsimp}(#1)}
       
    24 \newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
       
    25 \newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
       
    26 \newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
       
    27 \newcommand{\ZERO}{\mbox{\bf 0}}
       
    28 \newcommand{\ONE}{\mbox{\bf 1}}
       
    29 \newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
       
    30 \newcommand{\rdistinct}[2]{\textit{distinct} \; \textit{#1} \; #2}
       
    31 \newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
       
    32 \newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
       
    33 \newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
       
    34 
       
    35 \newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
       
    36 
       
    37 \def\decode{\textit{decode}}
       
    38 \def\internalise{\textit{internalise}}
       
    39 \def\lexer{\mathit{lexer}}
       
    40 \def\mkeps{\textit{mkeps}}
       
    41 \newcommand{\rder}[2]{#2 \backslash #1}
       
    42 
       
    43 \def\AZERO{\textit{AZERO}}
       
    44 \def\AONE{\textit{AONE}}
       
    45 \def\ACHAR{\textit{ACHAR}}
       
    46 
       
    47 \def\POSIX{\textit{POSIX}}
       
    48 \def\ALTS{\textit{ALTS}}
       
    49 \def\ASTAR{\textit{ASTAR}}
       
    50 \def\DFA{\textit{DFA}}
       
    51 \def\bmkeps{\textit{bmkeps}}
       
    52 \def\retrieve{\textit{retrieve}}
       
    53 \def\blexer{\textit{blexer}}
       
    54 \def\flex{\textit{flex}}
       
    55 \def\inj{\mathit{inj}}
       
    56 \def\Empty{\mathit{Empty}}
       
    57 \def\Left{\mathit{Left}}
       
    58 \def\Right{\mathit{Right}}
       
    59 \def\Stars{\mathit{Stars}}
       
    60 \def\Char{\mathit{Char}}
       
    61 \def\Seq{\mathit{Seq}}
       
    62 \def\Der{\textit{Der}}
       
    63 \def\Ders{\textit{Ders}}
       
    64 \def\nullable{\mathit{nullable}}
       
    65 \def\Z{\mathit{Z}}
       
    66 \def\S{\mathit{S}}
       
    67 \def\rup{r^\uparrow}
       
    68 %\def\bderssimp{\mathit{bders}\_\mathit{simp}}
       
    69 \def\distinctWith{\textit{distinctWith}}
       
    70 \def\lf{\textit{lf}}
       
    71 \def\PD{\textit{PD}}
       
    72 \def\suffix{\textit{Suffix}}
       
    73 
       
    74 
       
    75 \def\size{\mathit{size}}
       
    76 \def\rexp{\mathbf{rexp}}
       
    77 \def\simp{\mathit{simp}}
       
    78 \def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
       
    79 \def\map{\mathit{map}}
       
    80 \def\distinct{\mathit{distinct}}
       
    81 \def\blexersimp{\mathit{blexer}\_\mathit{simp}}
       
    82 \def\map{\textit{map}}
       
    83 %\def\vsuf{\textit{vsuf}}
       
    84 %\def\sflataux{\textit{sflat}\_\textit{aux}}
       
    85 \def\rrexp{\textit{rrexp}}
       
    86 \newcommand\rnullable[1]{\textit{rnullable}(#1)}
       
    87 \newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
       
    88 \newcommand\asize[1]{\llbracket #1 \rrbracket}
       
    89 \newcommand\rerase[1]{ (#1)\downarrow_r}
       
    90 
       
    91 
       
    92 \def\erase{\textit{erase}}
       
    93 \def\STAR{\textit{STAR}}
       
    94 \def\flts{\textit{flts}}
       
    95 
       
    96 
       
    97 \def\RZERO{\mathbf{0}_r }
       
    98 \def\RONE{\mathbf{1}_r}
       
    99 \newcommand\RCHAR[1]{\mathbf{#1}_r}
       
   100 \newcommand\RSEQ[2]{#1 \cdot #2}
       
   101 \newcommand\RALTS[1]{\oplus #1}
       
   102 \newcommand\RSTAR[1]{#1^*}
       
   103 \newcommand\vsuf[2]{\textit{vsuf} \;#1\;#2}
       
   104 
       
   105 %----------------------------------------------------------------------------------------
       
   106 %This part is about regular expressions, Brzozowski derivatives,
       
   107 %and a bit-coded lexing algorithm with proven correctness and time bounds.
       
   108 
       
   109 %TODO: look up snort rules to use here--give readers idea of what regexes look like
       
   110 
       
   111 
       
   112 Regular expressions are widely used in computer science: 
       
   113 be it in text-editors\parencite{atomEditor} with syntax highlighting and auto-completion, 
       
   114 command-line tools like $\mathit{grep}$ that facilitate easy 
       
   115 text processing, network intrusion
       
   116 detection systems that reject suspicious traffic, or compiler
       
   117 front ends--the majority of the solutions to these tasks 
       
   118 involve lexing with regular 
       
   119 expressions.
       
   120 Given its usefulness and ubiquity, one would imagine that
       
   121 modern regular expression matching implementations
       
   122 are mature and fully studied.
       
   123 Indeed, in a popular programming language' regex engine, 
       
   124 supplying it with regular expressions and strings, one can
       
   125 get rich matching information in a very short time.
       
   126 Some network intrusion detection systems
       
   127 use regex engines that are able to process 
       
   128 megabytes or even gigabytes of data per second\parencite{Turo_ov__2020}.
       
   129 Unfortunately, this is not the case for $\mathbf{all}$ inputs.
       
   130 %TODO: get source for SNORT/BRO's regex matching engine/speed
       
   131 
       
   132 
       
   133 Take $(a^*)^*\,b$ and ask whether
       
   134 strings of the form $aa..a$ match this regular
       
   135 expression. Obviously this is not the case---the expected $b$ in the last
       
   136 position is missing. One would expect that modern regular expression
       
   137 matching engines can find this out very quickly. Alas, if one tries
       
   138 this example in JavaScript, Python or Java 8, even with strings of a small
       
   139 length, say around 30 $a$'s, one discovers that 
       
   140 this decision takes crazy time to finish given the simplicity of the problem.
       
   141 This is clearly exponential behaviour, and 
       
   142 is triggered by some relatively simple regex patterns, as the graphs
       
   143 below show:
       
   144 
       
   145 \begin{figure}
       
   146 \centering
       
   147 \begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
       
   148 \begin{tikzpicture}
       
   149 \begin{axis}[
       
   150     xlabel={$n$},
       
   151     x label style={at={(1.05,-0.05)}},
       
   152     ylabel={time in secs},
       
   153     enlargelimits=false,
       
   154     xtick={0,5,...,30},
       
   155     xmax=33,
       
   156     ymax=35,
       
   157     ytick={0,5,...,30},
       
   158     scaled ticks=false,
       
   159     axis lines=left,
       
   160     width=5cm,
       
   161     height=4cm, 
       
   162     legend entries={JavaScript},  
       
   163     legend pos=north west,
       
   164     legend cell align=left]
       
   165 \addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
       
   166 \end{axis}
       
   167 \end{tikzpicture}
       
   168   &
       
   169 \begin{tikzpicture}
       
   170 \begin{axis}[
       
   171     xlabel={$n$},
       
   172     x label style={at={(1.05,-0.05)}},
       
   173     %ylabel={time in secs},
       
   174     enlargelimits=false,
       
   175     xtick={0,5,...,30},
       
   176     xmax=33,
       
   177     ymax=35,
       
   178     ytick={0,5,...,30},
       
   179     scaled ticks=false,
       
   180     axis lines=left,
       
   181     width=5cm,
       
   182     height=4cm, 
       
   183     legend entries={Python},  
       
   184     legend pos=north west,
       
   185     legend cell align=left]
       
   186 \addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
       
   187 \end{axis}
       
   188 \end{tikzpicture}
       
   189   &
       
   190 \begin{tikzpicture}
       
   191 \begin{axis}[
       
   192     xlabel={$n$},
       
   193     x label style={at={(1.05,-0.05)}},
       
   194     %ylabel={time in secs},
       
   195     enlargelimits=false,
       
   196     xtick={0,5,...,30},
       
   197     xmax=33,
       
   198     ymax=35,
       
   199     ytick={0,5,...,30},
       
   200     scaled ticks=false,
       
   201     axis lines=left,
       
   202     width=5cm,
       
   203     height=4cm, 
       
   204     legend entries={Java 8},  
       
   205     legend pos=north west,
       
   206     legend cell align=left]
       
   207 \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
       
   208 \end{axis}
       
   209 \end{tikzpicture}\\
       
   210 \multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
       
   211            of the form $\underbrace{aa..a}_{n}$.}
       
   212 \end{tabular}    
       
   213 \caption{aStarStarb} \label{fig:aStarStarb}
       
   214 \end{figure}
       
   215 
       
   216 
       
   217 
       
   218 
       
   219 This superlinear blowup in matching algorithms sometimes cause
       
   220 considerable grief in real life: for example on 20 July 2016 one evil
       
   221 regular expression brought the webpage
       
   222 \href{http://stackexchange.com}{Stack Exchange} to its
       
   223 knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
       
   224 In this instance, a regular expression intended to just trim white
       
   225 spaces from the beginning and the end of a line actually consumed
       
   226 massive amounts of CPU-resources---causing web servers to grind to a
       
   227 halt. This happened when a post with 20,000 white spaces was submitted,
       
   228 but importantly the white spaces were neither at the beginning nor at
       
   229 the end. As a result, the regular expression matching engine needed to
       
   230 backtrack over many choices. In this example, the time needed to process
       
   231 the string was $O(n^2)$ with respect to the string length. This
       
   232 quadratic overhead was enough for the homepage of Stack Exchange to
       
   233 respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
       
   234 attack and therefore stopped the servers from responding to any
       
   235 requests. This made the whole site become unavailable. 
       
   236 A more recent example is a global outage of all Cloudflare servers on 2 July
       
   237 2019. A poorly written regular expression exhibited exponential
       
   238 behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
       
   239 had several causes, at the heart was a regular expression that
       
   240 was used to monitor network
       
   241 traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}}
       
   242 %TODO: data points for some new versions of languages
       
   243 These problems with regular expressions 
       
   244 are not isolated events that happen
       
   245 very occasionally, but actually widespread.
       
   246 They occur so often that they get a 
       
   247 name--Regular-Expression-Denial-Of-Service (ReDoS)
       
   248 attack.
       
   249 Davis et al. \parencite{Davis18} detected more
       
   250 than 1000 super-linear (SL) regular expressions
       
   251 in Node.js, Python core libraries, and npm and pypi. 
       
   252 They therefore concluded that evil regular expressions
       
   253 are problems more than "a parlour trick", but one that
       
   254 requires
       
   255 more research attention.
       
   256 
       
   257  \section{The Problem Behind Slow Cases}
       
   258 
       
   259 %find literature/find out for yourself that REGEX->DFA on basic regexes
       
   260 %does not blow up the size
       
   261 Shouldn't regular expression matching be linear?
       
   262 How can one explain the super-linear behaviour of the 
       
   263 regex matching engines we have?
       
   264 The time cost of regex matching algorithms in general
       
   265 involve two phases: 
       
   266 the construction phase, in which the algorithm builds some  
       
   267 suitable data structure from the input regex $r$, we denote
       
   268 the time cost by $P_1(r)$.
       
   269 The lexing
       
   270 phase, when the input string $s$ is read and the data structure
       
   271 representing that regex $r$ is being operated on. We represent the time
       
   272 it takes by $P_2(r, s)$.\\
       
   273 In the case of a $\mathit{DFA}$,
       
   274 we have $P_2(r, s) = O( |s| )$,
       
   275 because we take at most $|s|$ steps, 
       
   276 and each step takes
       
   277 at most one transition--
       
   278 a deterministic-finite-automata
       
   279 by definition has at most one state active and at most one
       
   280 transition upon receiving an input symbol.
       
   281 But unfortunately in the  worst case
       
   282 $P_1(r) = O(exp^{|r|})$. An example will be given later. \\
       
   283 For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
       
   284 expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
       
   285 The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
       
   286 On the other hand, if backtracking is used, the worst-case time bound bloats
       
   287 to $|r| * 2^|s|$ .
       
   288 %on the input
       
   289 %And when calculating the time complexity of the matching algorithm,
       
   290 %we are assuming that each input reading step requires constant time.
       
   291 %which translates to that the number of 
       
   292 %states active and transitions taken each time is bounded by a
       
   293 %constant $C$.
       
   294 %But modern  regex libraries in popular language engines
       
   295 % often want to support much richer constructs than just
       
   296 % sequences and Kleene stars,
       
   297 %such as negation, intersection, 
       
   298 %bounded repetitions and back-references.
       
   299 %And de-sugaring these "extended" regular expressions 
       
   300 %into basic ones might bloat the size exponentially.
       
   301 %TODO: more reference for exponential size blowup on desugaring. 
       
   302 \subsection{Tools that uses $\mathit{DFA}$s}
       
   303 %TODO:more tools that use DFAs?
       
   304 $\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
       
   305 in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
       
   306 lexers. The user provides a set of regular expressions
       
   307 and configurations to such lexer generators, and then 
       
   308 gets an output program encoding a minimized $\mathit{DFA}$
       
   309 that can be compiled and run. 
       
   310 The good things about $\mathit{DFA}$s is that once
       
   311 generated, they are fast and stable, unlike
       
   312 backtracking algorithms. 
       
   313 However, they do not scale well with bounded repetitions.\\
       
   314 
       
   315 
       
   316 
       
   317 
       
   318 Bounded repetitions, usually written in the form
       
   319 $r^{\{c\}}$ (where $c$ is a constant natural number),
       
   320 denotes a regular expression accepting strings
       
   321 that can be divided into $c$ substrings, where each 
       
   322 substring is in $r$. 
       
   323 For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
       
   324 an $\mathit{NFA}$ describing it would look like:
       
   325 \begin{center}
       
   326 \begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
       
   327    \node[state,initial] (q_0)   {$q_0$}; 
       
   328    \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
       
   329    \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
       
   330    \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
       
   331     \path[->] 
       
   332     (q_0) edge  node {a} (q_1)
       
   333     	  edge [loop below] node {a,b} ()
       
   334     (q_1) edge  node  {a,b} (q_2)
       
   335     (q_2) edge  node  {a,b} (q_3);
       
   336 \end{tikzpicture}
       
   337 \end{center}
       
   338 The red states are "countdown states" which counts down 
       
   339 the number of characters needed in addition to the current
       
   340 string to make a successful match.
       
   341 For example, state $q_1$ indicates a match that has
       
   342 gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
       
   343 and just consumed the "delimiter" $a$ in the middle, and 
       
   344 need to match 2 more iterations of $(a|b)$ to complete.
       
   345 State $q_2$ on the other hand, can be viewed as a state
       
   346 after $q_1$ has consumed 1 character, and just waits
       
   347 for 1 more character to complete.
       
   348 $q_3$ is the last state, requiring 0 more character and is accepting.
       
   349 Depending on the suffix of the
       
   350 input string up to the current read location,
       
   351 the states $q_1$ and $q_2$, $q_3$
       
   352 may or may
       
   353 not be active, independent from each other.
       
   354 A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
       
   355 contain at least $2^3$ non-equivalent states that cannot be merged, 
       
   356 because the subset construction during determinisation will generate
       
   357 all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
       
   358 Generalizing this to regular expressions with larger
       
   359 bounded repetitions number, we have that
       
   360 regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
       
   361 would require at least $2^{n+1}$ states, if $r$ contains
       
   362 more than 1 string.
       
   363 This is to represent all different 
       
   364 scenarios which "countdown" states are active.
       
   365 For those regexes, tools such as $\mathit{JFLEX}$ 
       
   366 would generate gigantic $\mathit{DFA}$'s or
       
   367 out of memory errors.
       
   368 For this reason, regex libraries that support 
       
   369 bounded repetitions often choose to use the $\mathit{NFA}$ 
       
   370 approach.
       
   371 \subsection{The $\mathit{NFA}$ approach to regex matching}
       
   372 One can simulate the $\mathit{NFA}$ running in two ways:
       
   373 one by keeping track of all active states after consuming 
       
   374 a character, and update that set of states iteratively.
       
   375 This can be viewed as a breadth-first-search of the $\mathit{NFA}$
       
   376 for a path terminating
       
   377 at an accepting state.
       
   378 Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
       
   379 type of $\mathit{NFA}$ simulation, and guarantees a linear runtime
       
   380 in terms of input string length.
       
   381 %TODO:try out these lexers
       
   382 The other way to use $\mathit{NFA}$ for matching is choosing  
       
   383 a single transition each time, keeping all the other options in 
       
   384 a queue or stack, and backtracking if that choice eventually 
       
   385 fails. This method, often called a  "depth-first-search", 
       
   386 is efficient in a lot of cases, but could end up
       
   387 with exponential run time.\\
       
   388 %TODO:COMPARE java python lexer speed with Rust and Go
       
   389 The reason behind backtracking algorithms in languages like
       
   390 Java and Python is that they support back-references.
       
   391 \subsection{Back References in Regex--Non-Regular part}
       
   392 If we have a regular expression like this (the sequence
       
   393 operator is omitted for brevity):
       
   394 \begin{center}
       
   395 	$r_1(r_2(r_3r_4))$
       
   396 \end{center}
       
   397 We could label sub-expressions of interest 
       
   398 by parenthesizing them and giving 
       
   399 them a number by the order in which their opening parentheses appear.
       
   400 One possible way of parenthesizing and labelling is given below:
       
   401 \begin{center}
       
   402 	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
       
   403 \end{center}
       
   404 $r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
       
   405 by 1 to 4. $1$ would refer to the entire expression 
       
   406 $(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
       
   407 These sub-expressions are called "capturing groups".
       
   408 We can use the following syntax to denote that we want a string just matched by a 
       
   409 sub-expression (capturing group) to appear at a certain location again, 
       
   410 exactly as it was:
       
   411 \begin{center}
       
   412 $\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
       
   413 \underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
       
   414 \end{center}
       
   415 The backslash and number $i$ are used to denote such 
       
   416 so-called "back-references".
       
   417 Let $e$ be an expression made of regular expressions 
       
   418 and back-references. $e$ contains the expression $e_i$
       
   419 as its $i$-th capturing group.
       
   420 The semantics of back-reference can be recursively
       
   421 written as:
       
   422 \begin{center}
       
   423 	\begin{tabular}{c}
       
   424 		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
       
   425 		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
       
   426 	\end{tabular}
       
   427 \end{center}
       
   428 The concrete example
       
   429 $((a|b|c|\ldots|z)^*)\backslash 1$
       
   430 would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
       
   431 Back-reference is a construct in the "regex" standard
       
   432 that programmers found useful, but not exactly 
       
   433 regular any more.
       
   434 In fact, that allows the regex construct to express 
       
   435 languages that cannot be contained in context-free
       
   436 languages either.
       
   437 For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
       
   438 expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
       
   439 which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
       
   440 Such a language is contained in the context-sensitive hierarchy
       
   441 of formal languages. 
       
   442 Solving the back-reference expressions matching problem
       
   443 is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
       
   444 efficient solution is not known to exist.
       
   445 %TODO:read a bit more about back reference algorithms
       
   446 It seems that languages like Java and Python made the trade-off
       
   447 to support back-references at the expense of having to backtrack,
       
   448 even in the case of regexes not involving back-references.\\
       
   449 Summing these up, we can categorise existing 
       
   450 practical regex libraries into the ones  with  linear
       
   451 time guarantees like Go and Rust, which impose restrictions
       
   452 on the user input (not allowing back-references, 
       
   453 bounded repetitions canno exceed 1000 etc.), and ones  
       
   454  that allows the programmer much freedom, but grinds to a halt
       
   455  in some non-negligible portion of cases.
       
   456  %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
       
   457 % For example, the Rust regex engine claims to be linear, 
       
   458 % but does not support lookarounds and back-references.
       
   459 % The GoLang regex library does not support over 1000 repetitions.  
       
   460 % Java and Python both support back-references, but shows
       
   461 %catastrophic backtracking behaviours on inputs without back-references(
       
   462 %when the language is still regular).
       
   463  %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
       
   464  %TODO: verify the fact Rust does not allow 1000+ reps
       
   465  %TODO: Java 17 updated graphs? Is it ok to still use Java 8 graphs?
       
   466 \section{Buggy Regex Engines} 
       
   467 
       
   468 
       
   469  Another thing about these libraries is that there
       
   470  is no correctness guarantee.
       
   471  In some cases, they either fail to generate a lexing result when there exists a match,
       
   472  or give the wrong way of matching.
       
   473  
       
   474 
       
   475 It turns out that regex libraries not only suffer from 
       
   476 exponential backtracking problems, 
       
   477 but also undesired (or even buggy) outputs.
       
   478 %TODO: comment from who
       
   479 Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
       
   480 correctly implementing the POSIX (maximum-munch)
       
   481 rule of regular expression matching.
       
   482 This experience is echoed by the writer's
       
   483 tryout of a few online regex testers:
       
   484 A concrete example would be 
       
   485 the regex
       
   486 \begin{verbatim}
       
   487 (((((a*a*)b*)b){20})*)c
       
   488 \end{verbatim}
       
   489 and the string
       
   490 \begin{verbatim}
       
   491 baabaabababaabaaaaaaaaababaa
       
   492 aababababaaaabaaabaaaaaabaab
       
   493 aabababaababaaaaaaaaababaaaa
       
   494 babababaaaaaaaaaaaaac
       
   495 \end{verbatim}
       
   496 
       
   497 This seemingly complex regex simply says "some $a$'s
       
   498 followed by some $b$'s then followed by 1 single $b$,
       
   499 and this iterates 20 times, finally followed by a $c$.
       
   500 And a POSIX match would involve the entire string,"eating up"
       
   501 all the $b$'s in it.
       
   502 %TODO: give a coloured example of how this matches POSIXly
       
   503 
       
   504 This regex would trigger catastrophic backtracking in 
       
   505 languages like Python and Java,
       
   506 whereas it gives a non-POSIX  and uninformative 
       
   507 match in languages like Go or .NET--The match with only 
       
   508 character $c$.
       
   509 
       
   510 As Grathwohl\parencite{grathwohl2014crash} commented,
       
   511 \begin{center}
       
   512 	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
       
   513 \end{center}
       
   514 
       
   515 %\section{How people solve problems with regexes}
       
   516 
       
   517 
       
   518 When a regular expression does not behave as intended,
       
   519 people usually try to rewrite the regex to some equivalent form
       
   520 or they try to avoid the possibly problematic patterns completely,
       
   521 for which many false positives exist\parencite{Davis18}.
       
   522 Animated tools to "debug" regular expressions
       
   523 are also popular, regexploit\parencite{regexploit2021}, regex101\parencite{regex101} 
       
   524 to name a few.
       
   525 We are also aware of static analysis work on regular expressions that
       
   526 aims to detect potentially expoential regex patterns. Rathnayake and Thielecke 
       
   527 \parencite{Rathnayake2014StaticAF} proposed an algorithm
       
   528 that detects regular expressions triggering exponential
       
   529 behavious on backtracking matchers.
       
   530 Weideman \parencite{Weideman2017Static} came up with 
       
   531 non-linear polynomial worst-time estimates
       
   532 for regexes, attack string that exploit the worst-time 
       
   533 scenario, and "attack automata" that generates
       
   534 attack strings.
       
   535 %Arguably these methods limits the programmers' freedom
       
   536 %or productivity when all they want is to come up with a regex
       
   537 %that solves the text processing problem.
       
   538 
       
   539 %TODO:also the regex101 debugger
       
   540 \section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
       
   541  Is it possible to have a regex lexing algorithm with proven correctness and 
       
   542  time complexity, which allows easy extensions to
       
   543   constructs like 
       
   544  bounded repetitions, negation,  lookarounds, and even back-references? 
       
   545   
       
   546   We propose Brzozowski derivatives on regular expressions as
       
   547   a solution to this.
       
   548   
       
   549   In the last fifteen or so years, Brzozowski's derivatives of regular
       
   550 expressions have sparked quite a bit of interest in the functional
       
   551 programming and theorem prover communities.  The beauty of
       
   552 Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
       
   553 expressible in any functional language, and easily definable and
       
   554 reasoned about in theorem provers---the definitions just consist of
       
   555 inductive datatypes and simple recursive functions. 
       
   556 And an algorithms based on it by 
       
   557 Suzmann and Lu  \parencite{Sulzmann2014} allows easy extension
       
   558 to include  extended regular expressions and 
       
   559  simplification of internal data structures 
       
   560  eliminating the exponential behaviours.
       
   561  
       
   562 
       
   563   \section{Motivation}
       
   564   
       
   565 Derivatives give a simple solution
       
   566 to the problem of matching a string $s$ with a regular
       
   567 expression $r$: if the derivative of $r$ w.r.t.\ (in
       
   568 succession) all the characters of the string matches the empty string,
       
   569 then $r$ matches $s$ (and {\em vice versa}).  
       
   570 
       
   571 
       
   572 
       
   573 However, two difficulties with derivative-based matchers exist:
       
   574 First, Brzozowski's original matcher only generates a yes/no answer
       
   575 for whether a regular expression matches a string or not.  This is too
       
   576 little information in the context of lexing where separate tokens must
       
   577 be identified and also classified (for example as keywords
       
   578 or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
       
   579 difficulty by cleverly extending Brzozowski's matching
       
   580 algorithm. Their extended version generates additional information on
       
   581 \emph{how} a regular expression matches a string following the POSIX
       
   582 rules for regular expression matching. They achieve this by adding a
       
   583 second ``phase'' to Brzozowski's algorithm involving an injection
       
   584 function.  In our own earlier work we provided the formal
       
   585 specification of what POSIX matching means and proved in Isabelle/HOL
       
   586 the correctness
       
   587 of Sulzmann and Lu's extended algorithm accordingly
       
   588 \cite{AusafDyckhoffUrban2016}.
       
   589 
       
   590 The second difficulty is that Brzozowski's derivatives can 
       
   591 grow to arbitrarily big sizes. For example if we start with the
       
   592 regular expression $(a+aa)^*$ and take
       
   593 successive derivatives according to the character $a$, we end up with
       
   594 a sequence of ever-growing derivatives like 
       
   595 
       
   596 \def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
       
   597 \begin{center}
       
   598 \begin{tabular}{rll}
       
   599 $(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   600 & $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   601 & $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
       
   602 & & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
       
   603 & $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
       
   604 \end{tabular}
       
   605 \end{center}
       
   606  
       
   607 \noindent where after around 35 steps we run out of memory on a
       
   608 typical computer (we shall define shortly the precise details of our
       
   609 regular expressions and the derivative operation).  Clearly, the
       
   610 notation involving $\ZERO$s and $\ONE$s already suggests
       
   611 simplification rules that can be applied to regular regular
       
   612 expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
       
   613 \Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
       
   614 r$. While such simple-minded simplifications have been proved in our
       
   615 earlier work to preserve the correctness of Sulzmann and Lu's
       
   616 algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
       
   617 \emph{not} help with limiting the growth of the derivatives shown
       
   618 above: the growth is slowed, but the derivatives can still grow rather
       
   619 quickly beyond any finite bound.
       
   620 
       
   621 
       
   622 Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
       
   623 \cite{Sulzmann2014} where they introduce bitcoded
       
   624 regular expressions. In this version, POSIX values are
       
   625 represented as bitsequences and such sequences are incrementally generated
       
   626 when derivatives are calculated. The compact representation
       
   627 of bitsequences and regular expressions allows them to define a more
       
   628 ``aggressive'' simplification method that keeps the size of the
       
   629 derivatives finite no matter what the length of the string is.
       
   630 They make some informal claims about the correctness and linear behaviour
       
   631 of this version, but do not provide any supporting proof arguments, not
       
   632 even ``pencil-and-paper'' arguments. They write about their bitcoded
       
   633 \emph{incremental parsing method} (that is the algorithm to be formalised
       
   634 in this paper):
       
   635 
       
   636 
       
   637   
       
   638   \begin{quote}\it
       
   639   ``Correctness Claim: We further claim that the incremental parsing
       
   640   method [..] in combination with the simplification steps [..]
       
   641   yields POSIX parse trees. We have tested this claim
       
   642   extensively [..] but yet
       
   643   have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
       
   644 \end{quote}  
       
   645 
       
   646 Ausaf and Urban were able to back this correctness claim with
       
   647 a formal proof.
       
   648 
       
   649 But as they stated,
       
   650   \begin{quote}\it
       
   651 The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
       
   652 \end{quote}  
       
   653 
       
   654 This thesis implements the aggressive simplifications envisioned
       
   655 by Ausaf and Urban,
       
   656 and gives a formal proof of the correctness with those simplifications.
       
   657 
       
   658 
       
   659  
       
   660 
       
   661 
       
   662 
       
   663 %----------------------------------------------------------------------------------------
       
   664 
       
   665 \section{Contribution}
       
   666 
       
   667 
       
   668 
       
   669 This work addresses the vulnerability of super-linear and
       
   670 buggy regex implementations by the combination
       
   671 of Brzozowski's derivatives and interactive theorem proving. 
       
   672 We give an 
       
   673 improved version of  Sulzmann and Lu's bit-coded algorithm using 
       
   674 derivatives, which come with a formal guarantee in terms of correctness and 
       
   675 running time as an Isabelle/HOL proof.
       
   676 Then we improve the algorithm with an even stronger version of 
       
   677 simplification, and prove a time bound linear to input and
       
   678 cubic to regular expression size using a technique by
       
   679 Antimirov.
       
   680 
       
   681  
       
   682 The main contribution of this thesis is a proven correct lexing algorithm
       
   683 with formalized time bounds.
       
   684 To our best knowledge, no lexing libraries using Brzozowski derivatives
       
   685 have a provable time guarantee, 
       
   686 and claims about running time are usually speculative and backed by thin empirical
       
   687 evidence.
       
   688 %TODO: give references
       
   689 For example, Sulzmann and Lu had proposed an algorithm  in which they
       
   690 claim a linear running time.
       
   691 But that was falsified by our experiments and the running time 
       
   692 is actually $\Omega(2^n)$ in the worst case.
       
   693 A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
       
   694 %TODO: give references
       
   695 lexer, which calculates POSIX matches and is based on derivatives.
       
   696 They formalized the correctness of the lexer, but not the complexity.
       
   697 In the performance evaluation section, they simply analyzed the run time
       
   698 of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
       
   699 and concluded that the algorithm is quadratic in terms of input length.
       
   700 When we tried out their extracted OCaml code with our example $(a+aa)^*$,
       
   701 the time it took to lex only 40 $a$'s was 5 minutes.
       
   702 
       
   703 We  believe our results of a proof of performance on general
       
   704 inputs rather than specific examples a novel contribution.\\
       
   705 
       
   706 
       
   707 \subsection{Related Work}
       
   708 We are aware
       
   709 of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
       
   710 Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
       
   711 of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
       
   712 in Coq is given by Coquand and Siles \parencite{Coquand2012}.
       
   713 Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
       
   714  
       
   715  %We propose Brzozowski's derivatives as a solution to this problem.
       
   716 % about Lexing Using Brzozowski derivatives
       
   717 
       
   718 
       
   719 \section{Structure of the thesis}
       
   720 In chapter 2 \ref{Chapter2} we will introduce the concepts
       
   721 and notations we 
       
   722 use for describing the lexing algorithm by Sulzmann and Lu,
       
   723 and then give the algorithm and its variant, and discuss
       
   724 why more aggressive simplifications are needed. 
       
   725 Then we illustrate in Chapter 3\ref{Chapter3}
       
   726 how the algorithm without bitcodes falls short for such aggressive 
       
   727 simplifications and therefore introduce our version of the
       
   728  bitcoded algorithm and 
       
   729 its correctness proof .  
       
   730 In Chapter 4 \ref{Chapter4} we give the second guarantee
       
   731 of our bitcoded algorithm, that is a finite bound on the size of any 
       
   732 regex's derivatives.
       
   733 In Chapter 5\ref{Chapter5} we discuss stronger simplifications to improve the finite bound
       
   734 in Chapter 4 to a polynomial one, and demonstrate how one can extend the
       
   735 algorithm to include constructs such as bounded repetitions and negations.
       
   736  
       
   737 
       
   738 
       
   739 
       
   740 
       
   741 %----------------------------------------------------------------------------------------
       
   742 
       
   743 
       
   744 %----------------------------------------------------------------------------------------
       
   745 
       
   746 %----------------------------------------------------------------------------------------
       
   747 
       
   748 %----------------------------------------------------------------------------------------
       
   749 
       
   750