ChengsongTanPhdThesis/Chapters/Chapter1.tex
author Chengsong
Fri, 24 Jun 2022 21:49:23 +0100
changeset 553 0f00d440f484
parent 530 823d9b19d21c
permissions -rwxr-xr-x
more changes
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     1
% Chapter 1
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     2
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     4
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     5
\label{Chapter1} % For referencing the chapter elsewhere, use \ref{Chapter1} 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     6
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     8
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    15
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    16
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    17
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    18
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    19
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimp} #2}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    20
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
506
69ad05398894 thesis chapter 2 section 2.4 2.5
Chengsong
parents: 505
diff changeset
    21
\newcommand{\bders}[2]{#1 \backslash #2}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    22
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    23
\newcommand{\rsimp}[1]{\textit{rsimp}(#1)}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    24
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    25
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
529
Chengsong
parents: 528
diff changeset
    26
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    27
\newcommand{\ZERO}{\mbox{\bf 0}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    28
\newcommand{\ONE}{\mbox{\bf 1}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    29
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
505
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    30
\newcommand{\rdistinct}[2]{\textit{distinct} \; \textit{#1} \; #2}
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    31
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    32
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
506
69ad05398894 thesis chapter 2 section 2.4 2.5
Chengsong
parents: 505
diff changeset
    33
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    34
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    35
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    36
519
Chengsong
parents: 518
diff changeset
    37
\def\decode{\textit{decode}}
Chengsong
parents: 518
diff changeset
    38
\def\internalise{\textit{internalise}}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    39
\def\lexer{\mathit{lexer}}
519
Chengsong
parents: 518
diff changeset
    40
\def\mkeps{\textit{mkeps}}
505
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    41
\newcommand{\rder}[2]{#2 \backslash #1}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    42
500
Chengsong
parents: 472
diff changeset
    43
\def\AZERO{\textit{AZERO}}
Chengsong
parents: 472
diff changeset
    44
\def\AONE{\textit{AONE}}
Chengsong
parents: 472
diff changeset
    45
\def\ACHAR{\textit{ACHAR}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    46
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
    47
\def\POSIX{\textit{POSIX}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    48
\def\ALTS{\textit{ALTS}}
500
Chengsong
parents: 472
diff changeset
    49
\def\ASTAR{\textit{ASTAR}}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    50
\def\DFA{\textit{DFA}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    51
\def\bmkeps{\textit{bmkeps}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    52
\def\retrieve{\textit{retrieve}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    53
\def\blexer{\textit{blexer}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    54
\def\flex{\textit{flex}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    55
\def\inj{\mathit{inj}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    56
\def\Empty{\mathit{Empty}}
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
    57
\def\Left{\mathit{Left}}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    58
\def\Right{\mathit{Right}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    59
\def\Stars{\mathit{Stars}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    60
\def\Char{\mathit{Char}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    61
\def\Seq{\mathit{Seq}}
529
Chengsong
parents: 528
diff changeset
    62
\def\Der{\textit{Der}}
Chengsong
parents: 528
diff changeset
    63
\def\Ders{\textit{Ders}}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    64
\def\nullable{\mathit{nullable}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    65
\def\Z{\mathit{Z}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    66
\def\S{\mathit{S}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    67
\def\rup{r^\uparrow}
500
Chengsong
parents: 472
diff changeset
    68
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
Chengsong
parents: 472
diff changeset
    69
\def\distinctWith{\textit{distinctWith}}
528
28751de4b4ba revised according to comments
Chengsong
parents: 519
diff changeset
    70
\def\lf{\textit{lf}}
28751de4b4ba revised according to comments
Chengsong
parents: 519
diff changeset
    71
\def\PD{\textit{PD}}
530
823d9b19d21c all comments addressed
Chengsong
parents: 529
diff changeset
    72
\def\suffix{\textit{Suffix}}
528
28751de4b4ba revised according to comments
Chengsong
parents: 519
diff changeset
    73
500
Chengsong
parents: 472
diff changeset
    74
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
    75
\def\size{\mathit{size}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    76
\def\rexp{\mathbf{rexp}}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    77
\def\simp{\mathit{simp}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    78
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    79
\def\map{\mathit{map}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    80
\def\distinct{\mathit{distinct}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    81
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
500
Chengsong
parents: 472
diff changeset
    82
\def\map{\textit{map}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    83
%\def\vsuf{\textit{vsuf}}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    84
%\def\sflataux{\textit{sflat}\_\textit{aux}}
500
Chengsong
parents: 472
diff changeset
    85
\def\rrexp{\textit{rrexp}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    86
\newcommand\rnullable[1]{\textit{rnullable}(#1)}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    87
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    88
\newcommand\asize[1]{\llbracket #1 \rrbracket}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    89
\newcommand\rerase[1]{ (#1)\downarrow_r}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    90
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
    91
500
Chengsong
parents: 472
diff changeset
    92
\def\erase{\textit{erase}}
Chengsong
parents: 472
diff changeset
    93
\def\STAR{\textit{STAR}}
Chengsong
parents: 472
diff changeset
    94
\def\flts{\textit{flts}}
Chengsong
parents: 472
diff changeset
    95
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    96
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    97
\def\RZERO{\mathbf{0}_r }
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    98
\def\RONE{\mathbf{1}_r}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    99
\newcommand\RCHAR[1]{\mathbf{#1}_r}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
   100
\newcommand\RSEQ[2]{#1 \cdot #2}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
   101
\newcommand\RALTS[1]{\oplus #1}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
   102
\newcommand\RSTAR[1]{#1^*}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
   103
\newcommand\vsuf[2]{\textit{vsuf} \;#1\;#2}
505
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
   104
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   105
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   106
%This part is about regular expressions, Brzozowski derivatives,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   107
%and a bit-coded lexing algorithm with proven correctness and time bounds.
472
Chengsong
parents: 471
diff changeset
   108
Chengsong
parents: 471
diff changeset
   109
%TODO: look up snort rules to use here--give readers idea of what regexes look like
Chengsong
parents: 471
diff changeset
   110
Chengsong
parents: 471
diff changeset
   111
471
Chengsong
parents: 469
diff changeset
   112
Regular expressions are widely used in computer science: 
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   113
be it in text-editors\parencite{atomEditor} with syntax highlighting and auto-completion, 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   114
command-line tools like $\mathit{grep}$ that facilitate easy 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   115
text processing, network intrusion
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   116
detection systems that reject suspicious traffic, or compiler
500
Chengsong
parents: 472
diff changeset
   117
front ends--the majority of the solutions to these tasks 
Chengsong
parents: 472
diff changeset
   118
involve lexing with regular 
Chengsong
parents: 472
diff changeset
   119
expressions.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   120
Given its usefulness and ubiquity, one would imagine that
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   121
modern regular expression matching implementations
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   122
are mature and fully studied.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   123
Indeed, in a popular programming language' regex engine, 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   124
supplying it with regular expressions and strings, one can
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   125
get rich matching information in a very short time.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   126
Some network intrusion detection systems
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   127
use regex engines that are able to process 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   128
megabytes or even gigabytes of data per second\parencite{Turo_ov__2020}.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   129
Unfortunately, this is not the case for $\mathbf{all}$ inputs.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   130
%TODO: get source for SNORT/BRO's regex matching engine/speed
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   131
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   132
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   133
Take $(a^*)^*\,b$ and ask whether
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   134
strings of the form $aa..a$ match this regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   135
expression. Obviously this is not the case---the expected $b$ in the last
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   136
position is missing. One would expect that modern regular expression
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   137
matching engines can find this out very quickly. Alas, if one tries
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   138
this example in JavaScript, Python or Java 8, even with strings of a small
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   139
length, say around 30 $a$'s, one discovers that 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   140
this decision takes crazy time to finish given the simplicity of the problem.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   141
This is clearly exponential behaviour, and 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   142
is triggered by some relatively simple regex patterns, as the graphs
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   143
below show:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   144
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   145
\begin{figure}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   146
\centering
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   147
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   148
\begin{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   149
\begin{axis}[
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   150
    xlabel={$n$},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   151
    x label style={at={(1.05,-0.05)}},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   152
    ylabel={time in secs},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   153
    enlargelimits=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   154
    xtick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   155
    xmax=33,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   156
    ymax=35,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   157
    ytick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   158
    scaled ticks=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   159
    axis lines=left,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   160
    width=5cm,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   161
    height=4cm, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   162
    legend entries={JavaScript},  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   163
    legend pos=north west,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   164
    legend cell align=left]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   165
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   166
\end{axis}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   167
\end{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   168
  &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   169
\begin{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   170
\begin{axis}[
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   171
    xlabel={$n$},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   172
    x label style={at={(1.05,-0.05)}},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   173
    %ylabel={time in secs},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   174
    enlargelimits=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   175
    xtick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   176
    xmax=33,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   177
    ymax=35,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   178
    ytick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   179
    scaled ticks=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   180
    axis lines=left,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   181
    width=5cm,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   182
    height=4cm, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   183
    legend entries={Python},  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   184
    legend pos=north west,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   185
    legend cell align=left]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   186
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   187
\end{axis}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   188
\end{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   189
  &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   190
\begin{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   191
\begin{axis}[
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   192
    xlabel={$n$},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   193
    x label style={at={(1.05,-0.05)}},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   194
    %ylabel={time in secs},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   195
    enlargelimits=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   196
    xtick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   197
    xmax=33,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   198
    ymax=35,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   199
    ytick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   200
    scaled ticks=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   201
    axis lines=left,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   202
    width=5cm,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   203
    height=4cm, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   204
    legend entries={Java 8},  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   205
    legend pos=north west,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   206
    legend cell align=left]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   207
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   208
\end{axis}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   209
\end{tikzpicture}\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   210
\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   211
           of the form $\underbrace{aa..a}_{n}$.}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   212
\end{tabular}    
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   213
\caption{aStarStarb} \label{fig:aStarStarb}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   214
\end{figure}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   215
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   216
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   217
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   218
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   219
This superlinear blowup in matching algorithms sometimes cause
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   220
considerable grief in real life: for example on 20 July 2016 one evil
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   221
regular expression brought the webpage
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   222
\href{http://stackexchange.com}{Stack Exchange} to its
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   223
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   224
In this instance, a regular expression intended to just trim white
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   225
spaces from the beginning and the end of a line actually consumed
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   226
massive amounts of CPU-resources---causing web servers to grind to a
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   227
halt. This happened when a post with 20,000 white spaces was submitted,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   228
but importantly the white spaces were neither at the beginning nor at
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   229
the end. As a result, the regular expression matching engine needed to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   230
backtrack over many choices. In this example, the time needed to process
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   231
the string was $O(n^2)$ with respect to the string length. This
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   232
quadratic overhead was enough for the homepage of Stack Exchange to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   233
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   234
attack and therefore stopped the servers from responding to any
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   235
requests. This made the whole site become unavailable. 
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   236
A more recent example is a global outage of all Cloudflare servers on 2 July
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   237
2019. A poorly written regular expression exhibited exponential
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   238
behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   239
had several causes, at the heart was a regular expression that
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   240
was used to monitor network
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   241
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   242
%TODO: data points for some new versions of languages
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   243
These problems with regular expressions 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   244
are not isolated events that happen
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   245
very occasionally, but actually widespread.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   246
They occur so often that they get a 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   247
name--Regular-Expression-Denial-Of-Service (ReDoS)
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   248
attack.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   249
Davis et al. \parencite{Davis18} detected more
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   250
than 1000 super-linear (SL) regular expressions
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   251
in Node.js, Python core libraries, and npm and pypi. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   252
They therefore concluded that evil regular expressions
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   253
are problems more than "a parlour trick", but one that
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   254
requires
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   255
more research attention.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   256
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   257
 \section{The Problem Behind Slow Cases}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   258
471
Chengsong
parents: 469
diff changeset
   259
%find literature/find out for yourself that REGEX->DFA on basic regexes
Chengsong
parents: 469
diff changeset
   260
%does not blow up the size
Chengsong
parents: 469
diff changeset
   261
Shouldn't regular expression matching be linear?
Chengsong
parents: 469
diff changeset
   262
How can one explain the super-linear behaviour of the 
Chengsong
parents: 469
diff changeset
   263
regex matching engines we have?
Chengsong
parents: 469
diff changeset
   264
The time cost of regex matching algorithms in general
Chengsong
parents: 469
diff changeset
   265
involve two phases: 
Chengsong
parents: 469
diff changeset
   266
the construction phase, in which the algorithm builds some  
Chengsong
parents: 469
diff changeset
   267
suitable data structure from the input regex $r$, we denote
Chengsong
parents: 469
diff changeset
   268
the time cost by $P_1(r)$.
Chengsong
parents: 469
diff changeset
   269
The lexing
Chengsong
parents: 469
diff changeset
   270
phase, when the input string $s$ is read and the data structure
Chengsong
parents: 469
diff changeset
   271
representing that regex $r$ is being operated on. We represent the time
Chengsong
parents: 469
diff changeset
   272
it takes by $P_2(r, s)$.\\
Chengsong
parents: 469
diff changeset
   273
In the case of a $\mathit{DFA}$,
Chengsong
parents: 469
diff changeset
   274
we have $P_2(r, s) = O( |s| )$,
Chengsong
parents: 469
diff changeset
   275
because we take at most $|s|$ steps, 
Chengsong
parents: 469
diff changeset
   276
and each step takes
Chengsong
parents: 469
diff changeset
   277
at most one transition--
Chengsong
parents: 469
diff changeset
   278
a deterministic-finite-automata
Chengsong
parents: 469
diff changeset
   279
by definition has at most one state active and at most one
Chengsong
parents: 469
diff changeset
   280
transition upon receiving an input symbol.
Chengsong
parents: 469
diff changeset
   281
But unfortunately in the  worst case
Chengsong
parents: 469
diff changeset
   282
$P_1(r) = O(exp^{|r|})$. An example will be given later. \\
Chengsong
parents: 469
diff changeset
   283
For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
Chengsong
parents: 469
diff changeset
   284
expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
Chengsong
parents: 469
diff changeset
   285
The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
Chengsong
parents: 469
diff changeset
   286
On the other hand, if backtracking is used, the worst-case time bound bloats
Chengsong
parents: 469
diff changeset
   287
to $|r| * 2^|s|$ .
Chengsong
parents: 469
diff changeset
   288
%on the input
Chengsong
parents: 469
diff changeset
   289
%And when calculating the time complexity of the matching algorithm,
Chengsong
parents: 469
diff changeset
   290
%we are assuming that each input reading step requires constant time.
Chengsong
parents: 469
diff changeset
   291
%which translates to that the number of 
Chengsong
parents: 469
diff changeset
   292
%states active and transitions taken each time is bounded by a
Chengsong
parents: 469
diff changeset
   293
%constant $C$.
Chengsong
parents: 469
diff changeset
   294
%But modern  regex libraries in popular language engines
Chengsong
parents: 469
diff changeset
   295
% often want to support much richer constructs than just
Chengsong
parents: 469
diff changeset
   296
% sequences and Kleene stars,
Chengsong
parents: 469
diff changeset
   297
%such as negation, intersection, 
Chengsong
parents: 469
diff changeset
   298
%bounded repetitions and back-references.
Chengsong
parents: 469
diff changeset
   299
%And de-sugaring these "extended" regular expressions 
Chengsong
parents: 469
diff changeset
   300
%into basic ones might bloat the size exponentially.
Chengsong
parents: 469
diff changeset
   301
%TODO: more reference for exponential size blowup on desugaring. 
Chengsong
parents: 469
diff changeset
   302
\subsection{Tools that uses $\mathit{DFA}$s}
Chengsong
parents: 469
diff changeset
   303
%TODO:more tools that use DFAs?
Chengsong
parents: 469
diff changeset
   304
$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
Chengsong
parents: 469
diff changeset
   305
in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
Chengsong
parents: 469
diff changeset
   306
lexers. The user provides a set of regular expressions
Chengsong
parents: 469
diff changeset
   307
and configurations to such lexer generators, and then 
Chengsong
parents: 469
diff changeset
   308
gets an output program encoding a minimized $\mathit{DFA}$
Chengsong
parents: 469
diff changeset
   309
that can be compiled and run. 
Chengsong
parents: 469
diff changeset
   310
The good things about $\mathit{DFA}$s is that once
Chengsong
parents: 469
diff changeset
   311
generated, they are fast and stable, unlike
Chengsong
parents: 469
diff changeset
   312
backtracking algorithms. 
Chengsong
parents: 469
diff changeset
   313
However, they do not scale well with bounded repetitions.\\
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   314
516
6fecb7fe8cd0 blexer2: modified for plotting
Chengsong
parents: 506
diff changeset
   315
6fecb7fe8cd0 blexer2: modified for plotting
Chengsong
parents: 506
diff changeset
   316
6fecb7fe8cd0 blexer2: modified for plotting
Chengsong
parents: 506
diff changeset
   317
471
Chengsong
parents: 469
diff changeset
   318
Bounded repetitions, usually written in the form
Chengsong
parents: 469
diff changeset
   319
$r^{\{c\}}$ (where $c$ is a constant natural number),
Chengsong
parents: 469
diff changeset
   320
denotes a regular expression accepting strings
Chengsong
parents: 469
diff changeset
   321
that can be divided into $c$ substrings, where each 
Chengsong
parents: 469
diff changeset
   322
substring is in $r$. 
Chengsong
parents: 469
diff changeset
   323
For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
Chengsong
parents: 469
diff changeset
   324
an $\mathit{NFA}$ describing it would look like:
Chengsong
parents: 469
diff changeset
   325
\begin{center}
Chengsong
parents: 469
diff changeset
   326
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
Chengsong
parents: 469
diff changeset
   327
   \node[state,initial] (q_0)   {$q_0$}; 
Chengsong
parents: 469
diff changeset
   328
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
Chengsong
parents: 469
diff changeset
   329
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
Chengsong
parents: 469
diff changeset
   330
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
Chengsong
parents: 469
diff changeset
   331
    \path[->] 
Chengsong
parents: 469
diff changeset
   332
    (q_0) edge  node {a} (q_1)
Chengsong
parents: 469
diff changeset
   333
    	  edge [loop below] node {a,b} ()
Chengsong
parents: 469
diff changeset
   334
    (q_1) edge  node  {a,b} (q_2)
Chengsong
parents: 469
diff changeset
   335
    (q_2) edge  node  {a,b} (q_3);
Chengsong
parents: 469
diff changeset
   336
\end{tikzpicture}
Chengsong
parents: 469
diff changeset
   337
\end{center}
Chengsong
parents: 469
diff changeset
   338
The red states are "countdown states" which counts down 
Chengsong
parents: 469
diff changeset
   339
the number of characters needed in addition to the current
Chengsong
parents: 469
diff changeset
   340
string to make a successful match.
Chengsong
parents: 469
diff changeset
   341
For example, state $q_1$ indicates a match that has
Chengsong
parents: 469
diff changeset
   342
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
Chengsong
parents: 469
diff changeset
   343
and just consumed the "delimiter" $a$ in the middle, and 
Chengsong
parents: 469
diff changeset
   344
need to match 2 more iterations of $(a|b)$ to complete.
Chengsong
parents: 469
diff changeset
   345
State $q_2$ on the other hand, can be viewed as a state
Chengsong
parents: 469
diff changeset
   346
after $q_1$ has consumed 1 character, and just waits
Chengsong
parents: 469
diff changeset
   347
for 1 more character to complete.
Chengsong
parents: 469
diff changeset
   348
$q_3$ is the last state, requiring 0 more character and is accepting.
Chengsong
parents: 469
diff changeset
   349
Depending on the suffix of the
Chengsong
parents: 469
diff changeset
   350
input string up to the current read location,
Chengsong
parents: 469
diff changeset
   351
the states $q_1$ and $q_2$, $q_3$
Chengsong
parents: 469
diff changeset
   352
may or may
Chengsong
parents: 469
diff changeset
   353
not be active, independent from each other.
Chengsong
parents: 469
diff changeset
   354
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
Chengsong
parents: 469
diff changeset
   355
contain at least $2^3$ non-equivalent states that cannot be merged, 
Chengsong
parents: 469
diff changeset
   356
because the subset construction during determinisation will generate
Chengsong
parents: 469
diff changeset
   357
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
Chengsong
parents: 469
diff changeset
   358
Generalizing this to regular expressions with larger
Chengsong
parents: 469
diff changeset
   359
bounded repetitions number, we have that
Chengsong
parents: 469
diff changeset
   360
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
Chengsong
parents: 469
diff changeset
   361
would require at least $2^{n+1}$ states, if $r$ contains
Chengsong
parents: 469
diff changeset
   362
more than 1 string.
Chengsong
parents: 469
diff changeset
   363
This is to represent all different 
Chengsong
parents: 469
diff changeset
   364
scenarios which "countdown" states are active.
Chengsong
parents: 469
diff changeset
   365
For those regexes, tools such as $\mathit{JFLEX}$ 
Chengsong
parents: 469
diff changeset
   366
would generate gigantic $\mathit{DFA}$'s or
Chengsong
parents: 469
diff changeset
   367
out of memory errors.
Chengsong
parents: 469
diff changeset
   368
For this reason, regex libraries that support 
Chengsong
parents: 469
diff changeset
   369
bounded repetitions often choose to use the $\mathit{NFA}$ 
Chengsong
parents: 469
diff changeset
   370
approach.
Chengsong
parents: 469
diff changeset
   371
\subsection{The $\mathit{NFA}$ approach to regex matching}
Chengsong
parents: 469
diff changeset
   372
One can simulate the $\mathit{NFA}$ running in two ways:
Chengsong
parents: 469
diff changeset
   373
one by keeping track of all active states after consuming 
Chengsong
parents: 469
diff changeset
   374
a character, and update that set of states iteratively.
Chengsong
parents: 469
diff changeset
   375
This can be viewed as a breadth-first-search of the $\mathit{NFA}$
Chengsong
parents: 469
diff changeset
   376
for a path terminating
Chengsong
parents: 469
diff changeset
   377
at an accepting state.
Chengsong
parents: 469
diff changeset
   378
Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
Chengsong
parents: 469
diff changeset
   379
type of $\mathit{NFA}$ simulation, and guarantees a linear runtime
Chengsong
parents: 469
diff changeset
   380
in terms of input string length.
Chengsong
parents: 469
diff changeset
   381
%TODO:try out these lexers
Chengsong
parents: 469
diff changeset
   382
The other way to use $\mathit{NFA}$ for matching is choosing  
Chengsong
parents: 469
diff changeset
   383
a single transition each time, keeping all the other options in 
Chengsong
parents: 469
diff changeset
   384
a queue or stack, and backtracking if that choice eventually 
Chengsong
parents: 469
diff changeset
   385
fails. This method, often called a  "depth-first-search", 
Chengsong
parents: 469
diff changeset
   386
is efficient in a lot of cases, but could end up
Chengsong
parents: 469
diff changeset
   387
with exponential run time.\\
Chengsong
parents: 469
diff changeset
   388
%TODO:COMPARE java python lexer speed with Rust and Go
Chengsong
parents: 469
diff changeset
   389
The reason behind backtracking algorithms in languages like
Chengsong
parents: 469
diff changeset
   390
Java and Python is that they support back-references.
Chengsong
parents: 469
diff changeset
   391
\subsection{Back References in Regex--Non-Regular part}
Chengsong
parents: 469
diff changeset
   392
If we have a regular expression like this (the sequence
Chengsong
parents: 469
diff changeset
   393
operator is omitted for brevity):
Chengsong
parents: 469
diff changeset
   394
\begin{center}
Chengsong
parents: 469
diff changeset
   395
	$r_1(r_2(r_3r_4))$
Chengsong
parents: 469
diff changeset
   396
\end{center}
Chengsong
parents: 469
diff changeset
   397
We could label sub-expressions of interest 
Chengsong
parents: 469
diff changeset
   398
by parenthesizing them and giving 
Chengsong
parents: 469
diff changeset
   399
them a number by the order in which their opening parentheses appear.
Chengsong
parents: 469
diff changeset
   400
One possible way of parenthesizing and labelling is given below:
Chengsong
parents: 469
diff changeset
   401
\begin{center}
Chengsong
parents: 469
diff changeset
   402
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
Chengsong
parents: 469
diff changeset
   403
\end{center}
Chengsong
parents: 469
diff changeset
   404
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
Chengsong
parents: 469
diff changeset
   405
by 1 to 4. $1$ would refer to the entire expression 
Chengsong
parents: 469
diff changeset
   406
$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
Chengsong
parents: 469
diff changeset
   407
These sub-expressions are called "capturing groups".
Chengsong
parents: 469
diff changeset
   408
We can use the following syntax to denote that we want a string just matched by a 
Chengsong
parents: 469
diff changeset
   409
sub-expression (capturing group) to appear at a certain location again, 
Chengsong
parents: 469
diff changeset
   410
exactly as it was:
Chengsong
parents: 469
diff changeset
   411
\begin{center}
Chengsong
parents: 469
diff changeset
   412
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
Chengsong
parents: 469
diff changeset
   413
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
Chengsong
parents: 469
diff changeset
   414
\end{center}
Chengsong
parents: 469
diff changeset
   415
The backslash and number $i$ are used to denote such 
Chengsong
parents: 469
diff changeset
   416
so-called "back-references".
Chengsong
parents: 469
diff changeset
   417
Let $e$ be an expression made of regular expressions 
Chengsong
parents: 469
diff changeset
   418
and back-references. $e$ contains the expression $e_i$
Chengsong
parents: 469
diff changeset
   419
as its $i$-th capturing group.
Chengsong
parents: 469
diff changeset
   420
The semantics of back-reference can be recursively
Chengsong
parents: 469
diff changeset
   421
written as:
Chengsong
parents: 469
diff changeset
   422
\begin{center}
Chengsong
parents: 469
diff changeset
   423
	\begin{tabular}{c}
Chengsong
parents: 469
diff changeset
   424
		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
Chengsong
parents: 469
diff changeset
   425
		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
Chengsong
parents: 469
diff changeset
   426
	\end{tabular}
Chengsong
parents: 469
diff changeset
   427
\end{center}
Chengsong
parents: 469
diff changeset
   428
The concrete example
Chengsong
parents: 469
diff changeset
   429
$((a|b|c|\ldots|z)^*)\backslash 1$
Chengsong
parents: 469
diff changeset
   430
would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
Chengsong
parents: 469
diff changeset
   431
Back-reference is a construct in the "regex" standard
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   432
that programmers found useful, but not exactly 
471
Chengsong
parents: 469
diff changeset
   433
regular any more.
Chengsong
parents: 469
diff changeset
   434
In fact, that allows the regex construct to express 
Chengsong
parents: 469
diff changeset
   435
languages that cannot be contained in context-free
Chengsong
parents: 469
diff changeset
   436
languages either.
Chengsong
parents: 469
diff changeset
   437
For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
Chengsong
parents: 469
diff changeset
   438
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
Chengsong
parents: 469
diff changeset
   439
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
Chengsong
parents: 469
diff changeset
   440
Such a language is contained in the context-sensitive hierarchy
Chengsong
parents: 469
diff changeset
   441
of formal languages. 
Chengsong
parents: 469
diff changeset
   442
Solving the back-reference expressions matching problem
Chengsong
parents: 469
diff changeset
   443
is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
Chengsong
parents: 469
diff changeset
   444
efficient solution is not known to exist.
Chengsong
parents: 469
diff changeset
   445
%TODO:read a bit more about back reference algorithms
Chengsong
parents: 469
diff changeset
   446
It seems that languages like Java and Python made the trade-off
Chengsong
parents: 469
diff changeset
   447
to support back-references at the expense of having to backtrack,
Chengsong
parents: 469
diff changeset
   448
even in the case of regexes not involving back-references.\\
Chengsong
parents: 469
diff changeset
   449
Summing these up, we can categorise existing 
Chengsong
parents: 469
diff changeset
   450
practical regex libraries into the ones  with  linear
Chengsong
parents: 469
diff changeset
   451
time guarantees like Go and Rust, which impose restrictions
Chengsong
parents: 469
diff changeset
   452
on the user input (not allowing back-references, 
Chengsong
parents: 469
diff changeset
   453
bounded repetitions canno exceed 1000 etc.), and ones  
Chengsong
parents: 469
diff changeset
   454
 that allows the programmer much freedom, but grinds to a halt
Chengsong
parents: 469
diff changeset
   455
 in some non-negligible portion of cases.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   456
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
471
Chengsong
parents: 469
diff changeset
   457
% For example, the Rust regex engine claims to be linear, 
Chengsong
parents: 469
diff changeset
   458
% but does not support lookarounds and back-references.
Chengsong
parents: 469
diff changeset
   459
% The GoLang regex library does not support over 1000 repetitions.  
Chengsong
parents: 469
diff changeset
   460
% Java and Python both support back-references, but shows
Chengsong
parents: 469
diff changeset
   461
%catastrophic backtracking behaviours on inputs without back-references(
Chengsong
parents: 469
diff changeset
   462
%when the language is still regular).
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   463
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   464
 %TODO: verify the fact Rust does not allow 1000+ reps
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   465
 %TODO: Java 17 updated graphs? Is it ok to still use Java 8 graphs?
471
Chengsong
parents: 469
diff changeset
   466
\section{Buggy Regex Engines} 
Chengsong
parents: 469
diff changeset
   467
Chengsong
parents: 469
diff changeset
   468
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   469
 Another thing about these libraries is that there
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   470
 is no correctness guarantee.
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   471
 In some cases, they either fail to generate a lexing result when there exists a match,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   472
 or give the wrong way of matching.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   473
 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   474
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   475
It turns out that regex libraries not only suffer from 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   476
exponential backtracking problems, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   477
but also undesired (or even buggy) outputs.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   478
%TODO: comment from who
471
Chengsong
parents: 469
diff changeset
   479
Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   480
correctly implementing the POSIX (maximum-munch)
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   481
rule of regular expression matching.
471
Chengsong
parents: 469
diff changeset
   482
This experience is echoed by the writer's
Chengsong
parents: 469
diff changeset
   483
tryout of a few online regex testers:
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   484
A concrete example would be 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   485
the regex
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   486
\begin{verbatim}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   487
(((((a*a*)b*)b){20})*)c
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   488
\end{verbatim}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   489
and the string
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   490
\begin{verbatim}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   491
baabaabababaabaaaaaaaaababaa
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   492
aababababaaaabaaabaaaaaabaab
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   493
aabababaababaaaaaaaaababaaaa
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   494
babababaaaaaaaaaaaaac
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   495
\end{verbatim}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   496
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   497
This seemingly complex regex simply says "some $a$'s
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   498
followed by some $b$'s then followed by 1 single $b$,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   499
and this iterates 20 times, finally followed by a $c$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   500
And a POSIX match would involve the entire string,"eating up"
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   501
all the $b$'s in it.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   502
%TODO: give a coloured example of how this matches POSIXly
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   503
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   504
This regex would trigger catastrophic backtracking in 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   505
languages like Python and Java,
471
Chengsong
parents: 469
diff changeset
   506
whereas it gives a non-POSIX  and uninformative 
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   507
match in languages like Go or .NET--The match with only 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   508
character $c$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   509
471
Chengsong
parents: 469
diff changeset
   510
As Grathwohl\parencite{grathwohl2014crash} commented,
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   511
\begin{center}
471
Chengsong
parents: 469
diff changeset
   512
	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   513
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   514
472
Chengsong
parents: 471
diff changeset
   515
%\section{How people solve problems with regexes}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   516
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   517
472
Chengsong
parents: 471
diff changeset
   518
When a regular expression does not behave as intended,
Chengsong
parents: 471
diff changeset
   519
people usually try to rewrite the regex to some equivalent form
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   520
or they try to avoid the possibly problematic patterns completely,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   521
for which many false positives exist\parencite{Davis18}.
472
Chengsong
parents: 471
diff changeset
   522
Animated tools to "debug" regular expressions
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   523
are also popular, regexploit\parencite{regexploit2021}, regex101\parencite{regex101} 
472
Chengsong
parents: 471
diff changeset
   524
to name a few.
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   525
We are also aware of static analysis work on regular expressions that
472
Chengsong
parents: 471
diff changeset
   526
aims to detect potentially expoential regex patterns. Rathnayake and Thielecke 
471
Chengsong
parents: 469
diff changeset
   527
\parencite{Rathnayake2014StaticAF} proposed an algorithm
Chengsong
parents: 469
diff changeset
   528
that detects regular expressions triggering exponential
Chengsong
parents: 469
diff changeset
   529
behavious on backtracking matchers.
472
Chengsong
parents: 471
diff changeset
   530
Weideman \parencite{Weideman2017Static} came up with 
Chengsong
parents: 471
diff changeset
   531
non-linear polynomial worst-time estimates
471
Chengsong
parents: 469
diff changeset
   532
for regexes, attack string that exploit the worst-time 
Chengsong
parents: 469
diff changeset
   533
scenario, and "attack automata" that generates
472
Chengsong
parents: 471
diff changeset
   534
attack strings.
Chengsong
parents: 471
diff changeset
   535
%Arguably these methods limits the programmers' freedom
Chengsong
parents: 471
diff changeset
   536
%or productivity when all they want is to come up with a regex
Chengsong
parents: 471
diff changeset
   537
%that solves the text processing problem.
Chengsong
parents: 471
diff changeset
   538
471
Chengsong
parents: 469
diff changeset
   539
%TODO:also the regex101 debugger
Chengsong
parents: 469
diff changeset
   540
\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   541
 Is it possible to have a regex lexing algorithm with proven correctness and 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   542
 time complexity, which allows easy extensions to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   543
  constructs like 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   544
 bounded repetitions, negation,  lookarounds, and even back-references? 
472
Chengsong
parents: 471
diff changeset
   545
  
Chengsong
parents: 471
diff changeset
   546
  We propose Brzozowski derivatives on regular expressions as
Chengsong
parents: 471
diff changeset
   547
  a solution to this.
Chengsong
parents: 471
diff changeset
   548
  
Chengsong
parents: 471
diff changeset
   549
  In the last fifteen or so years, Brzozowski's derivatives of regular
Chengsong
parents: 471
diff changeset
   550
expressions have sparked quite a bit of interest in the functional
Chengsong
parents: 471
diff changeset
   551
programming and theorem prover communities.  The beauty of
Chengsong
parents: 471
diff changeset
   552
Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
Chengsong
parents: 471
diff changeset
   553
expressible in any functional language, and easily definable and
Chengsong
parents: 471
diff changeset
   554
reasoned about in theorem provers---the definitions just consist of
Chengsong
parents: 471
diff changeset
   555
inductive datatypes and simple recursive functions. 
Chengsong
parents: 471
diff changeset
   556
And an algorithms based on it by 
Chengsong
parents: 471
diff changeset
   557
Suzmann and Lu  \parencite{Sulzmann2014} allows easy extension
Chengsong
parents: 471
diff changeset
   558
to include  extended regular expressions and 
Chengsong
parents: 471
diff changeset
   559
 simplification of internal data structures 
Chengsong
parents: 471
diff changeset
   560
 eliminating the exponential behaviours.
Chengsong
parents: 471
diff changeset
   561
 
Chengsong
parents: 471
diff changeset
   562
519
Chengsong
parents: 518
diff changeset
   563
  \section{Motivation}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   564
  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   565
Derivatives give a simple solution
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   566
to the problem of matching a string $s$ with a regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   567
expression $r$: if the derivative of $r$ w.r.t.\ (in
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   568
succession) all the characters of the string matches the empty string,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   569
then $r$ matches $s$ (and {\em vice versa}).  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   570
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   571
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   572
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   573
However, two difficulties with derivative-based matchers exist:
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   574
First, Brzozowski's original matcher only generates a yes/no answer
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   575
for whether a regular expression matches a string or not.  This is too
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   576
little information in the context of lexing where separate tokens must
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   577
be identified and also classified (for example as keywords
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   578
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   579
difficulty by cleverly extending Brzozowski's matching
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   580
algorithm. Their extended version generates additional information on
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   581
\emph{how} a regular expression matches a string following the POSIX
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   582
rules for regular expression matching. They achieve this by adding a
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   583
second ``phase'' to Brzozowski's algorithm involving an injection
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   584
function.  In our own earlier work we provided the formal
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   585
specification of what POSIX matching means and proved in Isabelle/HOL
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   586
the correctness
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   587
of Sulzmann and Lu's extended algorithm accordingly
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   588
\cite{AusafDyckhoffUrban2016}.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   589
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   590
The second difficulty is that Brzozowski's derivatives can 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   591
grow to arbitrarily big sizes. For example if we start with the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   592
regular expression $(a+aa)^*$ and take
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   593
successive derivatives according to the character $a$, we end up with
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   594
a sequence of ever-growing derivatives like 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   595
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   596
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   597
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   598
\begin{tabular}{rll}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   599
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   600
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   601
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   602
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   603
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   604
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   605
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   606
 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   607
\noindent where after around 35 steps we run out of memory on a
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   608
typical computer (we shall define shortly the precise details of our
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   609
regular expressions and the derivative operation).  Clearly, the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   610
notation involving $\ZERO$s and $\ONE$s already suggests
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   611
simplification rules that can be applied to regular regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   612
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   613
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   614
r$. While such simple-minded simplifications have been proved in our
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   615
earlier work to preserve the correctness of Sulzmann and Lu's
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   616
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   617
\emph{not} help with limiting the growth of the derivatives shown
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   618
above: the growth is slowed, but the derivatives can still grow rather
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   619
quickly beyond any finite bound.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   620
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   621
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   622
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   623
\cite{Sulzmann2014} where they introduce bitcoded
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   624
regular expressions. In this version, POSIX values are
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   625
represented as bitsequences and such sequences are incrementally generated
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   626
when derivatives are calculated. The compact representation
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   627
of bitsequences and regular expressions allows them to define a more
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   628
``aggressive'' simplification method that keeps the size of the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   629
derivatives finite no matter what the length of the string is.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   630
They make some informal claims about the correctness and linear behaviour
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   631
of this version, but do not provide any supporting proof arguments, not
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   632
even ``pencil-and-paper'' arguments. They write about their bitcoded
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   633
\emph{incremental parsing method} (that is the algorithm to be formalised
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   634
in this paper):
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   635
519
Chengsong
parents: 518
diff changeset
   636
Chengsong
parents: 518
diff changeset
   637
  
Chengsong
parents: 518
diff changeset
   638
  \begin{quote}\it
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   639
  ``Correctness Claim: We further claim that the incremental parsing
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   640
  method [..] in combination with the simplification steps [..]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   641
  yields POSIX parse trees. We have tested this claim
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   642
  extensively [..] but yet
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   643
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   644
\end{quote}  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   645
519
Chengsong
parents: 518
diff changeset
   646
Ausaf and Urban were able to back this correctness claim with
Chengsong
parents: 518
diff changeset
   647
a formal proof.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   648
519
Chengsong
parents: 518
diff changeset
   649
But as they stated,
Chengsong
parents: 518
diff changeset
   650
  \begin{quote}\it
Chengsong
parents: 518
diff changeset
   651
The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
Chengsong
parents: 518
diff changeset
   652
\end{quote}  
Chengsong
parents: 518
diff changeset
   653
Chengsong
parents: 518
diff changeset
   654
This thesis implements the aggressive simplifications envisioned
Chengsong
parents: 518
diff changeset
   655
by Ausaf and Urban,
Chengsong
parents: 518
diff changeset
   656
and gives a formal proof of the correctness with those simplifications.
Chengsong
parents: 518
diff changeset
   657
Chengsong
parents: 518
diff changeset
   658
Chengsong
parents: 518
diff changeset
   659
 
Chengsong
parents: 518
diff changeset
   660
Chengsong
parents: 518
diff changeset
   661
Chengsong
parents: 518
diff changeset
   662
Chengsong
parents: 518
diff changeset
   663
%----------------------------------------------------------------------------------------
Chengsong
parents: 518
diff changeset
   664
Chengsong
parents: 518
diff changeset
   665
\section{Contribution}
Chengsong
parents: 518
diff changeset
   666
Chengsong
parents: 518
diff changeset
   667
Chengsong
parents: 518
diff changeset
   668
Chengsong
parents: 518
diff changeset
   669
This work addresses the vulnerability of super-linear and
Chengsong
parents: 518
diff changeset
   670
buggy regex implementations by the combination
Chengsong
parents: 518
diff changeset
   671
of Brzozowski's derivatives and interactive theorem proving. 
Chengsong
parents: 518
diff changeset
   672
We give an 
Chengsong
parents: 518
diff changeset
   673
improved version of  Sulzmann and Lu's bit-coded algorithm using 
Chengsong
parents: 518
diff changeset
   674
derivatives, which come with a formal guarantee in terms of correctness and 
Chengsong
parents: 518
diff changeset
   675
running time as an Isabelle/HOL proof.
Chengsong
parents: 518
diff changeset
   676
Then we improve the algorithm with an even stronger version of 
Chengsong
parents: 518
diff changeset
   677
simplification, and prove a time bound linear to input and
Chengsong
parents: 518
diff changeset
   678
cubic to regular expression size using a technique by
Chengsong
parents: 518
diff changeset
   679
Antimirov.
Chengsong
parents: 518
diff changeset
   680
Chengsong
parents: 518
diff changeset
   681
 
Chengsong
parents: 518
diff changeset
   682
The main contribution of this thesis is a proven correct lexing algorithm
Chengsong
parents: 518
diff changeset
   683
with formalized time bounds.
Chengsong
parents: 518
diff changeset
   684
To our best knowledge, no lexing libraries using Brzozowski derivatives
Chengsong
parents: 518
diff changeset
   685
have a provable time guarantee, 
Chengsong
parents: 518
diff changeset
   686
and claims about running time are usually speculative and backed by thin empirical
Chengsong
parents: 518
diff changeset
   687
evidence.
Chengsong
parents: 518
diff changeset
   688
%TODO: give references
Chengsong
parents: 518
diff changeset
   689
For example, Sulzmann and Lu had proposed an algorithm  in which they
Chengsong
parents: 518
diff changeset
   690
claim a linear running time.
Chengsong
parents: 518
diff changeset
   691
But that was falsified by our experiments and the running time 
Chengsong
parents: 518
diff changeset
   692
is actually $\Omega(2^n)$ in the worst case.
Chengsong
parents: 518
diff changeset
   693
A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
Chengsong
parents: 518
diff changeset
   694
%TODO: give references
Chengsong
parents: 518
diff changeset
   695
lexer, which calculates POSIX matches and is based on derivatives.
Chengsong
parents: 518
diff changeset
   696
They formalized the correctness of the lexer, but not the complexity.
Chengsong
parents: 518
diff changeset
   697
In the performance evaluation section, they simply analyzed the run time
Chengsong
parents: 518
diff changeset
   698
of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
Chengsong
parents: 518
diff changeset
   699
and concluded that the algorithm is quadratic in terms of input length.
Chengsong
parents: 518
diff changeset
   700
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
Chengsong
parents: 518
diff changeset
   701
the time it took to lex only 40 $a$'s was 5 minutes.
Chengsong
parents: 518
diff changeset
   702
Chengsong
parents: 518
diff changeset
   703
We  believe our results of a proof of performance on general
Chengsong
parents: 518
diff changeset
   704
inputs rather than specific examples a novel contribution.\\
Chengsong
parents: 518
diff changeset
   705
Chengsong
parents: 518
diff changeset
   706
Chengsong
parents: 518
diff changeset
   707
\subsection{Related Work}
Chengsong
parents: 518
diff changeset
   708
We are aware
Chengsong
parents: 518
diff changeset
   709
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
Chengsong
parents: 518
diff changeset
   710
Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
Chengsong
parents: 518
diff changeset
   711
of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
Chengsong
parents: 518
diff changeset
   712
in Coq is given by Coquand and Siles \parencite{Coquand2012}.
Chengsong
parents: 518
diff changeset
   713
Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
Chengsong
parents: 518
diff changeset
   714
 
Chengsong
parents: 518
diff changeset
   715
 %We propose Brzozowski's derivatives as a solution to this problem.
Chengsong
parents: 518
diff changeset
   716
% about Lexing Using Brzozowski derivatives
Chengsong
parents: 518
diff changeset
   717
Chengsong
parents: 518
diff changeset
   718
Chengsong
parents: 518
diff changeset
   719
\section{Structure of the thesis}
Chengsong
parents: 518
diff changeset
   720
In chapter 2 \ref{Chapter2} we will introduce the concepts
Chengsong
parents: 518
diff changeset
   721
and notations we 
Chengsong
parents: 518
diff changeset
   722
use for describing the lexing algorithm by Sulzmann and Lu,
Chengsong
parents: 518
diff changeset
   723
and then give the algorithm and its variant, and discuss
Chengsong
parents: 518
diff changeset
   724
why more aggressive simplifications are needed. 
Chengsong
parents: 518
diff changeset
   725
Then we illustrate in Chapter 3\ref{Chapter3}
Chengsong
parents: 518
diff changeset
   726
how the algorithm without bitcodes falls short for such aggressive 
Chengsong
parents: 518
diff changeset
   727
simplifications and therefore introduce our version of the
Chengsong
parents: 518
diff changeset
   728
 bitcoded algorithm and 
Chengsong
parents: 518
diff changeset
   729
its correctness proof .  
Chengsong
parents: 518
diff changeset
   730
In Chapter 4 \ref{Chapter4} we give the second guarantee
Chengsong
parents: 518
diff changeset
   731
of our bitcoded algorithm, that is a finite bound on the size of any 
Chengsong
parents: 518
diff changeset
   732
regex's derivatives.
Chengsong
parents: 518
diff changeset
   733
In Chapter 5\ref{Chapter5} we discuss stronger simplifications to improve the finite bound
Chengsong
parents: 518
diff changeset
   734
in Chapter 4 to a polynomial one, and demonstrate how one can extend the
Chengsong
parents: 518
diff changeset
   735
algorithm to include constructs such as bounded repetitions and negations.
Chengsong
parents: 518
diff changeset
   736
 
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   737
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   738
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   739
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   740
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   741
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   742
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   743
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   744
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   745
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   746
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   747
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   748
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   749
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   750