ChengsongTanPhdThesis/Chapters/Introduction.tex
author Chengsong
Mon, 06 Jun 2022 23:17:45 +0100
changeset 537 50e590823220
parent 532 cc54ce075db5
child 538 8016a2480704
permissions -rwxr-xr-x
more
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     1
% Chapter 1
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     2
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     4
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     5
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     6
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     8
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    15
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    16
%boxes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    17
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    18
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    19
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    20
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    21
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    22
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimp} #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    23
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    24
\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    25
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    26
\newcommand{\rsimp}[1]{\textit{rsimp}(#1)}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    27
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    28
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    29
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    30
\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    31
\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    32
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    33
\newcommand{\rdistinct}[2]{\textit{distinct} \; \textit{#1} \; #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    34
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    35
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    36
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    37
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    38
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    39
537
Chengsong
parents: 532
diff changeset
    40
\def\code{\textit{code}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    41
\def\decode{\textit{decode}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    42
\def\internalise{\textit{internalise}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    43
\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    44
\def\mkeps{\textit{mkeps}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    45
\newcommand{\rder}[2]{#2 \backslash #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    46
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    47
\def\AZERO{\textit{AZERO}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    48
\def\AONE{\textit{AONE}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    49
\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    50
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    51
\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    52
\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    53
\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    54
\def\DFA{\textit{DFA}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    55
\def\bmkeps{\textit{bmkeps}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    56
\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    57
\def\blexer{\textit{blexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    58
\def\flex{\textit{flex}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    59
\def\inj{\mathit{inj}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    60
\def\Empty{\mathit{Empty}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    61
\def\Left{\mathit{Left}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    62
\def\Right{\mathit{Right}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    63
\def\Stars{\mathit{Stars}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    64
\def\Char{\mathit{Char}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    65
\def\Seq{\mathit{Seq}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    66
\def\Der{\textit{Der}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    67
\def\Ders{\textit{Ders}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    68
\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    69
\def\Z{\mathit{Z}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    70
\def\S{\mathit{S}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    71
\def\rup{r^\uparrow}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    72
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    73
\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    74
\def\lf{\textit{lf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    75
\def\PD{\textit{PD}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    76
\def\suffix{\textit{Suffix}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    77
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    78
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    79
\def\size{\mathit{size}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    80
\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    81
\def\simp{\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    82
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    83
\def\map{\mathit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    84
\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    85
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    86
\def\map{\textit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    87
%\def\vsuf{\textit{vsuf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    88
%\def\sflataux{\textit{sflat}\_\textit{aux}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    89
\def\rrexp{\textit{rrexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    90
\newcommand\rnullable[1]{\textit{rnullable}(#1)}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    91
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    92
\newcommand\asize[1]{\llbracket #1 \rrbracket}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    93
\newcommand\rerase[1]{ (#1)\downarrow_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    94
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    95
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    96
\def\erase{\textit{erase}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    97
\def\STAR{\textit{STAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    98
\def\flts{\textit{flts}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    99
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   100
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   101
\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   102
\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   103
\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   104
\newcommand\RSEQ[2]{#1 \cdot #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   105
\newcommand\RALTS[1]{\oplus #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   106
\newcommand\RSTAR[1]{#1^*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   107
\newcommand\vsuf[2]{\textit{vsuf} \;#1\;#2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   108
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   109
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   110
%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   111
%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   112
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   113
%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   114
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   115
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   116
Regular expressions are widely used in computer science: 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   117
be it in text-editors\parencite{atomEditor} with syntax highlighting and auto-completion, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   118
command-line tools like $\mathit{grep}$ that facilitate easy 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   119
text processing, network intrusion
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   120
detection systems that reject suspicious traffic, or compiler
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   121
front ends--the majority of the solutions to these tasks 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   122
involve lexing with regular 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   123
expressions.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   124
Given its usefulness and ubiquity, one would imagine that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   125
modern regular expression matching implementations
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   126
are mature and fully studied.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   127
Indeed, in a popular programming language' regex engine, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   128
supplying it with regular expressions and strings, one can
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   129
get rich matching information in a very short time.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   130
Some network intrusion detection systems
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   131
use regex engines that are able to process 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   132
megabytes or even gigabytes of data per second\parencite{Turo_ov__2020}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   133
Unfortunately, this is not the case for $\mathbf{all}$ inputs.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   134
%TODO: get source for SNORT/BRO's regex matching engine/speed
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   135
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   136
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   137
Take $(a^*)^*\,b$ and ask whether
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   138
strings of the form $aa..a$ match this regular
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   139
expression. Obviously this is not the case---the expected $b$ in the last
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   140
position is missing. One would expect that modern regular expression
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   141
matching engines can find this out very quickly. Alas, if one tries
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   142
this example in JavaScript, Python or Java 8, even with strings of a small
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   143
length, say around 30 $a$'s, one discovers that 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   144
this decision takes crazy time to finish given the simplicity of the problem.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   145
This is clearly exponential behaviour, and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   146
is triggered by some relatively simple regex patterns, as the graphs
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   147
below show:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   148
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   149
\begin{figure}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   150
\centering
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   151
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   152
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   153
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   154
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   155
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   156
    ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   157
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   158
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   159
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   160
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   161
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   162
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   163
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   164
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   165
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   166
    legend entries={JavaScript},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   167
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   168
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   169
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   170
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   171
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   172
  &
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   173
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   174
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   175
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   176
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   177
    %ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   178
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   179
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   180
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   181
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   182
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   183
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   184
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   185
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   186
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   187
    legend entries={Python},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   188
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   189
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   190
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   191
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   192
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   193
  &
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   194
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   195
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   196
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   197
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   198
    %ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   199
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   200
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   201
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   202
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   203
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   204
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   205
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   206
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   207
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   208
    legend entries={Java 8},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   209
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   210
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   211
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   212
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   213
\end{tikzpicture}\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   214
\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   215
           of the form $\underbrace{aa..a}_{n}$.}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   216
\end{tabular}    
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   217
\caption{aStarStarb} \label{fig:aStarStarb}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   218
\end{figure}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   219
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   220
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   221
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   222
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   223
This superlinear blowup in matching algorithms sometimes cause
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   224
considerable grief in real life: for example on 20 July 2016 one evil
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   225
regular expression brought the webpage
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   226
\href{http://stackexchange.com}{Stack Exchange} to its
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   227
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   228
In this instance, a regular expression intended to just trim white
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   229
spaces from the beginning and the end of a line actually consumed
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   230
massive amounts of CPU-resources---causing web servers to grind to a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   231
halt. This happened when a post with 20,000 white spaces was submitted,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   232
but importantly the white spaces were neither at the beginning nor at
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   233
the end. As a result, the regular expression matching engine needed to
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   234
backtrack over many choices. In this example, the time needed to process
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   235
the string was $O(n^2)$ with respect to the string length. This
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   236
quadratic overhead was enough for the homepage of Stack Exchange to
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   237
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   238
attack and therefore stopped the servers from responding to any
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   239
requests. This made the whole site become unavailable. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   240
A more recent example is a global outage of all Cloudflare servers on 2 July
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   241
2019. A poorly written regular expression exhibited exponential
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   242
behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   243
had several causes, at the heart was a regular expression that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   244
was used to monitor network
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   245
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   246
%TODO: data points for some new versions of languages
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   247
These problems with regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   248
are not isolated events that happen
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   249
very occasionally, but actually widespread.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   250
They occur so often that they get a 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   251
name--Regular-Expression-Denial-Of-Service (ReDoS)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   252
attack.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   253
Davis et al. \parencite{Davis18} detected more
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   254
than 1000 super-linear (SL) regular expressions
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   255
in Node.js, Python core libraries, and npm and pypi. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   256
They therefore concluded that evil regular expressions
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   257
are problems more than "a parlour trick", but one that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   258
requires
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   259
more research attention.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   260
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   261
 \section{The Problem Behind Slow Cases}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   262
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   263
%find literature/find out for yourself that REGEX->DFA on basic regexes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   264
%does not blow up the size
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   265
Shouldn't regular expression matching be linear?
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   266
How can one explain the super-linear behaviour of the 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   267
regex matching engines we have?
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   268
The time cost of regex matching algorithms in general
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   269
involve two phases: 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   270
the construction phase, in which the algorithm builds some  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   271
suitable data structure from the input regex $r$, we denote
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   272
the time cost by $P_1(r)$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   273
The lexing
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   274
phase, when the input string $s$ is read and the data structure
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   275
representing that regex $r$ is being operated on. We represent the time
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   276
it takes by $P_2(r, s)$.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   277
In the case of a $\mathit{DFA}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   278
we have $P_2(r, s) = O( |s| )$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   279
because we take at most $|s|$ steps, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   280
and each step takes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   281
at most one transition--
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   282
a deterministic-finite-automata
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   283
by definition has at most one state active and at most one
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   284
transition upon receiving an input symbol.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   285
But unfortunately in the  worst case
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   286
$P_1(r) = O(exp^{|r|})$. An example will be given later. \\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   287
For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   288
expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   289
The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   290
On the other hand, if backtracking is used, the worst-case time bound bloats
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   291
to $|r| * 2^|s|$ .
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   292
%on the input
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   293
%And when calculating the time complexity of the matching algorithm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   294
%we are assuming that each input reading step requires constant time.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   295
%which translates to that the number of 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   296
%states active and transitions taken each time is bounded by a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   297
%constant $C$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   298
%But modern  regex libraries in popular language engines
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   299
% often want to support much richer constructs than just
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   300
% sequences and Kleene stars,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   301
%such as negation, intersection, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   302
%bounded repetitions and back-references.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   303
%And de-sugaring these "extended" regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   304
%into basic ones might bloat the size exponentially.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   305
%TODO: more reference for exponential size blowup on desugaring. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   306
\subsection{Tools that uses $\mathit{DFA}$s}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   307
%TODO:more tools that use DFAs?
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   308
$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   309
in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   310
lexers. The user provides a set of regular expressions
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   311
and configurations to such lexer generators, and then 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   312
gets an output program encoding a minimized $\mathit{DFA}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   313
that can be compiled and run. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   314
The good things about $\mathit{DFA}$s is that once
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   315
generated, they are fast and stable, unlike
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   316
backtracking algorithms. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   317
However, they do not scale well with bounded repetitions.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   318
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   319
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   320
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   321
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   322
Bounded repetitions, usually written in the form
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   323
$r^{\{c\}}$ (where $c$ is a constant natural number),
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   324
denotes a regular expression accepting strings
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   325
that can be divided into $c$ substrings, where each 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   326
substring is in $r$. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   327
For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   328
an $\mathit{NFA}$ describing it would look like:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   329
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   330
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   331
   \node[state,initial] (q_0)   {$q_0$}; 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   332
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   333
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   334
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   335
    \path[->] 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   336
    (q_0) edge  node {a} (q_1)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   337
    	  edge [loop below] node {a,b} ()
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   338
    (q_1) edge  node  {a,b} (q_2)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   339
    (q_2) edge  node  {a,b} (q_3);
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   340
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   341
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   342
The red states are "countdown states" which counts down 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   343
the number of characters needed in addition to the current
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   344
string to make a successful match.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   345
For example, state $q_1$ indicates a match that has
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   346
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   347
and just consumed the "delimiter" $a$ in the middle, and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   348
need to match 2 more iterations of $(a|b)$ to complete.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   349
State $q_2$ on the other hand, can be viewed as a state
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   350
after $q_1$ has consumed 1 character, and just waits
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   351
for 1 more character to complete.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   352
$q_3$ is the last state, requiring 0 more character and is accepting.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   353
Depending on the suffix of the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   354
input string up to the current read location,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   355
the states $q_1$ and $q_2$, $q_3$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   356
may or may
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   357
not be active, independent from each other.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   358
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   359
contain at least $2^3$ non-equivalent states that cannot be merged, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   360
because the subset construction during determinisation will generate
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   361
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   362
Generalizing this to regular expressions with larger
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   363
bounded repetitions number, we have that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   364
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   365
would require at least $2^{n+1}$ states, if $r$ contains
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   366
more than 1 string.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   367
This is to represent all different 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   368
scenarios which "countdown" states are active.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   369
For those regexes, tools such as $\mathit{JFLEX}$ 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   370
would generate gigantic $\mathit{DFA}$'s or
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   371
out of memory errors.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   372
For this reason, regex libraries that support 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   373
bounded repetitions often choose to use the $\mathit{NFA}$ 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   374
approach.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   375
\subsection{The $\mathit{NFA}$ approach to regex matching}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   376
One can simulate the $\mathit{NFA}$ running in two ways:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   377
one by keeping track of all active states after consuming 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   378
a character, and update that set of states iteratively.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   379
This can be viewed as a breadth-first-search of the $\mathit{NFA}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   380
for a path terminating
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   381
at an accepting state.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   382
Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   383
type of $\mathit{NFA}$ simulation, and guarantees a linear runtime
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   384
in terms of input string length.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   385
%TODO:try out these lexers
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   386
The other way to use $\mathit{NFA}$ for matching is choosing  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   387
a single transition each time, keeping all the other options in 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   388
a queue or stack, and backtracking if that choice eventually 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   389
fails. This method, often called a  "depth-first-search", 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   390
is efficient in a lot of cases, but could end up
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   391
with exponential run time.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   392
%TODO:COMPARE java python lexer speed with Rust and Go
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   393
The reason behind backtracking algorithms in languages like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   394
Java and Python is that they support back-references.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   395
\subsection{Back References in Regex--Non-Regular part}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   396
If we have a regular expression like this (the sequence
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   397
operator is omitted for brevity):
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   398
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   399
	$r_1(r_2(r_3r_4))$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   400
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   401
We could label sub-expressions of interest 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   402
by parenthesizing them and giving 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   403
them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   404
One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   405
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   406
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   407
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   408
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   409
by 1 to 4. $1$ would refer to the entire expression 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   410
$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   411
These sub-expressions are called "capturing groups".
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   412
We can use the following syntax to denote that we want a string just matched by a 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   413
sub-expression (capturing group) to appear at a certain location again, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   414
exactly as it was:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   415
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   416
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   417
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   418
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   419
The backslash and number $i$ are used to denote such 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   420
so-called "back-references".
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   421
Let $e$ be an expression made of regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   422
and back-references. $e$ contains the expression $e_i$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   423
as its $i$-th capturing group.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   424
The semantics of back-reference can be recursively
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   425
written as:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   426
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   427
	\begin{tabular}{c}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   428
		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   429
		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   430
	\end{tabular}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   431
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   432
The concrete example
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   433
$((a|b|c|\ldots|z)^*)\backslash 1$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   434
would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   435
Back-reference is a construct in the "regex" standard
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   436
that programmers found useful, but not exactly 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   437
regular any more.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   438
In fact, that allows the regex construct to express 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   439
languages that cannot be contained in context-free
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   440
languages either.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   441
For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   442
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   443
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   444
Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   445
of formal languages. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   446
Solving the back-reference expressions matching problem
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   447
is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   448
efficient solution is not known to exist.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   449
%TODO:read a bit more about back reference algorithms
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   450
It seems that languages like Java and Python made the trade-off
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   451
to support back-references at the expense of having to backtrack,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   452
even in the case of regexes not involving back-references.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   453
Summing these up, we can categorise existing 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   454
practical regex libraries into the ones  with  linear
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   455
time guarantees like Go and Rust, which impose restrictions
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   456
on the user input (not allowing back-references, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   457
bounded repetitions canno exceed 1000 etc.), and ones  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   458
 that allows the programmer much freedom, but grinds to a halt
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   459
 in some non-negligible portion of cases.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   460
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   461
% For example, the Rust regex engine claims to be linear, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   462
% but does not support lookarounds and back-references.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   463
% The GoLang regex library does not support over 1000 repetitions.  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   464
% Java and Python both support back-references, but shows
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   465
%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   466
%when the language is still regular).
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   467
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   468
 %TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   469
 %TODO: Java 17 updated graphs? Is it ok to still use Java 8 graphs?
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   470
\section{Buggy Regex Engines} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   471
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   472
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   473
 Another thing about these libraries is that there
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   474
 is no correctness guarantee.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   475
 In some cases, they either fail to generate a lexing result when there exists a match,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   476
 or give the wrong way of matching.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   477
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   478
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   479
It turns out that regex libraries not only suffer from 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   480
exponential backtracking problems, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   481
but also undesired (or even buggy) outputs.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   482
%TODO: comment from who
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   483
Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   484
correctly implementing the POSIX (maximum-munch)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   485
rule of regular expression matching.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   486
This experience is echoed by the writer's
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   487
tryout of a few online regex testers:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   488
A concrete example would be 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   489
the regex
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   490
\begin{verbatim}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   491
(((((a*a*)b*)b){20})*)c
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   492
\end{verbatim}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   493
and the string
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   494
\begin{verbatim}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   495
baabaabababaabaaaaaaaaababaa
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   496
aababababaaaabaaabaaaaaabaab
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   497
aabababaababaaaaaaaaababaaaa
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   498
babababaaaaaaaaaaaaac
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   499
\end{verbatim}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   500
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   501
This seemingly complex regex simply says "some $a$'s
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   502
followed by some $b$'s then followed by 1 single $b$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   503
and this iterates 20 times, finally followed by a $c$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   504
And a POSIX match would involve the entire string,"eating up"
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   505
all the $b$'s in it.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   506
%TODO: give a coloured example of how this matches POSIXly
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   507
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   508
This regex would trigger catastrophic backtracking in 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   509
languages like Python and Java,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   510
whereas it gives a non-POSIX  and uninformative 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   511
match in languages like Go or .NET--The match with only 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   512
character $c$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   513
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   514
As Grathwohl\parencite{grathwohl2014crash} commented,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   515
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   516
	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   517
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   518
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   519
%\section{How people solve problems with regexes}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   520
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   521
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   522
When a regular expression does not behave as intended,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   523
people usually try to rewrite the regex to some equivalent form
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   524
or they try to avoid the possibly problematic patterns completely,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   525
for which many false positives exist\parencite{Davis18}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   526
Animated tools to "debug" regular expressions
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   527
are also popular, regexploit\parencite{regexploit2021}, regex101\parencite{regex101} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   528
to name a few.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   529
We are also aware of static analysis work on regular expressions that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   530
aims to detect potentially expoential regex patterns. Rathnayake and Thielecke 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   531
\parencite{Rathnayake2014StaticAF} proposed an algorithm
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   532
that detects regular expressions triggering exponential
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   533
behavious on backtracking matchers.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   534
Weideman \parencite{Weideman2017Static} came up with 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   535
non-linear polynomial worst-time estimates
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   536
for regexes, attack string that exploit the worst-time 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   537
scenario, and "attack automata" that generates
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   538
attack strings.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   539
%Arguably these methods limits the programmers' freedom
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   540
%or productivity when all they want is to come up with a regex
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   541
%that solves the text processing problem.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   542
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   543
%TODO:also the regex101 debugger
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   544
\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   545
 Is it possible to have a regex lexing algorithm with proven correctness and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   546
 time complexity, which allows easy extensions to
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   547
  constructs like 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   548
 bounded repetitions, negation,  lookarounds, and even back-references? 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   549
  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   550
  We propose Brzozowski derivatives on regular expressions as
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   551
  a solution to this.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   552
  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   553
  In the last fifteen or so years, Brzozowski's derivatives of regular
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   554
expressions have sparked quite a bit of interest in the functional
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   555
programming and theorem prover communities.  The beauty of
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   556
Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   557
expressible in any functional language, and easily definable and
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   558
reasoned about in theorem provers---the definitions just consist of
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   559
inductive datatypes and simple recursive functions. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   560
And an algorithms based on it by 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   561
Suzmann and Lu  \parencite{Sulzmann2014} allows easy extension
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   562
to include  extended regular expressions and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   563
 simplification of internal data structures 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   564
 eliminating the exponential behaviours.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   565
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   566
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   567
  \section{Motivation}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   568
  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   569
Derivatives give a simple solution
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   570
to the problem of matching a string $s$ with a regular
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   571
expression $r$: if the derivative of $r$ w.r.t.\ (in
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   572
succession) all the characters of the string matches the empty string,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   573
then $r$ matches $s$ (and {\em vice versa}).  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   574
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   575
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   576
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   577
However, two difficulties with derivative-based matchers exist:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   578
First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   579
for whether a regular expression matches a string or not.  This is too
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   580
little information in the context of lexing where separate tokens must
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   581
be identified and also classified (for example as keywords
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   582
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   583
difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   584
algorithm. Their extended version generates additional information on
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   585
\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   586
rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   587
second ``phase'' to Brzozowski's algorithm involving an injection
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   588
function.  In our own earlier work we provided the formal
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   589
specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   590
the correctness
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   591
of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   592
\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   593
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   594
The second difficulty is that Brzozowski's derivatives can 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   595
grow to arbitrarily big sizes. For example if we start with the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   596
regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   597
successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   598
a sequence of ever-growing derivatives like 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   599
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   600
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   601
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   602
\begin{tabular}{rll}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   603
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   604
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   605
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   606
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   607
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   608
\end{tabular}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   609
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   610
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   611
\noindent where after around 35 steps we run out of memory on a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   612
typical computer (we shall define shortly the precise details of our
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   613
regular expressions and the derivative operation).  Clearly, the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   614
notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   615
simplification rules that can be applied to regular regular
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   616
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   617
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   618
r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   619
earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   620
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   621
\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   622
above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   623
quickly beyond any finite bound.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   624
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   625
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   626
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   627
\cite{Sulzmann2014} where they introduce bitcoded
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   628
regular expressions. In this version, POSIX values are
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   629
represented as bitsequences and such sequences are incrementally generated
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   630
when derivatives are calculated. The compact representation
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   631
of bitsequences and regular expressions allows them to define a more
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   632
``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   633
derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   634
They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   635
of this version, but do not provide any supporting proof arguments, not
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   636
even ``pencil-and-paper'' arguments. They write about their bitcoded
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   637
\emph{incremental parsing method} (that is the algorithm to be formalised
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   638
in this paper):
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   639
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   640
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   641
  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   642
  \begin{quote}\it
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   643
  ``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   644
  method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   645
  yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   646
  extensively [..] but yet
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   647
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   648
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   649
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   650
Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   651
a formal proof.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   652
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   653
But as they stated,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   654
  \begin{quote}\it
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   655
The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   656
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   657
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   658
This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   659
by Ausaf and Urban,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   660
and gives a formal proof of the correctness with those simplifications.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   661
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   662
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   663
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   664
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   665
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   666
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   667
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   668
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   669
\section{Contribution}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   670
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   671
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   672
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   673
This work addresses the vulnerability of super-linear and
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   674
buggy regex implementations by the combination
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   675
of Brzozowski's derivatives and interactive theorem proving. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   676
We give an 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   677
improved version of  Sulzmann and Lu's bit-coded algorithm using 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   678
derivatives, which come with a formal guarantee in terms of correctness and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   679
running time as an Isabelle/HOL proof.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   680
Then we improve the algorithm with an even stronger version of 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   681
simplification, and prove a time bound linear to input and
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   682
cubic to regular expression size using a technique by
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   683
Antimirov.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   684
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   685
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   686
The main contribution of this thesis is a proven correct lexing algorithm
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   687
with formalized time bounds.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   688
To our best knowledge, no lexing libraries using Brzozowski derivatives
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   689
have a provable time guarantee, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   690
and claims about running time are usually speculative and backed by thin empirical
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   691
evidence.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   692
%TODO: give references
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   693
For example, Sulzmann and Lu had proposed an algorithm  in which they
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   694
claim a linear running time.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   695
But that was falsified by our experiments and the running time 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   696
is actually $\Omega(2^n)$ in the worst case.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   697
A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   698
%TODO: give references
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   699
lexer, which calculates POSIX matches and is based on derivatives.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   700
They formalized the correctness of the lexer, but not the complexity.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   701
In the performance evaluation section, they simply analyzed the run time
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   702
of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   703
and concluded that the algorithm is quadratic in terms of input length.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   704
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   705
the time it took to lex only 40 $a$'s was 5 minutes.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   706
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   707
We  believe our results of a proof of performance on general
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   708
inputs rather than specific examples a novel contribution.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   709
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   710
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   711
\subsection{Related Work}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   712
We are aware
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   713
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   714
Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   715
of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   716
in Coq is given by Coquand and Siles \parencite{Coquand2012}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   717
Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   718
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   719
 %We propose Brzozowski's derivatives as a solution to this problem.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   720
% about Lexing Using Brzozowski derivatives
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   721
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   722
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   723
\section{Structure of the thesis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   724
In chapter 2 \ref{Chapter2} we will introduce the concepts
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   725
and notations we 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   726
use for describing the lexing algorithm by Sulzmann and Lu,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   727
and then give the algorithm and its variant, and discuss
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   728
why more aggressive simplifications are needed. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   729
Then we illustrate in Chapter 3\ref{Chapter3}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   730
how the algorithm without bitcodes falls short for such aggressive 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   731
simplifications and therefore introduce our version of the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   732
 bitcoded algorithm and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   733
its correctness proof .  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   734
In Chapter 4 \ref{Chapter4} we give the second guarantee
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   735
of our bitcoded algorithm, that is a finite bound on the size of any 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   736
regex's derivatives.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   737
In Chapter 5\ref{Chapter5} we discuss stronger simplifications to improve the finite bound
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   738
in Chapter 4 to a polynomial one, and demonstrate how one can extend the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   739
algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   740
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   741
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   742
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   743
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   744
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   745
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   746
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   747
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   748
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   749
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   750
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   751
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   752
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   753
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   754