ChengsongTanPhdThesis/Chapters/Introduction.tex
author Chengsong
Sat, 03 Sep 2022 00:14:20 +0100
changeset 596 b306628a0eab
parent 594 62f8fa03863e
child 600 fd068f39ac23
permissions -rwxr-xr-x
more chap 56
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     1
% Chapter 1
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     2
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     4
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     5
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     6
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     8
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    15
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    16
%boxes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    17
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    18
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    19
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    20
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    21
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    22
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
    23
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564
Chengsong
parents: 558
diff changeset
    24
\def\derssimp{\textit{ders}\_\textit{simp}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    25
\def\rders{\textit{rders}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    26
\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    27
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
    28
\def\bsimps{\textit{bsimp}}
554
Chengsong
parents: 543
diff changeset
    29
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    30
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    31
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    32
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    33
\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    34
\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    35
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555
Chengsong
parents: 554
diff changeset
    36
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594
Chengsong
parents: 591
diff changeset
    37
\def\rdistincts{\textit{rdistinct}}
556
Chengsong
parents: 555
diff changeset
    38
\def\rDistinct{\textit{rdistinct}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    39
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    40
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    41
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    42
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    43
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    44
564
Chengsong
parents: 558
diff changeset
    45
\def\case{\textit{case}}
554
Chengsong
parents: 543
diff changeset
    46
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
Chengsong
parents: 543
diff changeset
    47
\def\rsimpalts{\textit{rsimp}_{ALTS}}
Chengsong
parents: 543
diff changeset
    48
\def\good{\textit{good}}
Chengsong
parents: 543
diff changeset
    49
\def\btrue{\textit{true}}
Chengsong
parents: 543
diff changeset
    50
\def\bfalse{\textit{false}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    51
\def\bnullable{\textit{bnullable}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    52
\def\bnullables{\textit{bnullables}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    53
\def\Some{\textit{Some}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    54
\def\None{\textit{None}}
537
Chengsong
parents: 532
diff changeset
    55
\def\code{\textit{code}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    56
\def\decode{\textit{decode}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    57
\def\internalise{\textit{internalise}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    58
\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    59
\def\mkeps{\textit{mkeps}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    60
\newcommand{\rder}[2]{#2 \backslash_r #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    61
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    62
\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    63
554
Chengsong
parents: 543
diff changeset
    64
\def\nonnested{\textit{nonnested}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    65
\def\AZERO{\textit{AZERO}}
558
Chengsong
parents: 557
diff changeset
    66
\def\sizeNregex{\textit{sizeNregex}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    67
\def\AONE{\textit{AONE}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    68
\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    69
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    70
\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    71
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    72
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555
Chengsong
parents: 554
diff changeset
    73
\def\frewrite{\rightsquigarrow_f}
Chengsong
parents: 554
diff changeset
    74
\def\hrewrite{\rightsquigarrow_h}
Chengsong
parents: 554
diff changeset
    75
\def\grewrite{\rightsquigarrow_g}
Chengsong
parents: 554
diff changeset
    76
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
Chengsong
parents: 554
diff changeset
    77
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
Chengsong
parents: 554
diff changeset
    78
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    79
\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    80
\def\bder{\textit{bder}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    81
\def\der{\textit{der}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    82
\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    83
\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    84
\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    85
\def\DFA{\textit{DFA}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    86
\def\NFA{\textit{NFA}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    87
\def\bmkeps{\textit{bmkeps}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    88
\def\bmkepss{\textit{bmkepss}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    89
\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    90
\def\blexer{\textit{blexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    91
\def\flex{\textit{flex}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    92
\def\inj{\textit{inj}}
564
Chengsong
parents: 558
diff changeset
    93
\def\Empty{\textit{Empty}}
567
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
    94
\def\Left{\textit{Left}}
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
    95
\def\Right{\textit{Right}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    96
\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    97
\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    98
\def\Seq{\textit{Seq}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    99
\def\Der{\textit{Der}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   100
\def\Ders{\textit{Ders}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   101
\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   102
\def\Z{\mathit{Z}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   103
\def\S{\mathit{S}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   104
\def\rup{r^\uparrow}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   105
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   106
\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   107
\def\lf{\textit{lf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   108
\def\PD{\textit{PD}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   109
\def\suffix{\textit{Suffix}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   110
\def\distinctBy{\textit{distinctBy}}
558
Chengsong
parents: 557
diff changeset
   111
\def\starupdate{\textit{starUpdate}}
Chengsong
parents: 557
diff changeset
   112
\def\starupdates{\textit{starUpdates}}
Chengsong
parents: 557
diff changeset
   113
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   114
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   115
\def\size{\mathit{size}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   116
\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   117
\def\simp{\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   118
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   119
\def\map{\mathit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   120
\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   121
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   122
\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   123
\def\bsimpStrong{\textit{bsimpStrong}}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
   124
\def\bdersStrongs{\textit{bdersStrong}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   125
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   126
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   127
\def\map{\textit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   128
\def\rrexp{\textit{rrexp}}
554
Chengsong
parents: 543
diff changeset
   129
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   130
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   131
\newcommand\asize[1]{\llbracket #1 \rrbracket}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   132
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   133
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   134
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   135
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   136
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   137
\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   138
\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   139
\def\bsimpalts{\textit{bsimp}_{ALTS}}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   140
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   141
\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   142
\def\rsimpseq{\textit{rsimp}_{SEQ}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   143
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   144
\def\erase{\textit{erase}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   145
\def\STAR{\textit{STAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   146
\def\flts{\textit{flts}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   147
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   148
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   149
\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   150
\def\nub{\textit{nub}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   151
\def\filter{\textit{filter}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   152
\def\not{\textit{not}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   153
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   154
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   155
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   156
\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   157
\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   158
\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   159
\newcommand\RSEQ[2]{#1 \cdot #2}
558
Chengsong
parents: 557
diff changeset
   160
\newcommand\RALTS[1]{\sum #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   161
\newcommand\RSTAR[1]{#1^*}
558
Chengsong
parents: 557
diff changeset
   162
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   163
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   164
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   165
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   166
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   167
\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   168
  frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   169
  language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   170
  aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   171
  belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   172
  showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   173
  columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   174
  basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   175
  numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   176
  numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   177
  keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   178
  commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   179
  stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   180
  frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   181
  breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   182
  breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   183
  tabsize=3,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   184
}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   185
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   186
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   187
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   188
%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   189
%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   190
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   191
%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   192
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   193
\begin{figure}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   194
\centering
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   195
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   196
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   197
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   198
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   199
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   200
    ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   201
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   202
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   203
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   204
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   205
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   206
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   207
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   208
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   209
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   210
    legend entries={JavaScript},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   211
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   212
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   213
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   214
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   215
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   216
  &
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   217
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   218
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   219
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   220
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   221
    %ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   222
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   223
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   224
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   225
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   226
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   227
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   228
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   229
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   230
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   231
    legend entries={Python},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   232
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   233
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   234
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   235
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   236
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   237
  &
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   238
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   239
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   240
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   241
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   242
    %ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   243
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   244
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   245
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   246
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   247
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   248
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   249
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   250
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   251
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   252
    legend entries={Java 8},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   253
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   254
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   255
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   256
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   257
\end{tikzpicture}\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   258
\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   259
           of the form $\underbrace{aa..a}_{n}$.}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   260
\end{tabular}    
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   261
\caption{aStarStarb} \label{fig:aStarStarb}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   262
\end{figure}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   263
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   264
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   265
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   266
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   267
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   268
Regular expressions are widely used in computer science: 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   269
be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   270
command-line tools like $\mathit{grep}$ that facilitate easy 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   271
text-processing; network intrusion
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   272
detection systems that reject suspicious traffic; or compiler
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   273
front ends--the majority of the solutions to these tasks 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   274
involve lexing with regular 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   275
expressions.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   276
Given its usefulness and ubiquity, one would imagine that
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   277
modern regular expression matching implementations
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   278
are mature and fully studied.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   279
Indeed, in a popular programming language' regex engine, 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   280
supplying it with regular expressions and strings, one can
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   281
get rich matching information in a very short time.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   282
Some network intrusion detection systems
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   283
use regex engines that are able to process 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   284
megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   285
Unfortunately, this is not the case for $\mathbf{all}$ inputs.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   286
%TODO: get source for SNORT/BRO's regex matching engine/speed
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   287
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   288
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   289
Take $(a^*)^*\,b$ and ask whether
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   290
strings of the form $aa..a$ match this regular
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   291
expression. Obviously this is not the case---the expected $b$ in the last
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   292
position is missing. One would expect that modern regular expression
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   293
matching engines can find this out very quickly. Alas, if one tries
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   294
this example in JavaScript, Python or Java 8, even with strings of a small
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   295
length, say around 30 $a$'s, one discovers that 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   296
this decision takes crazy time to finish given the simplicity of the problem.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   297
This is clearly exponential behaviour, and 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   298
is triggered by some relatively simple regex patterns, as the graphs
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   299
 in \ref{fig:aStarStarb} show.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   300
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   301
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   302
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   303
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   304
\ChristianComment{Superlinear I just leave out the explanation 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   305
which I find once used would distract the flow. Plus if i just say exponential
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   306
here the 2016 event in StackExchange was not exponential, but just quardratic so would be 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   307
in accurate}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   308
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   309
This superlinear blowup in regular expression engines
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   310
had repeatedly caused grief in real life.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   311
For example, on 20 July 2016 one evil
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   312
regular expression brought the webpage
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   313
\href{http://stackexchange.com}{Stack Exchange} to its
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   314
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   315
In this instance, a regular expression intended to just trim white
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   316
spaces from the beginning and the end of a line actually consumed
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   317
massive amounts of CPU resources---causing web servers to grind to a
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   318
halt. In this example, the time needed to process
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   319
the string was $O(n^2)$ with respect to the string length. This
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   320
quadratic overhead was enough for the homepage of Stack Exchange to
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   321
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   322
attack and therefore stopped the servers from responding to any
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   323
requests. This made the whole site become unavailable. 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   324
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   325
A more recent example is a global outage of all Cloudflare servers on 2 July
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   326
2019. A poorly written regular expression exhibited exponential
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   327
behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   328
had several causes, at the heart was a regular expression that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   329
was used to monitor network
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   330
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   331
%TODO: data points for some new versions of languages
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   332
These problems with regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   333
are not isolated events that happen
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   334
very occasionally, but actually widespread.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   335
They occur so often that they get a 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   336
name--Regular-Expression-Denial-Of-Service (ReDoS)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   337
attack.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   338
\citeauthor{Davis18} detected more
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   339
than 1000 super-linear (SL) regular expressions
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   340
in Node.js, Python core libraries, and npm and pypi. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   341
They therefore concluded that evil regular expressions
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   342
are problems "more than a parlour trick", but one that
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   343
requires
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   344
more research attention.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   345
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   346
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   347
But the problems are not limited to slowness on certain 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   348
cases. 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   349
Another thing about these libraries is that there
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   350
is no correctness guarantee.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   351
In some cases, they either fail to generate a lexing result when there exists a match,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   352
or give results that are inconsistent with the $\POSIX$ standard.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   353
A concrete example would be 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   354
the regex
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   355
\begin{verbatim}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   356
(aba|ab|a)*
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   357
\end{verbatim}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   358
and the string
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   359
\begin{verbatim}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   360
ababa
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   361
\end{verbatim}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   362
The correct $\POSIX$ match for the above would be 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   363
with the entire string $ababa$, 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   364
split into two Kleene star iterations, $[ab] [aba]$ at positions
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   365
$[0, 2), [2, 5)$
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   366
respectively.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   367
But trying this out in regex101\parencite{regex101}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   368
with different language engines would yield 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   369
the same two fragmented matches: $[aba]$ at $[0, 3)$
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   370
and $a$ at $[4, 5)$.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   371
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   372
Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   373
correctly implementing the POSIX (maximum-munch)
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   374
rule of regular expression matching.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   375
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   376
As Grathwohl\parencite{grathwohl2014crash} commented,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   377
\begin{center}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   378
	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   379
\end{center}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   380
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   381
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   382
To summarise the above, regular expressions are important.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   383
They are popular and programming languages' library functions
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   384
for them are very fast on non-catastrophic cases.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   385
But there are problems with current practical implementations.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   386
First thing is that the running time might blow up.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   387
The second problem is that they might be error-prone on certain
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   388
very simple cases.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   389
In the next part of the chapter, we will look into reasons why 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   390
certain regex engines are running horribly slow on the "catastrophic"
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   391
cases and propose a solution that addresses both of these problems
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   392
based on Brzozowski and Sulzmann and Lu's work.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   393
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   394
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   395
 \section{Why are current regex engines slow?}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   396
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   397
%find literature/find out for yourself that REGEX->DFA on basic regexes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   398
%does not blow up the size
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   399
Shouldn't regular expression matching be linear?
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   400
How can one explain the super-linear behaviour of the 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   401
regex matching engines we have?
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   402
The time cost of regex matching algorithms in general
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   403
involve two different phases, and different things can go differently wrong on 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   404
these phases.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   405
$\DFA$s usually have problems in the first (construction) phase
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   406
, whereas $\NFA$s usually run into trouble
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   407
on the second phase.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   408
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   409
\subsection{Different Phases of a Matching/Lexing Algorithm}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   410
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   411
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   412
Most lexing algorithms can be roughly divided into 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   413
two phases during its run.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   414
The first phase is the "construction" phase,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   415
in which the algorithm builds some  
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   416
suitable data structure from the input regex $r$, so that
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   417
it can be easily operated on later.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   418
We denote
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   419
the time cost for such a phase by $P_1(r)$.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   420
The second phase is the lexing phase, when the input string 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   421
$s$ is read and the data structure
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   422
representing that regex $r$ is being operated on. 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   423
We represent the time
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   424
it takes by $P_2(r, s)$.\\
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   425
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   426
For $\mathit{DFA}$,
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   427
we have $P_2(r, s) = O( |s| )$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   428
because we take at most $|s|$ steps, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   429
and each step takes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   430
at most one transition--
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   431
a deterministic-finite-automata
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   432
by definition has at most one state active and at most one
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   433
transition upon receiving an input symbol.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   434
But unfortunately in the  worst case
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   435
$P_1(r) = O(exp^{|r|})$. An example will be given later. 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   436
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   437
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   438
For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   439
expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   440
The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   441
On the other hand, if backtracking is used, the worst-case time bound bloats
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   442
to $|r| * 2^|s|$.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   443
%on the input
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   444
%And when calculating the time complexity of the matching algorithm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   445
%we are assuming that each input reading step requires constant time.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   446
%which translates to that the number of 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   447
%states active and transitions taken each time is bounded by a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   448
%constant $C$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   449
%But modern  regex libraries in popular language engines
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   450
% often want to support much richer constructs than just
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   451
% sequences and Kleene stars,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   452
%such as negation, intersection, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   453
%bounded repetitions and back-references.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   454
%And de-sugaring these "extended" regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   455
%into basic ones might bloat the size exponentially.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   456
%TODO: more reference for exponential size blowup on desugaring. 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   457
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   458
\subsection{Why $\mathit{DFA}s$ can be slow in the first phase}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   459
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   460
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   461
The good things about $\mathit{DFA}$s is that once
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   462
generated, they are fast and stable, unlike
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   463
backtracking algorithms. 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   464
However, they do not scale well with bounded repetitions.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   465
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   466
\subsubsection{Problems with Bounded Repetitions}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   467
Bounded repetitions, usually written in the form
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   468
$r^{\{c\}}$ (where $c$ is a constant natural number),
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   469
denotes a regular expression accepting strings
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   470
that can be divided into $c$ substrings, where each 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   471
substring is in $r$. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   472
For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   473
an $\mathit{NFA}$ describing it would look like:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   474
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   475
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   476
   \node[state,initial] (q_0)   {$q_0$}; 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   477
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   478
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   479
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   480
    \path[->] 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   481
    (q_0) edge  node {a} (q_1)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   482
    	  edge [loop below] node {a,b} ()
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   483
    (q_1) edge  node  {a,b} (q_2)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   484
    (q_2) edge  node  {a,b} (q_3);
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   485
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   486
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   487
The red states are "countdown states" which counts down 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   488
the number of characters needed in addition to the current
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   489
string to make a successful match.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   490
For example, state $q_1$ indicates a match that has
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   491
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   492
and just consumed the "delimiter" $a$ in the middle, and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   493
need to match 2 more iterations of $(a|b)$ to complete.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   494
State $q_2$ on the other hand, can be viewed as a state
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   495
after $q_1$ has consumed 1 character, and just waits
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   496
for 1 more character to complete.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   497
$q_3$ is the last state, requiring 0 more character and is accepting.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   498
Depending on the suffix of the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   499
input string up to the current read location,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   500
the states $q_1$ and $q_2$, $q_3$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   501
may or may
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   502
not be active, independent from each other.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   503
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   504
contain at least $2^3$ non-equivalent states that cannot be merged, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   505
because the subset construction during determinisation will generate
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   506
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   507
Generalizing this to regular expressions with larger
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   508
bounded repetitions number, we have that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   509
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   510
would require at least $2^{n+1}$ states, if $r$ contains
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   511
more than 1 string.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   512
This is to represent all different 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   513
scenarios which "countdown" states are active.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   514
For those regexes, tools that uses $\DFA$s will get
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   515
out of memory errors.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   516
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   517
\subsubsection{Tools that uses $\mathit{DFA}$s}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   518
%TODO:more tools that use DFAs?
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   519
$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   520
in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   521
lexers. The user provides a set of regular expressions
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   522
and configurations to such lexer generators, and then 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   523
gets an output program encoding a minimized $\mathit{DFA}$
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   524
that can be compiled and run. 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   525
When given the above countdown regular expression,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   526
a small number $n$ would result in a determinised automata
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   527
with millions of states.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   528
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   529
For this reason, regex libraries that support 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   530
bounded repetitions often choose to use the $\mathit{NFA}$ 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   531
approach.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   532
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   533
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   534
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   535
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   536
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   537
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   539
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   540
\subsection{Why $\mathit{NFA}$s can be slow in the second phase}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   541
When one constructs an $\NFA$ out of a regular expression
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   542
there is often very little to be done in the first phase, one simply 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   543
construct the $\NFA$ states based on the structure of the input regular expression.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   544
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   545
In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   546
one by keeping track of all active states after consuming 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   547
a character, and update that set of states iteratively.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   548
This can be viewed as a breadth-first-search of the $\mathit{NFA}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   549
for a path terminating
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   550
at an accepting state.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   551
Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   552
type of $\mathit{NFA}$ simulation and guarantees a linear runtime
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   553
in terms of input string length.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   554
%TODO:try out these lexers
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   555
The other way to use $\mathit{NFA}$ for matching is choosing  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   556
a single transition each time, keeping all the other options in 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   557
a queue or stack, and backtracking if that choice eventually 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   558
fails. This method, often called a  "depth-first-search", 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   559
is efficient in a lot of cases, but could end up
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   560
with exponential run time.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   561
%TODO:COMPARE java python lexer speed with Rust and Go
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   562
The reason behind backtracking algorithms in languages like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   563
Java and Python is that they support back-references.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   564
\subsubsection{Back References}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   565
If we have a regular expression like this (the sequence
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   566
operator is omitted for brevity):
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   567
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   568
	$r_1(r_2(r_3r_4))$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   569
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   570
We could label sub-expressions of interest 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   571
by parenthesizing them and giving 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   572
them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   573
One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   574
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   575
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   576
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   577
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   578
by 1 to 4. $1$ would refer to the entire expression 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   579
$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   580
These sub-expressions are called "capturing groups".
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   581
We can use the following syntax to denote that we want a string just matched by a 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   582
sub-expression (capturing group) to appear at a certain location again, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   583
exactly as it was:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   584
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   585
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   586
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   587
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   588
The backslash and number $i$ are used to denote such 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   589
so-called "back-references".
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   590
Let $e$ be an expression made of regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   591
and back-references. $e$ contains the expression $e_i$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   592
as its $i$-th capturing group.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   593
The semantics of back-reference can be recursively
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   594
written as:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   595
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   596
	\begin{tabular}{c}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   597
		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   598
		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   599
	\end{tabular}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   600
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   601
The concrete example
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   602
$((a|b|c|\ldots|z)^*)\backslash 1$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   603
would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   604
Back-reference is a construct in the "regex" standard
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   605
that programmers found useful, but not exactly 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   606
regular any more.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   607
In fact, that allows the regex construct to express 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   608
languages that cannot be contained in context-free
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   609
languages either.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   610
For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   611
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   612
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   613
Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   614
of formal languages. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   615
Solving the back-reference expressions matching problem
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   616
is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   617
efficient solution is not known to exist.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   618
%TODO:read a bit more about back reference algorithms
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   619
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   620
It seems that languages like Java and Python made the trade-off
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   621
to support back-references at the expense of having to backtrack,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   622
even in the case of regexes not involving back-references.\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   623
Summing these up, we can categorise existing 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   624
practical regex libraries into the ones  with  linear
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   625
time guarantees like Go and Rust, which impose restrictions
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   626
on the user input (not allowing back-references, 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   627
bounded repetitions cannot exceed 1000 etc.), and ones  
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   628
 that allows the programmer much freedom, but grinds to a halt
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   629
 in some non-negligible portion of cases.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   630
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   631
% For example, the Rust regex engine claims to be linear, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   632
% but does not support lookarounds and back-references.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   633
% The GoLang regex library does not support over 1000 repetitions.  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   634
% Java and Python both support back-references, but shows
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   635
%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   636
%when the language is still regular).
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   637
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   638
 %TODO: verify the fact Rust does not allow 1000+ reps
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   639
\ChristianComment{Comment required: Java 17 updated graphs? Is it ok to still use Java 8 graphs?}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   640
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   641
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   642
So we have practical implementations 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   643
on regular expression matching/lexing which are fast
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   644
but do not come with any guarantees that it will not grind to a halt
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   645
or give wrong answers.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   646
Our goal is to have a regex lexing algorithm that comes with 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   647
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   648
\item
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   649
proven correctness 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   650
\item 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   651
proven non-catastrophic properties
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   652
\item
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   653
easy extensions to
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   654
constructs like 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   655
 bounded repetitions, negation,  lookarounds, and even back-references.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   656
 \end{itemize}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   657
 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   658
\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   659
We propose Brzozowski derivatives on regular expressions as
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   660
  a solution to this.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   661
In the last fifteen or so years, Brzozowski's derivatives of regular
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   662
expressions have sparked quite a bit of interest in the functional
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   663
programming and theorem prover communities.   
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   664
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   665
\subsection{Motivation}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   666
  
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   667
Derivatives give a simple solution
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   668
to the problem of matching a string $s$ with a regular
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   669
expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   670
succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   671
then $r$ matches $s$ (and {\em vice versa}).  
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   672
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   673
The beauty of
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   674
Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   675
expressible in any functional language, and easily definable and
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   676
reasoned about in theorem provers---the definitions just consist of
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   677
inductive datatypes and simple recursive functions. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   678
And an algorithms based on it by 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   679
Suzmann and Lu  \parencite{Sulzmann2014} allows easy extension
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   680
to include  extended regular expressions and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   681
 simplification of internal data structures 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   682
 eliminating the exponential behaviours.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   683
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   684
However, two difficulties with derivative-based matchers exist:
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   685
\subsubsection{Problems with Current Brzozowski Matchers}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   686
First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   687
for whether a regular expression matches a string or not.  This is too
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   688
little information in the context of lexing where separate tokens must
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   689
be identified and also classified (for example as keywords
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   690
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   691
difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   692
algorithm. Their extended version generates additional information on
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   693
\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   694
rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   695
second ``phase'' to Brzozowski's algorithm involving an injection
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   696
function.  In our own earlier work, we provided the formal
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   697
specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   698
the correctness
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   699
of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   700
\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   701
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   702
The second difficulty is that Brzozowski's derivatives can 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   703
grow to arbitrarily big sizes. For example if we start with the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   704
regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   705
successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   706
a sequence of ever-growing derivatives like 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   707
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   708
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   709
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   710
\begin{tabular}{rll}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   711
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   712
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   713
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   714
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   715
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   716
\end{tabular}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   717
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   718
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   719
\noindent where after around 35 steps we run out of memory on a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   720
typical computer (we shall define shortly the precise details of our
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   721
regular expressions and the derivative operation).  Clearly, the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   722
notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   723
simplification rules that can be applied to regular regular
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   724
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   725
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   726
r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   727
earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   728
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   729
\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   730
above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   731
quickly beyond any finite bound.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   732
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   733
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   734
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   735
\cite{Sulzmann2014} where they introduce bit-coded
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   736
regular expressions. In this version, POSIX values are
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   737
represented as bit sequences and such sequences are incrementally generated
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   738
when derivatives are calculated. The compact representation
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   739
of bit sequences and regular expressions allows them to define a more
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   740
``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   741
derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   742
They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   743
of this version, but do not provide any supporting proof arguments, not
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   744
even ``pencil-and-paper'' arguments. They write about their bit-coded
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   745
\emph{incremental parsing method} (that is the algorithm to be formalised
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   746
in this dissertation)
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   747
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   748
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   749
  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   750
  \begin{quote}\it
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   751
  ``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   752
  method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   753
  yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   754
  extensively [..] but yet
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   755
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   756
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   757
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   758
Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   759
a formal proof.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   760
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   761
But as they stated,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   762
  \begin{quote}\it
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   763
The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   764
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   765
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   766
This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   767
by Ausaf and Urban,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   768
and gives a formal proof of the correctness with those simplifications.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   769
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   770
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   771
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   772
\section{Contribution}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   773
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   774
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   775
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   776
This work addresses the vulnerability of super-linear and
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   777
buggy regex implementations by the combination
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   778
of Brzozowski's derivatives and interactive theorem proving. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   779
We give an 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   780
improved version of  Sulzmann and Lu's bit-coded algorithm using 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   781
derivatives, which come with a formal guarantee in terms of correctness and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   782
running time as an Isabelle/HOL proof.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   783
Further improvements to the algorithm with an even stronger version of 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   784
simplification is made.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   785
We have not yet come up with one, but believe that it leads to a 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   786
formalised proof with a time bound linear to input and
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   787
cubic to regular expression size using a technique by
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   788
Antimirov\cite{Antimirov}.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   789
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   790
 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   791
The main contribution of this thesis is 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   792
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   793
\item
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   794
a proven correct lexing algorithm
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   795
\item
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   796
with formalized finite bounds on internal data structures' sizes.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   797
\end{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   798
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   799
To our best knowledge, no lexing libraries using Brzozowski derivatives
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   800
have a provable time guarantee, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   801
and claims about running time are usually speculative and backed by thin empirical
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   802
evidence.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   803
%TODO: give references
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   804
For example, Sulzmann and Lu had proposed an algorithm  in which they
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   805
claim a linear running time.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   806
But that was falsified by our experiments and the running time 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   807
is actually $\Omega(2^n)$ in the worst case.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   808
A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   809
%TODO: give references
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   810
lexer, which calculates POSIX matches and is based on derivatives.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   811
They formalized the correctness of the lexer, but not the complexity.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   812
In the performance evaluation section, they simply analyzed the run time
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   813
of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   814
and concluded that the algorithm is quadratic in terms of input length.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   815
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   816
the time it took to lex only 40 $a$'s was 5 minutes.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   817
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   818
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   819
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   820
\subsection{Related Work}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   821
We are aware
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   822
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   823
Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   824
of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   825
in Coq is given by Coquand and Siles \parencite{Coquand2012}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   826
Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   827
 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   828
 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   829
 When a regular expression does not behave as intended,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   830
people usually try to rewrite the regex to some equivalent form
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   831
or they try to avoid the possibly problematic patterns completely,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   832
for which many false positives exist\parencite{Davis18}.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   833
Animated tools to "debug" regular expressions such as
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   834
 \parencite{regexploit2021} \parencite{regex101} are also popular.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   835
We are also aware of static analysis work on regular expressions that
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   836
aims to detect potentially expoential regex patterns. Rathnayake and Thielecke 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   837
\parencite{Rathnayake2014StaticAF} proposed an algorithm
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   838
that detects regular expressions triggering exponential
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   839
behavious on backtracking matchers.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   840
Weideman \parencite{Weideman2017Static} came up with 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   841
non-linear polynomial worst-time estimates
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   842
for regexes, attack string that exploit the worst-time 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   843
scenario, and "attack automata" that generates
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   844
attack strings.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   845
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   846
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   847
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   848
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   849
\section{Structure of the thesis}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   850
In chapter 2 \ref{Inj} we will introduce the concepts
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   851
and notations we 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   852
use for describing the lexing algorithm by Sulzmann and Lu,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   853
and then give the lexing algorithm.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   854
We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   855
Then we illustrate in \ref{Bitcoded2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   856
how the algorithm without bitcodes falls short for such aggressive 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   857
simplifications and therefore introduce our version of the
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   858
 bit-coded algorithm and 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   859
its correctness proof .  
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   860
In \ref{Finite} we give the second guarantee
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   861
of our bitcoded algorithm, that is a finite bound on the size of any 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   862
regex's derivatives.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   863
In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   864
in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   865
algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   866
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   867
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   868
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   869
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   870
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   871
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   872
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   873
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   874
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   875
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   876
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   877
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   878
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   879
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   880