ChengsongTanPhdThesis/Chapters/Introduction.tex
author Chengsong
Tue, 11 Oct 2022 13:09:47 +0100
changeset 612 8c234a1bc7e0
parent 609 61139fdddae0
child 618 233cf2b97d1a
permissions -rwxr-xr-x
chap6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     1
% Chapter 1
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     2
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     4
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     5
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     6
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     8
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    15
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    16
%boxes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    17
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    18
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    19
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    20
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    21
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    22
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
    23
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564
Chengsong
parents: 558
diff changeset
    24
\def\derssimp{\textit{ders}\_\textit{simp}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    25
\def\rders{\textit{rders}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    26
\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    27
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
    28
\def\bsimps{\textit{bsimp}}
554
Chengsong
parents: 543
diff changeset
    29
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    30
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    31
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    32
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    33
\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    34
\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    35
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555
Chengsong
parents: 554
diff changeset
    36
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594
Chengsong
parents: 591
diff changeset
    37
\def\rdistincts{\textit{rdistinct}}
556
Chengsong
parents: 555
diff changeset
    38
\def\rDistinct{\textit{rdistinct}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    39
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    40
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    41
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    42
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    43
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    44
600
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    45
\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    46
\def\SEQs{\textit{SEQs}}
564
Chengsong
parents: 558
diff changeset
    47
\def\case{\textit{case}}
554
Chengsong
parents: 543
diff changeset
    48
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
Chengsong
parents: 543
diff changeset
    49
\def\rsimpalts{\textit{rsimp}_{ALTS}}
Chengsong
parents: 543
diff changeset
    50
\def\good{\textit{good}}
Chengsong
parents: 543
diff changeset
    51
\def\btrue{\textit{true}}
Chengsong
parents: 543
diff changeset
    52
\def\bfalse{\textit{false}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    53
\def\bnullable{\textit{bnullable}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    54
\def\bnullables{\textit{bnullables}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    55
\def\Some{\textit{Some}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    56
\def\None{\textit{None}}
537
Chengsong
parents: 532
diff changeset
    57
\def\code{\textit{code}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    58
\def\decode{\textit{decode}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    59
\def\internalise{\textit{internalise}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    60
\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    61
\def\mkeps{\textit{mkeps}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    62
\newcommand{\rder}[2]{#2 \backslash_r #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    63
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    64
\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    65
554
Chengsong
parents: 543
diff changeset
    66
\def\nonnested{\textit{nonnested}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    67
\def\AZERO{\textit{AZERO}}
558
Chengsong
parents: 557
diff changeset
    68
\def\sizeNregex{\textit{sizeNregex}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    69
\def\AONE{\textit{AONE}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    70
\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    71
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    72
\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    73
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    74
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555
Chengsong
parents: 554
diff changeset
    75
\def\frewrite{\rightsquigarrow_f}
Chengsong
parents: 554
diff changeset
    76
\def\hrewrite{\rightsquigarrow_h}
Chengsong
parents: 554
diff changeset
    77
\def\grewrite{\rightsquigarrow_g}
Chengsong
parents: 554
diff changeset
    78
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
Chengsong
parents: 554
diff changeset
    79
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
Chengsong
parents: 554
diff changeset
    80
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    81
\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    82
\def\bder{\textit{bder}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    83
\def\der{\textit{der}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    84
\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    85
\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    86
\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    87
\def\DFA{\textit{DFA}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    88
\def\NFA{\textit{NFA}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    89
\def\bmkeps{\textit{bmkeps}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    90
\def\bmkepss{\textit{bmkepss}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    91
\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    92
\def\blexer{\textit{blexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    93
\def\flex{\textit{flex}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    94
\def\inj{\textit{inj}}
564
Chengsong
parents: 558
diff changeset
    95
\def\Empty{\textit{Empty}}
567
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
    96
\def\Left{\textit{Left}}
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
    97
\def\Right{\textit{Right}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    98
\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    99
\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   100
\def\Seq{\textit{Seq}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   101
\def\Der{\textit{Der}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   102
\def\Ders{\textit{Ders}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   103
\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   104
\def\Z{\mathit{Z}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   105
\def\S{\mathit{S}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   106
\def\rup{r^\uparrow}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   107
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   108
\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   109
\def\lf{\textit{lf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   110
\def\PD{\textit{PD}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   111
\def\suffix{\textit{Suffix}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   112
\def\distinctBy{\textit{distinctBy}}
558
Chengsong
parents: 557
diff changeset
   113
\def\starupdate{\textit{starUpdate}}
Chengsong
parents: 557
diff changeset
   114
\def\starupdates{\textit{starUpdates}}
Chengsong
parents: 557
diff changeset
   115
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   116
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   117
\def\size{\mathit{size}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   118
\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   119
\def\simp{\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   120
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   121
\def\map{\mathit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   122
\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   123
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   124
\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   125
\def\bsimpStrong{\textit{bsimpStrong}}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
   126
\def\bdersStrongs{\textit{bdersStrong}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   127
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   128
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   129
\def\map{\textit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   130
\def\rrexp{\textit{rrexp}}
554
Chengsong
parents: 543
diff changeset
   131
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   132
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   133
\newcommand\asize[1]{\llbracket #1 \rrbracket}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   134
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   135
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   136
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   137
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   138
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   139
\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   140
\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   141
\def\bsimpalts{\textit{bsimp}_{ALTS}}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   142
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   143
\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   144
\def\rsimpseq{\textit{rsimp}_{SEQ}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   145
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   146
\def\erase{\textit{erase}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   147
\def\STAR{\textit{STAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   148
\def\flts{\textit{flts}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   149
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   150
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   151
\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   152
\def\nub{\textit{nub}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   153
\def\filter{\textit{filter}}
601
Chengsong
parents: 600
diff changeset
   154
%\def\not{\textit{not}}
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   155
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   156
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   157
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   158
\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   159
\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   160
\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   161
\newcommand\RSEQ[2]{#1 \cdot #2}
558
Chengsong
parents: 557
diff changeset
   162
\newcommand\RALTS[1]{\sum #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   163
\newcommand\RSTAR[1]{#1^*}
558
Chengsong
parents: 557
diff changeset
   164
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   165
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   166
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   167
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   168
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   169
\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   170
  frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   171
  language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   172
  aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   173
  belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   174
  showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   175
  columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   176
  basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   177
  numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   178
  numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   179
  keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   180
  commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   181
  stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   182
  frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   183
  breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   184
  breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   185
  tabsize=3,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   186
}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   187
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   188
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   189
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   190
%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   191
%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   192
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   193
%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   194
601
Chengsong
parents: 600
diff changeset
   195
Chengsong
parents: 600
diff changeset
   196
Chengsong
parents: 600
diff changeset
   197
Chengsong
parents: 600
diff changeset
   198
Chengsong
parents: 600
diff changeset
   199
Chengsong
parents: 600
diff changeset
   200
Regular expressions are widely used in computer science: 
Chengsong
parents: 600
diff changeset
   201
be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
Chengsong
parents: 600
diff changeset
   202
command-line tools like $\mathit{grep}$ that facilitate easy 
Chengsong
parents: 600
diff changeset
   203
text-processing; network intrusion
612
Chengsong
parents: 609
diff changeset
   204
detection systems that inspect suspicious traffic; or compiler
Chengsong
parents: 609
diff changeset
   205
front ends.
Chengsong
parents: 609
diff changeset
   206
Given their usefulness and ubiquity, one would assume that
601
Chengsong
parents: 600
diff changeset
   207
modern regular expression matching implementations
Chengsong
parents: 600
diff changeset
   208
are mature and fully studied.
602
Chengsong
parents: 601
diff changeset
   209
Indeed, in a popular programming language's regex engine, 
Chengsong
parents: 601
diff changeset
   210
supplying it with regular expressions and strings,
Chengsong
parents: 601
diff changeset
   211
in most cases one can
Chengsong
parents: 601
diff changeset
   212
get the matching information in a very short time.
Chengsong
parents: 601
diff changeset
   213
Those matchers can be blindingly fast--some 
Chengsong
parents: 601
diff changeset
   214
network intrusion detection systems
601
Chengsong
parents: 600
diff changeset
   215
use regex engines that are able to process 
Chengsong
parents: 600
diff changeset
   216
megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
602
Chengsong
parents: 601
diff changeset
   217
However, those matchers can exhibit a surprising security vulnerability
Chengsong
parents: 601
diff changeset
   218
under a certain class of inputs.
Chengsong
parents: 601
diff changeset
   219
%However, , this is not the case for $\mathbf{all}$ inputs.
601
Chengsong
parents: 600
diff changeset
   220
%TODO: get source for SNORT/BRO's regex matching engine/speed
Chengsong
parents: 600
diff changeset
   221
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   222
612
Chengsong
parents: 609
diff changeset
   223
Consider $(a^*)^*\,b$ and ask whether
Chengsong
parents: 609
diff changeset
   224
strings of the form $aa..a$ can be matched by this regular
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   225
expression. Obviously this is not the case---the expected $b$ in the last
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   226
position is missing. One would expect that modern regular expression
612
Chengsong
parents: 609
diff changeset
   227
matching engines can find this out very quickly. Surprisingly, if one tries
Chengsong
parents: 609
diff changeset
   228
this example in JavaScript, Python or Java 8, even with small strings, 
Chengsong
parents: 609
diff changeset
   229
say of lenght of around 30 $a$'s,
Chengsong
parents: 609
diff changeset
   230
the decision takes an absurd time to finish (see graphs in figure \ref{fig:aStarStarb}).
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   231
This is clearly exponential behaviour, and 
612
Chengsong
parents: 609
diff changeset
   232
is triggered by some relatively simple regular expressions.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   233
Java 9 and newer
612
Chengsong
parents: 609
diff changeset
   234
versions improve this behaviour somewhat, but is still slow compared 
Chengsong
parents: 609
diff changeset
   235
with the approach we are going to use in this thesis.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   236
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   237
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   238
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   239
This superlinear blowup in regular expression engines
612
Chengsong
parents: 609
diff changeset
   240
had repeatedly caused grief in ``real life'' where it is 
Chengsong
parents: 609
diff changeset
   241
given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   242
For example, on 20 July 2016 one evil
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   243
regular expression brought the webpage
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   244
\href{http://stackexchange.com}{Stack Exchange} to its
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   245
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   246
In this instance, a regular expression intended to just trim white
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   247
spaces from the beginning and the end of a line actually consumed
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   248
massive amounts of CPU resources---causing web servers to grind to a
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   249
halt. In this example, the time needed to process
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   250
the string was $O(n^2)$ with respect to the string length. This
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   251
quadratic overhead was enough for the homepage of Stack Exchange to
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   252
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   253
attack and therefore stopped the servers from responding to any
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   254
requests. This made the whole site become unavailable. 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   255
601
Chengsong
parents: 600
diff changeset
   256
\begin{figure}[p]
612
Chengsong
parents: 609
diff changeset
   257
\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   258
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   259
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   260
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   261
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   262
    ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   263
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   264
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   265
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   266
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   267
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   268
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   269
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   270
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   271
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   272
    legend entries={JavaScript},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   273
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   274
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   275
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   276
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   277
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   278
  &
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   279
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   280
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   281
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   282
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   283
    %ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   284
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   285
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   286
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   287
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   288
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   289
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   290
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   291
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   292
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   293
    legend entries={Python},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   294
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   295
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   296
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   297
\end{axis}
612
Chengsong
parents: 609
diff changeset
   298
\end{tikzpicture}\\ 
601
Chengsong
parents: 600
diff changeset
   299
\begin{tikzpicture}
Chengsong
parents: 600
diff changeset
   300
\begin{axis}[
Chengsong
parents: 600
diff changeset
   301
    xlabel={$n$},
Chengsong
parents: 600
diff changeset
   302
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 600
diff changeset
   303
    ylabel={time in secs},
Chengsong
parents: 600
diff changeset
   304
    enlargelimits=false,
Chengsong
parents: 600
diff changeset
   305
    xtick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   306
    xmax=33,
Chengsong
parents: 600
diff changeset
   307
    ymax=35,
Chengsong
parents: 600
diff changeset
   308
    ytick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   309
    scaled ticks=false,
Chengsong
parents: 600
diff changeset
   310
    axis lines=left,
Chengsong
parents: 600
diff changeset
   311
    width=5cm,
Chengsong
parents: 600
diff changeset
   312
    height=4cm, 
612
Chengsong
parents: 609
diff changeset
   313
    legend entries={Java 8},  
601
Chengsong
parents: 600
diff changeset
   314
    legend pos=north west,
Chengsong
parents: 600
diff changeset
   315
    legend cell align=left]
612
Chengsong
parents: 609
diff changeset
   316
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
601
Chengsong
parents: 600
diff changeset
   317
\end{axis}
Chengsong
parents: 600
diff changeset
   318
\end{tikzpicture}
Chengsong
parents: 600
diff changeset
   319
  &
Chengsong
parents: 600
diff changeset
   320
\begin{tikzpicture}
Chengsong
parents: 600
diff changeset
   321
\begin{axis}[
Chengsong
parents: 600
diff changeset
   322
    xlabel={$n$},
Chengsong
parents: 600
diff changeset
   323
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 600
diff changeset
   324
    %ylabel={time in secs},
Chengsong
parents: 600
diff changeset
   325
    enlargelimits=false,
Chengsong
parents: 600
diff changeset
   326
    xtick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   327
    xmax=33,
Chengsong
parents: 600
diff changeset
   328
    ymax=35,
Chengsong
parents: 600
diff changeset
   329
    ytick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   330
    scaled ticks=false,
Chengsong
parents: 600
diff changeset
   331
    axis lines=left,
Chengsong
parents: 600
diff changeset
   332
    width=5cm,
Chengsong
parents: 600
diff changeset
   333
    height=4cm, 
612
Chengsong
parents: 609
diff changeset
   334
    legend entries={Dart},  
Chengsong
parents: 609
diff changeset
   335
    legend pos=north west,
Chengsong
parents: 609
diff changeset
   336
    legend cell align=left]
Chengsong
parents: 609
diff changeset
   337
\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
Chengsong
parents: 609
diff changeset
   338
\end{axis}
Chengsong
parents: 609
diff changeset
   339
\end{tikzpicture}\\ 
Chengsong
parents: 609
diff changeset
   340
\begin{tikzpicture}
Chengsong
parents: 609
diff changeset
   341
\begin{axis}[
Chengsong
parents: 609
diff changeset
   342
    xlabel={$n$},
Chengsong
parents: 609
diff changeset
   343
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 609
diff changeset
   344
    ylabel={time in secs},
Chengsong
parents: 609
diff changeset
   345
    enlargelimits=false,
Chengsong
parents: 609
diff changeset
   346
    xtick={0,5,...,30},
Chengsong
parents: 609
diff changeset
   347
    xmax=33,
Chengsong
parents: 609
diff changeset
   348
    ymax=35,
Chengsong
parents: 609
diff changeset
   349
    ytick={0,5,...,30},
Chengsong
parents: 609
diff changeset
   350
    scaled ticks=false,
Chengsong
parents: 609
diff changeset
   351
    axis lines=left,
Chengsong
parents: 609
diff changeset
   352
    width=5cm,
Chengsong
parents: 609
diff changeset
   353
    height=4cm, 
601
Chengsong
parents: 600
diff changeset
   354
    legend entries={Swift},  
Chengsong
parents: 600
diff changeset
   355
    legend pos=north west,
Chengsong
parents: 600
diff changeset
   356
    legend cell align=left]
Chengsong
parents: 600
diff changeset
   357
\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
Chengsong
parents: 600
diff changeset
   358
\end{axis}
Chengsong
parents: 600
diff changeset
   359
\end{tikzpicture}
612
Chengsong
parents: 609
diff changeset
   360
  & 
Chengsong
parents: 609
diff changeset
   361
\begin{tikzpicture}
Chengsong
parents: 609
diff changeset
   362
\begin{axis}[
Chengsong
parents: 609
diff changeset
   363
    xlabel={$n$},
Chengsong
parents: 609
diff changeset
   364
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 609
diff changeset
   365
    %ylabel={time in secs},
Chengsong
parents: 609
diff changeset
   366
    enlargelimits=true,
Chengsong
parents: 609
diff changeset
   367
    %xtick={0,5000,...,40000},
Chengsong
parents: 609
diff changeset
   368
    %xmax=40000,
Chengsong
parents: 609
diff changeset
   369
    %ymax=35,
Chengsong
parents: 609
diff changeset
   370
    restrict x to domain*=0:40000,
Chengsong
parents: 609
diff changeset
   371
    restrict y to domain*=0:35,
Chengsong
parents: 609
diff changeset
   372
    %ytick={0,5,...,30},
Chengsong
parents: 609
diff changeset
   373
    %scaled ticks=false,
Chengsong
parents: 609
diff changeset
   374
    axis lines=left,
Chengsong
parents: 609
diff changeset
   375
    width=5cm,
Chengsong
parents: 609
diff changeset
   376
    height=4cm, 
Chengsong
parents: 609
diff changeset
   377
    legend entries={Java9+},  
Chengsong
parents: 609
diff changeset
   378
    legend pos=north west,
Chengsong
parents: 609
diff changeset
   379
    legend cell align=left]
Chengsong
parents: 609
diff changeset
   380
\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
Chengsong
parents: 609
diff changeset
   381
\end{axis}
Chengsong
parents: 609
diff changeset
   382
\end{tikzpicture}\\ 
Chengsong
parents: 609
diff changeset
   383
\multicolumn{2}{c}{Graphs}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   384
\end{tabular}    
601
Chengsong
parents: 600
diff changeset
   385
\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings 
Chengsong
parents: 600
diff changeset
   386
           of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
612
Chengsong
parents: 609
diff changeset
   387
   The reason for their superlinear behaviour is that they do a depth-first-search
Chengsong
parents: 609
diff changeset
   388
   using NFAs.
Chengsong
parents: 609
diff changeset
   389
   If the string does not match, the regular expression matching
Chengsong
parents: 609
diff changeset
   390
   engine starts to explore all possibilities. 
601
Chengsong
parents: 600
diff changeset
   391
}\label{fig:aStarStarb}
Chengsong
parents: 600
diff changeset
   392
\end{figure}\afterpage{\clearpage}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   393
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   394
A more recent example is a global outage of all Cloudflare servers on 2 July
612
Chengsong
parents: 609
diff changeset
   395
2019. A poorly written regular expression exhibited catastrophic backtracking
Chengsong
parents: 609
diff changeset
   396
and exhausted CPUs that serve HTTP traffic. Although the outage
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   397
had several causes, at the heart was a regular expression that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   398
was used to monitor network
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   399
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   400
These problems with regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   401
are not isolated events that happen
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   402
very occasionally, but actually widespread.
612
Chengsong
parents: 609
diff changeset
   403
They occur so often that they have a 
Chengsong
parents: 609
diff changeset
   404
name: Regular-Expression-Denial-Of-Service (ReDoS)
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   405
attack.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   406
\citeauthor{Davis18} detected more
612
Chengsong
parents: 609
diff changeset
   407
than 1000 evil regular expressions
Chengsong
parents: 609
diff changeset
   408
in Node.js, Python core libraries, npm and in pypi. 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   409
They therefore concluded that evil regular expressions
612
Chengsong
parents: 609
diff changeset
   410
are real problems rather than "a parlour trick".
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   411
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   412
This work aims to address this issue
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   413
with the help of formal proofs.
612
Chengsong
parents: 609
diff changeset
   414
We describe a lexing algorithm based
Chengsong
parents: 609
diff changeset
   415
on Brzozowski derivatives with verified correctness (in 
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   416
Isabelle/HOL)
612
Chengsong
parents: 609
diff changeset
   417
and a finiteness property.
Chengsong
parents: 609
diff changeset
   418
Such properties %guarantee the absence of 
Chengsong
parents: 609
diff changeset
   419
are an important step in preventing
Chengsong
parents: 609
diff changeset
   420
catastrophic backtracking once and for all.
604
Chengsong
parents: 603
diff changeset
   421
We will give more details in the next sections
Chengsong
parents: 603
diff changeset
   422
on (i) why the slow cases in graph \ref{fig:aStarStarb}
612
Chengsong
parents: 609
diff changeset
   423
can occur in traditional regular expression engines
604
Chengsong
parents: 603
diff changeset
   424
and (ii) why we choose our 
612
Chengsong
parents: 609
diff changeset
   425
approach based on Brzozowski derivatives and formal proofs.
602
Chengsong
parents: 601
diff changeset
   426
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   427
612
Chengsong
parents: 609
diff changeset
   428
\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
601
Chengsong
parents: 600
diff changeset
   429
Regular expressions and regular expression matchers 
Chengsong
parents: 600
diff changeset
   430
have of course been studied for many, many years.
612
Chengsong
parents: 609
diff changeset
   431
Theoretical results in automata theory state 
604
Chengsong
parents: 603
diff changeset
   432
that basic regular expression matching should be linear
605
Chengsong
parents: 604
diff changeset
   433
w.r.t the input.
Chengsong
parents: 604
diff changeset
   434
This assumes that the regular expression
Chengsong
parents: 604
diff changeset
   435
$r$ was pre-processed and turned into a
612
Chengsong
parents: 609
diff changeset
   436
deterministic finite automaton (DFA) before matching\cite{Sakarovitch2009}.
604
Chengsong
parents: 603
diff changeset
   437
By basic we mean textbook definitions such as the one
612
Chengsong
parents: 609
diff changeset
   438
below, involving only regular expressions for characters, alternatives,
604
Chengsong
parents: 603
diff changeset
   439
sequences, and Kleene stars:
Chengsong
parents: 603
diff changeset
   440
\[
612
Chengsong
parents: 609
diff changeset
   441
	r ::= c | r_1 + r_2 | r_1 \cdot r_2 | r^*
604
Chengsong
parents: 603
diff changeset
   442
\]
Chengsong
parents: 603
diff changeset
   443
Modern regular expression matchers used by programmers,
Chengsong
parents: 603
diff changeset
   444
however,
612
Chengsong
parents: 609
diff changeset
   445
support much richer constructs, such as bounded repetitions
604
Chengsong
parents: 603
diff changeset
   446
and back-references.
612
Chengsong
parents: 609
diff changeset
   447
To differentiate, we use the word \emph{regex} to refer
605
Chengsong
parents: 604
diff changeset
   448
to those expressions with richer constructs while reserving the
Chengsong
parents: 604
diff changeset
   449
term \emph{regular expression}
Chengsong
parents: 604
diff changeset
   450
for the more traditional meaning in formal languages theory.
Chengsong
parents: 604
diff changeset
   451
We follow this convention 
Chengsong
parents: 604
diff changeset
   452
in this thesis.
Chengsong
parents: 604
diff changeset
   453
In the future, we aim to support all the popular features of regexes, 
612
Chengsong
parents: 609
diff changeset
   454
but for this work we mainly look at basic regular expressions
Chengsong
parents: 609
diff changeset
   455
and bounded repetitions.
604
Chengsong
parents: 603
diff changeset
   456
605
Chengsong
parents: 604
diff changeset
   457
Chengsong
parents: 604
diff changeset
   458
Chengsong
parents: 604
diff changeset
   459
%Most modern regex libraries
Chengsong
parents: 604
diff changeset
   460
%the so-called PCRE standard (Peral Compatible Regular Expressions)
Chengsong
parents: 604
diff changeset
   461
%has the back-references
612
Chengsong
parents: 609
diff changeset
   462
Regexes come with a number of constructs
605
Chengsong
parents: 604
diff changeset
   463
that make it more convenient for 
604
Chengsong
parents: 603
diff changeset
   464
programmers to write regular expressions.
612
Chengsong
parents: 609
diff changeset
   465
Depending on the types of constructs
605
Chengsong
parents: 604
diff changeset
   466
the task of matching and lexing with them
612
Chengsong
parents: 609
diff changeset
   467
will have different levels of complexity.
Chengsong
parents: 609
diff changeset
   468
Some of those constructs are just syntactic sugars that are
604
Chengsong
parents: 603
diff changeset
   469
simply short hand notations
605
Chengsong
parents: 604
diff changeset
   470
that save the programmers a few keystrokes.
612
Chengsong
parents: 609
diff changeset
   471
These will not cause problems for regex libraries.
605
Chengsong
parents: 604
diff changeset
   472
For example the
612
Chengsong
parents: 609
diff changeset
   473
non-binary alternative involving three or more choices just means:
604
Chengsong
parents: 603
diff changeset
   474
\[
605
Chengsong
parents: 604
diff changeset
   475
	(a | b | c) \stackrel{means}{=} ((a + b)+ c)
604
Chengsong
parents: 603
diff changeset
   476
\]
612
Chengsong
parents: 609
diff changeset
   477
Similarly, the range operator used to express the alternative
Chengsong
parents: 609
diff changeset
   478
of all characters between its operands is just a concise way:
604
Chengsong
parents: 603
diff changeset
   479
\[
605
Chengsong
parents: 604
diff changeset
   480
	[0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 ) \; \text{(all number digits)}
604
Chengsong
parents: 603
diff changeset
   481
\]
612
Chengsong
parents: 609
diff changeset
   482
for an alternative. The
Chengsong
parents: 609
diff changeset
   483
wildcard character $.$ is used to refer to any single character,
605
Chengsong
parents: 604
diff changeset
   484
\[
Chengsong
parents: 604
diff changeset
   485
	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
Chengsong
parents: 604
diff changeset
   486
\]
612
Chengsong
parents: 609
diff changeset
   487
except the newline.
604
Chengsong
parents: 603
diff changeset
   488
605
Chengsong
parents: 604
diff changeset
   489
\subsection{Bounded Repetitions}
612
Chengsong
parents: 609
diff changeset
   490
More interesting are bounded repetitions, which can 
Chengsong
parents: 609
diff changeset
   491
make the regular expressions much
605
Chengsong
parents: 604
diff changeset
   492
more compact.
612
Chengsong
parents: 609
diff changeset
   493
There are 
Chengsong
parents: 609
diff changeset
   494
$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
Chengsong
parents: 609
diff changeset
   495
(where $n$ and $m$ are constant natural numbers).
Chengsong
parents: 609
diff changeset
   496
Like the star regular expressions, the set of strings or language
Chengsong
parents: 609
diff changeset
   497
a bounded regular expression can match
Chengsong
parents: 609
diff changeset
   498
is defined using the power operation on sets:
605
Chengsong
parents: 604
diff changeset
   499
\begin{center}
Chengsong
parents: 604
diff changeset
   500
	\begin{tabular}{lcl}
Chengsong
parents: 604
diff changeset
   501
		$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
Chengsong
parents: 604
diff changeset
   502
		$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
Chengsong
parents: 604
diff changeset
   503
		$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
Chengsong
parents: 604
diff changeset
   504
		$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
Chengsong
parents: 604
diff changeset
   505
	\end{tabular}
Chengsong
parents: 604
diff changeset
   506
\end{center}
612
Chengsong
parents: 609
diff changeset
   507
The attraction of bounded repetitions is that they can be
Chengsong
parents: 609
diff changeset
   508
used to avoid a blow up: for example $r^{\{n\}}$
Chengsong
parents: 609
diff changeset
   509
is a shorthand for
605
Chengsong
parents: 604
diff changeset
   510
\[
Chengsong
parents: 604
diff changeset
   511
	\underbrace{r\ldots r}_\text{n copies of r}.
Chengsong
parents: 604
diff changeset
   512
\]
Chengsong
parents: 604
diff changeset
   513
%Therefore, a naive algorithm that simply unfolds
Chengsong
parents: 604
diff changeset
   514
%them into their desugared forms
Chengsong
parents: 604
diff changeset
   515
%will suffer from at least an exponential runtime increase.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   516
612
Chengsong
parents: 609
diff changeset
   517
Chengsong
parents: 609
diff changeset
   518
The problem with matching 
Chengsong
parents: 609
diff changeset
   519
is that tools based on the classic notion of
Chengsong
parents: 609
diff changeset
   520
automata need to expand $r^{\{n\}}$ into $n$ connected 
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   521
copies of the automaton for $r$. This leads to very inefficient matching
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   522
algorithms  or algorithms that consume large amounts of memory.
605
Chengsong
parents: 604
diff changeset
   523
Implementations using $\DFA$s will
Chengsong
parents: 604
diff changeset
   524
either become excruciatingly slow 
Chengsong
parents: 604
diff changeset
   525
(for example Verbatim++\cite{Verbatimpp}) or get
Chengsong
parents: 604
diff changeset
   526
out of memory errors (for example $\mathit{LEX}$ and 
Chengsong
parents: 604
diff changeset
   527
$\mathit{JFLEX}$\footnote{which are lexer generators
Chengsong
parents: 604
diff changeset
   528
in C and JAVA that generate $\mathit{DFA}$-based
Chengsong
parents: 604
diff changeset
   529
lexers. The user provides a set of regular expressions
Chengsong
parents: 604
diff changeset
   530
and configurations to them, and then 
Chengsong
parents: 604
diff changeset
   531
gets an output program encoding a minimized $\mathit{DFA}$
Chengsong
parents: 604
diff changeset
   532
that can be compiled and run. 
Chengsong
parents: 604
diff changeset
   533
When given the above countdown regular expression,
Chengsong
parents: 604
diff changeset
   534
a small $n$ (a few dozen) would result in a 
Chengsong
parents: 604
diff changeset
   535
determinised automata
612
Chengsong
parents: 609
diff changeset
   536
with millions of states.}) for large counters.
Chengsong
parents: 609
diff changeset
   537
A classic example for this phenomenon is the regular expression $(a+b)^*  a (a+b)^{n}$
605
Chengsong
parents: 604
diff changeset
   538
where the minimal DFA requires at least $2^{n+1}$ states.
Chengsong
parents: 604
diff changeset
   539
For example, when $n$ is equal to 2,
612
Chengsong
parents: 609
diff changeset
   540
The corresponding $\mathit{NFA}$ looks like:
604
Chengsong
parents: 603
diff changeset
   541
\begin{center}
Chengsong
parents: 603
diff changeset
   542
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
Chengsong
parents: 603
diff changeset
   543
   \node[state,initial] (q_0)   {$q_0$}; 
Chengsong
parents: 603
diff changeset
   544
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
Chengsong
parents: 603
diff changeset
   545
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
Chengsong
parents: 603
diff changeset
   546
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
Chengsong
parents: 603
diff changeset
   547
    \path[->] 
Chengsong
parents: 603
diff changeset
   548
    (q_0) edge  node {a} (q_1)
Chengsong
parents: 603
diff changeset
   549
    	  edge [loop below] node {a,b} ()
Chengsong
parents: 603
diff changeset
   550
    (q_1) edge  node  {a,b} (q_2)
Chengsong
parents: 603
diff changeset
   551
    (q_2) edge  node  {a,b} (q_3);
Chengsong
parents: 603
diff changeset
   552
\end{tikzpicture}
Chengsong
parents: 603
diff changeset
   553
\end{center}
612
Chengsong
parents: 609
diff changeset
   554
when turned into a DFA by the subset construction
Chengsong
parents: 609
diff changeset
   555
requires at least $2^3$ states.\footnote{The 
605
Chengsong
parents: 604
diff changeset
   556
red states are "countdown states" which counts down 
604
Chengsong
parents: 603
diff changeset
   557
the number of characters needed in addition to the current
Chengsong
parents: 603
diff changeset
   558
string to make a successful match.
Chengsong
parents: 603
diff changeset
   559
For example, state $q_1$ indicates a match that has
Chengsong
parents: 603
diff changeset
   560
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
Chengsong
parents: 603
diff changeset
   561
and just consumed the "delimiter" $a$ in the middle, and 
Chengsong
parents: 603
diff changeset
   562
need to match 2 more iterations of $(a|b)$ to complete.
Chengsong
parents: 603
diff changeset
   563
State $q_2$ on the other hand, can be viewed as a state
Chengsong
parents: 603
diff changeset
   564
after $q_1$ has consumed 1 character, and just waits
Chengsong
parents: 603
diff changeset
   565
for 1 more character to complete.
Chengsong
parents: 603
diff changeset
   566
$q_3$ is the last state, requiring 0 more character and is accepting.
Chengsong
parents: 603
diff changeset
   567
Depending on the suffix of the
Chengsong
parents: 603
diff changeset
   568
input string up to the current read location,
Chengsong
parents: 603
diff changeset
   569
the states $q_1$ and $q_2$, $q_3$
Chengsong
parents: 603
diff changeset
   570
may or may
Chengsong
parents: 603
diff changeset
   571
not be active, independent from each other.
Chengsong
parents: 603
diff changeset
   572
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
Chengsong
parents: 603
diff changeset
   573
contain at least $2^3$ non-equivalent states that cannot be merged, 
Chengsong
parents: 603
diff changeset
   574
because the subset construction during determinisation will generate
Chengsong
parents: 603
diff changeset
   575
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
Chengsong
parents: 603
diff changeset
   576
Generalizing this to regular expressions with larger
Chengsong
parents: 603
diff changeset
   577
bounded repetitions number, we have that
Chengsong
parents: 603
diff changeset
   578
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
605
Chengsong
parents: 604
diff changeset
   579
would require at least $2^{n+1}$ states, if $r$ itself contains
604
Chengsong
parents: 603
diff changeset
   580
more than 1 string.
Chengsong
parents: 603
diff changeset
   581
This is to represent all different 
605
Chengsong
parents: 604
diff changeset
   582
scenarios which "countdown" states are active.}
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   583
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   584
605
Chengsong
parents: 604
diff changeset
   585
Bounded repetitions are very important because they
612
Chengsong
parents: 609
diff changeset
   586
tend to occur a lot in practical use,
Chengsong
parents: 609
diff changeset
   587
for example in the regex library RegExLib,
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   588
the rules library of Snort \cite{Snort1999}\footnote{
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   589
Snort is a network intrusion detection (NID) tool
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   590
for monitoring network traffic.
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   591
The network security community curates a list
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   592
of malicious patterns written as regexes,
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   593
which is used by Snort's detection engine
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   594
to match against network traffic for any hostile
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   595
activities such as buffer overflow attacks.}, 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   596
as well as in XML Schema definitions (XSDs).
605
Chengsong
parents: 604
diff changeset
   597
According to Bj\"{o}rklund et al \cite{xml2015},
Chengsong
parents: 604
diff changeset
   598
more than half of the 
612
Chengsong
parents: 609
diff changeset
   599
XSDs they found on the Maven.org central repository
Chengsong
parents: 609
diff changeset
   600
have bounded regular expressions in them.
Chengsong
parents: 609
diff changeset
   601
Often the counters are quite large, with the largest being
Chengsong
parents: 609
diff changeset
   602
approximately up to ten million. 
605
Chengsong
parents: 604
diff changeset
   603
An example XSD they gave
612
Chengsong
parents: 609
diff changeset
   604
is:
Chengsong
parents: 609
diff changeset
   605
\begin{verbatim}
Chengsong
parents: 609
diff changeset
   606
<sequence minOccurs="0" maxOccurs="65535">
Chengsong
parents: 609
diff changeset
   607
 <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
Chengsong
parents: 609
diff changeset
   608
 <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
Chengsong
parents: 609
diff changeset
   609
</sequence>
Chengsong
parents: 609
diff changeset
   610
\end{verbatim}
605
Chengsong
parents: 604
diff changeset
   611
This can be seen as the expression 
Chengsong
parents: 604
diff changeset
   612
$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
Chengsong
parents: 604
diff changeset
   613
regular expressions 
Chengsong
parents: 604
diff changeset
   614
satisfying certain constraints (such as 
Chengsong
parents: 604
diff changeset
   615
satisfying the floating point number format).
Chengsong
parents: 604
diff changeset
   616
It is therefore quite unsatisfying that 
Chengsong
parents: 604
diff changeset
   617
some regular expressions matching libraries
Chengsong
parents: 604
diff changeset
   618
impose adhoc limits
Chengsong
parents: 604
diff changeset
   619
for bounded regular expressions:
Chengsong
parents: 604
diff changeset
   620
For example, in the regular expression matching library in the Go
Chengsong
parents: 604
diff changeset
   621
language the regular expression $a^{1001}$ is not permitted, because no counter
Chengsong
parents: 604
diff changeset
   622
can be above 1000, and in the built-in Rust regular expression library
Chengsong
parents: 604
diff changeset
   623
expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
606
Chengsong
parents: 605
diff changeset
   624
for being too big. 
Chengsong
parents: 605
diff changeset
   625
As Becchi and Crawley\cite{Becchi08}  have pointed out,
Chengsong
parents: 605
diff changeset
   626
the reason for these restrictions
612
Chengsong
parents: 609
diff changeset
   627
is that they simulate a non-deterministic finite
606
Chengsong
parents: 605
diff changeset
   628
automata (NFA) with a breadth-first search.
Chengsong
parents: 605
diff changeset
   629
This way the number of active states could
Chengsong
parents: 605
diff changeset
   630
be equal to the counter number.
Chengsong
parents: 605
diff changeset
   631
When the counters are large, 
Chengsong
parents: 605
diff changeset
   632
the memory requirement could become
612
Chengsong
parents: 609
diff changeset
   633
infeasible, and a regex engine
Chengsong
parents: 609
diff changeset
   634
like Go will reject this pattern straight away.
606
Chengsong
parents: 605
diff changeset
   635
\begin{figure}[H]
Chengsong
parents: 605
diff changeset
   636
\begin{center}
Chengsong
parents: 605
diff changeset
   637
\begin{tikzpicture} [node distance = 2cm, on grid, auto]
Chengsong
parents: 605
diff changeset
   638
 
Chengsong
parents: 605
diff changeset
   639
    	\node (q0) [state, initial] {$0$};
Chengsong
parents: 605
diff changeset
   640
	\node (q1) [state, right = of q0] {$1$};
612
Chengsong
parents: 609
diff changeset
   641
	%\node (q2) [state, right = of q1] {$2$};
Chengsong
parents: 609
diff changeset
   642
	\node (qdots) [right = of q1] {$\ldots$};
606
Chengsong
parents: 605
diff changeset
   643
	\node (qn) [state, right = of qdots] {$n$};
Chengsong
parents: 605
diff changeset
   644
	\node (qn1) [state, right = of qn] {$n+1$};
Chengsong
parents: 605
diff changeset
   645
	\node (qn2) [state, right = of qn1] {$n+2$};
Chengsong
parents: 605
diff changeset
   646
	\node (qn3) [state, accepting, right = of qn2] {$n+3$}; 
Chengsong
parents: 605
diff changeset
   647
 
Chengsong
parents: 605
diff changeset
   648
\path [-stealth, thick]
Chengsong
parents: 605
diff changeset
   649
	(q0) edge [loop above] node {a} ()
Chengsong
parents: 605
diff changeset
   650
    (q0) edge node {a}   (q1) 
612
Chengsong
parents: 609
diff changeset
   651
    %(q1) edge node {.}   (q2)
Chengsong
parents: 609
diff changeset
   652
    (q1) edge node {.}   (qdots)
606
Chengsong
parents: 605
diff changeset
   653
    (qdots) edge node {.} (qn)
Chengsong
parents: 605
diff changeset
   654
    (qn) edge node {.} (qn1)
Chengsong
parents: 605
diff changeset
   655
    (qn1) edge node {b} (qn2)
Chengsong
parents: 605
diff changeset
   656
    (qn2) edge node {$c$} (qn3);
Chengsong
parents: 605
diff changeset
   657
\end{tikzpicture}
Chengsong
parents: 605
diff changeset
   658
%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
Chengsong
parents: 605
diff changeset
   659
%   \node[state,initial] (q_0)   {$0$}; 
Chengsong
parents: 605
diff changeset
   660
%   \node[state, ] (q_1) [right=of q_0] {$1$}; 
Chengsong
parents: 605
diff changeset
   661
%   \node[state, ] (q_2) [right=of q_1] {$2$}; 
Chengsong
parents: 605
diff changeset
   662
%   \node[state,
Chengsong
parents: 605
diff changeset
   663
%   \node[state, accepting, ](q_3) [right=of q_2] {$3$};
Chengsong
parents: 605
diff changeset
   664
%    \path[->] 
Chengsong
parents: 605
diff changeset
   665
%    (q_0) edge  node {a} (q_1)
Chengsong
parents: 605
diff changeset
   666
%    	  edge [loop below] node {a,b} ()
Chengsong
parents: 605
diff changeset
   667
%    (q_1) edge  node  {a,b} (q_2)
Chengsong
parents: 605
diff changeset
   668
%    (q_2) edge  node  {a,b} (q_3);
Chengsong
parents: 605
diff changeset
   669
%\end{tikzpicture}
Chengsong
parents: 605
diff changeset
   670
\end{center}
Chengsong
parents: 605
diff changeset
   671
\caption{The example given by Becchi and Crawley
Chengsong
parents: 605
diff changeset
   672
	that NFA simulation can consume large
Chengsong
parents: 605
diff changeset
   673
	amounts of memory: $.^*a.^{\{n\}}bc$ matching
Chengsong
parents: 605
diff changeset
   674
	strings of the form $aaa\ldots aaaabc$.
Chengsong
parents: 605
diff changeset
   675
	When traversing in a breadth-first manner,
Chengsong
parents: 605
diff changeset
   676
all states from 0 till $n+1$ will become active.}
Chengsong
parents: 605
diff changeset
   677
\end{figure}
Chengsong
parents: 605
diff changeset
   678
%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
Chengsong
parents: 605
diff changeset
   679
%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
Chengsong
parents: 605
diff changeset
   680
%in terms of input string length.
Chengsong
parents: 605
diff changeset
   681
%TODO:try out these lexers
Chengsong
parents: 605
diff changeset
   682
These problems can of course be solved in matching algorithms where 
605
Chengsong
parents: 604
diff changeset
   683
automata go beyond the classic notion and for instance include explicit
Chengsong
parents: 604
diff changeset
   684
counters \cite{Turo_ov__2020}.
612
Chengsong
parents: 609
diff changeset
   685
These solutions can be quite efficient,
606
Chengsong
parents: 605
diff changeset
   686
with the ability to process
612
Chengsong
parents: 609
diff changeset
   687
gigabytes of strings input per second
606
Chengsong
parents: 605
diff changeset
   688
even with large counters \cite{Becchi08}.
612
Chengsong
parents: 609
diff changeset
   689
But formal reasoning about these automata especially in Isabelle 
Chengsong
parents: 609
diff changeset
   690
can be challenging
Chengsong
parents: 609
diff changeset
   691
and un-intuitive. 
Chengsong
parents: 609
diff changeset
   692
Therefore, we take correctness and runtime claims made about these solutions
Chengsong
parents: 609
diff changeset
   693
with a grain of salt.
605
Chengsong
parents: 604
diff changeset
   694
Chengsong
parents: 604
diff changeset
   695
In the work reported in \cite{CSL2022} and here, 
Chengsong
parents: 604
diff changeset
   696
we add better support using derivatives
Chengsong
parents: 604
diff changeset
   697
for bounded regular expressions $r^{\{n\}}$.
Chengsong
parents: 604
diff changeset
   698
The results
Chengsong
parents: 604
diff changeset
   699
extend straightforwardly to
Chengsong
parents: 604
diff changeset
   700
repetitions with an interval such as 
Chengsong
parents: 604
diff changeset
   701
$r^{\{n\ldots m\}}$.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   702
The merit of Brzozowski derivatives (more on this later)
605
Chengsong
parents: 604
diff changeset
   703
on this problem is that
Chengsong
parents: 604
diff changeset
   704
it can be naturally extended to support bounded repetitions.
Chengsong
parents: 604
diff changeset
   705
Moreover these extensions are still made up of only
Chengsong
parents: 604
diff changeset
   706
inductive datatypes and recursive functions,
Chengsong
parents: 604
diff changeset
   707
making it handy to deal with using theorem provers.
Chengsong
parents: 604
diff changeset
   708
%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
Chengsong
parents: 604
diff changeset
   709
%straightforwardly extended to deal with bounded regular expressions
Chengsong
parents: 604
diff changeset
   710
%and moreover the resulting code still consists of only simple
Chengsong
parents: 604
diff changeset
   711
%recursive functions and inductive datatypes.
Chengsong
parents: 604
diff changeset
   712
Finally, bounded regular expressions do not destroy our finite
Chengsong
parents: 604
diff changeset
   713
boundedness property, which we shall prove later on.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   714
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   715
606
Chengsong
parents: 605
diff changeset
   716
Chengsong
parents: 605
diff changeset
   717
Chengsong
parents: 605
diff changeset
   718
605
Chengsong
parents: 604
diff changeset
   719
\subsection{Back-References}
606
Chengsong
parents: 605
diff changeset
   720
The other way to simulate an $\mathit{NFA}$ for matching is choosing  
Chengsong
parents: 605
diff changeset
   721
a single transition each time, keeping all the other options in 
Chengsong
parents: 605
diff changeset
   722
a queue or stack, and backtracking if that choice eventually 
Chengsong
parents: 605
diff changeset
   723
fails. This method, often called a  "depth-first-search", 
Chengsong
parents: 605
diff changeset
   724
is efficient in a lot of cases, but could end up
Chengsong
parents: 605
diff changeset
   725
with exponential run time.
Chengsong
parents: 605
diff changeset
   726
The backtracking method is employed in regex libraries
Chengsong
parents: 605
diff changeset
   727
that support \emph{back-references}, for example
Chengsong
parents: 605
diff changeset
   728
in Java and Python.
605
Chengsong
parents: 604
diff changeset
   729
%\section{Back-references and The Terminology Regex}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   730
605
Chengsong
parents: 604
diff changeset
   731
%When one constructs an $\NFA$ out of a regular expression
Chengsong
parents: 604
diff changeset
   732
%there is often very little to be done in the first phase, one simply 
Chengsong
parents: 604
diff changeset
   733
%construct the $\NFA$ states based on the structure of the input regular expression.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   734
605
Chengsong
parents: 604
diff changeset
   735
%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
Chengsong
parents: 604
diff changeset
   736
%one by keeping track of all active states after consuming 
Chengsong
parents: 604
diff changeset
   737
%a character, and update that set of states iteratively.
Chengsong
parents: 604
diff changeset
   738
%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
Chengsong
parents: 604
diff changeset
   739
%for a path terminating
Chengsong
parents: 604
diff changeset
   740
%at an accepting state.
606
Chengsong
parents: 605
diff changeset
   741
Chengsong
parents: 605
diff changeset
   742
Chengsong
parents: 605
diff changeset
   743
Chengsong
parents: 605
diff changeset
   744
Given a regular expression like this (the sequence
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   745
operator is omitted for brevity):
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   746
\begin{center}
606
Chengsong
parents: 605
diff changeset
   747
	$r_1r_2r_3r_4$
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   748
\end{center}
606
Chengsong
parents: 605
diff changeset
   749
one could label sub-expressions of interest 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   750
by parenthesizing them and giving 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   751
them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   752
One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   753
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   754
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   755
\end{center}
606
Chengsong
parents: 605
diff changeset
   756
The sub-expressions
Chengsong
parents: 605
diff changeset
   757
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
Chengsong
parents: 605
diff changeset
   758
by 1 to 4, and can be ``referred back'' by their respective numbers. 
Chengsong
parents: 605
diff changeset
   759
%These sub-expressions are called "capturing groups".
Chengsong
parents: 605
diff changeset
   760
To do so, we use the syntax $\backslash i$ 
Chengsong
parents: 605
diff changeset
   761
to denote that we want the sub-string 
Chengsong
parents: 605
diff changeset
   762
of the input just matched by the i-th
Chengsong
parents: 605
diff changeset
   763
sub-expression to appear again, 
Chengsong
parents: 605
diff changeset
   764
exactly the same as it first appeared: 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   765
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   766
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   767
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   768
\end{center}
606
Chengsong
parents: 605
diff changeset
   769
%The backslash and number $i$ are the
Chengsong
parents: 605
diff changeset
   770
%so-called "back-references".
Chengsong
parents: 605
diff changeset
   771
%Let $e$ be an expression made of regular expressions 
Chengsong
parents: 605
diff changeset
   772
%and back-references. $e$ contains the expression $e_i$
Chengsong
parents: 605
diff changeset
   773
%as its $i$-th capturing group.
Chengsong
parents: 605
diff changeset
   774
%The semantics of back-reference can be recursively
Chengsong
parents: 605
diff changeset
   775
%written as:
Chengsong
parents: 605
diff changeset
   776
%\begin{center}
Chengsong
parents: 605
diff changeset
   777
%	\begin{tabular}{c}
Chengsong
parents: 605
diff changeset
   778
%		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
Chengsong
parents: 605
diff changeset
   779
%		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
Chengsong
parents: 605
diff changeset
   780
%	\end{tabular}
Chengsong
parents: 605
diff changeset
   781
%\end{center}
Chengsong
parents: 605
diff changeset
   782
A concrete example
Chengsong
parents: 605
diff changeset
   783
for back-references would be
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   784
\begin{center}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   785
$(.^*)\backslash 1$,
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   786
\end{center}
606
Chengsong
parents: 605
diff changeset
   787
which would match 
Chengsong
parents: 605
diff changeset
   788
strings that can be split into two identical halves,
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   789
for example $\mathit{foofoo}$, $\mathit{ww}$ and etc.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   790
Note that this is different from 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   791
repeating the  sub-expression verbatim like
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   792
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   793
	$(.^*)(.^*)$,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   794
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   795
which does not impose any restrictions on what strings the second 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   796
sub-expression $.^*$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   797
might match.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   798
Another example of back-references would be
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   799
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   800
$(.)(.)\backslash 2\backslash 1$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   801
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   802
which expresses four-character palindromes
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   803
like $abba$, $x??x$ etc.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   804
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   805
Back-references is a regex construct 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   806
that programmers found quite useful.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   807
According to Becchi and Crawley\cite{Becchi08},
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   808
6\% of Snort rules (up until 2008) include the use of them.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   809
The most common use of back-references
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   810
would be expressing well-formed html files,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   811
where back-references would be handy in expressing
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   812
a pair of opening and closing tags like 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   813
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   814
	$\langle html \rangle \ldots \langle / html \rangle$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   815
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   816
A regex describing such a format
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   817
could be
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   818
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   819
	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   820
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   821
Despite being useful, the syntax and expressive power of regexes 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   822
go beyond the regular language hierarchy
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   823
with back-references.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   824
In fact, they allow the regex construct to express 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   825
languages that cannot be contained in context-free
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   826
languages either.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   827
For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   828
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   829
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   830
Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   831
of formal languages. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   832
Solving the back-reference expressions matching problem
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   833
is known to be NP-complete \parencite{alfred2014algorithms}.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   834
A non-bactracking,
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   835
efficient solution is not known to exist.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   836
Regex libraries supporting back-references such as 
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   837
PCRE \cite{pcre} therefore have to
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   838
revert to a depth-first search algorithm which backtracks.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   839
What is unexpected is that even in the cases 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   840
not involving back-references, there is still
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   841
a (non-negligible) chance they might backtrack super-linearly,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   842
as shown in the graphs in \ref{fig:aStarStarb}.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   843
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   844
\subsection{Summary of the Catastrophic Backtracking Problem}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   845
Summing these up, we can categorise existing 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   846
practical regex libraries into two kinds:
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   847
(i)The ones  with  linear
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   848
time guarantees like Go and Rust. The cost with them is that
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   849
they impose restrictions
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   850
on the user input (not allowing back-references, 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   851
bounded repetitions cannot exceed a counter limit etc.).
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   852
(ii) Those 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   853
that allow large bounded regular expressions and back-references
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   854
at the expense of using a backtracking algorithm.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   855
They could grind to a halt
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   856
on some very simple cases, posing a vulnerability of
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   857
a ReDoS attack.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   858
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   859
 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   860
We would like to have regex engines that can 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   861
deal with the regular part (e.g.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   862
bounded repetitions) of regexes more
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   863
efficiently.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   864
Also we want to make sure that they do it correctly.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   865
It turns out that such aim is not so easy to achieve.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   866
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   867
% For example, the Rust regex engine claims to be linear, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   868
% but does not support lookarounds and back-references.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   869
% The GoLang regex library does not support over 1000 repetitions.  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   870
% Java and Python both support back-references, but shows
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   871
%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   872
%when the language is still regular).
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   873
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   874
 %TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   875
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   876
605
Chengsong
parents: 604
diff changeset
   877
Chengsong
parents: 604
diff changeset
   878
Chengsong
parents: 604
diff changeset
   879
%The time cost of regex matching algorithms in general
Chengsong
parents: 604
diff changeset
   880
%involve two different phases, and different things can go differently wrong on 
Chengsong
parents: 604
diff changeset
   881
%these phases.
Chengsong
parents: 604
diff changeset
   882
%$\DFA$s usually have problems in the first (construction) phase
Chengsong
parents: 604
diff changeset
   883
%, whereas $\NFA$s usually run into trouble
Chengsong
parents: 604
diff changeset
   884
%on the second phase.
Chengsong
parents: 604
diff changeset
   885
Chengsong
parents: 604
diff changeset
   886
Chengsong
parents: 604
diff changeset
   887
\section{Error-prone POSIX Implementations}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   888
When there are multiple ways of matching a string
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   889
with a regular expression, a matcher needs to
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   890
disambiguate.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   891
The standard for which particular match to pick
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   892
is called the disambiguation strategy.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   893
The more intuitive strategy is called POSIX,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   894
which always chooses the longest initial match.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   895
An alternative strategy would be greedy matches,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   896
which always ends a sub-match as early as possible.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   897
The POSIX standard is widely adopted in many operating systems.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   898
However, many implementations (including the C libraries
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   899
used by Linux and OS X distributions) contain bugs
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   900
or do not meet the specification they claim to adhere to.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   901
In some cases, they either fail to generate a lexing 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   902
result when there exists a match,
605
Chengsong
parents: 604
diff changeset
   903
or give results that are inconsistent with the $\POSIX$ standard.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   904
A concrete example would be the regex given by \cite{fowler2003}
605
Chengsong
parents: 604
diff changeset
   905
\begin{center}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   906
	$(aba + ab + a)^* \text{and the string} ababa$
605
Chengsong
parents: 604
diff changeset
   907
\end{center}
Chengsong
parents: 604
diff changeset
   908
The correct $\POSIX$ match for the above would be 
Chengsong
parents: 604
diff changeset
   909
with the entire string $ababa$, 
Chengsong
parents: 604
diff changeset
   910
split into two Kleene star iterations, $[ab] [aba]$ at positions
Chengsong
parents: 604
diff changeset
   911
$[0, 2), [2, 5)$
Chengsong
parents: 604
diff changeset
   912
respectively.
Chengsong
parents: 604
diff changeset
   913
But trying this out in regex101\parencite{regex101}
Chengsong
parents: 604
diff changeset
   914
with different language engines would yield 
Chengsong
parents: 604
diff changeset
   915
the same two fragmented matches: $[aba]$ at $[0, 3)$
Chengsong
parents: 604
diff changeset
   916
and $a$ at $[4, 5)$.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   917
Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell} 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   918
commented that most regex libraries are not
605
Chengsong
parents: 604
diff changeset
   919
correctly implementing the POSIX (maximum-munch)
Chengsong
parents: 604
diff changeset
   920
rule of regular expression matching.
Chengsong
parents: 604
diff changeset
   921
As Grathwohl\parencite{grathwohl2014crash} wrote,
Chengsong
parents: 604
diff changeset
   922
\begin{quote}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   923
	``The POSIX strategy is more complicated than the 
605
Chengsong
parents: 604
diff changeset
   924
	greedy because of the dependence on information about 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   925
	the length of matched strings in the various subexpressions.''
605
Chengsong
parents: 604
diff changeset
   926
\end{quote}
Chengsong
parents: 604
diff changeset
   927
%\noindent
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   928
The implementation complexity of POSIX rules also come from
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   929
the specification being not very clear.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   930
There are many informal summaries of this disambiguation
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   931
strategy, which are often quite long and delicate.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   932
For example Kuklewicz \cite{KuklewiczHaskell} 
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   933
described the POSIX rule as
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   934
\begin{quote}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   935
	``
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   936
	\begin{itemize}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   937
		\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   938
regular expressions (REs) take the leftmost starting match, and the longest match starting there
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   939
earlier subpatterns have leftmost-longest priority over later subpatterns\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   940
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   941
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   942
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   943
REs have right associative concatenation which can be changed with parenthesis\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   944
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   945
parenthesized subexpressions return the match from their last usage\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   946
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   947
text of component subexpressions must be contained in the text of the 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   948
higher-level subexpressions\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   949
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   950
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   951
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   952
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\''
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   953
\end{itemize}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   954
\end{quote}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   955
The text above 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   956
is trying to capture something very precise,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   957
and is crying out for formalising.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   958
Ausaf et al. \cite{AusafDyckhoffUrban2016}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   959
are the first to fill the gap
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   960
by not just describing such a formalised POSIX
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   961
specification in Isabelle/HOL, but also proving
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   962
that their specification coincides with the 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   963
POSIX specification given by Okui and Suzuki \cite{Okui10} 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   964
which is a completely
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   965
different characterisation.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   966
They then formally proved the correctness of
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   967
a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   968
based on that specification.
605
Chengsong
parents: 604
diff changeset
   969
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   970
In the next section we will very briefly
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   971
introduce Brzozowski derivatives and Sulzmann
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   972
and Lu's algorithm, which this thesis builds on.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   973
We give a taste of what they 
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   974
are like and why they are suitable for regular expression
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   975
matching and lexing.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   976
 
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   977
\section{Our Solution--Formal Specification of POSIX Matching 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   978
and Brzozowski Derivatives}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   979
Now we start with the central topic of the thesis: Brzozowski derivatives.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   980
Brzozowski \cite{Brzozowski1964} first introduced the 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   981
concept of the \emph{derivative} in the 1960s.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   982
The derivative of a regular expression $r$
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   983
with respect to a character $c$, is written as $r \backslash c$.\footnote{
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   984
	Despite having the same name, regular expression
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   985
	derivatives bear little similarity with the mathematical definition
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   986
	of derivatives on functions.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   987
}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   988
It tells us what $r$ would transform into
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   989
if we chop off the first character $c$ 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   990
from all strings in the language of $r$ ($L \; r$).
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   991
To give a flavour of Brzozowski derivatives, we present
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   992
two straightforward clauses from it:
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   993
\begin{center}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   994
	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   995
		$d \backslash c$     & $\dn$ & 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   996
		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   997
$(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   998
	\end{tabular}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   999
\end{center}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1000
\noindent
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1001
The first clause says that for the regular expression
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1002
denoting a singleton set consisting of a sinlge-character string $\{ d \}$,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1003
we check the derivative character $c$ against $d$,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1004
returning a set containing only the empty string $\{ [] \}$
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1005
if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1006
The second clause states that to obtain the regular expression
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1007
representing all strings' head character $c$ being chopped off
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1008
from $r_1 + r_2$, one simply needs to recursively take derivative
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1009
of $r_1$ and $r_2$ and then put them together.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1010
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1011
Thanks to the definition, derivatives have the nice property
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1012
that $s \in L \; (r\backslash c)$ if and only if 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1013
$c::s \in L \; r$.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1014
%This property can be used on regular expressions
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1015
%matching and lexing--to test whether a string $s$ is in $L \; r$,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1016
%one simply takes derivatives of $r$ successively with
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1017
%respect to the characters (in the correct order) in $s$,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1018
%and then test whether the empty string is in the last regular expression.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1019
Derivatives give a simple solution
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1020
to the problem of matching and lexing a string $s$ with a regular
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1021
expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1022
succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1023
then $r$ matches $s$ (and {\em vice versa}).  
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1024
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1025
This makes formally reasoning about these properties such
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1026
as correctness and complexity smooth and intuitive.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1027
In fact, there has already been several mechanised proofs about them,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1028
for example the one by Owens and Slind \cite{Owens2008} in HOL4,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1029
another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, and
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1030
yet another in Coq by Coquand and Siles \cite{Coquand2012}.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1031
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1032
In addition, one can extend the clauses to bounded repetitions
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1033
``for free'':
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1034
\begin{center}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1035
	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1036
		$r^{\{n\}} \backslash c$     & $\dn$ & $r \backslash c \cdot
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1037
		r^{\{n-1\}}$\\
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1038
	\end{tabular}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1039
\end{center}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1040
\noindent
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1041
And experimental results suggest that  unlike DFA-based solutions,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1042
this derivatives can support 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1043
bounded regular expressions with large counters
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1044
quite well.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1045
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1046
There has also been 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1047
extensions to other constructs.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1048
For example, Owens et al include the derivatives
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1049
for \emph{NOT} regular expressions, which is
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1050
able to concisely express C-style comments of the form
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1051
$/* \ldots */$.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1052
Another extension for derivatives would be
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1053
regular expressions with look-aheads, done by
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1054
by Miyazaki and Minamide
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1055
\cite{Takayuki2019}.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1056
%We therefore use Brzozowski derivatives on regular expressions 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1057
%lexing 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1058
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1059
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1060
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1061
Given the above definitions and properties of
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1062
Brzozowski derivatives, one quickly realises their potential
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1063
in generating a formally verified algorithm for lexing--the clauses and property
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1064
can be easily expressed in a functional programming language 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1065
or converted to theorem prover
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1066
code, with great extensibility.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1067
Perhaps this is the reason why it has sparked quite a bit of interest
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1068
in the functional programming and theorem prover communities in the last
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1069
fifteen or so years (
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1070
\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1071
\cite{Chen12} and \cite{Coquand2012}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1072
to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1073
after they were first published.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1074
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1075
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1076
However, there are two difficulties with derivative-based matchers:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1077
First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1078
for whether a regular expression matches a string or not.  This is too
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1079
little information in the context of lexing where separate tokens must
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1080
be identified and also classified (for example as keywords
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1081
or identifiers). 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1082
Second, derivative-based matchers need to be more efficient.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1083
Elegant and beautiful
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1084
as many implementations are,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1085
they can be excruciatingly slow. 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1086
For example, Sulzmann and Lu
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1087
claim a linear running time of their proposed algorithm,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1088
but that was falsified by our experiments. The running time 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1089
is actually $\Omega(2^n)$ in the worst case.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1090
A similar claim about a theoretical runtime of $O(n^2)$ 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1091
is made for the Verbatim \cite{Verbatim}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1092
%TODO: give references
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1093
lexer, which calculates POSIX matches and is based on derivatives.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1094
They formalized the correctness of the lexer, but not the complexity.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1095
In the performance evaluation section, they simply analyzed the run time
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1096
of matching $a$ with the string 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1097
\begin{center}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1098
	$\underbrace{a \ldots a}_{\text{n a's}}$
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1099
\end{center}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1100
and concluded that the algorithm is quadratic in terms of input length.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1101
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1102
the time it took to lex only 40 $a$'s was 5 minutes.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1103
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1104
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1105
\subsection{Sulzmann and Lu's Algorithm}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1106
Sulzmann and Lu~\cite{Sulzmann2014} overcame the first 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1107
difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1108
algorithm. Their extended version generates additional information on
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1109
\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1110
rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1111
second ``phase'' to Brzozowski's algorithm involving an injection
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1112
function simplification of internal data structures 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1113
eliminating the exponential behaviours.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1114
In an earlier work, Ausaf et al provided the formal
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1115
specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1116
the correctness
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1117
of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1118
\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1119
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1120
The version of the algorithm proven correct 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1121
suffers from the
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1122
second difficulty though, where the internal derivatives can
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1123
grow to arbitrarily big sizes. 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1124
For example if we start with the
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1125
regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1126
successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1127
a sequence of ever-growing derivatives like 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1128
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1129
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1130
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1131
\begin{tabular}{rll}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1132
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1133
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1134
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1135
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1136
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1137
\end{tabular}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1138
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1139
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1140
\noindent where after around 35 steps we run out of memory on a
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1141
typical computer (we shall define in the next chapter 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1142
the precise details of our
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1143
regular expressions and the derivative operation).  Clearly, the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1144
notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1145
simplification rules that can be applied to regular regular
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1146
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1147
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1148
r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1149
earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1150
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1151
\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1152
above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1153
quickly beyond any finite bound.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1154
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1155
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1156
\cite{Sulzmann2014} where they introduce bit-coded
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1157
regular expressions. In this version, POSIX values are
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1158
represented as bit sequences and such sequences are incrementally generated
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1159
when derivatives are calculated. The compact representation
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1160
of bit sequences and regular expressions allows them to define a more
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1161
``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1162
derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1163
They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1164
of this version, but do not provide any supporting proof arguments, not
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1165
even ``pencil-and-paper'' arguments. They write about their bit-coded
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1166
\emph{incremental parsing method} (that is the algorithm to be formalised
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1167
in this dissertation)
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1168
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1169
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1170
  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1171
  \begin{quote}\it
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1172
  ``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1173
  method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1174
  yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1175
  extensively [..] but yet
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1176
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1177
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1178
Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1179
a formal proof.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1180
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1181
However a faster formally verified 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1182
lexing program with the optimisations
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1183
mentioned by Sulzmann and Lu's second algorithm
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1184
is still missing.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1185
As they stated,
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1186
  \begin{quote}\it
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1187
``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1188
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1189
This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1190
by Ausaf and Urban,
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1191
together with a formal proof of the correctness with those simplifications.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1192
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1193
612
Chengsong
parents: 609
diff changeset
  1194
One of the most recent work in the context of lexing
Chengsong
parents: 609
diff changeset
  1195
%with this issue
Chengsong
parents: 609
diff changeset
  1196
is the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.
Chengsong
parents: 609
diff changeset
  1197
This is relevant work for us and we will compare it later with 
Chengsong
parents: 609
diff changeset
  1198
our derivative-based matcher we are going to present.
Chengsong
parents: 609
diff changeset
  1199
There is also some newer work called
Chengsong
parents: 609
diff changeset
  1200
Verbatim++\cite{Verbatimpp}, which does not use derivatives, 
Chengsong
parents: 609
diff changeset
  1201
but deterministic finite automaton instead.
Chengsong
parents: 609
diff changeset
  1202
%An example that gives problem to automaton approaches would be
Chengsong
parents: 609
diff changeset
  1203
%the regular expression $(a|b)^*a(a|b)^{\{n\}}$.
Chengsong
parents: 609
diff changeset
  1204
%It requires at least $2^{n+1}$ states to represent
Chengsong
parents: 609
diff changeset
  1205
%as a DFA.
Chengsong
parents: 609
diff changeset
  1206
Chengsong
parents: 609
diff changeset
  1207
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1208
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1209
\section{Contribution}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1210
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1211
In this thesis,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1212
we propose a solution to catastrophic
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1213
backtracking and error-prone matchers: a formally verified
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1214
regular expression lexing algorithm
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1215
that is both fast
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1216
and correct by extending Ausaf et al.'s work.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1217
The end result is %a regular expression lexing algorithm that comes with 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1218
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1219
\item
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1220
an improved version of  Sulzmann and Lu's bit-coded algorithm using 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1221
derivatives with simplifications, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1222
accompanied by
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1223
a proven correctness theorem according to POSIX specification 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1224
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1225
\item 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1226
a complexity-related property for that algorithm saying that the 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1227
internal data structure will
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1228
remain finite,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1229
\item
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1230
and extension to
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1231
the bounded repetitions construct with the correctness and finiteness property
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1232
maintained.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1233
 \end{itemize}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1234
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1235
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1236
With a formal finiteness bound in place,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1237
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1238
Further improvements to the algorithm with an even stronger version of 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1239
simplification is made.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1240
Thanks to our theorem-prover-friendly approach,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1241
we believe that 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1242
this finiteness bound can be improved to a bound
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1243
linear to input and
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1244
cubic to the regular expression size using a technique by
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1245
Antimirov\cite{Antimirov95}.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1246
Once formalised, this would be a guarantee for the absence of all super-linear behavious.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1247
We are working out the
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1248
details.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1249
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1250
 
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1251
To our best knowledge, no lexing libraries using Brzozowski derivatives
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1252
have similar complexity-related bounds, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1253
and claims about running time are usually speculative and backed by empirical
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1254
evidence on a few test cases.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1255
If a matching or lexing algorithm
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1256
does not come with certain basic complexity related 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1257
guarantees (for examaple the internal data structure size
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1258
does not grow indefinitely), 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1259
then they cannot claim with confidence having solved the problem
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1260
of catastrophic backtracking.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1261
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1262
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1263
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1264
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1265
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1266
\section{Structure of the thesis}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1267
In chapter 2 \ref{Inj} we will introduce the concepts
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1268
and notations we 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1269
use for describing the lexing algorithm by Sulzmann and Lu,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1270
and then give the lexing algorithm.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1271
We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1272
Then we illustrate in \ref{Bitcoded2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1273
how the algorithm without bitcodes falls short for such aggressive 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1274
simplifications and therefore introduce our version of the
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1275
 bit-coded algorithm and 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1276
its correctness proof .  
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1277
In \ref{Finite} we give the second guarantee
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1278
of our bitcoded algorithm, that is a finite bound on the size of any 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1279
regex's derivatives.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1280
In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1281
in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1282
algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1283
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1284
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1285
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1286
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1287
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1288
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1289
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1290
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1291
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1292
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1293
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1294
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1295
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1296
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1297