ChengsongTanPhdThesis/Chapters/Introduction.tex
author Chengsong
Fri, 30 Sep 2022 01:47:33 +0100
changeset 607 e6fc9b72c0e3
parent 606 99b530103464
child 608 37b6fd310a16
permissions -rwxr-xr-x
chap1 almost done
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     1
% Chapter 1
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     2
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     4
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     5
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     6
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     8
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    15
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    16
%boxes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    17
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    18
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    19
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    20
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    21
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    22
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
    23
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564
Chengsong
parents: 558
diff changeset
    24
\def\derssimp{\textit{ders}\_\textit{simp}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    25
\def\rders{\textit{rders}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    26
\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    27
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
    28
\def\bsimps{\textit{bsimp}}
554
Chengsong
parents: 543
diff changeset
    29
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    30
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    31
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    32
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    33
\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    34
\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    35
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555
Chengsong
parents: 554
diff changeset
    36
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594
Chengsong
parents: 591
diff changeset
    37
\def\rdistincts{\textit{rdistinct}}
556
Chengsong
parents: 555
diff changeset
    38
\def\rDistinct{\textit{rdistinct}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    39
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    40
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    41
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    42
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    43
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    44
600
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    45
\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    46
\def\SEQs{\textit{SEQs}}
564
Chengsong
parents: 558
diff changeset
    47
\def\case{\textit{case}}
554
Chengsong
parents: 543
diff changeset
    48
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
Chengsong
parents: 543
diff changeset
    49
\def\rsimpalts{\textit{rsimp}_{ALTS}}
Chengsong
parents: 543
diff changeset
    50
\def\good{\textit{good}}
Chengsong
parents: 543
diff changeset
    51
\def\btrue{\textit{true}}
Chengsong
parents: 543
diff changeset
    52
\def\bfalse{\textit{false}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    53
\def\bnullable{\textit{bnullable}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    54
\def\bnullables{\textit{bnullables}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    55
\def\Some{\textit{Some}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    56
\def\None{\textit{None}}
537
Chengsong
parents: 532
diff changeset
    57
\def\code{\textit{code}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    58
\def\decode{\textit{decode}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    59
\def\internalise{\textit{internalise}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    60
\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    61
\def\mkeps{\textit{mkeps}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    62
\newcommand{\rder}[2]{#2 \backslash_r #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    63
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    64
\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    65
554
Chengsong
parents: 543
diff changeset
    66
\def\nonnested{\textit{nonnested}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    67
\def\AZERO{\textit{AZERO}}
558
Chengsong
parents: 557
diff changeset
    68
\def\sizeNregex{\textit{sizeNregex}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    69
\def\AONE{\textit{AONE}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    70
\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    71
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    72
\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    73
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    74
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555
Chengsong
parents: 554
diff changeset
    75
\def\frewrite{\rightsquigarrow_f}
Chengsong
parents: 554
diff changeset
    76
\def\hrewrite{\rightsquigarrow_h}
Chengsong
parents: 554
diff changeset
    77
\def\grewrite{\rightsquigarrow_g}
Chengsong
parents: 554
diff changeset
    78
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
Chengsong
parents: 554
diff changeset
    79
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
Chengsong
parents: 554
diff changeset
    80
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    81
\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    82
\def\bder{\textit{bder}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    83
\def\der{\textit{der}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    84
\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    85
\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    86
\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    87
\def\DFA{\textit{DFA}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    88
\def\NFA{\textit{NFA}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    89
\def\bmkeps{\textit{bmkeps}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    90
\def\bmkepss{\textit{bmkepss}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    91
\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    92
\def\blexer{\textit{blexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    93
\def\flex{\textit{flex}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    94
\def\inj{\textit{inj}}
564
Chengsong
parents: 558
diff changeset
    95
\def\Empty{\textit{Empty}}
567
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
    96
\def\Left{\textit{Left}}
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
    97
\def\Right{\textit{Right}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    98
\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
    99
\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   100
\def\Seq{\textit{Seq}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   101
\def\Der{\textit{Der}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   102
\def\Ders{\textit{Ders}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   103
\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   104
\def\Z{\mathit{Z}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   105
\def\S{\mathit{S}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   106
\def\rup{r^\uparrow}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   107
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   108
\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   109
\def\lf{\textit{lf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   110
\def\PD{\textit{PD}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   111
\def\suffix{\textit{Suffix}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   112
\def\distinctBy{\textit{distinctBy}}
558
Chengsong
parents: 557
diff changeset
   113
\def\starupdate{\textit{starUpdate}}
Chengsong
parents: 557
diff changeset
   114
\def\starupdates{\textit{starUpdates}}
Chengsong
parents: 557
diff changeset
   115
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   116
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   117
\def\size{\mathit{size}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   118
\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   119
\def\simp{\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   120
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   121
\def\map{\mathit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   122
\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   123
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   124
\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   125
\def\bsimpStrong{\textit{bsimpStrong}}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
   126
\def\bdersStrongs{\textit{bdersStrong}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   127
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   128
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   129
\def\map{\textit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   130
\def\rrexp{\textit{rrexp}}
554
Chengsong
parents: 543
diff changeset
   131
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   132
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   133
\newcommand\asize[1]{\llbracket #1 \rrbracket}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   134
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   135
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   136
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   137
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   138
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   139
\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   140
\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   141
\def\bsimpalts{\textit{bsimp}_{ALTS}}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   142
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   143
\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   144
\def\rsimpseq{\textit{rsimp}_{SEQ}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   145
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   146
\def\erase{\textit{erase}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   147
\def\STAR{\textit{STAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   148
\def\flts{\textit{flts}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   149
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   150
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   151
\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   152
\def\nub{\textit{nub}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   153
\def\filter{\textit{filter}}
601
Chengsong
parents: 600
diff changeset
   154
%\def\not{\textit{not}}
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   155
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   156
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   157
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   158
\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   159
\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   160
\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   161
\newcommand\RSEQ[2]{#1 \cdot #2}
558
Chengsong
parents: 557
diff changeset
   162
\newcommand\RALTS[1]{\sum #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   163
\newcommand\RSTAR[1]{#1^*}
558
Chengsong
parents: 557
diff changeset
   164
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   165
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   166
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   167
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   168
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   169
\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   170
  frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   171
  language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   172
  aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   173
  belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   174
  showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   175
  columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   176
  basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   177
  numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   178
  numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   179
  keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   180
  commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   181
  stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   182
  frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   183
  breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   184
  breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   185
  tabsize=3,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   186
}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   187
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   188
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   189
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   190
%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   191
%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   192
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   193
%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   194
601
Chengsong
parents: 600
diff changeset
   195
Chengsong
parents: 600
diff changeset
   196
Chengsong
parents: 600
diff changeset
   197
Chengsong
parents: 600
diff changeset
   198
Chengsong
parents: 600
diff changeset
   199
Chengsong
parents: 600
diff changeset
   200
Regular expressions are widely used in computer science: 
Chengsong
parents: 600
diff changeset
   201
be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
Chengsong
parents: 600
diff changeset
   202
command-line tools like $\mathit{grep}$ that facilitate easy 
Chengsong
parents: 600
diff changeset
   203
text-processing; network intrusion
Chengsong
parents: 600
diff changeset
   204
detection systems that reject suspicious traffic; or compiler
Chengsong
parents: 600
diff changeset
   205
front ends--the majority of the solutions to these tasks 
Chengsong
parents: 600
diff changeset
   206
involve lexing with regular 
Chengsong
parents: 600
diff changeset
   207
expressions.
Chengsong
parents: 600
diff changeset
   208
Given its usefulness and ubiquity, one would imagine that
Chengsong
parents: 600
diff changeset
   209
modern regular expression matching implementations
Chengsong
parents: 600
diff changeset
   210
are mature and fully studied.
602
Chengsong
parents: 601
diff changeset
   211
Indeed, in a popular programming language's regex engine, 
Chengsong
parents: 601
diff changeset
   212
supplying it with regular expressions and strings,
Chengsong
parents: 601
diff changeset
   213
in most cases one can
Chengsong
parents: 601
diff changeset
   214
get the matching information in a very short time.
Chengsong
parents: 601
diff changeset
   215
Those matchers can be blindingly fast--some 
Chengsong
parents: 601
diff changeset
   216
network intrusion detection systems
601
Chengsong
parents: 600
diff changeset
   217
use regex engines that are able to process 
Chengsong
parents: 600
diff changeset
   218
megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
602
Chengsong
parents: 601
diff changeset
   219
However, those matchers can exhibit a surprising security vulnerability
Chengsong
parents: 601
diff changeset
   220
under a certain class of inputs.
Chengsong
parents: 601
diff changeset
   221
%However, , this is not the case for $\mathbf{all}$ inputs.
601
Chengsong
parents: 600
diff changeset
   222
%TODO: get source for SNORT/BRO's regex matching engine/speed
Chengsong
parents: 600
diff changeset
   223
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   224
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   225
Take $(a^*)^*\,b$ and ask whether
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   226
strings of the form $aa..a$ match this regular
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   227
expression. Obviously this is not the case---the expected $b$ in the last
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   228
position is missing. One would expect that modern regular expression
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   229
matching engines can find this out very quickly. Alas, if one tries
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   230
this example in JavaScript, Python or Java 8, even with strings of a small
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   231
length, say around 30 $a$'s,
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   232
the decision takes crazy time to finish (graph \ref{fig:aStarStarb}).
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   233
This is clearly exponential behaviour, and 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   234
is triggered by some relatively simple regex patterns.
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   235
Java 9 and newer
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   236
versions improves this behaviour, but is still slow compared 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   237
with the approach we are going to use.
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   238
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   239
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   240
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   241
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   242
This superlinear blowup in regular expression engines
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   243
had repeatedly caused grief in real life that they
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   244
get a name for them--``catastrophic backtracking''.
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   245
For example, on 20 July 2016 one evil
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   246
regular expression brought the webpage
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   247
\href{http://stackexchange.com}{Stack Exchange} to its
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   248
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   249
In this instance, a regular expression intended to just trim white
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   250
spaces from the beginning and the end of a line actually consumed
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   251
massive amounts of CPU resources---causing web servers to grind to a
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   252
halt. In this example, the time needed to process
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   253
the string was $O(n^2)$ with respect to the string length. This
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   254
quadratic overhead was enough for the homepage of Stack Exchange to
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   255
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   256
attack and therefore stopped the servers from responding to any
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   257
requests. This made the whole site become unavailable. 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   258
601
Chengsong
parents: 600
diff changeset
   259
\begin{figure}[p]
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   260
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   261
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   262
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   263
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   264
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   265
    ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   266
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   267
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   268
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   269
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   270
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   271
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   272
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   273
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   274
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   275
    legend entries={JavaScript},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   276
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   277
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   278
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   279
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   280
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   281
  &
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   282
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   283
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   284
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   285
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   286
    %ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   287
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   288
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   289
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   290
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   291
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   292
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   293
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   294
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   295
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   296
    legend entries={Python},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   297
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   298
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   299
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   300
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   301
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   302
  &
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   303
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   304
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   305
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   306
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   307
    %ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   308
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   309
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   310
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   311
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   312
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   313
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   314
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   315
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   316
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   317
    legend entries={Java 8},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   318
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   319
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   320
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   321
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   322
\end{tikzpicture}\\
601
Chengsong
parents: 600
diff changeset
   323
\begin{tikzpicture}
Chengsong
parents: 600
diff changeset
   324
\begin{axis}[
Chengsong
parents: 600
diff changeset
   325
    xlabel={$n$},
Chengsong
parents: 600
diff changeset
   326
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 600
diff changeset
   327
    ylabel={time in secs},
Chengsong
parents: 600
diff changeset
   328
    enlargelimits=false,
Chengsong
parents: 600
diff changeset
   329
    xtick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   330
    xmax=33,
Chengsong
parents: 600
diff changeset
   331
    ymax=35,
Chengsong
parents: 600
diff changeset
   332
    ytick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   333
    scaled ticks=false,
Chengsong
parents: 600
diff changeset
   334
    axis lines=left,
Chengsong
parents: 600
diff changeset
   335
    width=5cm,
Chengsong
parents: 600
diff changeset
   336
    height=4cm, 
Chengsong
parents: 600
diff changeset
   337
    legend entries={Dart},  
Chengsong
parents: 600
diff changeset
   338
    legend pos=north west,
Chengsong
parents: 600
diff changeset
   339
    legend cell align=left]
Chengsong
parents: 600
diff changeset
   340
\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
Chengsong
parents: 600
diff changeset
   341
\end{axis}
Chengsong
parents: 600
diff changeset
   342
\end{tikzpicture}
Chengsong
parents: 600
diff changeset
   343
  &
Chengsong
parents: 600
diff changeset
   344
\begin{tikzpicture}
Chengsong
parents: 600
diff changeset
   345
\begin{axis}[
Chengsong
parents: 600
diff changeset
   346
    xlabel={$n$},
Chengsong
parents: 600
diff changeset
   347
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 600
diff changeset
   348
    %ylabel={time in secs},
Chengsong
parents: 600
diff changeset
   349
    enlargelimits=false,
Chengsong
parents: 600
diff changeset
   350
    xtick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   351
    xmax=33,
Chengsong
parents: 600
diff changeset
   352
    ymax=35,
Chengsong
parents: 600
diff changeset
   353
    ytick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   354
    scaled ticks=false,
Chengsong
parents: 600
diff changeset
   355
    axis lines=left,
Chengsong
parents: 600
diff changeset
   356
    width=5cm,
Chengsong
parents: 600
diff changeset
   357
    height=4cm, 
Chengsong
parents: 600
diff changeset
   358
    legend entries={Swift},  
Chengsong
parents: 600
diff changeset
   359
    legend pos=north west,
Chengsong
parents: 600
diff changeset
   360
    legend cell align=left]
Chengsong
parents: 600
diff changeset
   361
\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
Chengsong
parents: 600
diff changeset
   362
\end{axis}
Chengsong
parents: 600
diff changeset
   363
\end{tikzpicture}
Chengsong
parents: 600
diff changeset
   364
  & \\
Chengsong
parents: 600
diff changeset
   365
\multicolumn{3}{c}{Graphs}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   366
\end{tabular}    
601
Chengsong
parents: 600
diff changeset
   367
\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings 
Chengsong
parents: 600
diff changeset
   368
           of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
Chengsong
parents: 600
diff changeset
   369
   The reason for their superlinear behaviour is that they do a depth-first-search.
Chengsong
parents: 600
diff changeset
   370
   If the string does not match, the engine starts to explore all possibilities. 
Chengsong
parents: 600
diff changeset
   371
}\label{fig:aStarStarb}
Chengsong
parents: 600
diff changeset
   372
\end{figure}\afterpage{\clearpage}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   373
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   374
A more recent example is a global outage of all Cloudflare servers on 2 July
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   375
2019. A poorly written regular expression exhibited exponential
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   376
behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   377
had several causes, at the heart was a regular expression that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   378
was used to monitor network
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   379
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   380
These problems with regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   381
are not isolated events that happen
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   382
very occasionally, but actually widespread.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   383
They occur so often that they get a 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   384
name--Regular-Expression-Denial-Of-Service (ReDoS)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   385
attack.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   386
\citeauthor{Davis18} detected more
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   387
than 1000 super-linear (SL) regular expressions
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   388
in Node.js, Python core libraries, and npm and pypi. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   389
They therefore concluded that evil regular expressions
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   390
are problems "more than a parlour trick", but one that
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   391
requires
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   392
more research attention.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   393
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   394
This work aims to address this issue
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   395
with the help of formal proofs.
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   396
We offer a lexing algorithm based
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   397
on Brzozowski derivatives with certified correctness (in 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   398
Isabelle/HOL)
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   399
and finiteness property.
604
Chengsong
parents: 603
diff changeset
   400
Such properties guarantee the absence of 
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   401
catastrophic backtracking in most cases.
604
Chengsong
parents: 603
diff changeset
   402
We will give more details in the next sections
Chengsong
parents: 603
diff changeset
   403
on (i) why the slow cases in graph \ref{fig:aStarStarb}
Chengsong
parents: 603
diff changeset
   404
can occur
Chengsong
parents: 603
diff changeset
   405
and (ii) why we choose our 
Chengsong
parents: 603
diff changeset
   406
approach (Brzozowski derivatives and formal proofs).
602
Chengsong
parents: 601
diff changeset
   407
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   408
605
Chengsong
parents: 604
diff changeset
   409
\section{Regex, and the Problems with Regex Matchers}
601
Chengsong
parents: 600
diff changeset
   410
Regular expressions and regular expression matchers 
Chengsong
parents: 600
diff changeset
   411
have of course been studied for many, many years.
605
Chengsong
parents: 604
diff changeset
   412
Theoretical results in automata theory say
604
Chengsong
parents: 603
diff changeset
   413
that basic regular expression matching should be linear
605
Chengsong
parents: 604
diff changeset
   414
w.r.t the input.
Chengsong
parents: 604
diff changeset
   415
This assumes that the regular expression
Chengsong
parents: 604
diff changeset
   416
$r$ was pre-processed and turned into a
Chengsong
parents: 604
diff changeset
   417
deterministic finite automata (DFA) before matching,
Chengsong
parents: 604
diff changeset
   418
which could be exponential\cite{Sakarovitch2009}.
604
Chengsong
parents: 603
diff changeset
   419
By basic we mean textbook definitions such as the one
Chengsong
parents: 603
diff changeset
   420
below, involving only characters, alternatives,
Chengsong
parents: 603
diff changeset
   421
sequences, and Kleene stars:
Chengsong
parents: 603
diff changeset
   422
\[
Chengsong
parents: 603
diff changeset
   423
	r ::= \ZERO | \ONE | c | r_1 + r_2 | r_1 \cdot r_2 | r^*
Chengsong
parents: 603
diff changeset
   424
\]
Chengsong
parents: 603
diff changeset
   425
Modern regular expression matchers used by programmers,
Chengsong
parents: 603
diff changeset
   426
however,
Chengsong
parents: 603
diff changeset
   427
support richer constructs such as bounded repetitions
Chengsong
parents: 603
diff changeset
   428
and back-references.
605
Chengsong
parents: 604
diff changeset
   429
To differentiate, people use the word \emph{regex} to refer
Chengsong
parents: 604
diff changeset
   430
to those expressions with richer constructs while reserving the
Chengsong
parents: 604
diff changeset
   431
term \emph{regular expression}
Chengsong
parents: 604
diff changeset
   432
for the more traditional meaning in formal languages theory.
Chengsong
parents: 604
diff changeset
   433
We follow this convention 
Chengsong
parents: 604
diff changeset
   434
in this thesis.
Chengsong
parents: 604
diff changeset
   435
In the future, we aim to support all the popular features of regexes, 
604
Chengsong
parents: 603
diff changeset
   436
but for this work we mainly look at regular expressions.
Chengsong
parents: 603
diff changeset
   437
605
Chengsong
parents: 604
diff changeset
   438
Chengsong
parents: 604
diff changeset
   439
Chengsong
parents: 604
diff changeset
   440
%Most modern regex libraries
Chengsong
parents: 604
diff changeset
   441
%the so-called PCRE standard (Peral Compatible Regular Expressions)
Chengsong
parents: 604
diff changeset
   442
%has the back-references
604
Chengsong
parents: 603
diff changeset
   443
Regexes come with a lot of constructs
605
Chengsong
parents: 604
diff changeset
   444
beyond the basic ones
Chengsong
parents: 604
diff changeset
   445
that make it more convenient for 
604
Chengsong
parents: 603
diff changeset
   446
programmers to write regular expressions.
605
Chengsong
parents: 604
diff changeset
   447
Depending on the types of these constructs
Chengsong
parents: 604
diff changeset
   448
the task of matching and lexing with them
Chengsong
parents: 604
diff changeset
   449
will have different levels of complexity increase.
604
Chengsong
parents: 603
diff changeset
   450
Some of those constructs are syntactic sugars that are
Chengsong
parents: 603
diff changeset
   451
simply short hand notations
605
Chengsong
parents: 604
diff changeset
   452
that save the programmers a few keystrokes.
Chengsong
parents: 604
diff changeset
   453
These will not cause trouble for regex libraries.
Chengsong
parents: 604
diff changeset
   454
Chengsong
parents: 604
diff changeset
   455
\noindent
Chengsong
parents: 604
diff changeset
   456
For example the
604
Chengsong
parents: 603
diff changeset
   457
non-binary alternative involving three or more choices:
Chengsong
parents: 603
diff changeset
   458
\[
605
Chengsong
parents: 604
diff changeset
   459
	(a | b | c) \stackrel{means}{=} ((a + b)+ c)
604
Chengsong
parents: 603
diff changeset
   460
\]
605
Chengsong
parents: 604
diff changeset
   461
the range operator $-$ used to express the alternative
Chengsong
parents: 604
diff changeset
   462
of all characters between its operands in a concise way:
604
Chengsong
parents: 603
diff changeset
   463
\[
605
Chengsong
parents: 604
diff changeset
   464
	[0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 ) \; \text{(all number digits)}
604
Chengsong
parents: 603
diff changeset
   465
\]
605
Chengsong
parents: 604
diff changeset
   466
and the
Chengsong
parents: 604
diff changeset
   467
wildcard character $.$ used to refer to any single character:
Chengsong
parents: 604
diff changeset
   468
\[
Chengsong
parents: 604
diff changeset
   469
	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
Chengsong
parents: 604
diff changeset
   470
\]
604
Chengsong
parents: 603
diff changeset
   471
605
Chengsong
parents: 604
diff changeset
   472
\noindent
Chengsong
parents: 604
diff changeset
   473
\subsection{Bounded Repetitions}
Chengsong
parents: 604
diff changeset
   474
Some of those constructs do make the expressions much
Chengsong
parents: 604
diff changeset
   475
more compact.
Chengsong
parents: 604
diff changeset
   476
For example, the bounded regular expressions
Chengsong
parents: 604
diff changeset
   477
(where $n$ and $m$ are constant natural numbers)
Chengsong
parents: 604
diff changeset
   478
$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$,
Chengsong
parents: 604
diff changeset
   479
defined as 
Chengsong
parents: 604
diff changeset
   480
\begin{center}
Chengsong
parents: 604
diff changeset
   481
	\begin{tabular}{lcl}
Chengsong
parents: 604
diff changeset
   482
		$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
Chengsong
parents: 604
diff changeset
   483
		$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
Chengsong
parents: 604
diff changeset
   484
		$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
Chengsong
parents: 604
diff changeset
   485
		$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
Chengsong
parents: 604
diff changeset
   486
	\end{tabular}
Chengsong
parents: 604
diff changeset
   487
\end{center}
Chengsong
parents: 604
diff changeset
   488
are exponentially smaller compared with
Chengsong
parents: 604
diff changeset
   489
their unfolded form: for example $r^{\{n\}}$
Chengsong
parents: 604
diff changeset
   490
as opposed to
Chengsong
parents: 604
diff changeset
   491
\[
Chengsong
parents: 604
diff changeset
   492
	\underbrace{r\ldots r}_\text{n copies of r}.
Chengsong
parents: 604
diff changeset
   493
\]
Chengsong
parents: 604
diff changeset
   494
%Therefore, a naive algorithm that simply unfolds
Chengsong
parents: 604
diff changeset
   495
%them into their desugared forms
Chengsong
parents: 604
diff changeset
   496
%will suffer from at least an exponential runtime increase.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   497
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   498
The problem here is that tools based on the classic notion of
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   499
automata need to expand $r^{n}$ into $n$ connected 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   500
copies of the automaton for $r$. This leads to very inefficient matching
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   501
algorithms  or algorithms that consume large amounts of memory.
605
Chengsong
parents: 604
diff changeset
   502
Implementations using $\DFA$s will
Chengsong
parents: 604
diff changeset
   503
either become excruciatingly slow 
Chengsong
parents: 604
diff changeset
   504
(for example Verbatim++\cite{Verbatimpp}) or get
Chengsong
parents: 604
diff changeset
   505
out of memory errors (for example $\mathit{LEX}$ and 
Chengsong
parents: 604
diff changeset
   506
$\mathit{JFLEX}$\footnote{which are lexer generators
Chengsong
parents: 604
diff changeset
   507
in C and JAVA that generate $\mathit{DFA}$-based
Chengsong
parents: 604
diff changeset
   508
lexers. The user provides a set of regular expressions
Chengsong
parents: 604
diff changeset
   509
and configurations to them, and then 
Chengsong
parents: 604
diff changeset
   510
gets an output program encoding a minimized $\mathit{DFA}$
Chengsong
parents: 604
diff changeset
   511
that can be compiled and run. 
Chengsong
parents: 604
diff changeset
   512
When given the above countdown regular expression,
Chengsong
parents: 604
diff changeset
   513
a small $n$ (a few dozen) would result in a 
Chengsong
parents: 604
diff changeset
   514
determinised automata
Chengsong
parents: 604
diff changeset
   515
with millions of states.}) under large counters.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   516
A classic example is the regular expression $(a+b)^*  a (a+b)^{n}$
605
Chengsong
parents: 604
diff changeset
   517
where the minimal DFA requires at least $2^{n+1}$ states.
Chengsong
parents: 604
diff changeset
   518
For example, when $n$ is equal to 2,
604
Chengsong
parents: 603
diff changeset
   519
an $\mathit{NFA}$ describing it would look like:
Chengsong
parents: 603
diff changeset
   520
\begin{center}
Chengsong
parents: 603
diff changeset
   521
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
Chengsong
parents: 603
diff changeset
   522
   \node[state,initial] (q_0)   {$q_0$}; 
Chengsong
parents: 603
diff changeset
   523
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
Chengsong
parents: 603
diff changeset
   524
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
Chengsong
parents: 603
diff changeset
   525
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
Chengsong
parents: 603
diff changeset
   526
    \path[->] 
Chengsong
parents: 603
diff changeset
   527
    (q_0) edge  node {a} (q_1)
Chengsong
parents: 603
diff changeset
   528
    	  edge [loop below] node {a,b} ()
Chengsong
parents: 603
diff changeset
   529
    (q_1) edge  node  {a,b} (q_2)
Chengsong
parents: 603
diff changeset
   530
    (q_2) edge  node  {a,b} (q_3);
Chengsong
parents: 603
diff changeset
   531
\end{tikzpicture}
Chengsong
parents: 603
diff changeset
   532
\end{center}
605
Chengsong
parents: 604
diff changeset
   533
which requires at least $2^3$ states
Chengsong
parents: 604
diff changeset
   534
for its subset construction.\footnote{The 
Chengsong
parents: 604
diff changeset
   535
red states are "countdown states" which counts down 
604
Chengsong
parents: 603
diff changeset
   536
the number of characters needed in addition to the current
Chengsong
parents: 603
diff changeset
   537
string to make a successful match.
Chengsong
parents: 603
diff changeset
   538
For example, state $q_1$ indicates a match that has
Chengsong
parents: 603
diff changeset
   539
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
Chengsong
parents: 603
diff changeset
   540
and just consumed the "delimiter" $a$ in the middle, and 
Chengsong
parents: 603
diff changeset
   541
need to match 2 more iterations of $(a|b)$ to complete.
Chengsong
parents: 603
diff changeset
   542
State $q_2$ on the other hand, can be viewed as a state
Chengsong
parents: 603
diff changeset
   543
after $q_1$ has consumed 1 character, and just waits
Chengsong
parents: 603
diff changeset
   544
for 1 more character to complete.
Chengsong
parents: 603
diff changeset
   545
$q_3$ is the last state, requiring 0 more character and is accepting.
Chengsong
parents: 603
diff changeset
   546
Depending on the suffix of the
Chengsong
parents: 603
diff changeset
   547
input string up to the current read location,
Chengsong
parents: 603
diff changeset
   548
the states $q_1$ and $q_2$, $q_3$
Chengsong
parents: 603
diff changeset
   549
may or may
Chengsong
parents: 603
diff changeset
   550
not be active, independent from each other.
Chengsong
parents: 603
diff changeset
   551
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
Chengsong
parents: 603
diff changeset
   552
contain at least $2^3$ non-equivalent states that cannot be merged, 
Chengsong
parents: 603
diff changeset
   553
because the subset construction during determinisation will generate
Chengsong
parents: 603
diff changeset
   554
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
Chengsong
parents: 603
diff changeset
   555
Generalizing this to regular expressions with larger
Chengsong
parents: 603
diff changeset
   556
bounded repetitions number, we have that
Chengsong
parents: 603
diff changeset
   557
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
605
Chengsong
parents: 604
diff changeset
   558
would require at least $2^{n+1}$ states, if $r$ itself contains
604
Chengsong
parents: 603
diff changeset
   559
more than 1 string.
Chengsong
parents: 603
diff changeset
   560
This is to represent all different 
605
Chengsong
parents: 604
diff changeset
   561
scenarios which "countdown" states are active.}
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   562
605
Chengsong
parents: 604
diff changeset
   563
One of the most recent work in the context of lexing
Chengsong
parents: 604
diff changeset
   564
%with this issue
Chengsong
parents: 604
diff changeset
   565
is the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.
Chengsong
parents: 604
diff changeset
   566
This is relevant work and we will compare later on
Chengsong
parents: 604
diff changeset
   567
our derivative-based matcher we are going to present.
Chengsong
parents: 604
diff changeset
   568
There is also some newer work called
Chengsong
parents: 604
diff changeset
   569
Verbatim++\cite{Verbatimpp}, which does not use derivatives, 
Chengsong
parents: 604
diff changeset
   570
but deterministic finite automaton instead.
Chengsong
parents: 604
diff changeset
   571
%An example that gives problem to automaton approaches would be
Chengsong
parents: 604
diff changeset
   572
%the regular expression $(a|b)^*a(a|b)^{\{n\}}$.
Chengsong
parents: 604
diff changeset
   573
%It requires at least $2^{n+1}$ states to represent
Chengsong
parents: 604
diff changeset
   574
%as a DFA.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   575
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   576
605
Chengsong
parents: 604
diff changeset
   577
Bounded repetitions are very important because they
Chengsong
parents: 604
diff changeset
   578
tend to occur a lot in practical use.
Chengsong
parents: 604
diff changeset
   579
For example in the regex library RegExLib,
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   580
the rules library of Snort \cite{Snort1999}\footnote{
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   581
Snort is a network intrusion detection (NID) tool
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   582
for monitoring network traffic.}, 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   583
as well as in XML Schema definitions (XSDs).
605
Chengsong
parents: 604
diff changeset
   584
According to Bj\"{o}rklund et al \cite{xml2015},
Chengsong
parents: 604
diff changeset
   585
more than half of the 
Chengsong
parents: 604
diff changeset
   586
XSDs they found have bounded regular expressions in them.
Chengsong
parents: 604
diff changeset
   587
Often the counters are quite large, the largest up to ten million. 
Chengsong
parents: 604
diff changeset
   588
An example XSD they gave
Chengsong
parents: 604
diff changeset
   589
was:
606
Chengsong
parents: 605
diff changeset
   590
%\begin{verbatim}
Chengsong
parents: 605
diff changeset
   591
%<sequence minOccurs="0" maxOccurs="65535">
Chengsong
parents: 605
diff changeset
   592
%  <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
Chengsong
parents: 605
diff changeset
   593
%  <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
Chengsong
parents: 605
diff changeset
   594
%</sequence>
Chengsong
parents: 605
diff changeset
   595
%\end{verbatim}
605
Chengsong
parents: 604
diff changeset
   596
This can be seen as the expression 
Chengsong
parents: 604
diff changeset
   597
$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
Chengsong
parents: 604
diff changeset
   598
regular expressions 
Chengsong
parents: 604
diff changeset
   599
satisfying certain constraints (such as 
Chengsong
parents: 604
diff changeset
   600
satisfying the floating point number format).
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   601
605
Chengsong
parents: 604
diff changeset
   602
It is therefore quite unsatisfying that 
Chengsong
parents: 604
diff changeset
   603
some regular expressions matching libraries
Chengsong
parents: 604
diff changeset
   604
impose adhoc limits
Chengsong
parents: 604
diff changeset
   605
for bounded regular expressions:
Chengsong
parents: 604
diff changeset
   606
For example, in the regular expression matching library in the Go
Chengsong
parents: 604
diff changeset
   607
language the regular expression $a^{1001}$ is not permitted, because no counter
Chengsong
parents: 604
diff changeset
   608
can be above 1000, and in the built-in Rust regular expression library
Chengsong
parents: 604
diff changeset
   609
expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
606
Chengsong
parents: 605
diff changeset
   610
for being too big. 
Chengsong
parents: 605
diff changeset
   611
As Becchi and Crawley\cite{Becchi08}  have pointed out,
Chengsong
parents: 605
diff changeset
   612
the reason for these restrictions
Chengsong
parents: 605
diff changeset
   613
are that they simulate a non-deterministic finite
Chengsong
parents: 605
diff changeset
   614
automata (NFA) with a breadth-first search.
Chengsong
parents: 605
diff changeset
   615
This way the number of active states could
Chengsong
parents: 605
diff changeset
   616
be equal to the counter number.
Chengsong
parents: 605
diff changeset
   617
When the counters are large, 
Chengsong
parents: 605
diff changeset
   618
the memory requirement could become
Chengsong
parents: 605
diff changeset
   619
infeasible, and the pattern needs to be rejected straight away.
Chengsong
parents: 605
diff changeset
   620
\begin{figure}[H]
Chengsong
parents: 605
diff changeset
   621
\begin{center}
Chengsong
parents: 605
diff changeset
   622
\begin{tikzpicture} [node distance = 2cm, on grid, auto]
Chengsong
parents: 605
diff changeset
   623
 
Chengsong
parents: 605
diff changeset
   624
    	\node (q0) [state, initial] {$0$};
Chengsong
parents: 605
diff changeset
   625
	\node (q1) [state, right = of q0] {$1$};
Chengsong
parents: 605
diff changeset
   626
	\node (q2) [state, right = of q1] {$2$};
Chengsong
parents: 605
diff changeset
   627
	\node (qdots) [right = of q2] {$\ldots$};
Chengsong
parents: 605
diff changeset
   628
	\node (qn) [state, right = of qdots] {$n$};
Chengsong
parents: 605
diff changeset
   629
	\node (qn1) [state, right = of qn] {$n+1$};
Chengsong
parents: 605
diff changeset
   630
	\node (qn2) [state, right = of qn1] {$n+2$};
Chengsong
parents: 605
diff changeset
   631
	\node (qn3) [state, accepting, right = of qn2] {$n+3$}; 
Chengsong
parents: 605
diff changeset
   632
 
Chengsong
parents: 605
diff changeset
   633
\path [-stealth, thick]
Chengsong
parents: 605
diff changeset
   634
	(q0) edge [loop above] node {a} ()
Chengsong
parents: 605
diff changeset
   635
    (q0) edge node {a}   (q1) 
Chengsong
parents: 605
diff changeset
   636
    (q1) edge node {.}   (q2)
Chengsong
parents: 605
diff changeset
   637
    (q2) edge node {.}   (qdots)
Chengsong
parents: 605
diff changeset
   638
    (qdots) edge node {.} (qn)
Chengsong
parents: 605
diff changeset
   639
    (qn) edge node {.} (qn1)
Chengsong
parents: 605
diff changeset
   640
    (qn1) edge node {b} (qn2)
Chengsong
parents: 605
diff changeset
   641
    (qn2) edge node {$c$} (qn3);
Chengsong
parents: 605
diff changeset
   642
\end{tikzpicture}
Chengsong
parents: 605
diff changeset
   643
%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
Chengsong
parents: 605
diff changeset
   644
%   \node[state,initial] (q_0)   {$0$}; 
Chengsong
parents: 605
diff changeset
   645
%   \node[state, ] (q_1) [right=of q_0] {$1$}; 
Chengsong
parents: 605
diff changeset
   646
%   \node[state, ] (q_2) [right=of q_1] {$2$}; 
Chengsong
parents: 605
diff changeset
   647
%   \node[state,
Chengsong
parents: 605
diff changeset
   648
%   \node[state, accepting, ](q_3) [right=of q_2] {$3$};
Chengsong
parents: 605
diff changeset
   649
%    \path[->] 
Chengsong
parents: 605
diff changeset
   650
%    (q_0) edge  node {a} (q_1)
Chengsong
parents: 605
diff changeset
   651
%    	  edge [loop below] node {a,b} ()
Chengsong
parents: 605
diff changeset
   652
%    (q_1) edge  node  {a,b} (q_2)
Chengsong
parents: 605
diff changeset
   653
%    (q_2) edge  node  {a,b} (q_3);
Chengsong
parents: 605
diff changeset
   654
%\end{tikzpicture}
Chengsong
parents: 605
diff changeset
   655
\end{center}
Chengsong
parents: 605
diff changeset
   656
\caption{The example given by Becchi and Crawley
Chengsong
parents: 605
diff changeset
   657
	that NFA simulation can consume large
Chengsong
parents: 605
diff changeset
   658
	amounts of memory: $.^*a.^{\{n\}}bc$ matching
Chengsong
parents: 605
diff changeset
   659
	strings of the form $aaa\ldots aaaabc$.
Chengsong
parents: 605
diff changeset
   660
	When traversing in a breadth-first manner,
Chengsong
parents: 605
diff changeset
   661
all states from 0 till $n+1$ will become active.}
Chengsong
parents: 605
diff changeset
   662
\end{figure}
Chengsong
parents: 605
diff changeset
   663
%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
Chengsong
parents: 605
diff changeset
   664
%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
Chengsong
parents: 605
diff changeset
   665
%in terms of input string length.
Chengsong
parents: 605
diff changeset
   666
%TODO:try out these lexers
Chengsong
parents: 605
diff changeset
   667
These problems can of course be solved in matching algorithms where 
605
Chengsong
parents: 604
diff changeset
   668
automata go beyond the classic notion and for instance include explicit
Chengsong
parents: 604
diff changeset
   669
counters \cite{Turo_ov__2020}.
606
Chengsong
parents: 605
diff changeset
   670
These solutions can be quite effective,
Chengsong
parents: 605
diff changeset
   671
with the ability to process
Chengsong
parents: 605
diff changeset
   672
gigabytes of string input per second
Chengsong
parents: 605
diff changeset
   673
even with large counters \cite{Becchi08}.
Chengsong
parents: 605
diff changeset
   674
But formally reasoning about these automata can be challenging
Chengsong
parents: 605
diff changeset
   675
and un-intuitive.
Chengsong
parents: 605
diff changeset
   676
Therefore, correctness and runtime claims made about these solutions need to be
Chengsong
parents: 605
diff changeset
   677
taken with a grain of salt.
605
Chengsong
parents: 604
diff changeset
   678
Chengsong
parents: 604
diff changeset
   679
In the work reported in \cite{CSL2022} and here, 
Chengsong
parents: 604
diff changeset
   680
we add better support using derivatives
Chengsong
parents: 604
diff changeset
   681
for bounded regular expressions $r^{\{n\}}$.
Chengsong
parents: 604
diff changeset
   682
The results
Chengsong
parents: 604
diff changeset
   683
extend straightforwardly to
Chengsong
parents: 604
diff changeset
   684
repetitions with an interval such as 
Chengsong
parents: 604
diff changeset
   685
$r^{\{n\ldots m\}}$.
Chengsong
parents: 604
diff changeset
   686
The merit of Brzozowski derivatives
Chengsong
parents: 604
diff changeset
   687
on this problem is that
Chengsong
parents: 604
diff changeset
   688
it can be naturally extended to support bounded repetitions.
Chengsong
parents: 604
diff changeset
   689
Moreover these extensions are still made up of only
Chengsong
parents: 604
diff changeset
   690
inductive datatypes and recursive functions,
Chengsong
parents: 604
diff changeset
   691
making it handy to deal with using theorem provers.
Chengsong
parents: 604
diff changeset
   692
%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
Chengsong
parents: 604
diff changeset
   693
%straightforwardly extended to deal with bounded regular expressions
Chengsong
parents: 604
diff changeset
   694
%and moreover the resulting code still consists of only simple
Chengsong
parents: 604
diff changeset
   695
%recursive functions and inductive datatypes.
Chengsong
parents: 604
diff changeset
   696
Finally, bounded regular expressions do not destroy our finite
Chengsong
parents: 604
diff changeset
   697
boundedness property, which we shall prove later on.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   698
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   699
606
Chengsong
parents: 605
diff changeset
   700
Chengsong
parents: 605
diff changeset
   701
Chengsong
parents: 605
diff changeset
   702
605
Chengsong
parents: 604
diff changeset
   703
\subsection{Back-References}
606
Chengsong
parents: 605
diff changeset
   704
The other way to simulate an $\mathit{NFA}$ for matching is choosing  
Chengsong
parents: 605
diff changeset
   705
a single transition each time, keeping all the other options in 
Chengsong
parents: 605
diff changeset
   706
a queue or stack, and backtracking if that choice eventually 
Chengsong
parents: 605
diff changeset
   707
fails. This method, often called a  "depth-first-search", 
Chengsong
parents: 605
diff changeset
   708
is efficient in a lot of cases, but could end up
Chengsong
parents: 605
diff changeset
   709
with exponential run time.
Chengsong
parents: 605
diff changeset
   710
The backtracking method is employed in regex libraries
Chengsong
parents: 605
diff changeset
   711
that support \emph{back-references}, for example
Chengsong
parents: 605
diff changeset
   712
in Java and Python.
605
Chengsong
parents: 604
diff changeset
   713
%\section{Back-references and The Terminology Regex}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   714
605
Chengsong
parents: 604
diff changeset
   715
%When one constructs an $\NFA$ out of a regular expression
Chengsong
parents: 604
diff changeset
   716
%there is often very little to be done in the first phase, one simply 
Chengsong
parents: 604
diff changeset
   717
%construct the $\NFA$ states based on the structure of the input regular expression.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   718
605
Chengsong
parents: 604
diff changeset
   719
%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
Chengsong
parents: 604
diff changeset
   720
%one by keeping track of all active states after consuming 
Chengsong
parents: 604
diff changeset
   721
%a character, and update that set of states iteratively.
Chengsong
parents: 604
diff changeset
   722
%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
Chengsong
parents: 604
diff changeset
   723
%for a path terminating
Chengsong
parents: 604
diff changeset
   724
%at an accepting state.
606
Chengsong
parents: 605
diff changeset
   725
Chengsong
parents: 605
diff changeset
   726
Chengsong
parents: 605
diff changeset
   727
Chengsong
parents: 605
diff changeset
   728
Given a regular expression like this (the sequence
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   729
operator is omitted for brevity):
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   730
\begin{center}
606
Chengsong
parents: 605
diff changeset
   731
	$r_1r_2r_3r_4$
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   732
\end{center}
606
Chengsong
parents: 605
diff changeset
   733
one could label sub-expressions of interest 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   734
by parenthesizing them and giving 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   735
them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   736
One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   737
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   738
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   739
\end{center}
606
Chengsong
parents: 605
diff changeset
   740
The sub-expressions
Chengsong
parents: 605
diff changeset
   741
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
Chengsong
parents: 605
diff changeset
   742
by 1 to 4, and can be ``referred back'' by their respective numbers. 
Chengsong
parents: 605
diff changeset
   743
%These sub-expressions are called "capturing groups".
Chengsong
parents: 605
diff changeset
   744
To do so, we use the syntax $\backslash i$ 
Chengsong
parents: 605
diff changeset
   745
to denote that we want the sub-string 
Chengsong
parents: 605
diff changeset
   746
of the input just matched by the i-th
Chengsong
parents: 605
diff changeset
   747
sub-expression to appear again, 
Chengsong
parents: 605
diff changeset
   748
exactly the same as it first appeared: 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   749
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   750
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   751
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   752
\end{center}
606
Chengsong
parents: 605
diff changeset
   753
%The backslash and number $i$ are the
Chengsong
parents: 605
diff changeset
   754
%so-called "back-references".
Chengsong
parents: 605
diff changeset
   755
%Let $e$ be an expression made of regular expressions 
Chengsong
parents: 605
diff changeset
   756
%and back-references. $e$ contains the expression $e_i$
Chengsong
parents: 605
diff changeset
   757
%as its $i$-th capturing group.
Chengsong
parents: 605
diff changeset
   758
%The semantics of back-reference can be recursively
Chengsong
parents: 605
diff changeset
   759
%written as:
Chengsong
parents: 605
diff changeset
   760
%\begin{center}
Chengsong
parents: 605
diff changeset
   761
%	\begin{tabular}{c}
Chengsong
parents: 605
diff changeset
   762
%		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
Chengsong
parents: 605
diff changeset
   763
%		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
Chengsong
parents: 605
diff changeset
   764
%	\end{tabular}
Chengsong
parents: 605
diff changeset
   765
%\end{center}
Chengsong
parents: 605
diff changeset
   766
A concrete example
Chengsong
parents: 605
diff changeset
   767
for back-references would be
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   768
\begin{center}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   769
$(.^*)\backslash 1$,
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   770
\end{center}
606
Chengsong
parents: 605
diff changeset
   771
which would match 
Chengsong
parents: 605
diff changeset
   772
strings that can be split into two identical halves,
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   773
for example $\mathit{foofoo}$, $\mathit{ww}$ and etc.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   774
Note that this is different from 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   775
repeating the  sub-expression verbatim like
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   776
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   777
	$(.^*)(.^*)$,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   778
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   779
which does not impose any restrictions on what strings the second 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   780
sub-expression $.^*$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   781
might match.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   782
Another example of back-references would be
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   783
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   784
$(.)(.)\backslash 2\backslash 1$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   785
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   786
which expresses four-character palindromes
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   787
like $abba$, $x??x$ etc.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   788
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   789
Back-references is a regex construct 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   790
that programmers found quite useful.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   791
According to Becchi and Crawley\cite{Becchi08},
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   792
6\% of Snort rules (up until 2008) include the use of them.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   793
The most common use of back-references
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   794
would be expressing well-formed html files,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   795
where back-references would be handy in expressing
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   796
a pair of opening and closing tags like 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   797
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   798
	$\langle html \rangle \ldots \langle / html \rangle$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   799
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   800
A regex describing such a format
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   801
could be
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   802
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   803
	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   804
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   805
Despite being useful, the syntax and expressive power of regexes 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   806
go beyond the regular language hierarchy
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   807
with back-references.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   808
In fact, they allow the regex construct to express 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   809
languages that cannot be contained in context-free
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   810
languages either.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   811
For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   812
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   813
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   814
Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   815
of formal languages. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   816
Solving the back-reference expressions matching problem
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   817
is NP-complete\parencite{alfred2014algorithms}.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   818
A non-bactracking,
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   819
efficient solution is not known to exist.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   820
Regex libraries supporting back-references such as PCRE therefore have to
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   821
revert to a depth-first search algorithm that backtracks.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   822
The unreasonable part with them, is that even in the case of 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   823
regexes not involving back-references, there is still
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   824
a (non-negligible) chance they might backtrack super-linearly.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   825
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   826
\subsection{Summary of the Catastrophic Backtracking Problem}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   827
Summing these up, we can categorise existing 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   828
practical regex libraries into two kinds:
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   829
(i)The ones  with  linear
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   830
time guarantees like Go and Rust. The cost with them is that
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   831
they impose restrictions
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   832
on the user input (not allowing back-references, 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   833
bounded repetitions cannot exceed a counter limit etc.).
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   834
(ii) Those 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   835
that allow large bounded regular expressions and back-references
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   836
at the expense of using a backtracking algorithm.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   837
They could grind to a halt
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   838
on some very simple cases, posing a vulnerability of
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   839
a ReDoS attack.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   840
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   841
 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   842
We would like to have regex engines that can 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   843
deal with the regular part (e.g.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   844
bounded repetitions) of regexes more
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   845
efficiently.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   846
Also we want to make sure that they do it correctly.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   847
It turns out that such aim is not so easy to achieve.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   848
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   849
% For example, the Rust regex engine claims to be linear, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   850
% but does not support lookarounds and back-references.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   851
% The GoLang regex library does not support over 1000 repetitions.  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   852
% Java and Python both support back-references, but shows
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   853
%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   854
%when the language is still regular).
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   855
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   856
 %TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   857
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   858
605
Chengsong
parents: 604
diff changeset
   859
Chengsong
parents: 604
diff changeset
   860
Chengsong
parents: 604
diff changeset
   861
%The time cost of regex matching algorithms in general
Chengsong
parents: 604
diff changeset
   862
%involve two different phases, and different things can go differently wrong on 
Chengsong
parents: 604
diff changeset
   863
%these phases.
Chengsong
parents: 604
diff changeset
   864
%$\DFA$s usually have problems in the first (construction) phase
Chengsong
parents: 604
diff changeset
   865
%, whereas $\NFA$s usually run into trouble
Chengsong
parents: 604
diff changeset
   866
%on the second phase.
Chengsong
parents: 604
diff changeset
   867
Chengsong
parents: 604
diff changeset
   868
Chengsong
parents: 604
diff changeset
   869
\section{Error-prone POSIX Implementations}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   870
When there are multiple ways of matching a string
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   871
with a regular expression, a matcher needs to
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   872
disambiguate.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   873
The standard for which particular match to pick
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   874
is called the disambiguation strategy.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   875
The more intuitive strategy is called POSIX,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   876
which always chooses the longest initial match.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   877
An alternative strategy would be greedy matches,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   878
which always ends a sub-match as early as possible.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   879
The POSIX standard is widely adopted in many operating systems.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   880
However, many implementations (including the C libraries
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   881
used by Linux and OS X distributions) contain bugs
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   882
or do not meet the specification they claim to adhere to.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   883
In some cases, they either fail to generate a lexing 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   884
result when there exists a match,
605
Chengsong
parents: 604
diff changeset
   885
or give results that are inconsistent with the $\POSIX$ standard.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   886
A concrete example would be the regex given by \cite{fowler2003}
605
Chengsong
parents: 604
diff changeset
   887
\begin{center}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   888
	$(aba + ab + a)^* \text{and the string} ababa$
605
Chengsong
parents: 604
diff changeset
   889
\end{center}
Chengsong
parents: 604
diff changeset
   890
The correct $\POSIX$ match for the above would be 
Chengsong
parents: 604
diff changeset
   891
with the entire string $ababa$, 
Chengsong
parents: 604
diff changeset
   892
split into two Kleene star iterations, $[ab] [aba]$ at positions
Chengsong
parents: 604
diff changeset
   893
$[0, 2), [2, 5)$
Chengsong
parents: 604
diff changeset
   894
respectively.
Chengsong
parents: 604
diff changeset
   895
But trying this out in regex101\parencite{regex101}
Chengsong
parents: 604
diff changeset
   896
with different language engines would yield 
Chengsong
parents: 604
diff changeset
   897
the same two fragmented matches: $[aba]$ at $[0, 3)$
Chengsong
parents: 604
diff changeset
   898
and $a$ at $[4, 5)$.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   899
Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell} 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   900
commented that most regex libraries are not
605
Chengsong
parents: 604
diff changeset
   901
correctly implementing the POSIX (maximum-munch)
Chengsong
parents: 604
diff changeset
   902
rule of regular expression matching.
Chengsong
parents: 604
diff changeset
   903
As Grathwohl\parencite{grathwohl2014crash} wrote,
Chengsong
parents: 604
diff changeset
   904
\begin{quote}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   905
	``The POSIX strategy is more complicated than the 
605
Chengsong
parents: 604
diff changeset
   906
	greedy because of the dependence on information about 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   907
	the length of matched strings in the various subexpressions.''
605
Chengsong
parents: 604
diff changeset
   908
\end{quote}
Chengsong
parents: 604
diff changeset
   909
%\noindent
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   910
The implementation complexity of POSIX rules also come from
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   911
the specification being not very clear.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   912
There are many informal summaries of this disambiguation
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   913
strategy, which are often quite long and delicate.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   914
For example Kuklewicz described the POSIX rule as
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   915
\begin{quote}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   916
	``
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   917
	\begin{itemize}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   918
		\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   919
regular expressions (REs) take the leftmost starting match, and the longest match starting there
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   920
earlier subpatterns have leftmost-longest priority over later subpatterns\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   921
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   922
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   923
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   924
REs have right associative concatenation which can be changed with parenthesis\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   925
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   926
parenthesized subexpressions return the match from their last usage\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   927
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   928
text of component subexpressions must be contained in the text of the 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   929
higher-level subexpressions\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   930
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   931
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   932
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   933
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\''
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   934
\end{itemize}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   935
\end{quote}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   936
The text above 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   937
is trying to capture something very precise,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   938
and is crying out for formalising.
605
Chengsong
parents: 604
diff changeset
   939
Chengsong
parents: 604
diff changeset
   940
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   941
%\subsection{Different Phases of a Matching/Lexing Algorithm}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   942
%
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   943
%
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   944
%Most lexing algorithms can be roughly divided into 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   945
%two phases during its run.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   946
%The first phase is the "construction" phase,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   947
%in which the algorithm builds some  
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   948
%suitable data structure from the input regex $r$, so that
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   949
%it can be easily operated on later.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   950
%We denote
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   951
%the time cost for such a phase by $P_1(r)$.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   952
%The second phase is the lexing phase, when the input string 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   953
%$s$ is read and the data structure
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   954
%representing that regex $r$ is being operated on. 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   955
%We represent the time
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   956
%it takes by $P_2(r, s)$.\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   957
%For $\mathit{DFA}$,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   958
%we have $P_2(r, s) = O( |s| )$,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   959
%because we take at most $|s|$ steps, 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   960
%and each step takes
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   961
%at most one transition--
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   962
%a deterministic-finite-automata
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   963
%by definition has at most one state active and at most one
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   964
%transition upon receiving an input symbol.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   965
%But unfortunately in the  worst case
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   966
%$P_1(r) = O(exp^{|r|})$. An example will be given later. 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   967
%For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   968
%expressions like $r^n$ into 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   969
%\[
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   970
%	\underbrace{r \cdots r}_{\text{n copies of r}}.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   971
%\]
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   972
%The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   973
%On the other hand, if backtracking is used, the worst-case time bound bloats
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   974
%to $|r| * 2^{|s|}$.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   975
%%on the input
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   976
%%And when calculating the time complexity of the matching algorithm,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   977
%%we are assuming that each input reading step requires constant time.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   978
%%which translates to that the number of 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   979
%%states active and transitions taken each time is bounded by a
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   980
%%constant $C$.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   981
%%But modern  regex libraries in popular language engines
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   982
%% often want to support much richer constructs than just
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   983
%% sequences and Kleene stars,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   984
%%such as negation, intersection, 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   985
%%bounded repetitions and back-references.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   986
%%And de-sugaring these "extended" regular expressions 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   987
%%into basic ones might bloat the size exponentially.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   988
%%TODO: more reference for exponential size blowup on desugaring. 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   989
%
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   990
%\subsection{Why $\mathit{DFA}s$ can be slow in the first phase}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   991
%
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   992
%
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   993
%The good things about $\mathit{DFA}$s is that once
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   994
%generated, they are fast and stable, unlike
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   995
%backtracking algorithms. 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   996
%However, they do not scale well with bounded repetitions.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   997
%
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   998
%\subsubsection{Problems with Bounded Repetitions}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   999
%
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1000
%
605
Chengsong
parents: 604
diff changeset
  1001
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1002
To summarise, we need regex libraries that are both fast
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1003
and correct.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1004
And that correctness needs to be built on a precise
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1005
model of what POSIX disambiguation is.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1006
We propose a solution that addresses both problems
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1007
based on Brzozowski, Sulzmann and Lu and Ausaf and Urban's work.
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1008
The end result is a regular expression lexing algorithm that comes with 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1009
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1010
\item
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1011
a proven correctness theorem according to POSIX specification 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1012
given by Ausaf and Urban \cite{AusafDyckhoffUrban2016},
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1013
\item 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1014
a proven property saying that the algorithm's internal data structure will
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1015
remain finite,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1016
\item
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1017
and extension to
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1018
the bounded repetitions construct with the correctness and finiteness property
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1019
maintained.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1020
 \end{itemize}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1021
 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1022
\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1023
We propose Brzozowski derivatives on regular expressions as
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1024
  a solution to this.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1025
In the last fifteen or so years, Brzozowski's derivatives of regular
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1026
expressions have sparked quite a bit of interest in the functional
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1027
programming and theorem prover communities.   
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1028
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1029
\subsection{Motivation}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1030
  
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1031
Derivatives give a simple solution
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1032
to the problem of matching a string $s$ with a regular
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1033
expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1034
succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1035
then $r$ matches $s$ (and {\em vice versa}).  
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1036
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1037
The beauty of
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1038
Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1039
expressible in any functional language, and easily definable and
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1040
reasoned about in theorem provers---the definitions just consist of
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1041
inductive datatypes and simple recursive functions. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1042
And an algorithms based on it by 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1043
Suzmann and Lu  \parencite{Sulzmann2014} allows easy extension
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1044
to include  extended regular expressions and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1045
 simplification of internal data structures 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1046
 eliminating the exponential behaviours.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1047
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1048
However, two difficulties with derivative-based matchers exist:
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1049
\subsubsection{Problems with Current Brzozowski Matchers}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1050
First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1051
for whether a regular expression matches a string or not.  This is too
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1052
little information in the context of lexing where separate tokens must
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1053
be identified and also classified (for example as keywords
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1054
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1055
difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1056
algorithm. Their extended version generates additional information on
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1057
\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1058
rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1059
second ``phase'' to Brzozowski's algorithm involving an injection
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1060
function.  In our own earlier work, we provided the formal
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1061
specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1062
the correctness
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1063
of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1064
\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1065
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1066
The second difficulty is that Brzozowski's derivatives can 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1067
grow to arbitrarily big sizes. For example if we start with the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1068
regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1069
successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1070
a sequence of ever-growing derivatives like 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1071
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1072
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1073
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1074
\begin{tabular}{rll}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1075
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1076
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1077
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1078
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1079
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1080
\end{tabular}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1081
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1082
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1083
\noindent where after around 35 steps we run out of memory on a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1084
typical computer (we shall define shortly the precise details of our
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1085
regular expressions and the derivative operation).  Clearly, the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1086
notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1087
simplification rules that can be applied to regular regular
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1088
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1089
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1090
r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1091
earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1092
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1093
\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1094
above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1095
quickly beyond any finite bound.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1096
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1097
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1098
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1099
\cite{Sulzmann2014} where they introduce bit-coded
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1100
regular expressions. In this version, POSIX values are
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1101
represented as bit sequences and such sequences are incrementally generated
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1102
when derivatives are calculated. The compact representation
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1103
of bit sequences and regular expressions allows them to define a more
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1104
``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1105
derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1106
They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1107
of this version, but do not provide any supporting proof arguments, not
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1108
even ``pencil-and-paper'' arguments. They write about their bit-coded
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1109
\emph{incremental parsing method} (that is the algorithm to be formalised
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1110
in this dissertation)
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1111
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1112
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1113
  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1114
  \begin{quote}\it
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1115
  ``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1116
  method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1117
  yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1118
  extensively [..] but yet
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1119
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1120
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1121
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1122
Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1123
a formal proof.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1124
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1125
But as they stated,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1126
  \begin{quote}\it
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1127
The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1128
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1129
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1130
This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1131
by Ausaf and Urban,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1132
and gives a formal proof of the correctness with those simplifications.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1133
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1134
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1135
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1136
\section{Contribution}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1137
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1138
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1139
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1140
This work addresses the vulnerability of super-linear and
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1141
buggy regex implementations by the combination
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1142
of Brzozowski's derivatives and interactive theorem proving. 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1143
We give an 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1144
improved version of  Sulzmann and Lu's bit-coded algorithm using 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1145
derivatives, which come with a formal guarantee in terms of correctness and 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1146
running time as an Isabelle/HOL proof.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1147
Further improvements to the algorithm with an even stronger version of 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1148
simplification is made.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1149
We have not yet come up with one, but believe that it leads to a 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1150
formalised proof with a time bound linear to input and
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1151
cubic to regular expression size using a technique by
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1152
Antimirov\cite{Antimirov}.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1153
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1154
 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1155
The main contribution of this thesis is 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1156
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1157
\item
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1158
a proven correct lexing algorithm
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1159
\item
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1160
with formalized finite bounds on internal data structures' sizes.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1161
\end{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1162
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1163
To our best knowledge, no lexing libraries using Brzozowski derivatives
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1164
have a provable time guarantee, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1165
and claims about running time are usually speculative and backed by thin empirical
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1166
evidence.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1167
%TODO: give references
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1168
For example, Sulzmann and Lu had proposed an algorithm  in which they
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1169
claim a linear running time.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1170
But that was falsified by our experiments and the running time 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1171
is actually $\Omega(2^n)$ in the worst case.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1172
A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1173
%TODO: give references
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1174
lexer, which calculates POSIX matches and is based on derivatives.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1175
They formalized the correctness of the lexer, but not the complexity.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1176
In the performance evaluation section, they simply analyzed the run time
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1177
of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1178
and concluded that the algorithm is quadratic in terms of input length.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1179
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1180
the time it took to lex only 40 $a$'s was 5 minutes.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1181
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1182
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1183
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1184
\subsection{Related Work}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1185
We are aware
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1186
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1187
Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1188
of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1189
in Coq is given by Coquand and Siles \parencite{Coquand2012}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1190
Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1191
 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1192
 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1193
 When a regular expression does not behave as intended,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1194
people usually try to rewrite the regex to some equivalent form
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1195
or they try to avoid the possibly problematic patterns completely,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1196
for which many false positives exist\parencite{Davis18}.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1197
Animated tools to "debug" regular expressions such as
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1198
 \parencite{regexploit2021} \parencite{regex101} are also popular.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1199
We are also aware of static analysis work on regular expressions that
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1200
aims to detect potentially expoential regex patterns. Rathnayake and Thielecke 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1201
\parencite{Rathnayake2014StaticAF} proposed an algorithm
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1202
that detects regular expressions triggering exponential
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1203
behavious on backtracking matchers.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1204
Weideman \parencite{Weideman2017Static} came up with 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1205
non-linear polynomial worst-time estimates
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1206
for regexes, attack string that exploit the worst-time 
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1207
scenario, and "attack automata" that generates
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1208
attack strings.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1209
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1210
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1211
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1212
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1213
\section{Structure of the thesis}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1214
In chapter 2 \ref{Inj} we will introduce the concepts
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1215
and notations we 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1216
use for describing the lexing algorithm by Sulzmann and Lu,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1217
and then give the lexing algorithm.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1218
We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1219
Then we illustrate in \ref{Bitcoded2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1220
how the algorithm without bitcodes falls short for such aggressive 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1221
simplifications and therefore introduce our version of the
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1222
 bit-coded algorithm and 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1223
its correctness proof .  
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1224
In \ref{Finite} we give the second guarantee
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1225
of our bitcoded algorithm, that is a finite bound on the size of any 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1226
regex's derivatives.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1227
In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1228
in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1229
algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1230
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1231
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1232
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1233
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1234
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1235
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1236
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1237
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1238
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1239
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1240
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1241
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1242
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1243
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1244