ChengsongTanPhdThesis/Chapters/Introduction.tex
author Chengsong
Fri, 26 May 2023 23:42:15 +0100
changeset 648 d15a0b7d6d90
parent 646 56057198e4f5
child 649 ef2b8abcbc55
permissions -rwxr-xr-x
new amend
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     1
% Chapter 1
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     2
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     4
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     5
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     6
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     8
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    15
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    16
%boxes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    17
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    18
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    19
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    20
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    21
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    22
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
    23
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564
Chengsong
parents: 558
diff changeset
    24
\def\derssimp{\textit{ders}\_\textit{simp}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    25
\def\rders{\textit{rders}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    26
\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    27
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
    28
\def\bsimps{\textit{bsimp}}
554
Chengsong
parents: 543
diff changeset
    29
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    30
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    31
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    32
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    33
\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    34
\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    35
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555
Chengsong
parents: 554
diff changeset
    36
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594
Chengsong
parents: 591
diff changeset
    37
\def\rdistincts{\textit{rdistinct}}
556
Chengsong
parents: 555
diff changeset
    38
\def\rDistinct{\textit{rdistinct}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    39
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    40
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    41
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
620
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    42
\def\cbn{\textit{createdByNtimes}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    43
\def\hpa{\textit{highestPowerAux}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    44
\def\hpower{\textit{highestPower}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    45
\def\ntset{\textit{ntset}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    46
\def\optermsimp{\textit{optermsimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    47
\def\optermOsimp{\textit{optermOsimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    48
\def\optermosimp{\textit{optermosimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    49
\def\opterm{\textit{opterm}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    50
\def\nString{\textit{nonemptyString}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    51
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    52
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    53
600
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    54
\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    55
\def\SEQs{\textit{SEQs}}
564
Chengsong
parents: 558
diff changeset
    56
\def\case{\textit{case}}
554
Chengsong
parents: 543
diff changeset
    57
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
Chengsong
parents: 543
diff changeset
    58
\def\rsimpalts{\textit{rsimp}_{ALTS}}
Chengsong
parents: 543
diff changeset
    59
\def\good{\textit{good}}
Chengsong
parents: 543
diff changeset
    60
\def\btrue{\textit{true}}
Chengsong
parents: 543
diff changeset
    61
\def\bfalse{\textit{false}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    62
\def\bnullable{\textit{bnullable}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    63
\def\bnullables{\textit{bnullables}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    64
\def\Some{\textit{Some}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    65
\def\None{\textit{None}}
537
Chengsong
parents: 532
diff changeset
    66
\def\code{\textit{code}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    67
\def\decode{\textit{decode}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    68
\def\internalise{\textit{internalise}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    69
\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    70
\def\mkeps{\textit{mkeps}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    71
\newcommand{\rder}[2]{#2 \backslash_r #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    72
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    73
\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    74
554
Chengsong
parents: 543
diff changeset
    75
\def\nonnested{\textit{nonnested}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    76
\def\AZERO{\textit{AZERO}}
558
Chengsong
parents: 557
diff changeset
    77
\def\sizeNregex{\textit{sizeNregex}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    78
\def\AONE{\textit{AONE}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    79
\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    80
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    81
\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    82
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    83
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555
Chengsong
parents: 554
diff changeset
    84
\def\frewrite{\rightsquigarrow_f}
Chengsong
parents: 554
diff changeset
    85
\def\hrewrite{\rightsquigarrow_h}
Chengsong
parents: 554
diff changeset
    86
\def\grewrite{\rightsquigarrow_g}
Chengsong
parents: 554
diff changeset
    87
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
Chengsong
parents: 554
diff changeset
    88
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
Chengsong
parents: 554
diff changeset
    89
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    90
\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    91
\def\bder{\textit{bder}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    92
\def\der{\textit{der}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    93
\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    94
\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    95
\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    96
\def\DFA{\textit{DFA}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    97
\def\NFA{\textit{NFA}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    98
\def\bmkeps{\textit{bmkeps}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    99
\def\bmkepss{\textit{bmkepss}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   100
\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   101
\def\blexer{\textit{blexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   102
\def\flex{\textit{flex}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   103
\def\inj{\textit{inj}}
564
Chengsong
parents: 558
diff changeset
   104
\def\Empty{\textit{Empty}}
567
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
   105
\def\Left{\textit{Left}}
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
   106
\def\Right{\textit{Right}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   107
\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   108
\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   109
\def\Seq{\textit{Seq}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   110
\def\Der{\textit{Der}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   111
\def\Ders{\textit{Ders}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   112
\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   113
\def\Z{\mathit{Z}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   114
\def\S{\mathit{S}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   115
\def\rup{r^\uparrow}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   116
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   117
\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   118
\def\lf{\textit{lf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   119
\def\PD{\textit{PD}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   120
\def\suffix{\textit{Suffix}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   121
\def\distinctBy{\textit{distinctBy}}
558
Chengsong
parents: 557
diff changeset
   122
\def\starupdate{\textit{starUpdate}}
Chengsong
parents: 557
diff changeset
   123
\def\starupdates{\textit{starUpdates}}
620
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
   124
\def\nupdate{\textit{nupdate}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
   125
\def\nupdates{\textit{nupdates}}
558
Chengsong
parents: 557
diff changeset
   126
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   127
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   128
\def\size{\mathit{size}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   129
\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   130
\def\simp{\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   131
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   132
\def\map{\mathit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   133
\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   134
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   135
\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   136
\def\bsimpStrong{\textit{bsimpStrong}}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
   137
\def\bdersStrongs{\textit{bdersStrong}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   138
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   139
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   140
\def\map{\textit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   141
\def\rrexp{\textit{rrexp}}
554
Chengsong
parents: 543
diff changeset
   142
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   143
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   144
\newcommand\asize[1]{\llbracket #1 \rrbracket}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   145
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   146
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   147
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   148
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   149
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   150
\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   151
\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   152
\def\bsimpalts{\textit{bsimp}_{ALTS}}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   153
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   154
\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   155
\def\rsimpseq{\textit{rsimp}_{SEQ}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   156
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   157
\def\erase{\textit{erase}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   158
\def\STAR{\textit{STAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   159
\def\flts{\textit{flts}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   160
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   161
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   162
\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   163
\def\nub{\textit{nub}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   164
\def\filter{\textit{filter}}
601
Chengsong
parents: 600
diff changeset
   165
%\def\not{\textit{not}}
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   166
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   167
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   168
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   169
\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   170
\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   171
\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   172
\newcommand\RSEQ[2]{#1 \cdot #2}
558
Chengsong
parents: 557
diff changeset
   173
\newcommand\RALTS[1]{\sum #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   174
\newcommand\RSTAR[1]{#1^*}
558
Chengsong
parents: 557
diff changeset
   175
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   176
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   177
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   178
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   179
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   180
\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   181
  frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   182
  language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   183
  aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   184
  belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   185
  showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   186
  columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   187
  basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   188
  numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   189
  numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   190
  keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   191
  commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   192
  stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   193
  frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   194
  breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   195
  breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   196
  tabsize=3,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   197
}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   198
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   199
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   200
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   201
%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   202
%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   203
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   204
%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   205
601
Chengsong
parents: 600
diff changeset
   206
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   207
Regular expressions, since their inception in the 1940s, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   208
have been subject to extensive study and implementation. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   209
Their primary application lies in lexing, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   210
a process whereby an unstructured input string is disassembled into a tree of tokens
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   211
with their guidance.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   212
This is often used to match strings that comprises of numerous fields, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   213
where certain fields may recur or be omitted. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   214
For example, the regular expression for recognising email addresses is
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   215
\begin{center}
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   216
\verb|^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$|.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   217
\end{center}
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   218
Using this, regular expression matchers and lexers are able to extract 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   219
for example the domain names by the use of \verb|[a-zA-Z0-9.-]+|. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   220
Consequently, they are an indispensible component in text processing tools 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   221
of software systems such as compilers, IDEs, and firewalls.
601
Chengsong
parents: 600
diff changeset
   222
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   223
The study of regular expressions is ongoing due to an
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   224
issue known as catastrophic backtracking. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   225
This phenomenon refers to scenarios in which the regular expression 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   226
matching or lexing engine exhibits a disproportionately long 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   227
runtime despite the simplicity of the input and expression.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   228
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   229
The origin of catastrophic backtracking is rooted in the 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   230
ambiguities of lexing. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   231
In the process of matching a multi-character string with 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   232
a regular expression that encompasses several sub-expressions, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   233
different positions can be designated to mark 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   234
the boundaries of corresponding substrings of the sub-expressions. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   235
For instance, in matching the string $aaa$ with the 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   236
regular expression $(a+aa)^*$, the divide between 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   237
the initial match and the subsequent iteration could either be 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   238
set between the first and second characters ($a | aa$) or between the second and third characters ($aa | a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   239
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   240
Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   241
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   242
Various disambiguation strategies are employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
601
Chengsong
parents: 600
diff changeset
   243
Chengsong
parents: 600
diff changeset
   244
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   245
%Regular expressions 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   246
%have been extensively studied and
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   247
%implemented since their invention in 1940s.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   248
%It is primarily used in lexing, where an unstructured input string is broken
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   249
%down into a tree of tokens.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   250
%That tree's construction is guided by the shape of the regular expression.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   251
%This is particularly useful in expressing the shape of a string
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   252
%with many fields, where certain fields might repeat or be omitted.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   253
%Regular expression matchers and Lexers allow us to 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   254
%identify and delimit different subsections of a string and potentially 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   255
%extract information from inputs, making them
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   256
%an indispensible component in modern software systems' text processing tasks
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   257
%such as compilers, IDEs, and firewalls.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   258
%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   259
%which means the regular expression matching or lexing engine takes an unproportional time to run 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   260
%despite the input and the expression being relatively simple.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   261
%
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   262
%Catastrophic backtracking stems from the ambiguities of lexing: 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   263
%when matching a multiple-character string with a regular 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   264
%exression that includes serveral sub-expressions, there might be different positions to set 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   265
%the border between sub-expressions' corresponding sub-strings.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   266
%For example, matching the string $aaa$ against the regular expression
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   267
%$(a+aa)^*$, the border between the initial match and the second iteration
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   268
%could be between the first and second character ($a | aa$) 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   269
%or between the second and third character ($aa | a$).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   270
%As the size of the input string and the structural complexity 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   271
%of the regular expression increase,
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   272
%the number of different combinations of delimiters can grow exponentially, and
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   273
%algorithms that explore these possibilities unwisely will also see an exponential complexity.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   274
%
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   275
%Catastrophic backtracking allow a class of computationally inexpensive attacks called
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   276
%Regular expression Denial of Service attacks (ReDoS), in which the hacker 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   277
%simply sends out a small attack string to a server, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   278
%triggering high-complexity behaviours in its regular expression engine. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   279
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   280
%details of this in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   281
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   282
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   283
%There have been prose definitions like the following
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   284
%by Kuklewicz \cite{KuklewiczHaskell}: 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   285
%described the POSIX rule as (section 1, last paragraph):
646
Chengsong
parents: 638
diff changeset
   286
\begin{quote}
Chengsong
parents: 638
diff changeset
   287
	\begin{itemize}
Chengsong
parents: 638
diff changeset
   288
		\item
Chengsong
parents: 638
diff changeset
   289
regular expressions (REs) take the leftmost starting match, and the longest match starting there
Chengsong
parents: 638
diff changeset
   290
earlier subpatterns have leftmost-longest priority over later subpatterns\\
Chengsong
parents: 638
diff changeset
   291
\item
Chengsong
parents: 638
diff changeset
   292
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
Chengsong
parents: 638
diff changeset
   293
\item
Chengsong
parents: 638
diff changeset
   294
REs have right associative concatenation which can be changed with parenthesis\\
Chengsong
parents: 638
diff changeset
   295
\item
Chengsong
parents: 638
diff changeset
   296
parenthesized subexpressions return the match from their last usage\\
Chengsong
parents: 638
diff changeset
   297
\item
Chengsong
parents: 638
diff changeset
   298
text of component subexpressions must be contained in the text of the 
Chengsong
parents: 638
diff changeset
   299
higher-level subexpressions\\
Chengsong
parents: 638
diff changeset
   300
\item
Chengsong
parents: 638
diff changeset
   301
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
Chengsong
parents: 638
diff changeset
   302
\item
Chengsong
parents: 638
diff changeset
   303
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
Chengsong
parents: 638
diff changeset
   304
\end{itemize}
Chengsong
parents: 638
diff changeset
   305
\end{quote}
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   306
However, the author noted that lexers are rarely correct according to this standard.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   307
Being able to formally define and capture the idea of POSIX rules and matching/lexing algorithms and prove 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   308
the correctness of such algorithms against the POSIX semantics definitions
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   309
would be valuable.
646
Chengsong
parents: 638
diff changeset
   310
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   311
Formal proofs are machine checked programs
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   312
such as the ones written in Isabelle/HOL, is a powerful means 
646
Chengsong
parents: 638
diff changeset
   313
for computer scientists to be certain about correctness of their algorithms and correctness properties.
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   314
This is done by automatically checking that the end goal is derived solely from a list of axioms, definitions
646
Chengsong
parents: 638
diff changeset
   315
and proof rules that are known to be correct.
Chengsong
parents: 638
diff changeset
   316
The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
Chengsong
parents: 638
diff changeset
   317
methods as their implementation and complexity analysis tend to be error-prone.
Chengsong
parents: 638
diff changeset
   318
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   319
There have been many interesting steps taken in the direction of formal proofs and regular expressions
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   320
lexing and matching.
646
Chengsong
parents: 638
diff changeset
   321
Chengsong
parents: 638
diff changeset
   322
Chengsong
parents: 638
diff changeset
   323
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   324
(ends)
646
Chengsong
parents: 638
diff changeset
   325
Chengsong
parents: 638
diff changeset
   326
%Regular expressions are widely used in computer science: 
Chengsong
parents: 638
diff changeset
   327
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
Chengsong
parents: 638
diff changeset
   328
%command-line tools like $\mathit{grep}$ that facilitate easy 
Chengsong
parents: 638
diff changeset
   329
%text-processing \cite{grep}; network intrusion
Chengsong
parents: 638
diff changeset
   330
%detection systems that inspect suspicious traffic; or compiler
Chengsong
parents: 638
diff changeset
   331
%front ends.
Chengsong
parents: 638
diff changeset
   332
%Given their usefulness and ubiquity, one would assume that
Chengsong
parents: 638
diff changeset
   333
%modern regular expression matching implementations
Chengsong
parents: 638
diff changeset
   334
%are mature and fully studied.
Chengsong
parents: 638
diff changeset
   335
%Indeed, in many popular programming languages' regular expression engines, 
Chengsong
parents: 638
diff changeset
   336
%one can supply a regular expression and a string, and in most cases get
Chengsong
parents: 638
diff changeset
   337
%get the matching information in a very short time.
Chengsong
parents: 638
diff changeset
   338
%Those engines can sometimes be blindingly fast--some 
Chengsong
parents: 638
diff changeset
   339
%network intrusion detection systems
Chengsong
parents: 638
diff changeset
   340
%use regular expression engines that are able to process 
Chengsong
parents: 638
diff changeset
   341
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
Chengsong
parents: 638
diff changeset
   342
%However, those engines can sometimes also exhibit a surprising security vulnerability
Chengsong
parents: 638
diff changeset
   343
%under a certain class of inputs.
602
Chengsong
parents: 601
diff changeset
   344
%However, , this is not the case for $\mathbf{all}$ inputs.
601
Chengsong
parents: 600
diff changeset
   345
%TODO: get source for SNORT/BRO's regex matching engine/speed
Chengsong
parents: 600
diff changeset
   346
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   347
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   348
Consider for example the regular expression $(a^*)^*\,b$ and 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   349
strings of the form $aa..a$. These strings cannot be matched by this regular
633
f1edd5ee1a12 more, including link
Chengsong
parents: 631
diff changeset
   350
expression: Obviously the expected $b$ in the last
f1edd5ee1a12 more, including link
Chengsong
parents: 631
diff changeset
   351
position is missing. One would assume that modern regular expression
612
Chengsong
parents: 609
diff changeset
   352
matching engines can find this out very quickly. Surprisingly, if one tries
Chengsong
parents: 609
diff changeset
   353
this example in JavaScript, Python or Java 8, even with small strings, 
Chengsong
parents: 609
diff changeset
   354
say of lenght of around 30 $a$'s,
634
Chengsong
parents: 633
diff changeset
   355
the decision takes an absurd amount of time to finish (see graphs in figure \ref{fig:aStarStarb}).
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   356
The algorithms clearly show exponential behaviour, and as can be seen
612
Chengsong
parents: 609
diff changeset
   357
is triggered by some relatively simple regular expressions.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   358
Java 9 and newer
634
Chengsong
parents: 633
diff changeset
   359
versions improve this behaviour somewhat, but are still slow compared 
612
Chengsong
parents: 609
diff changeset
   360
with the approach we are going to use in this thesis.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   361
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   362
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   363
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   364
This superlinear blowup in regular expression engines
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   365
has caused grief in ``real life'' where it is 
612
Chengsong
parents: 609
diff changeset
   366
given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   367
For example, on 20 July 2016 one evil
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   368
regular expression brought the webpage
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   369
\href{http://stackexchange.com}{Stack Exchange} to its
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   370
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   371
In this instance, a regular expression intended to just trim white
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   372
spaces from the beginning and the end of a line actually consumed
634
Chengsong
parents: 633
diff changeset
   373
massive amounts of CPU resources---causing the web servers to grind to a
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   374
halt. In this example, the time needed to process
634
Chengsong
parents: 633
diff changeset
   375
the string was 
Chengsong
parents: 633
diff changeset
   376
$O(n^2)$ with respect to the string length $n$. This
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   377
quadratic overhead was enough for the homepage of Stack Exchange to
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   378
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   379
attack and therefore stopped the servers from responding to any
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   380
requests. This made the whole site become unavailable. 
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   381
601
Chengsong
parents: 600
diff changeset
   382
\begin{figure}[p]
630
d50a309a0645 with Christian
Chengsong
parents: 628
diff changeset
   383
\begin{center}
612
Chengsong
parents: 609
diff changeset
   384
\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   385
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   386
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   387
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   388
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   389
    ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   390
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   391
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   392
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   393
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   394
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   395
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   396
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   397
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   398
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   399
    legend entries={JavaScript},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   400
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   401
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   402
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   403
\end{axis}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   404
\end{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   405
  &
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   406
\begin{tikzpicture}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   407
\begin{axis}[
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   408
    xlabel={$n$},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   409
    x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   410
    %ylabel={time in secs},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   411
    enlargelimits=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   412
    xtick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   413
    xmax=33,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   414
    ymax=35,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   415
    ytick={0,5,...,30},
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   416
    scaled ticks=false,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   417
    axis lines=left,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   418
    width=5cm,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   419
    height=4cm, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   420
    legend entries={Python},  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   421
    legend pos=north west,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   422
    legend cell align=left]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   423
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   424
\end{axis}
612
Chengsong
parents: 609
diff changeset
   425
\end{tikzpicture}\\ 
601
Chengsong
parents: 600
diff changeset
   426
\begin{tikzpicture}
Chengsong
parents: 600
diff changeset
   427
\begin{axis}[
Chengsong
parents: 600
diff changeset
   428
    xlabel={$n$},
Chengsong
parents: 600
diff changeset
   429
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 600
diff changeset
   430
    ylabel={time in secs},
Chengsong
parents: 600
diff changeset
   431
    enlargelimits=false,
Chengsong
parents: 600
diff changeset
   432
    xtick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   433
    xmax=33,
Chengsong
parents: 600
diff changeset
   434
    ymax=35,
Chengsong
parents: 600
diff changeset
   435
    ytick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   436
    scaled ticks=false,
Chengsong
parents: 600
diff changeset
   437
    axis lines=left,
Chengsong
parents: 600
diff changeset
   438
    width=5cm,
Chengsong
parents: 600
diff changeset
   439
    height=4cm, 
612
Chengsong
parents: 609
diff changeset
   440
    legend entries={Java 8},  
601
Chengsong
parents: 600
diff changeset
   441
    legend pos=north west,
Chengsong
parents: 600
diff changeset
   442
    legend cell align=left]
612
Chengsong
parents: 609
diff changeset
   443
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
601
Chengsong
parents: 600
diff changeset
   444
\end{axis}
Chengsong
parents: 600
diff changeset
   445
\end{tikzpicture}
Chengsong
parents: 600
diff changeset
   446
  &
Chengsong
parents: 600
diff changeset
   447
\begin{tikzpicture}
Chengsong
parents: 600
diff changeset
   448
\begin{axis}[
Chengsong
parents: 600
diff changeset
   449
    xlabel={$n$},
Chengsong
parents: 600
diff changeset
   450
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 600
diff changeset
   451
    %ylabel={time in secs},
Chengsong
parents: 600
diff changeset
   452
    enlargelimits=false,
Chengsong
parents: 600
diff changeset
   453
    xtick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   454
    xmax=33,
Chengsong
parents: 600
diff changeset
   455
    ymax=35,
Chengsong
parents: 600
diff changeset
   456
    ytick={0,5,...,30},
Chengsong
parents: 600
diff changeset
   457
    scaled ticks=false,
Chengsong
parents: 600
diff changeset
   458
    axis lines=left,
Chengsong
parents: 600
diff changeset
   459
    width=5cm,
Chengsong
parents: 600
diff changeset
   460
    height=4cm, 
612
Chengsong
parents: 609
diff changeset
   461
    legend entries={Dart},  
Chengsong
parents: 609
diff changeset
   462
    legend pos=north west,
Chengsong
parents: 609
diff changeset
   463
    legend cell align=left]
Chengsong
parents: 609
diff changeset
   464
\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
Chengsong
parents: 609
diff changeset
   465
\end{axis}
Chengsong
parents: 609
diff changeset
   466
\end{tikzpicture}\\ 
Chengsong
parents: 609
diff changeset
   467
\begin{tikzpicture}
Chengsong
parents: 609
diff changeset
   468
\begin{axis}[
Chengsong
parents: 609
diff changeset
   469
    xlabel={$n$},
Chengsong
parents: 609
diff changeset
   470
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 609
diff changeset
   471
    ylabel={time in secs},
Chengsong
parents: 609
diff changeset
   472
    enlargelimits=false,
Chengsong
parents: 609
diff changeset
   473
    xtick={0,5,...,30},
Chengsong
parents: 609
diff changeset
   474
    xmax=33,
Chengsong
parents: 609
diff changeset
   475
    ymax=35,
Chengsong
parents: 609
diff changeset
   476
    ytick={0,5,...,30},
Chengsong
parents: 609
diff changeset
   477
    scaled ticks=false,
Chengsong
parents: 609
diff changeset
   478
    axis lines=left,
Chengsong
parents: 609
diff changeset
   479
    width=5cm,
Chengsong
parents: 609
diff changeset
   480
    height=4cm, 
601
Chengsong
parents: 600
diff changeset
   481
    legend entries={Swift},  
Chengsong
parents: 600
diff changeset
   482
    legend pos=north west,
Chengsong
parents: 600
diff changeset
   483
    legend cell align=left]
Chengsong
parents: 600
diff changeset
   484
\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
Chengsong
parents: 600
diff changeset
   485
\end{axis}
Chengsong
parents: 600
diff changeset
   486
\end{tikzpicture}
612
Chengsong
parents: 609
diff changeset
   487
  & 
Chengsong
parents: 609
diff changeset
   488
\begin{tikzpicture}
Chengsong
parents: 609
diff changeset
   489
\begin{axis}[
Chengsong
parents: 609
diff changeset
   490
    xlabel={$n$},
Chengsong
parents: 609
diff changeset
   491
    x label style={at={(1.05,-0.05)}},
Chengsong
parents: 609
diff changeset
   492
    %ylabel={time in secs},
Chengsong
parents: 609
diff changeset
   493
    enlargelimits=true,
Chengsong
parents: 609
diff changeset
   494
    %xtick={0,5000,...,40000},
Chengsong
parents: 609
diff changeset
   495
    %xmax=40000,
Chengsong
parents: 609
diff changeset
   496
    %ymax=35,
Chengsong
parents: 609
diff changeset
   497
    restrict x to domain*=0:40000,
Chengsong
parents: 609
diff changeset
   498
    restrict y to domain*=0:35,
Chengsong
parents: 609
diff changeset
   499
    %ytick={0,5,...,30},
Chengsong
parents: 609
diff changeset
   500
    %scaled ticks=false,
Chengsong
parents: 609
diff changeset
   501
    axis lines=left,
Chengsong
parents: 609
diff changeset
   502
    width=5cm,
Chengsong
parents: 609
diff changeset
   503
    height=4cm, 
Chengsong
parents: 609
diff changeset
   504
    legend entries={Java9+},  
Chengsong
parents: 609
diff changeset
   505
    legend pos=north west,
Chengsong
parents: 609
diff changeset
   506
    legend cell align=left]
Chengsong
parents: 609
diff changeset
   507
\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
Chengsong
parents: 609
diff changeset
   508
\end{axis}
Chengsong
parents: 609
diff changeset
   509
\end{tikzpicture}\\ 
Chengsong
parents: 609
diff changeset
   510
\multicolumn{2}{c}{Graphs}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   511
\end{tabular}    
630
d50a309a0645 with Christian
Chengsong
parents: 628
diff changeset
   512
\end{center}
601
Chengsong
parents: 600
diff changeset
   513
\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings 
Chengsong
parents: 600
diff changeset
   514
           of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
612
Chengsong
parents: 609
diff changeset
   515
   The reason for their superlinear behaviour is that they do a depth-first-search
Chengsong
parents: 609
diff changeset
   516
   using NFAs.
Chengsong
parents: 609
diff changeset
   517
   If the string does not match, the regular expression matching
Chengsong
parents: 609
diff changeset
   518
   engine starts to explore all possibilities. 
601
Chengsong
parents: 600
diff changeset
   519
}\label{fig:aStarStarb}
Chengsong
parents: 600
diff changeset
   520
\end{figure}\afterpage{\clearpage}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   521
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   522
A more recent example is a global outage of all Cloudflare servers on 2 July
612
Chengsong
parents: 609
diff changeset
   523
2019. A poorly written regular expression exhibited catastrophic backtracking
Chengsong
parents: 609
diff changeset
   524
and exhausted CPUs that serve HTTP traffic. Although the outage
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   525
had several causes, at the heart was a regular expression that
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   526
was used to monitor network
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   527
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   528
These problems with regular expressions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   529
are not isolated events that happen
634
Chengsong
parents: 633
diff changeset
   530
very rarely, 
Chengsong
parents: 633
diff changeset
   531
%but actually widespread.
Chengsong
parents: 633
diff changeset
   532
%They occur so often that they have a 
Chengsong
parents: 633
diff changeset
   533
but they occur actually often enough that they have a
612
Chengsong
parents: 609
diff changeset
   534
name: Regular-Expression-Denial-Of-Service (ReDoS)
634
Chengsong
parents: 633
diff changeset
   535
attacks.
Chengsong
parents: 633
diff changeset
   536
Davis et al. \cite{Davis18} detected more
612
Chengsong
parents: 609
diff changeset
   537
than 1000 evil regular expressions
635
Chengsong
parents: 634
diff changeset
   538
in Node.js, Python core libraries, npm and pypi. 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   539
They therefore concluded that evil regular expressions
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   540
are a real problem rather than just "a parlour trick".
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   541
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   542
The work in this thesis aims to address this issue
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   543
with the help of formal proofs.
612
Chengsong
parents: 609
diff changeset
   544
We describe a lexing algorithm based
635
Chengsong
parents: 634
diff changeset
   545
on Brzozowski derivatives with verified correctness 
Chengsong
parents: 634
diff changeset
   546
and a finiteness property for the size of derivatives
Chengsong
parents: 634
diff changeset
   547
(which are all done in Isabelle/HOL).
612
Chengsong
parents: 609
diff changeset
   548
Such properties %guarantee the absence of 
Chengsong
parents: 609
diff changeset
   549
are an important step in preventing
Chengsong
parents: 609
diff changeset
   550
catastrophic backtracking once and for all.
604
Chengsong
parents: 603
diff changeset
   551
We will give more details in the next sections
Chengsong
parents: 603
diff changeset
   552
on (i) why the slow cases in graph \ref{fig:aStarStarb}
612
Chengsong
parents: 609
diff changeset
   553
can occur in traditional regular expression engines
604
Chengsong
parents: 603
diff changeset
   554
and (ii) why we choose our 
612
Chengsong
parents: 609
diff changeset
   555
approach based on Brzozowski derivatives and formal proofs.
602
Chengsong
parents: 601
diff changeset
   556
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   557
612
Chengsong
parents: 609
diff changeset
   558
\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
601
Chengsong
parents: 600
diff changeset
   559
Regular expressions and regular expression matchers 
635
Chengsong
parents: 634
diff changeset
   560
have clearly been studied for many, many years.
612
Chengsong
parents: 609
diff changeset
   561
Theoretical results in automata theory state 
604
Chengsong
parents: 603
diff changeset
   562
that basic regular expression matching should be linear
605
Chengsong
parents: 604
diff changeset
   563
w.r.t the input.
Chengsong
parents: 604
diff changeset
   564
This assumes that the regular expression
Chengsong
parents: 604
diff changeset
   565
$r$ was pre-processed and turned into a
635
Chengsong
parents: 634
diff changeset
   566
deterministic finite automaton (DFA) before matching \cite{Sakarovitch2009}.
604
Chengsong
parents: 603
diff changeset
   567
By basic we mean textbook definitions such as the one
612
Chengsong
parents: 609
diff changeset
   568
below, involving only regular expressions for characters, alternatives,
604
Chengsong
parents: 603
diff changeset
   569
sequences, and Kleene stars:
Chengsong
parents: 603
diff changeset
   570
\[
612
Chengsong
parents: 609
diff changeset
   571
	r ::= c | r_1 + r_2 | r_1 \cdot r_2 | r^*
604
Chengsong
parents: 603
diff changeset
   572
\]
Chengsong
parents: 603
diff changeset
   573
Modern regular expression matchers used by programmers,
Chengsong
parents: 603
diff changeset
   574
however,
635
Chengsong
parents: 634
diff changeset
   575
support much richer constructs, such as bounded repetitions,
Chengsong
parents: 634
diff changeset
   576
negations,
604
Chengsong
parents: 603
diff changeset
   577
and back-references.
612
Chengsong
parents: 609
diff changeset
   578
To differentiate, we use the word \emph{regex} to refer
605
Chengsong
parents: 604
diff changeset
   579
to those expressions with richer constructs while reserving the
Chengsong
parents: 604
diff changeset
   580
term \emph{regular expression}
Chengsong
parents: 604
diff changeset
   581
for the more traditional meaning in formal languages theory.
Chengsong
parents: 604
diff changeset
   582
We follow this convention 
Chengsong
parents: 604
diff changeset
   583
in this thesis.
Chengsong
parents: 604
diff changeset
   584
In the future, we aim to support all the popular features of regexes, 
612
Chengsong
parents: 609
diff changeset
   585
but for this work we mainly look at basic regular expressions
Chengsong
parents: 609
diff changeset
   586
and bounded repetitions.
604
Chengsong
parents: 603
diff changeset
   587
605
Chengsong
parents: 604
diff changeset
   588
Chengsong
parents: 604
diff changeset
   589
Chengsong
parents: 604
diff changeset
   590
%Most modern regex libraries
Chengsong
parents: 604
diff changeset
   591
%the so-called PCRE standard (Peral Compatible Regular Expressions)
Chengsong
parents: 604
diff changeset
   592
%has the back-references
612
Chengsong
parents: 609
diff changeset
   593
Regexes come with a number of constructs
605
Chengsong
parents: 604
diff changeset
   594
that make it more convenient for 
604
Chengsong
parents: 603
diff changeset
   595
programmers to write regular expressions.
612
Chengsong
parents: 609
diff changeset
   596
Depending on the types of constructs
605
Chengsong
parents: 604
diff changeset
   597
the task of matching and lexing with them
612
Chengsong
parents: 609
diff changeset
   598
will have different levels of complexity.
635
Chengsong
parents: 634
diff changeset
   599
Some of those constructs are syntactic sugars that are
604
Chengsong
parents: 603
diff changeset
   600
simply short hand notations
605
Chengsong
parents: 604
diff changeset
   601
that save the programmers a few keystrokes.
612
Chengsong
parents: 609
diff changeset
   602
These will not cause problems for regex libraries.
605
Chengsong
parents: 604
diff changeset
   603
For example the
612
Chengsong
parents: 609
diff changeset
   604
non-binary alternative involving three or more choices just means:
604
Chengsong
parents: 603
diff changeset
   605
\[
605
Chengsong
parents: 604
diff changeset
   606
	(a | b | c) \stackrel{means}{=} ((a + b)+ c)
604
Chengsong
parents: 603
diff changeset
   607
\]
635
Chengsong
parents: 634
diff changeset
   608
Similarly, the range operator
Chengsong
parents: 634
diff changeset
   609
%used to express the alternative
Chengsong
parents: 634
diff changeset
   610
%of all characters between its operands, 
Chengsong
parents: 634
diff changeset
   611
is just a concise way
Chengsong
parents: 634
diff changeset
   612
of expressing an alternative of consecutive characters:
604
Chengsong
parents: 603
diff changeset
   613
\[
635
Chengsong
parents: 634
diff changeset
   614
	[0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 )  
604
Chengsong
parents: 603
diff changeset
   615
\]
612
Chengsong
parents: 609
diff changeset
   616
for an alternative. The
635
Chengsong
parents: 634
diff changeset
   617
wildcard character '$.$' is used to refer to any single character,
605
Chengsong
parents: 604
diff changeset
   618
\[
Chengsong
parents: 604
diff changeset
   619
	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
Chengsong
parents: 604
diff changeset
   620
\]
612
Chengsong
parents: 609
diff changeset
   621
except the newline.
604
Chengsong
parents: 603
diff changeset
   622
605
Chengsong
parents: 604
diff changeset
   623
\subsection{Bounded Repetitions}
612
Chengsong
parents: 609
diff changeset
   624
More interesting are bounded repetitions, which can 
Chengsong
parents: 609
diff changeset
   625
make the regular expressions much
605
Chengsong
parents: 604
diff changeset
   626
more compact.
635
Chengsong
parents: 634
diff changeset
   627
Normally there are four kinds of bounded repetitions:
612
Chengsong
parents: 609
diff changeset
   628
$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
Chengsong
parents: 609
diff changeset
   629
(where $n$ and $m$ are constant natural numbers).
Chengsong
parents: 609
diff changeset
   630
Like the star regular expressions, the set of strings or language
Chengsong
parents: 609
diff changeset
   631
a bounded regular expression can match
Chengsong
parents: 609
diff changeset
   632
is defined using the power operation on sets:
605
Chengsong
parents: 604
diff changeset
   633
\begin{center}
Chengsong
parents: 604
diff changeset
   634
	\begin{tabular}{lcl}
Chengsong
parents: 604
diff changeset
   635
		$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
Chengsong
parents: 604
diff changeset
   636
		$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
Chengsong
parents: 604
diff changeset
   637
		$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
Chengsong
parents: 604
diff changeset
   638
		$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
Chengsong
parents: 604
diff changeset
   639
	\end{tabular}
Chengsong
parents: 604
diff changeset
   640
\end{center}
612
Chengsong
parents: 609
diff changeset
   641
The attraction of bounded repetitions is that they can be
635
Chengsong
parents: 634
diff changeset
   642
used to avoid a size blow up: for example $r^{\{n\}}$
612
Chengsong
parents: 609
diff changeset
   643
is a shorthand for
635
Chengsong
parents: 634
diff changeset
   644
the much longer regular expression:
605
Chengsong
parents: 604
diff changeset
   645
\[
Chengsong
parents: 604
diff changeset
   646
	\underbrace{r\ldots r}_\text{n copies of r}.
Chengsong
parents: 604
diff changeset
   647
\]
Chengsong
parents: 604
diff changeset
   648
%Therefore, a naive algorithm that simply unfolds
Chengsong
parents: 604
diff changeset
   649
%them into their desugared forms
Chengsong
parents: 604
diff changeset
   650
%will suffer from at least an exponential runtime increase.
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   651
612
Chengsong
parents: 609
diff changeset
   652
Chengsong
parents: 609
diff changeset
   653
The problem with matching 
635
Chengsong
parents: 634
diff changeset
   654
such bounded repetitions
612
Chengsong
parents: 609
diff changeset
   655
is that tools based on the classic notion of
Chengsong
parents: 609
diff changeset
   656
automata need to expand $r^{\{n\}}$ into $n$ connected 
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   657
copies of the automaton for $r$. This leads to very inefficient matching
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   658
algorithms  or algorithms that consume large amounts of memory.
605
Chengsong
parents: 604
diff changeset
   659
Implementations using $\DFA$s will
635
Chengsong
parents: 634
diff changeset
   660
in such situations
605
Chengsong
parents: 604
diff changeset
   661
either become excruciatingly slow 
635
Chengsong
parents: 634
diff changeset
   662
(for example Verbatim++ \cite{Verbatimpp}) or run
Chengsong
parents: 634
diff changeset
   663
out of memory (for example $\mathit{LEX}$ and 
Chengsong
parents: 634
diff changeset
   664
$\mathit{JFLEX}$\footnote{LEX and JFLEX are lexer generators
605
Chengsong
parents: 604
diff changeset
   665
in C and JAVA that generate $\mathit{DFA}$-based
Chengsong
parents: 604
diff changeset
   666
lexers. The user provides a set of regular expressions
635
Chengsong
parents: 634
diff changeset
   667
and configurations, and then 
605
Chengsong
parents: 604
diff changeset
   668
gets an output program encoding a minimized $\mathit{DFA}$
Chengsong
parents: 604
diff changeset
   669
that can be compiled and run. 
Chengsong
parents: 604
diff changeset
   670
When given the above countdown regular expression,
635
Chengsong
parents: 634
diff changeset
   671
a small $n$ (say 20) would result in a program representing a
Chengsong
parents: 634
diff changeset
   672
DFA
612
Chengsong
parents: 609
diff changeset
   673
with millions of states.}) for large counters.
Chengsong
parents: 609
diff changeset
   674
A classic example for this phenomenon is the regular expression $(a+b)^*  a (a+b)^{n}$
605
Chengsong
parents: 604
diff changeset
   675
where the minimal DFA requires at least $2^{n+1}$ states.
Chengsong
parents: 604
diff changeset
   676
For example, when $n$ is equal to 2,
635
Chengsong
parents: 634
diff changeset
   677
the corresponding $\mathit{NFA}$ looks like:
Chengsong
parents: 634
diff changeset
   678
\vspace{6mm}
604
Chengsong
parents: 603
diff changeset
   679
\begin{center}
Chengsong
parents: 603
diff changeset
   680
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
Chengsong
parents: 603
diff changeset
   681
   \node[state,initial] (q_0)   {$q_0$}; 
Chengsong
parents: 603
diff changeset
   682
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
Chengsong
parents: 603
diff changeset
   683
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
Chengsong
parents: 603
diff changeset
   684
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
Chengsong
parents: 603
diff changeset
   685
    \path[->] 
Chengsong
parents: 603
diff changeset
   686
    (q_0) edge  node {a} (q_1)
Chengsong
parents: 603
diff changeset
   687
    	  edge [loop below] node {a,b} ()
Chengsong
parents: 603
diff changeset
   688
    (q_1) edge  node  {a,b} (q_2)
Chengsong
parents: 603
diff changeset
   689
    (q_2) edge  node  {a,b} (q_3);
Chengsong
parents: 603
diff changeset
   690
\end{tikzpicture}
Chengsong
parents: 603
diff changeset
   691
\end{center}
635
Chengsong
parents: 634
diff changeset
   692
and when turned into a DFA by the subset construction
612
Chengsong
parents: 609
diff changeset
   693
requires at least $2^3$ states.\footnote{The 
635
Chengsong
parents: 634
diff changeset
   694
red states are "countdown states" which count down 
604
Chengsong
parents: 603
diff changeset
   695
the number of characters needed in addition to the current
Chengsong
parents: 603
diff changeset
   696
string to make a successful match.
Chengsong
parents: 603
diff changeset
   697
For example, state $q_1$ indicates a match that has
Chengsong
parents: 603
diff changeset
   698
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
Chengsong
parents: 603
diff changeset
   699
and just consumed the "delimiter" $a$ in the middle, and 
635
Chengsong
parents: 634
diff changeset
   700
needs to match 2 more iterations of $(a|b)$ to complete.
604
Chengsong
parents: 603
diff changeset
   701
State $q_2$ on the other hand, can be viewed as a state
Chengsong
parents: 603
diff changeset
   702
after $q_1$ has consumed 1 character, and just waits
Chengsong
parents: 603
diff changeset
   703
for 1 more character to complete.
635
Chengsong
parents: 634
diff changeset
   704
The state $q_3$ is the last (accepting) state, requiring 0 
Chengsong
parents: 634
diff changeset
   705
more characters.
604
Chengsong
parents: 603
diff changeset
   706
Depending on the suffix of the
Chengsong
parents: 603
diff changeset
   707
input string up to the current read location,
Chengsong
parents: 603
diff changeset
   708
the states $q_1$ and $q_2$, $q_3$
Chengsong
parents: 603
diff changeset
   709
may or may
635
Chengsong
parents: 634
diff changeset
   710
not be active.
604
Chengsong
parents: 603
diff changeset
   711
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
Chengsong
parents: 603
diff changeset
   712
contain at least $2^3$ non-equivalent states that cannot be merged, 
Chengsong
parents: 603
diff changeset
   713
because the subset construction during determinisation will generate
Chengsong
parents: 603
diff changeset
   714
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
Chengsong
parents: 603
diff changeset
   715
Generalizing this to regular expressions with larger
Chengsong
parents: 603
diff changeset
   716
bounded repetitions number, we have that
Chengsong
parents: 603
diff changeset
   717
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
605
Chengsong
parents: 604
diff changeset
   718
would require at least $2^{n+1}$ states, if $r$ itself contains
604
Chengsong
parents: 603
diff changeset
   719
more than 1 string.
Chengsong
parents: 603
diff changeset
   720
This is to represent all different 
635
Chengsong
parents: 634
diff changeset
   721
scenarios in which "countdown" states are active.}
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   722
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   723
635
Chengsong
parents: 634
diff changeset
   724
Bounded repetitions are important because they
Chengsong
parents: 634
diff changeset
   725
tend to occur frequently in practical use,
Chengsong
parents: 634
diff changeset
   726
for example in the regex library RegExLib, in
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   727
the rules library of Snort \cite{Snort1999}\footnote{
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   728
Snort is a network intrusion detection (NID) tool
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   729
for monitoring network traffic.
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   730
The network security community curates a list
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   731
of malicious patterns written as regexes,
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   732
which is used by Snort's detection engine
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   733
to match against network traffic for any hostile
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   734
activities such as buffer overflow attacks.}, 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   735
as well as in XML Schema definitions (XSDs).
605
Chengsong
parents: 604
diff changeset
   736
According to Bj\"{o}rklund et al \cite{xml2015},
Chengsong
parents: 604
diff changeset
   737
more than half of the 
612
Chengsong
parents: 609
diff changeset
   738
XSDs they found on the Maven.org central repository
Chengsong
parents: 609
diff changeset
   739
have bounded regular expressions in them.
Chengsong
parents: 609
diff changeset
   740
Often the counters are quite large, with the largest being
635
Chengsong
parents: 634
diff changeset
   741
close to ten million. 
Chengsong
parents: 634
diff changeset
   742
A smaller sample XSD they gave
612
Chengsong
parents: 609
diff changeset
   743
is:
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   744
\lstset{
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   745
	basicstyle=\fontsize{8.5}{9}\ttfamily,
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   746
  language=XML,
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   747
  morekeywords={encoding,
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   748
    xs:schema,xs:element,xs:complexType,xs:sequence,xs:attribute}
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   749
}
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   750
\begin{lstlisting}
612
Chengsong
parents: 609
diff changeset
   751
<sequence minOccurs="0" maxOccurs="65535">
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   752
	<element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   753
 	<element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
612
Chengsong
parents: 609
diff changeset
   754
</sequence>
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   755
\end{lstlisting}
635
Chengsong
parents: 634
diff changeset
   756
This can be seen as the regex
605
Chengsong
parents: 604
diff changeset
   757
$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
Chengsong
parents: 604
diff changeset
   758
regular expressions 
Chengsong
parents: 604
diff changeset
   759
satisfying certain constraints (such as 
Chengsong
parents: 604
diff changeset
   760
satisfying the floating point number format).
Chengsong
parents: 604
diff changeset
   761
It is therefore quite unsatisfying that 
Chengsong
parents: 604
diff changeset
   762
some regular expressions matching libraries
Chengsong
parents: 604
diff changeset
   763
impose adhoc limits
Chengsong
parents: 604
diff changeset
   764
for bounded regular expressions:
Chengsong
parents: 604
diff changeset
   765
For example, in the regular expression matching library in the Go
Chengsong
parents: 604
diff changeset
   766
language the regular expression $a^{1001}$ is not permitted, because no counter
Chengsong
parents: 604
diff changeset
   767
can be above 1000, and in the built-in Rust regular expression library
Chengsong
parents: 604
diff changeset
   768
expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
606
Chengsong
parents: 605
diff changeset
   769
for being too big. 
635
Chengsong
parents: 634
diff changeset
   770
As Becchi and Crawley \cite{Becchi08}  have pointed out,
606
Chengsong
parents: 605
diff changeset
   771
the reason for these restrictions
612
Chengsong
parents: 609
diff changeset
   772
is that they simulate a non-deterministic finite
606
Chengsong
parents: 605
diff changeset
   773
automata (NFA) with a breadth-first search.
Chengsong
parents: 605
diff changeset
   774
This way the number of active states could
Chengsong
parents: 605
diff changeset
   775
be equal to the counter number.
Chengsong
parents: 605
diff changeset
   776
When the counters are large, 
Chengsong
parents: 605
diff changeset
   777
the memory requirement could become
612
Chengsong
parents: 609
diff changeset
   778
infeasible, and a regex engine
635
Chengsong
parents: 634
diff changeset
   779
like in Go will reject this pattern straight away.
606
Chengsong
parents: 605
diff changeset
   780
\begin{figure}[H]
Chengsong
parents: 605
diff changeset
   781
\begin{center}
Chengsong
parents: 605
diff changeset
   782
\begin{tikzpicture} [node distance = 2cm, on grid, auto]
Chengsong
parents: 605
diff changeset
   783
 
Chengsong
parents: 605
diff changeset
   784
    	\node (q0) [state, initial] {$0$};
Chengsong
parents: 605
diff changeset
   785
	\node (q1) [state, right = of q0] {$1$};
612
Chengsong
parents: 609
diff changeset
   786
	%\node (q2) [state, right = of q1] {$2$};
Chengsong
parents: 609
diff changeset
   787
	\node (qdots) [right = of q1] {$\ldots$};
606
Chengsong
parents: 605
diff changeset
   788
	\node (qn) [state, right = of qdots] {$n$};
Chengsong
parents: 605
diff changeset
   789
	\node (qn1) [state, right = of qn] {$n+1$};
Chengsong
parents: 605
diff changeset
   790
	\node (qn2) [state, right = of qn1] {$n+2$};
Chengsong
parents: 605
diff changeset
   791
	\node (qn3) [state, accepting, right = of qn2] {$n+3$}; 
Chengsong
parents: 605
diff changeset
   792
 
Chengsong
parents: 605
diff changeset
   793
\path [-stealth, thick]
Chengsong
parents: 605
diff changeset
   794
	(q0) edge [loop above] node {a} ()
Chengsong
parents: 605
diff changeset
   795
    (q0) edge node {a}   (q1) 
612
Chengsong
parents: 609
diff changeset
   796
    %(q1) edge node {.}   (q2)
Chengsong
parents: 609
diff changeset
   797
    (q1) edge node {.}   (qdots)
606
Chengsong
parents: 605
diff changeset
   798
    (qdots) edge node {.} (qn)
Chengsong
parents: 605
diff changeset
   799
    (qn) edge node {.} (qn1)
Chengsong
parents: 605
diff changeset
   800
    (qn1) edge node {b} (qn2)
Chengsong
parents: 605
diff changeset
   801
    (qn2) edge node {$c$} (qn3);
Chengsong
parents: 605
diff changeset
   802
\end{tikzpicture}
Chengsong
parents: 605
diff changeset
   803
%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
Chengsong
parents: 605
diff changeset
   804
%   \node[state,initial] (q_0)   {$0$}; 
Chengsong
parents: 605
diff changeset
   805
%   \node[state, ] (q_1) [right=of q_0] {$1$}; 
Chengsong
parents: 605
diff changeset
   806
%   \node[state, ] (q_2) [right=of q_1] {$2$}; 
Chengsong
parents: 605
diff changeset
   807
%   \node[state,
Chengsong
parents: 605
diff changeset
   808
%   \node[state, accepting, ](q_3) [right=of q_2] {$3$};
Chengsong
parents: 605
diff changeset
   809
%    \path[->] 
Chengsong
parents: 605
diff changeset
   810
%    (q_0) edge  node {a} (q_1)
Chengsong
parents: 605
diff changeset
   811
%    	  edge [loop below] node {a,b} ()
Chengsong
parents: 605
diff changeset
   812
%    (q_1) edge  node  {a,b} (q_2)
Chengsong
parents: 605
diff changeset
   813
%    (q_2) edge  node  {a,b} (q_3);
Chengsong
parents: 605
diff changeset
   814
%\end{tikzpicture}
Chengsong
parents: 605
diff changeset
   815
\end{center}
Chengsong
parents: 605
diff changeset
   816
\caption{The example given by Becchi and Crawley
Chengsong
parents: 605
diff changeset
   817
	that NFA simulation can consume large
Chengsong
parents: 605
diff changeset
   818
	amounts of memory: $.^*a.^{\{n\}}bc$ matching
Chengsong
parents: 605
diff changeset
   819
	strings of the form $aaa\ldots aaaabc$.
Chengsong
parents: 605
diff changeset
   820
	When traversing in a breadth-first manner,
Chengsong
parents: 605
diff changeset
   821
all states from 0 till $n+1$ will become active.}
Chengsong
parents: 605
diff changeset
   822
\end{figure}
Chengsong
parents: 605
diff changeset
   823
%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
Chengsong
parents: 605
diff changeset
   824
%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
Chengsong
parents: 605
diff changeset
   825
%in terms of input string length.
Chengsong
parents: 605
diff changeset
   826
%TODO:try out these lexers
Chengsong
parents: 605
diff changeset
   827
These problems can of course be solved in matching algorithms where 
605
Chengsong
parents: 604
diff changeset
   828
automata go beyond the classic notion and for instance include explicit
Chengsong
parents: 604
diff changeset
   829
counters \cite{Turo_ov__2020}.
612
Chengsong
parents: 609
diff changeset
   830
These solutions can be quite efficient,
606
Chengsong
parents: 605
diff changeset
   831
with the ability to process
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   832
gigabits of strings input per second
606
Chengsong
parents: 605
diff changeset
   833
even with large counters \cite{Becchi08}.
635
Chengsong
parents: 634
diff changeset
   834
These practical solutions do not come with
Chengsong
parents: 634
diff changeset
   835
formal guarantees, and as pointed out by
Chengsong
parents: 634
diff changeset
   836
Kuklewicz \cite{KuklewiczHaskell}, can be error-prone.
Chengsong
parents: 634
diff changeset
   837
%But formal reasoning about these automata especially in Isabelle 
Chengsong
parents: 634
diff changeset
   838
%can be challenging
Chengsong
parents: 634
diff changeset
   839
%and un-intuitive. 
Chengsong
parents: 634
diff changeset
   840
%Therefore, we take correctness and runtime claims made about these solutions
Chengsong
parents: 634
diff changeset
   841
%with a grain of salt.
605
Chengsong
parents: 604
diff changeset
   842
628
7af4e2420a8c ready to submit~~
Chengsong
parents: 622
diff changeset
   843
In the work reported in \cite{FoSSaCS2023} and here, 
605
Chengsong
parents: 604
diff changeset
   844
we add better support using derivatives
635
Chengsong
parents: 634
diff changeset
   845
for bounded regular expression $r^{\{n\}}$.
Chengsong
parents: 634
diff changeset
   846
Our results
605
Chengsong
parents: 604
diff changeset
   847
extend straightforwardly to
635
Chengsong
parents: 634
diff changeset
   848
repetitions with intervals such as 
605
Chengsong
parents: 604
diff changeset
   849
$r^{\{n\ldots m\}}$.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   850
The merit of Brzozowski derivatives (more on this later)
605
Chengsong
parents: 604
diff changeset
   851
on this problem is that
Chengsong
parents: 604
diff changeset
   852
it can be naturally extended to support bounded repetitions.
635
Chengsong
parents: 634
diff changeset
   853
Moreover these extensions are still made up of only small
605
Chengsong
parents: 604
diff changeset
   854
inductive datatypes and recursive functions,
633
f1edd5ee1a12 more, including link
Chengsong
parents: 631
diff changeset
   855
making it handy to deal with them in theorem provers.
605
Chengsong
parents: 604
diff changeset
   856
%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
Chengsong
parents: 604
diff changeset
   857
%straightforwardly extended to deal with bounded regular expressions
Chengsong
parents: 604
diff changeset
   858
%and moreover the resulting code still consists of only simple
Chengsong
parents: 604
diff changeset
   859
%recursive functions and inductive datatypes.
Chengsong
parents: 604
diff changeset
   860
Finally, bounded regular expressions do not destroy our finite
Chengsong
parents: 604
diff changeset
   861
boundedness property, which we shall prove later on.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   862
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   863
606
Chengsong
parents: 605
diff changeset
   864
Chengsong
parents: 605
diff changeset
   865
Chengsong
parents: 605
diff changeset
   866
605
Chengsong
parents: 604
diff changeset
   867
\subsection{Back-References}
606
Chengsong
parents: 605
diff changeset
   868
The other way to simulate an $\mathit{NFA}$ for matching is choosing  
Chengsong
parents: 605
diff changeset
   869
a single transition each time, keeping all the other options in 
Chengsong
parents: 605
diff changeset
   870
a queue or stack, and backtracking if that choice eventually 
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   871
fails. 
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   872
This method, often called a  "depth-first-search", 
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   873
is efficient in many cases, but could end up
606
Chengsong
parents: 605
diff changeset
   874
with exponential run time.
Chengsong
parents: 605
diff changeset
   875
The backtracking method is employed in regex libraries
Chengsong
parents: 605
diff changeset
   876
that support \emph{back-references}, for example
Chengsong
parents: 605
diff changeset
   877
in Java and Python.
605
Chengsong
parents: 604
diff changeset
   878
%\section{Back-references and The Terminology Regex}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   879
605
Chengsong
parents: 604
diff changeset
   880
%When one constructs an $\NFA$ out of a regular expression
Chengsong
parents: 604
diff changeset
   881
%there is often very little to be done in the first phase, one simply 
Chengsong
parents: 604
diff changeset
   882
%construct the $\NFA$ states based on the structure of the input regular expression.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   883
605
Chengsong
parents: 604
diff changeset
   884
%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
Chengsong
parents: 604
diff changeset
   885
%one by keeping track of all active states after consuming 
Chengsong
parents: 604
diff changeset
   886
%a character, and update that set of states iteratively.
Chengsong
parents: 604
diff changeset
   887
%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
Chengsong
parents: 604
diff changeset
   888
%for a path terminating
Chengsong
parents: 604
diff changeset
   889
%at an accepting state.
606
Chengsong
parents: 605
diff changeset
   890
Chengsong
parents: 605
diff changeset
   891
Chengsong
parents: 605
diff changeset
   892
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   893
Consider the following regular expression where the sequence
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   894
operator is omitted for brevity:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   895
\begin{center}
606
Chengsong
parents: 605
diff changeset
   896
	$r_1r_2r_3r_4$
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   897
\end{center}
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   898
In this example,
606
Chengsong
parents: 605
diff changeset
   899
one could label sub-expressions of interest 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   900
by parenthesizing them and giving 
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   901
them a number in the order in which their opening parentheses appear.
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   902
One possible way of parenthesizing and labelling is: 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   903
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   904
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   905
\end{center}
606
Chengsong
parents: 605
diff changeset
   906
The sub-expressions
Chengsong
parents: 605
diff changeset
   907
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
Chengsong
parents: 605
diff changeset
   908
by 1 to 4, and can be ``referred back'' by their respective numbers. 
Chengsong
parents: 605
diff changeset
   909
%These sub-expressions are called "capturing groups".
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   910
To do so, one uses the syntax $\backslash i$ 
606
Chengsong
parents: 605
diff changeset
   911
to denote that we want the sub-string 
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   912
of the input matched by the i-th
606
Chengsong
parents: 605
diff changeset
   913
sub-expression to appear again, 
Chengsong
parents: 605
diff changeset
   914
exactly the same as it first appeared: 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   915
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   916
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   917
\underset{s_i \text{ which just matched} \;r_i}{\backslash i} \ldots$
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   918
\end{center}
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   919
Once the sub-string $s_i$ for the sub-expression $r_i$
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   920
has been fixed, there is no variability on what the back-reference
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   921
label $\backslash i$ can be---it is tied to $s_i$.
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   922
The overall string will look like $\ldots s_i \ldots s_i \ldots $
606
Chengsong
parents: 605
diff changeset
   923
%The backslash and number $i$ are the
Chengsong
parents: 605
diff changeset
   924
%so-called "back-references".
Chengsong
parents: 605
diff changeset
   925
%Let $e$ be an expression made of regular expressions 
Chengsong
parents: 605
diff changeset
   926
%and back-references. $e$ contains the expression $e_i$
Chengsong
parents: 605
diff changeset
   927
%as its $i$-th capturing group.
Chengsong
parents: 605
diff changeset
   928
%The semantics of back-reference can be recursively
Chengsong
parents: 605
diff changeset
   929
%written as:
Chengsong
parents: 605
diff changeset
   930
%\begin{center}
Chengsong
parents: 605
diff changeset
   931
%	\begin{tabular}{c}
Chengsong
parents: 605
diff changeset
   932
%		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
Chengsong
parents: 605
diff changeset
   933
%		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
Chengsong
parents: 605
diff changeset
   934
%	\end{tabular}
Chengsong
parents: 605
diff changeset
   935
%\end{center}
Chengsong
parents: 605
diff changeset
   936
A concrete example
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   937
for back-references is
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   938
\begin{center}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   939
$(.^*)\backslash 1$,
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   940
\end{center}
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   941
which matches
606
Chengsong
parents: 605
diff changeset
   942
strings that can be split into two identical halves,
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   943
for example $\mathit{foofoo}$, $\mathit{ww}$ and so on.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   944
Note that this is different from 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   945
repeating the  sub-expression verbatim like
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   946
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   947
	$(.^*)(.^*)$,
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   948
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   949
which does not impose any restrictions on what strings the second 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   950
sub-expression $.^*$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   951
might match.
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   952
Another example for back-references is
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   953
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   954
$(.)(.)\backslash 2\backslash 1$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   955
\end{center}
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   956
which matches four-character palindromes
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   957
like $abba$, $x??x$ and so on.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   958
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   959
Back-references are a regex construct 
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   960
that programmers find quite useful.
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   961
According to Becchi and Crawley \cite{Becchi08},
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   962
6\% of Snort rules (up until 2008) use them.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   963
The most common use of back-references
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   964
is to express well-formed html files,
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   965
where back-references are convenient for matching
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   966
opening and closing tags like 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   967
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   968
	$\langle html \rangle \ldots \langle / html \rangle$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   969
\end{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   970
A regex describing such a format
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   971
is
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   972
\begin{center}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   973
	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   974
\end{center}
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   975
Despite being useful, the expressive power of regexes 
633
f1edd5ee1a12 more, including link
Chengsong
parents: 631
diff changeset
   976
go beyond regular languages 
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   977
once back-references are included.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   978
In fact, they allow the regex construct to express 
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   979
languages that cannot be contained in context-free
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   980
languages either.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   981
For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   982
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   983
which cannot be expressed by 
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   984
context-free grammars \cite{campeanu2003formal}.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   985
Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   986
of formal languages. 
618
233cf2b97d1a chapter 5 finished!!
Chengsong
parents: 612
diff changeset
   987
Also solving the matching problem involving back-references
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   988
is known to be NP-complete \parencite{alfred2014algorithms}.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   989
Regex libraries supporting back-references such as 
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
   990
PCRE \cite{pcre} therefore have to
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   991
revert to a depth-first search algorithm including backtracking.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   992
What is unexpected is that even in the cases 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   993
not involving back-references, there is still
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   994
a (non-negligible) chance they might backtrack super-linearly,
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   995
as shown in the graphs in figure \ref{fig:aStarStarb}.
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   996
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
   997
Summing up, we can categorise existing 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
   998
practical regex libraries into two kinds:
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   999
(i) The ones  with  linear
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1000
time guarantees like Go and Rust. The downside with them is that
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1001
they impose restrictions
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1002
on the regular expressions (not allowing back-references, 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1003
bounded repetitions cannot exceed an ad hoc limit etc.).
633
f1edd5ee1a12 more, including link
Chengsong
parents: 631
diff changeset
  1004
And (ii) those 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1005
that allow large bounded regular expressions and back-references
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1006
at the expense of using backtracking algorithms.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1007
They can potentially ``grind to a halt''
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1008
on some very simple cases, resulting 
633
f1edd5ee1a12 more, including link
Chengsong
parents: 631
diff changeset
  1009
ReDoS attacks if exposed to the internet.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1010
633
f1edd5ee1a12 more, including link
Chengsong
parents: 631
diff changeset
  1011
The problems with both approaches are the motivation for us 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1012
to look again at the regular expression matching problem. 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1013
Another motivation is that regular expression matching algorithms
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1014
that follow the POSIX standard often contain errors and bugs 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1015
as we shall explain next.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1016
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1017
%We would like to have regex engines that can 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1018
%deal with the regular part (e.g.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1019
%bounded repetitions) of regexes more
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1020
%efficiently.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1021
%Also we want to make sure that they do it correctly.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1022
%It turns out that such aim is not so easy to achieve.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1023
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1024
% For example, the Rust regex engine claims to be linear, 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1025
% but does not support lookarounds and back-references.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1026
% The GoLang regex library does not support over 1000 repetitions.  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1027
% Java and Python both support back-references, but shows
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1028
%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1029
%when the language is still regular).
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1030
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1031
 %TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1032
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1033
605
Chengsong
parents: 604
diff changeset
  1034
Chengsong
parents: 604
diff changeset
  1035
Chengsong
parents: 604
diff changeset
  1036
%The time cost of regex matching algorithms in general
Chengsong
parents: 604
diff changeset
  1037
%involve two different phases, and different things can go differently wrong on 
Chengsong
parents: 604
diff changeset
  1038
%these phases.
Chengsong
parents: 604
diff changeset
  1039
%$\DFA$s usually have problems in the first (construction) phase
Chengsong
parents: 604
diff changeset
  1040
%, whereas $\NFA$s usually run into trouble
Chengsong
parents: 604
diff changeset
  1041
%on the second phase.
Chengsong
parents: 604
diff changeset
  1042
Chengsong
parents: 604
diff changeset
  1043
Chengsong
parents: 604
diff changeset
  1044
\section{Error-prone POSIX Implementations}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1045
Very often there are multiple ways of matching a string
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1046
with a regular expression.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1047
In such cases the regular expressions matcher needs to
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1048
disambiguate.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1049
The more widely used strategy is called POSIX,
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1050
which roughly speaking always chooses the longest initial match.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1051
The POSIX strategy is widely adopted in many regular expression matchers
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1052
because it is a natural strategy for lexers.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1053
However, many implementations (including the C libraries
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1054
used by Linux and OS X distributions) contain bugs
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1055
or do not meet the specification they claim to adhere to.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1056
Kuklewicz maintains a unit test repository which lists some
628
7af4e2420a8c ready to submit~~
Chengsong
parents: 622
diff changeset
  1057
problems with existing regular expression engines \cite{KuklewiczHaskell}.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1058
In some cases, they either fail to generate a
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1059
result when there exists a match,
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1060
or give results that are inconsistent with the POSIX standard.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1061
A concrete example is the regex:
605
Chengsong
parents: 604
diff changeset
  1062
\begin{center}
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1063
	$(aba + ab + a)^* \text{and the string} \; ababa$
605
Chengsong
parents: 604
diff changeset
  1064
\end{center}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1065
The correct POSIX match for the above
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1066
involves the entire string $ababa$, 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1067
split into two Kleene star iterations, namely $[ab], [aba]$ at positions
605
Chengsong
parents: 604
diff changeset
  1068
$[0, 2), [2, 5)$
Chengsong
parents: 604
diff changeset
  1069
respectively.
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1070
But feeding this example to the different engines
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1071
listed at regex101 \footnote{
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1072
	regex101 is an online regular expression matcher which
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1073
	provides API for trying out regular expression
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1074
	engines of multiple popular programming languages like
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1075
Java, Python, Go, etc.} \parencite{regex101}. 
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1076
yields
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1077
only two incomplete matches: $[aba]$ at $[0, 3)$
605
Chengsong
parents: 604
diff changeset
  1078
and $a$ at $[4, 5)$.
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1079
Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell} 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1080
commented that most regex libraries are not
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1081
correctly implementing the central POSIX
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1082
rule, called the maximum munch rule.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1083
Grathwohl \parencite{grathwohl2014crash} wrote:
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1084
\begin{quote}\it
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1085
	``The POSIX strategy is more complicated than the 
605
Chengsong
parents: 604
diff changeset
  1086
	greedy because of the dependence on information about 
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1087
	the length of matched strings in the various subexpressions.''
605
Chengsong
parents: 604
diff changeset
  1088
\end{quote}
Chengsong
parents: 604
diff changeset
  1089
%\noindent
646
Chengsong
parents: 638
diff changeset
  1090
People have recognised that the implementation complexity of POSIX rules also come from
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1091
the specification being not very precise.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1092
The developers of the regexp package of Go 
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1093
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1094
commented that
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1095
\begin{quote}\it
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1096
``
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1097
The POSIX rule is computationally prohibitive
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1098
and not even well-defined.
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1099
``
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1100
\end{quote}
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1101
There are many informal summaries of this disambiguation
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1102
strategy, which are often quite long and delicate.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
  1103
For example Kuklewicz \cite{KuklewiczHaskell} 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1104
described the POSIX rule as (section 1, last paragraph):
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1105
\begin{quote}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1106
	\begin{itemize}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1107
		\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1108
regular expressions (REs) take the leftmost starting match, and the longest match starting there
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1109
earlier subpatterns have leftmost-longest priority over later subpatterns\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1110
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1111
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1112
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1113
REs have right associative concatenation which can be changed with parenthesis\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1114
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1115
parenthesized subexpressions return the match from their last usage\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1116
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1117
text of component subexpressions must be contained in the text of the 
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1118
higher-level subexpressions\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1119
\item
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1120
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1121
\item
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1122
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
607
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1123
\end{itemize}
e6fc9b72c0e3 chap1 almost done
Chengsong
parents: 606
diff changeset
  1124
\end{quote}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1125
%The text above 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1126
%is trying to capture something very precise,
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1127
%and is crying out for formalising.
646
Chengsong
parents: 638
diff changeset
  1128
Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
Chengsong
parents: 638
diff changeset
  1129
formalised the notion of bit-coded regular expressions
Chengsong
parents: 638
diff changeset
  1130
and proved their relations with simple regular expressions in
Chengsong
parents: 638
diff changeset
  1131
the dependently-typed proof assistant Agda.
Chengsong
parents: 638
diff changeset
  1132
They also proved the soundness and completeness of a matching algorithm
Chengsong
parents: 638
diff changeset
  1133
based on the bit-coded regular expressions.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
  1134
Ausaf et al. \cite{AusafDyckhoffUrban2016}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1135
are the first to
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1136
give a quite simple formalised POSIX
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1137
specification in Isabelle/HOL, and also prove 
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1138
that their specification coincides with an earlier (unformalised) 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1139
POSIX specification given by Okui and Suzuki \cite{Okui10}.
608
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
  1140
They then formally proved the correctness of
37b6fd310a16 added related work chap
Chengsong
parents: 607
diff changeset
  1141
a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1142
with regards to that specification.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1143
They also found that the informal POSIX
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1144
specification by Sulzmann and Lu needs to be substantially 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1145
revised in order for the correctness proof to go through.
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1146
Their original specification and proof were unfixable
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1147
according to Ausaf et al.
605
Chengsong
parents: 604
diff changeset
  1148
646
Chengsong
parents: 638
diff changeset
  1149
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1150
In the next section, we will briefly
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1151
introduce Brzozowski derivatives and Sulzmann
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1152
and Lu's algorithm, which the main point of this thesis builds on.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1153
%We give a taste of what they 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1154
%are like and why they are suitable for regular expression
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1155
%matching and lexing.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1156
\section{Formal Specification of POSIX Matching 
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1157
and Brzozowski Derivatives}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1158
%Now we start with the central topic of the thesis: Brzozowski derivatives.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1159
Brzozowski \cite{Brzozowski1964} first introduced the 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1160
concept of a \emph{derivative} of regular expression in 1964.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1161
The derivative of a regular expression $r$
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1162
with respect to a character $c$, is written as $r \backslash c$.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1163
This operation tells us what $r$ transforms into
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1164
if we ``chop'' off a particular character $c$ 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1165
from all strings in the language of $r$ (defined
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1166
later as $L \; r$).
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1167
%To give a flavour of Brzozowski derivatives, we present
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1168
%two straightforward clauses from it:
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1169
%\begin{center}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1170
%	\begin{tabular}{lcl}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1171
%		$d \backslash c$     & $\dn$ & 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1172
%		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1173
%$(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1174
%	\end{tabular}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1175
%\end{center}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1176
%\noindent
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1177
%The first clause says that for the regular expression
636
0bcb4a7cb40c chap 1 all incorporated
Chengsong
parents: 635
diff changeset
  1178
%denoting a singleton set consisting of a single-character string $\{ d \}$,
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1179
%we check the derivative character $c$ against $d$,
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1180
%returning a set containing only the empty string $\{ [] \}$
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1181
%if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1182
%The second clause states that to obtain the regular expression
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1183
%representing all strings' head character $c$ being chopped off
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1184
%from $r_1 + r_2$, one simply needs to recursively take derivative
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1185
%of $r_1$ and $r_2$ and then put them together.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1186
Derivatives have the property
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1187
that $s \in L \; (r\backslash c)$ if and only if 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1188
$c::s \in L \; r$ where $::$ stands for list prepending.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1189
%This property can be used on regular expressions
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1190
%matching and lexing--to test whether a string $s$ is in $L \; r$,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1191
%one simply takes derivatives of $r$ successively with
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1192
%respect to the characters (in the correct order) in $s$,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1193
%and then test whether the empty string is in the last regular expression.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1194
With this property, derivatives can give a simple solution
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1195
to the problem of matching a string $s$ with a regular
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1196
expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1197
succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1198
then $r$ matches $s$ (and {\em vice versa}).  
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1199
%This makes formally reasoning about these properties such
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1200
%as correctness and complexity smooth and intuitive.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1201
There are several mechanised proofs of this property in various theorem
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1202
provers,
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1203
for example one by Owens and Slind \cite{Owens2008} in HOL4,
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1204
another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, and
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1205
yet another in Coq by Coquand and Siles \cite{Coquand2012}.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1206
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1207
In addition, one can extend derivatives to bounded repetitions
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1208
relatively straightforwardly. For example, the derivative for 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1209
this can be defined as:
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1210
\begin{center}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1211
	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1212
		$r^{\{n\}} \backslash c$     & $\dn$ & $r \backslash c \cdot
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1213
		r^{\{n-1\}} (\text{when} n > 0)$\\
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1214
	\end{tabular}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1215
\end{center}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1216
\noindent
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1217
Experimental results suggest that  unlike DFA-based solutions
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1218
for bounded regular expressions,
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1219
derivatives can cope
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1220
large counters
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1221
quite well.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1222
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1223
There have also been 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1224
extensions of derivatives to other regex constructs.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1225
For example, Owens et al include the derivatives
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1226
for the \emph{NOT} regular expression, which is
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1227
able to concisely express C-style comments of the form
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1228
$/* \ldots */$ (see \cite{Owens2008}).
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1229
Another extension for derivatives is
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1230
regular expressions with look-aheads, done
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1231
by Miyazaki and Minamide
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1232
\cite{Takayuki2019}.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1233
%We therefore use Brzozowski derivatives on regular expressions 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1234
%lexing 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1235
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1236
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1237
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1238
Given the above definitions and properties of
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1239
Brzozowski derivatives, one quickly realises their potential
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1240
in generating a formally verified algorithm for lexing: the clauses and property
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1241
can be easily expressed in a functional programming language 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1242
or converted to theorem prover
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1243
code, with great ease.
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1244
Perhaps this is the reason why derivatives have sparked quite a bit of interest
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1245
in the functional programming and theorem prover communities in the last
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1246
fifteen or so years (
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1247
\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1248
\cite{Chen12} and \cite{Coquand2012}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1249
to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1250
after they were first published by Brzozowski.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1251
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1252
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1253
However, there are two difficulties with derivative-based matchers:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1254
First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1255
for whether a regular expression matches a string or not.  This is too
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1256
little information in the context of lexing where separate tokens must
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1257
be identified and also classified (for example as keywords
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1258
or identifiers). 
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1259
Second, derivative-based matchers need to be more efficient in terms
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1260
of the sizes of derivatives.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1261
Elegant and beautiful
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1262
as many implementations are,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1263
they can be excruciatingly slow. 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1264
For example, Sulzmann and Lu
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1265
claim a linear running time of their proposed algorithm,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1266
but that was falsified by our experiments. The running time 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1267
is actually $\Omega(2^n)$ in the worst case.
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1268
A similar claim about a theoretical runtime of $O(n^2)$ 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1269
is made for the Verbatim \cite{Verbatim}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1270
%TODO: give references
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1271
lexer, which calculates POSIX matches and is based on derivatives.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1272
They formalized the correctness of the lexer, but not their complexity result.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1273
In the performance evaluation section, they analyzed the run time
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1274
of matching $a$ with the string 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1275
\begin{center}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1276
	$\underbrace{a \ldots a}_{\text{n a's}}$.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1277
\end{center}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1278
\noindent
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1279
They concluded that the algorithm is quadratic in terms of 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1280
the length of the input string.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1281
When we tried out their extracted OCaml code with the example $(a+aa)^*$,
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1282
the time it took to match a string of 40 $a$'s was approximately 5 minutes.
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1283
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1284
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1285
\subsection{Sulzmann and Lu's Algorithm}
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1286
Sulzmann and Lu~\cite{Sulzmann2014} overcame the first 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1287
problem with the yes/no answer 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1288
by cleverly extending Brzozowski's matching
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1289
algorithm. Their extended version generates additional information on
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1290
\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1291
rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1292
second ``phase'' to Brzozowski's algorithm involving an injection
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1293
function. This injection function in a sense undoes the ``damage''
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1294
of the derivatives chopping off characters.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1295
In earlier work, Ausaf et al provided the formal
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1296
specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1297
the correctness
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1298
of this extended algorithm accordingly
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1299
\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1300
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1301
The version of the algorithm proven correct
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1302
suffers however heavily from a 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1303
second difficulty, where derivatives can
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1304
grow to arbitrarily big sizes. 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1305
For example if we start with the
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1306
regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1307
successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1308
a sequence of ever-growing derivatives like 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1309
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1310
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1311
\begin{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1312
\begin{tabular}{rll}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1313
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1314
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1315
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1316
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1317
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1318
\end{tabular}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1319
\end{center}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1320
 
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1321
\noindent where after around 35 steps we usually run out of memory on a
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1322
typical computer.  Clearly, the
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1323
notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1324
simplification rules that can be applied to regular regular
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1325
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1326
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1327
r$. While such simple-minded simplifications have been proved in 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1328
the work by Ausaf et al. to preserve the correctness of Sulzmann and Lu's
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1329
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1330
\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1331
above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1332
quickly beyond any finite bound.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1333
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1334
Therefore we want to look in this thesis at a second
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1335
algorithm by Sulzmann and Lu where they
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1336
overcame this ``growth problem'' \cite{Sulzmann2014}.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1337
In this version, POSIX values are 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1338
represented as bit sequences and such sequences are incrementally generated
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1339
when derivatives are calculated. The compact representation
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1340
of bit sequences and regular expressions allows them to define a more
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1341
``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1342
derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1343
They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1344
of this version, but do not provide any supporting proof arguments, not
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1345
even ``pencil-and-paper'' arguments. They write about their bit-coded
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1346
\emph{incremental parsing method} (that is the algorithm to be formalised
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1347
in this dissertation)
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1348
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1349
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1350
  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1351
  \begin{quote}\it
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1352
  ``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1353
  method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1354
  yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1355
  extensively [..] but yet
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1356
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1357
\end{quote}  
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1358
Ausaf and Urban made some initial progress towards the 
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1359
full correctness proof but still had to leave out the optimisation
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1360
Sulzmann and Lu proposed.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1361
Ausaf  wrote \cite{Ausaf},
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1362
  \begin{quote}\it
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1363
``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1364
\end{quote}  
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1365
This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1366
by Ausaf and Urban,
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1367
together with a formal proof of the correctness of those simplifications.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1368
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1369
612
Chengsong
parents: 609
diff changeset
  1370
One of the most recent work in the context of lexing
Chengsong
parents: 609
diff changeset
  1371
%with this issue
630
d50a309a0645 with Christian
Chengsong
parents: 628
diff changeset
  1372
is the Verbatim lexer by Egolf, Lasser and Fisher~\cite{Verbatim}.
612
Chengsong
parents: 609
diff changeset
  1373
This is relevant work for us and we will compare it later with 
Chengsong
parents: 609
diff changeset
  1374
our derivative-based matcher we are going to present.
Chengsong
parents: 609
diff changeset
  1375
There is also some newer work called
630
d50a309a0645 with Christian
Chengsong
parents: 628
diff changeset
  1376
Verbatim++~\cite{Verbatimpp}, which does not use derivatives, 
612
Chengsong
parents: 609
diff changeset
  1377
but deterministic finite automaton instead.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1378
We will also study this work in a later section.
612
Chengsong
parents: 609
diff changeset
  1379
%An example that gives problem to automaton approaches would be
Chengsong
parents: 609
diff changeset
  1380
%the regular expression $(a|b)^*a(a|b)^{\{n\}}$.
Chengsong
parents: 609
diff changeset
  1381
%It requires at least $2^{n+1}$ states to represent
Chengsong
parents: 609
diff changeset
  1382
%as a DFA.
Chengsong
parents: 609
diff changeset
  1383
Chengsong
parents: 609
diff changeset
  1384
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1385
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1386
\section{Contribution}
646
Chengsong
parents: 638
diff changeset
  1387
{\color{red} \rule{\linewidth}{0.5mm}}
Chengsong
parents: 638
diff changeset
  1388
\textbf{issue with this part: way too short, not enough details of what I have done.}
Chengsong
parents: 638
diff changeset
  1389
{\color{red} \rule{\linewidth}{0.5mm}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1390
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1391
In this thesis,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1392
we propose a solution to catastrophic
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1393
backtracking and error-prone matchers: a formally verified
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1394
regular expression lexing algorithm
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1395
that is both fast
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1396
and correct. 
646
Chengsong
parents: 638
diff changeset
  1397
{\color{red} \rule{\linewidth}{0.5mm}}
Chengsong
parents: 638
diff changeset
  1398
\HandRight Added content:
Chengsong
parents: 638
diff changeset
  1399
%Package \verb`pifont`: \ding{43}
Chengsong
parents: 638
diff changeset
  1400
%Package \verb`utfsym`: \usym{261E} 
Chengsong
parents: 638
diff changeset
  1401
%Package \verb`dingbat`: \leftpointright 
Chengsong
parents: 638
diff changeset
  1402
%Package \verb`dingbat`: \rightpointright 
Chengsong
parents: 638
diff changeset
  1403
Chengsong
parents: 638
diff changeset
  1404
In particular, the main problem we solved on top of previous work was
Chengsong
parents: 638
diff changeset
  1405
coming up with a formally verified algorithm called $\blexersimp$ based
Chengsong
parents: 638
diff changeset
  1406
on Brzozowski derivatives. It calculates a POSIX
Chengsong
parents: 638
diff changeset
  1407
lexical value from a string and a regular expression. This algorithm was originally
Chengsong
parents: 638
diff changeset
  1408
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
Chengsong
parents: 638
diff changeset
  1409
function does not really simplify intermediate results where it needs to and improved the
Chengsong
parents: 638
diff changeset
  1410
algorithm accordingly.
Chengsong
parents: 638
diff changeset
  1411
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
Chengsong
parents: 638
diff changeset
  1412
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
Chengsong
parents: 638
diff changeset
  1413
\begin{theorem}
Chengsong
parents: 638
diff changeset
  1414
$|\blexersimp \; r \; s | \leq N_r$
Chengsong
parents: 638
diff changeset
  1415
\end{theorem}
Chengsong
parents: 638
diff changeset
  1416
The simplifications applied in each step of $\blexersimp$ 
Chengsong
parents: 638
diff changeset
  1417
Chengsong
parents: 638
diff changeset
  1418
\begin{center}
Chengsong
parents: 638
diff changeset
  1419
	$\blexersimp
Chengsong
parents: 638
diff changeset
  1420
	$
Chengsong
parents: 638
diff changeset
  1421
\end{center}
Chengsong
parents: 638
diff changeset
  1422
keeps the derivatives small, but presents a 
Chengsong
parents: 638
diff changeset
  1423
challenge
Chengsong
parents: 638
diff changeset
  1424
Chengsong
parents: 638
diff changeset
  1425
Chengsong
parents: 638
diff changeset
  1426
establishing a correctness theorem of the below form:
Chengsong
parents: 638
diff changeset
  1427
%TODO: change this to "theorem to prove"
Chengsong
parents: 638
diff changeset
  1428
\begin{theorem}
Chengsong
parents: 638
diff changeset
  1429
	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$, 
Chengsong
parents: 638
diff changeset
  1430
	then $\blexersimp\; r \; s = \Some \; v$. Otherwise 
Chengsong
parents: 638
diff changeset
  1431
	$\blexersimp \; r \; s = \None$.
Chengsong
parents: 638
diff changeset
  1432
\end{theorem}
Chengsong
parents: 638
diff changeset
  1433
Chengsong
parents: 638
diff changeset
  1434
Chengsong
parents: 638
diff changeset
  1435
Chengsong
parents: 638
diff changeset
  1436
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1437
The result is %a regular expression lexing algorithm that comes with 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1438
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1439
\item
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1440
an improved version of  Sulzmann and Lu's bit-coded algorithm using 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1441
derivatives with simplifications, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1442
accompanied by
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1443
a proven correctness theorem according to POSIX specification 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1444
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1445
\item 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1446
a complexity-related property for that algorithm saying that the 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1447
internal data structure will
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1448
remain below a finite bound,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
  1449
\item
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1450
and an extension to
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1451
the bounded repetition constructs with the correctness and finiteness property
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1452
maintained.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1453
\end{itemize}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1454
\noindent
646
Chengsong
parents: 638
diff changeset
  1455
{\color{red} \rule{\linewidth}{0.5mm}}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1456
With a formal finiteness bound in place,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1457
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
631
Chengsong
parents: 630
diff changeset
  1458
The Isabelle/HOL code for our formalisation can be 
Chengsong
parents: 630
diff changeset
  1459
found at
Chengsong
parents: 630
diff changeset
  1460
\begin{center}
Chengsong
parents: 630
diff changeset
  1461
\url{https://github.com/hellotommmy/posix}
Chengsong
parents: 630
diff changeset
  1462
\end{center}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
  1463
Further improvements to the algorithm with an even stronger version of 
631
Chengsong
parents: 630
diff changeset
  1464
simplification can be made. We conjecture that the resulting size of derivatives
Chengsong
parents: 630
diff changeset
  1465
can be bounded by a cubic bound w.r.t. the size of the regular expression.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1466
We will give relevant code in Scala, 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1467
but do not give a formal proof for that in Isabelle/HOL.
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1468
This is still future work.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1469
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1470
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1471
\section{Structure of the thesis}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1472
In chapter \ref{Inj} we will introduce the concepts
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1473
and notations we 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1474
use for describing regular expressions and derivatives,
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1475
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1476
its correctness proof).
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1477
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1478
together with the correctness proof by Ausaf and Urban.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1479
Then we illustrate in chapter \ref{Bitcoded2}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1480
how Sulzmann and Lu's
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1481
simplifications fail to simplify correctly. We therefore introduce our own version of the
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1482
algorithm with correct simplifications and 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1483
their correctness proof.  
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1484
In chapter \ref{Finite} we give the second guarantee
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1485
of our bitcoded algorithm, that is a finite bound on the size of any 
631
Chengsong
parents: 630
diff changeset
  1486
regular expression's derivatives. 
Chengsong
parents: 630
diff changeset
  1487
We also show how one can extend the
Chengsong
parents: 630
diff changeset
  1488
algorithm to include bounded repetitions. 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
  1489
In chapter \ref{Cubic} we discuss stronger simplification rules which 
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1490
improve the finite bound to a cubic bound. %and the NOT regular expression.
637
Chengsong
parents: 636
diff changeset
  1491
Chapter \ref{RelatedWork} introduces relevant work for this thesis.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
  1492
Chapter \ref{Future} concludes and mentions avenues of future research.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1493
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1494
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1495
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1496
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1497
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1498
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1499
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1500
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1501
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1502
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1503
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1504
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1505
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1506
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1507