ChengsongTanPhdThesis/Chapters/Introduction.tex
author Chengsong
Sun, 18 Jun 2023 17:54:52 +0100
changeset 649 ef2b8abcbc55
parent 648 d15a0b7d6d90
child 652 a4d692a9a289
permissions -rwxr-xr-x
more
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     1
% Chapter 1
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     2
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     4
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     5
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     6
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     8
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    15
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    16
%boxes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    17
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    18
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    19
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    20
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    21
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    22
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
    23
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564
Chengsong
parents: 558
diff changeset
    24
\def\derssimp{\textit{ders}\_\textit{simp}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    25
\def\rders{\textit{rders}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    26
\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    27
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
    28
\def\bsimps{\textit{bsimp}}
554
Chengsong
parents: 543
diff changeset
    29
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    30
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    31
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    32
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    33
\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    34
\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    35
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555
Chengsong
parents: 554
diff changeset
    36
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594
Chengsong
parents: 591
diff changeset
    37
\def\rdistincts{\textit{rdistinct}}
556
Chengsong
parents: 555
diff changeset
    38
\def\rDistinct{\textit{rdistinct}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    39
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    40
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    41
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
620
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    42
\def\cbn{\textit{createdByNtimes}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    43
\def\hpa{\textit{highestPowerAux}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    44
\def\hpower{\textit{highestPower}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    45
\def\ntset{\textit{ntset}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    46
\def\optermsimp{\textit{optermsimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    47
\def\optermOsimp{\textit{optermOsimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    48
\def\optermosimp{\textit{optermosimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    49
\def\opterm{\textit{opterm}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    50
\def\nString{\textit{nonemptyString}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    51
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    52
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    53
600
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    54
\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    55
\def\SEQs{\textit{SEQs}}
564
Chengsong
parents: 558
diff changeset
    56
\def\case{\textit{case}}
554
Chengsong
parents: 543
diff changeset
    57
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
Chengsong
parents: 543
diff changeset
    58
\def\rsimpalts{\textit{rsimp}_{ALTS}}
Chengsong
parents: 543
diff changeset
    59
\def\good{\textit{good}}
Chengsong
parents: 543
diff changeset
    60
\def\btrue{\textit{true}}
Chengsong
parents: 543
diff changeset
    61
\def\bfalse{\textit{false}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    62
\def\bnullable{\textit{bnullable}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    63
\def\bnullables{\textit{bnullables}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    64
\def\Some{\textit{Some}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    65
\def\None{\textit{None}}
537
Chengsong
parents: 532
diff changeset
    66
\def\code{\textit{code}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    67
\def\decode{\textit{decode}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    68
\def\internalise{\textit{internalise}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    69
\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    70
\def\mkeps{\textit{mkeps}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    71
\newcommand{\rder}[2]{#2 \backslash_r #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    72
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    73
\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    74
554
Chengsong
parents: 543
diff changeset
    75
\def\nonnested{\textit{nonnested}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    76
\def\AZERO{\textit{AZERO}}
558
Chengsong
parents: 557
diff changeset
    77
\def\sizeNregex{\textit{sizeNregex}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    78
\def\AONE{\textit{AONE}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    79
\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    80
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    81
\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    82
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    83
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555
Chengsong
parents: 554
diff changeset
    84
\def\frewrite{\rightsquigarrow_f}
Chengsong
parents: 554
diff changeset
    85
\def\hrewrite{\rightsquigarrow_h}
Chengsong
parents: 554
diff changeset
    86
\def\grewrite{\rightsquigarrow_g}
Chengsong
parents: 554
diff changeset
    87
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
Chengsong
parents: 554
diff changeset
    88
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
Chengsong
parents: 554
diff changeset
    89
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    90
\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    91
\def\bder{\textit{bder}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    92
\def\der{\textit{der}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    93
\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    94
\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    95
\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    96
\def\DFA{\textit{DFA}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    97
\def\NFA{\textit{NFA}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    98
\def\bmkeps{\textit{bmkeps}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    99
\def\bmkepss{\textit{bmkepss}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   100
\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   101
\def\blexer{\textit{blexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   102
\def\flex{\textit{flex}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   103
\def\inj{\textit{inj}}
564
Chengsong
parents: 558
diff changeset
   104
\def\Empty{\textit{Empty}}
567
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
   105
\def\Left{\textit{Left}}
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
   106
\def\Right{\textit{Right}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   107
\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   108
\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   109
\def\Seq{\textit{Seq}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   110
\def\Der{\textit{Der}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   111
\def\Ders{\textit{Ders}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   112
\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   113
\def\Z{\mathit{Z}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   114
\def\S{\mathit{S}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   115
\def\rup{r^\uparrow}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   116
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   117
\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   118
\def\lf{\textit{lf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   119
\def\PD{\textit{PD}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   120
\def\suffix{\textit{Suffix}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   121
\def\distinctBy{\textit{distinctBy}}
558
Chengsong
parents: 557
diff changeset
   122
\def\starupdate{\textit{starUpdate}}
Chengsong
parents: 557
diff changeset
   123
\def\starupdates{\textit{starUpdates}}
620
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
   124
\def\nupdate{\textit{nupdate}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
   125
\def\nupdates{\textit{nupdates}}
558
Chengsong
parents: 557
diff changeset
   126
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   127
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   128
\def\size{\mathit{size}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   129
\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   130
\def\simp{\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   131
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   132
\def\map{\mathit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   133
\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   134
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   135
\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   136
\def\bsimpStrong{\textit{bsimpStrong}}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
   137
\def\bdersStrongs{\textit{bdersStrong}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   138
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   139
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   140
\def\map{\textit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   141
\def\rrexp{\textit{rrexp}}
554
Chengsong
parents: 543
diff changeset
   142
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   143
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   144
\newcommand\asize[1]{\llbracket #1 \rrbracket}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   145
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   146
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   147
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   148
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   149
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   150
\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   151
\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   152
\def\bsimpalts{\textit{bsimp}_{ALTS}}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   153
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   154
\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   155
\def\rsimpseq{\textit{rsimp}_{SEQ}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   156
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   157
\def\erase{\textit{erase}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   158
\def\STAR{\textit{STAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   159
\def\flts{\textit{flts}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   160
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   161
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   162
\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   163
\def\nub{\textit{nub}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   164
\def\filter{\textit{filter}}
601
Chengsong
parents: 600
diff changeset
   165
%\def\not{\textit{not}}
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   166
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   167
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   168
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   169
\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   170
\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   171
\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   172
\newcommand\RSEQ[2]{#1 \cdot #2}
558
Chengsong
parents: 557
diff changeset
   173
\newcommand\RALTS[1]{\sum #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   174
\newcommand\RSTAR[1]{#1^*}
558
Chengsong
parents: 557
diff changeset
   175
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   176
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   177
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   178
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   179
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   180
\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   181
  frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   182
  language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   183
  aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   184
  belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   185
  showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   186
  columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   187
  basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   188
  numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   189
  numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   190
  keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   191
  commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   192
  stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   193
  frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   194
  breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   195
  breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   196
  tabsize=3,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   197
}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   198
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   199
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   200
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   201
%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   202
%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   203
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   204
%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   205
601
Chengsong
parents: 600
diff changeset
   206
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   207
Regular expressions, since their inception in the 1940s, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   208
have been subject to extensive study and implementation. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   209
Their primary application lies in lexing, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   210
a process whereby an unstructured input string is disassembled into a tree of tokens
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   211
with their guidance.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   212
This is often used to match strings that comprises of numerous fields, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   213
where certain fields may recur or be omitted. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   214
For example, the regular expression for recognising email addresses is
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   215
\begin{center}
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   216
\verb|^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$|.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   217
\end{center}
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   218
Using this, regular expression matchers and lexers are able to extract 
649
Chengsong
parents: 648
diff changeset
   219
the domain names by the use of \verb|[a-zA-Z0-9.-]+|. 
Chengsong
parents: 648
diff changeset
   220
Consequently, they are an indispensible components in text processing tools 
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   221
of software systems such as compilers, IDEs, and firewalls.
601
Chengsong
parents: 600
diff changeset
   222
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   223
The study of regular expressions is ongoing due to an
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   224
issue known as catastrophic backtracking. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   225
This phenomenon refers to scenarios in which the regular expression 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   226
matching or lexing engine exhibits a disproportionately long 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   227
runtime despite the simplicity of the input and expression.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   228
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   229
The origin of catastrophic backtracking is rooted in the 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   230
ambiguities of lexing. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   231
In the process of matching a multi-character string with 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   232
a regular expression that encompasses several sub-expressions, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   233
different positions can be designated to mark 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   234
the boundaries of corresponding substrings of the sub-expressions. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   235
For instance, in matching the string $aaa$ with the 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   236
regular expression $(a+aa)^*$, the divide between 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   237
the initial match and the subsequent iteration could either be 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   238
set between the first and second characters ($a | aa$) or between the second and third characters ($aa | a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   239
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   240
Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   241
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   242
Various disambiguation strategies are employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
601
Chengsong
parents: 600
diff changeset
   243
Chengsong
parents: 600
diff changeset
   244
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   245
%Regular expressions 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   246
%have been extensively studied and
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   247
%implemented since their invention in 1940s.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   248
%It is primarily used in lexing, where an unstructured input string is broken
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   249
%down into a tree of tokens.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   250
%That tree's construction is guided by the shape of the regular expression.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   251
%This is particularly useful in expressing the shape of a string
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   252
%with many fields, where certain fields might repeat or be omitted.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   253
%Regular expression matchers and Lexers allow us to 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   254
%identify and delimit different subsections of a string and potentially 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   255
%extract information from inputs, making them
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   256
%an indispensible component in modern software systems' text processing tasks
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   257
%such as compilers, IDEs, and firewalls.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   258
%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   259
%which means the regular expression matching or lexing engine takes an unproportional time to run 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   260
%despite the input and the expression being relatively simple.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   261
%
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   262
%Catastrophic backtracking stems from the ambiguities of lexing: 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   263
%when matching a multiple-character string with a regular 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   264
%exression that includes serveral sub-expressions, there might be different positions to set 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   265
%the border between sub-expressions' corresponding sub-strings.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   266
%For example, matching the string $aaa$ against the regular expression
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   267
%$(a+aa)^*$, the border between the initial match and the second iteration
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   268
%could be between the first and second character ($a | aa$) 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   269
%or between the second and third character ($aa | a$).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   270
%As the size of the input string and the structural complexity 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   271
%of the regular expression increase,
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   272
%the number of different combinations of delimiters can grow exponentially, and
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   273
%algorithms that explore these possibilities unwisely will also see an exponential complexity.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   274
%
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   275
%Catastrophic backtracking allow a class of computationally inexpensive attacks called
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   276
%Regular expression Denial of Service attacks (ReDoS), in which the hacker 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   277
%simply sends out a small attack string to a server, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   278
%triggering high-complexity behaviours in its regular expression engine. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   279
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   280
%details of this in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   281
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   282
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   283
%There have been prose definitions like the following
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   284
%by Kuklewicz \cite{KuklewiczHaskell}: 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   285
%described the POSIX rule as (section 1, last paragraph):
646
Chengsong
parents: 638
diff changeset
   286
\begin{quote}
Chengsong
parents: 638
diff changeset
   287
	\begin{itemize}
Chengsong
parents: 638
diff changeset
   288
		\item
Chengsong
parents: 638
diff changeset
   289
regular expressions (REs) take the leftmost starting match, and the longest match starting there
Chengsong
parents: 638
diff changeset
   290
earlier subpatterns have leftmost-longest priority over later subpatterns\\
Chengsong
parents: 638
diff changeset
   291
\item
Chengsong
parents: 638
diff changeset
   292
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
Chengsong
parents: 638
diff changeset
   293
\item
Chengsong
parents: 638
diff changeset
   294
REs have right associative concatenation which can be changed with parenthesis\\
Chengsong
parents: 638
diff changeset
   295
\item
Chengsong
parents: 638
diff changeset
   296
parenthesized subexpressions return the match from their last usage\\
Chengsong
parents: 638
diff changeset
   297
\item
Chengsong
parents: 638
diff changeset
   298
text of component subexpressions must be contained in the text of the 
Chengsong
parents: 638
diff changeset
   299
higher-level subexpressions\\
Chengsong
parents: 638
diff changeset
   300
\item
Chengsong
parents: 638
diff changeset
   301
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
Chengsong
parents: 638
diff changeset
   302
\item
Chengsong
parents: 638
diff changeset
   303
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
Chengsong
parents: 638
diff changeset
   304
\end{itemize}
Chengsong
parents: 638
diff changeset
   305
\end{quote}
649
Chengsong
parents: 648
diff changeset
   306
However, the author noted that various lexers that claim to be POSIX 
Chengsong
parents: 648
diff changeset
   307
are rarely correct according to this standard.
Chengsong
parents: 648
diff changeset
   308
There are numerous occasions where programmers realised the subtlety and
Chengsong
parents: 648
diff changeset
   309
difficulty to implement correctly, one such quote from Go's regexp library author 
Chengsong
parents: 648
diff changeset
   310
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
Chengsong
parents: 648
diff changeset
   311
\begin{quote}\it
Chengsong
parents: 648
diff changeset
   312
``
Chengsong
parents: 648
diff changeset
   313
The POSIX rule is computationally prohibitive
Chengsong
parents: 648
diff changeset
   314
and not even well-defined.
Chengsong
parents: 648
diff changeset
   315
``
Chengsong
parents: 648
diff changeset
   316
\end{quote}
Chengsong
parents: 648
diff changeset
   317
Being able to formally define and capture the idea of 
Chengsong
parents: 648
diff changeset
   318
POSIX rules and prove 
Chengsong
parents: 648
diff changeset
   319
the correctness of regular expression matching/lexing 
Chengsong
parents: 648
diff changeset
   320
algorithms against the POSIX semantics definitions
Chengsong
parents: 648
diff changeset
   321
is valuable.
Chengsong
parents: 648
diff changeset
   322
646
Chengsong
parents: 638
diff changeset
   323
649
Chengsong
parents: 648
diff changeset
   324
Formal proofs are
Chengsong
parents: 648
diff changeset
   325
machine checked programs
Chengsong
parents: 648
diff changeset
   326
 %such as the ones written in Isabelle/HOL, is a powerful means 
Chengsong
parents: 648
diff changeset
   327
for computer scientists to be certain 
Chengsong
parents: 648
diff changeset
   328
about the correctness of their algorithms.
Chengsong
parents: 648
diff changeset
   329
This is done by 
Chengsong
parents: 648
diff changeset
   330
recursively checking that every fact in a proof script 
Chengsong
parents: 648
diff changeset
   331
is either an axiom or a fact that is derived from
Chengsong
parents: 648
diff changeset
   332
known axioms or verified facts.
Chengsong
parents: 648
diff changeset
   333
%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
Chengsong
parents: 648
diff changeset
   334
%methods as their implementation and complexity analysis tend to be error-prone.
Chengsong
parents: 648
diff changeset
   335
Formal proofs provides an unprecendented level of asssurance
Chengsong
parents: 648
diff changeset
   336
that an algorithm will perform as expected under all inputs.
Chengsong
parents: 648
diff changeset
   337
The software systems that help people interactively build and check
Chengsong
parents: 648
diff changeset
   338
such proofs are called theorem-provers or proof assitants.
Chengsong
parents: 648
diff changeset
   339
Many  theorem-provers have been developed, such as Mizar,
Chengsong
parents: 648
diff changeset
   340
Isabelle/HOL, HOL-Light, HOL4,
Chengsong
parents: 648
diff changeset
   341
Coq, Agda, Idris, Lean and so on.
Chengsong
parents: 648
diff changeset
   342
Isabelle/HOL is a theorem prover with a simple type theory
Chengsong
parents: 648
diff changeset
   343
and powerful automated proof generators like sledgehammer.
Chengsong
parents: 648
diff changeset
   344
We chose to use Isabelle/HOL for its powerful automation
Chengsong
parents: 648
diff changeset
   345
and ease and simplicity in expressing regular expressions and 
Chengsong
parents: 648
diff changeset
   346
regular languages.
Chengsong
parents: 648
diff changeset
   347
%Some of those employ
Chengsong
parents: 648
diff changeset
   348
%dependent types like Mizar, Coq, Agda, Lean and Idris.
Chengsong
parents: 648
diff changeset
   349
%Some support a constructivism approach, such as Coq.
646
Chengsong
parents: 638
diff changeset
   350
Chengsong
parents: 638
diff changeset
   351
649
Chengsong
parents: 648
diff changeset
   352
Formal proofs on regular expression matching and lexing
Chengsong
parents: 648
diff changeset
   353
complements the efforts in
Chengsong
parents: 648
diff changeset
   354
industry which tend to focus on overall speed
Chengsong
parents: 648
diff changeset
   355
with techniques like parallelization (FPGA paper), tackling 
Chengsong
parents: 648
diff changeset
   356
the problem of catastrophic backtracking 
Chengsong
parents: 648
diff changeset
   357
in an ad-hoc manner (cloudflare and stackexchange article).
Chengsong
parents: 648
diff changeset
   358
Chengsong
parents: 648
diff changeset
   359
There have been many interesting steps in the theorem-proving community 
Chengsong
parents: 648
diff changeset
   360
about formalising regular expressions and lexing.
Chengsong
parents: 648
diff changeset
   361
One flavour is to build from the regular expression an automaton, and determine
Chengsong
parents: 648
diff changeset
   362
acceptance in terms of the resulting 
Chengsong
parents: 648
diff changeset
   363
state after executing the input string on that automaton.
Chengsong
parents: 648
diff changeset
   364
Automata formalisations are in general harder and more cumbersome to deal
Chengsong
parents: 648
diff changeset
   365
with for theorem provers than working directly on regular expressions.
Chengsong
parents: 648
diff changeset
   366
One such example is by Nipkow \cite{Nipkow1998}.
Chengsong
parents: 648
diff changeset
   367
%They 
Chengsong
parents: 648
diff changeset
   368
%made everything recursive (for example the next state function),
Chengsong
parents: 648
diff changeset
   369
As a core idea, they
Chengsong
parents: 648
diff changeset
   370
used a list of booleans to name each state so that 
Chengsong
parents: 648
diff changeset
   371
after composing sub-automata together, renaming the states to maintain
Chengsong
parents: 648
diff changeset
   372
the distinctness of each state is recursive and simple.
Chengsong
parents: 648
diff changeset
   373
The result was the obvious lemmas incorporating  
Chengsong
parents: 648
diff changeset
   374
``a painful amount of detail'' in their formalisation.
Chengsong
parents: 648
diff changeset
   375
Sometimes the automata are represented as graphs. 
Chengsong
parents: 648
diff changeset
   376
But graphs are not inductive datatypes.
Chengsong
parents: 648
diff changeset
   377
Having to set the induction principle on the number of nodes
Chengsong
parents: 648
diff changeset
   378
in a graph makes formal reasoning non-intuitive and convoluted,
Chengsong
parents: 648
diff changeset
   379
resulting in large formalisations \cite{Lammich2012}. 
Chengsong
parents: 648
diff changeset
   380
When combining two graphs, one also needs to make sure that the nodes in
Chengsong
parents: 648
diff changeset
   381
both graphs are distinct, which almost always involve
Chengsong
parents: 648
diff changeset
   382
renaming of the nodes.
Chengsong
parents: 648
diff changeset
   383
A theorem-prover which provides dependent types such as Coq 
Chengsong
parents: 648
diff changeset
   384
can alleviate the issue of representing graph nodes
Chengsong
parents: 648
diff changeset
   385
\cite{Doczkal2013}. There the representation of nodes is made
Chengsong
parents: 648
diff changeset
   386
easier by the use of $\textit{FinType}$.
Chengsong
parents: 648
diff changeset
   387
Another representation for automata are matrices. 
Chengsong
parents: 648
diff changeset
   388
But the induction for them is not as straightforward either.
Chengsong
parents: 648
diff changeset
   389
There are some more clever representations, for example one by Paulson 
Chengsong
parents: 648
diff changeset
   390
using hereditarily finite sets \cite{Paulson2015}. 
Chengsong
parents: 648
diff changeset
   391
There the problem with combining graphs can be solved better. 
Chengsong
parents: 648
diff changeset
   392
%but we believe that such clever tricks are not very obvious for 
Chengsong
parents: 648
diff changeset
   393
%the John-average-Isabelle-user.
Chengsong
parents: 648
diff changeset
   394
Chengsong
parents: 648
diff changeset
   395
The approach that operates directly on regular expressions circumvents the problem of
Chengsong
parents: 648
diff changeset
   396
conversion between a regular expression and an automaton, thereby avoiding representation
Chengsong
parents: 648
diff changeset
   397
problems altogether, despite that regular expressions may be seen as a variant of a
Chengsong
parents: 648
diff changeset
   398
non-deterministic finite automaton (ref Laurikari tagged NFA paper).
Chengsong
parents: 648
diff changeset
   399
To matching a string, a sequence of algebraic transformations called 
Chengsong
parents: 648
diff changeset
   400
(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression.
Chengsong
parents: 648
diff changeset
   401
Each derivative takes a character and a regular expression, 
Chengsong
parents: 648
diff changeset
   402
and returns a new regular expression whose language is closely related to 
Chengsong
parents: 648
diff changeset
   403
the original regular expression's language:
Chengsong
parents: 648
diff changeset
   404
strings prefixed with that input character will have their head removed
Chengsong
parents: 648
diff changeset
   405
and strings not prefixed
Chengsong
parents: 648
diff changeset
   406
with that character will be eliminated. 
Chengsong
parents: 648
diff changeset
   407
After taking derivatives with respect to all the characters the string is
Chengsong
parents: 648
diff changeset
   408
exhausted. Then an algorithm checks whether the empty string is in that final 
Chengsong
parents: 648
diff changeset
   409
regular expression's language.
Chengsong
parents: 648
diff changeset
   410
If so, a match exists and the string is in the language of the input regular expression.
Chengsong
parents: 648
diff changeset
   411
Chengsong
parents: 648
diff changeset
   412
Again this process can be seen as the simulation of an NFA running on a string,
Chengsong
parents: 648
diff changeset
   413
but the recursive nature of the datatypes and functions involved makes
Chengsong
parents: 648
diff changeset
   414
derivatives a perfect object of study for theorem provers.
Chengsong
parents: 648
diff changeset
   415
That is why there has been numerous formalisations of regular expressions
Chengsong
parents: 648
diff changeset
   416
and Brzozowski derivatives in the functional programming and 
Chengsong
parents: 648
diff changeset
   417
theorem proving community (a large list of refs to derivatives formalisation publications).
Chengsong
parents: 648
diff changeset
   418
Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
Chengsong
parents: 648
diff changeset
   419
formalised the notion of bit-coded regular expressions
Chengsong
parents: 648
diff changeset
   420
and proved their relations with simple regular expressions in
Chengsong
parents: 648
diff changeset
   421
the dependently-typed proof assistant Agda.
Chengsong
parents: 648
diff changeset
   422
They also proved the soundness and completeness of a matching algorithm
Chengsong
parents: 648
diff changeset
   423
based on the bit-coded regular expressions. Their algorithm is a decision procedure
Chengsong
parents: 648
diff changeset
   424
that gives a Yes/No answer, which does not produce
Chengsong
parents: 648
diff changeset
   425
lexical values.
Chengsong
parents: 648
diff changeset
   426
%X also formalised derivatives and regular expressions, producing "parse trees".
Chengsong
parents: 648
diff changeset
   427
%(Some person who's a big name in formal methods)
Chengsong
parents: 648
diff changeset
   428
Chengsong
parents: 648
diff changeset
   429
Chengsong
parents: 648
diff changeset
   430
The variant of the problem we are looking at centers around
Chengsong
parents: 648
diff changeset
   431
an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
Chengsong
parents: 648
diff changeset
   432
The reason we chose to look at $\blexer$ and its simplifications 
Chengsong
parents: 648
diff changeset
   433
is because it allows a lexical tree to be generated
Chengsong
parents: 648
diff changeset
   434
by some elegant and subtle procedure based on Brzozowski derivatives.
Chengsong
parents: 648
diff changeset
   435
The procedures are made of recursive functions and inductive datatypes just like derivatives, 
Chengsong
parents: 648
diff changeset
   436
allowing intuitive and concise formal reasoning with theorem provers.
Chengsong
parents: 648
diff changeset
   437
Most importantly, $\blexer$ opens up a path to an optimized version
Chengsong
parents: 648
diff changeset
   438
of $\blexersimp$ possibilities to improve
Chengsong
parents: 648
diff changeset
   439
performance with simplifications that aggressively change the structure of regular expressions.
Chengsong
parents: 648
diff changeset
   440
While most derivative-based methods
Chengsong
parents: 648
diff changeset
   441
rely on structures to be maintained to allow induction to
Chengsong
parents: 648
diff changeset
   442
go through.
Chengsong
parents: 648
diff changeset
   443
For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
Chengsong
parents: 648
diff changeset
   444
with derivatives, but as soon as they started introducing
Chengsong
parents: 648
diff changeset
   445
optimizations such as memoization, they reverted to constructing
Chengsong
parents: 648
diff changeset
   446
DFAs first.
Chengsong
parents: 648
diff changeset
   447
Edelmann \ref{Edelmann2020} explored similar optimizations in his
Chengsong
parents: 648
diff changeset
   448
work on verified LL(1) parsing, with additional enhancements with data structures like
Chengsong
parents: 648
diff changeset
   449
zippers.
Chengsong
parents: 648
diff changeset
   450
Chengsong
parents: 648
diff changeset
   451
%Sulzmann and Lu have worked out an algorithm
Chengsong
parents: 648
diff changeset
   452
%that is especially suited for verification
Chengsong
parents: 648
diff changeset
   453
%which utilized the fact
Chengsong
parents: 648
diff changeset
   454
%that whenever ambiguity occurs one may duplicate a sub-expression and use
Chengsong
parents: 648
diff changeset
   455
%different copies to describe different matching choices.
Chengsong
parents: 648
diff changeset
   456
The idea behind the correctness of $\blexer$ is simple: during a derivative,
Chengsong
parents: 648
diff changeset
   457
multiple matches might be possible, where an alternative with multiple children
Chengsong
parents: 648
diff changeset
   458
each corresponding to a 
Chengsong
parents: 648
diff changeset
   459
different match is created. In the end of
Chengsong
parents: 648
diff changeset
   460
a lexing process one always picks up the leftmost alternative, which is guarnateed 
Chengsong
parents: 648
diff changeset
   461
to be a POSIX value.
Chengsong
parents: 648
diff changeset
   462
This is done by consistently keeping sub-regular expressions in an alternative
Chengsong
parents: 648
diff changeset
   463
with longer submatches
Chengsong
parents: 648
diff changeset
   464
to the left of other copies (
Chengsong
parents: 648
diff changeset
   465
Remember that POSIX values are roughly the values with the longest inital
Chengsong
parents: 648
diff changeset
   466
submatch).
Chengsong
parents: 648
diff changeset
   467
The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
Chengsong
parents: 648
diff changeset
   468
is that since we only take the leftmost copy, then all re-occurring copies can be
Chengsong
parents: 648
diff changeset
   469
eliminated without losing the POSIX property, and this can be done with
Chengsong
parents: 648
diff changeset
   470
children of alternatives at different levels by merging them together.
Chengsong
parents: 648
diff changeset
   471
Proving $\blexersimp$ requires a different
Chengsong
parents: 648
diff changeset
   472
proof strategy compared to that by Ausaf \cite{FahadThesis}.
Chengsong
parents: 648
diff changeset
   473
We invent a rewriting relation as an
Chengsong
parents: 648
diff changeset
   474
inductive predicate which captures 
Chengsong
parents: 648
diff changeset
   475
a strong enough invariance that ensures correctness,
Chengsong
parents: 648
diff changeset
   476
which commutes with the derivative operation. 
Chengsong
parents: 648
diff changeset
   477
This predicate allows a simple
Chengsong
parents: 648
diff changeset
   478
induction on the input string to go through.
Chengsong
parents: 648
diff changeset
   479
Chengsong
parents: 648
diff changeset
   480
%This idea has been repeatedly used in different variants of lexing
Chengsong
parents: 648
diff changeset
   481
%algorithms in their paper, one of which involves bit-codes. The bit-coded
Chengsong
parents: 648
diff changeset
   482
%derivative-based algorithm even allows relatively aggressive 
Chengsong
parents: 648
diff changeset
   483
%%simplification rules which cause
Chengsong
parents: 648
diff changeset
   484
%structural changes that render the induction used in the correctness
Chengsong
parents: 648
diff changeset
   485
%proofs unusable.
Chengsong
parents: 648
diff changeset
   486
%More details will be given in \ref{Bitcoded2} including the
Chengsong
parents: 648
diff changeset
   487
%new correctness proof which involves a new inductive predicate which allows 
Chengsong
parents: 648
diff changeset
   488
%rule induction to go through.
Chengsong
parents: 648
diff changeset
   489
Chengsong
parents: 648
diff changeset
   490
Chengsong
parents: 648
diff changeset
   491
Chengsong
parents: 648
diff changeset
   492
Chengsong
parents: 648
diff changeset
   493
Chengsong
parents: 648
diff changeset
   494
Chengsong
parents: 648
diff changeset
   495
Chengsong
parents: 648
diff changeset
   496
%first character is removed 
Chengsong
parents: 648
diff changeset
   497
%state of the automaton after matching that character 
Chengsong
parents: 648
diff changeset
   498
%where nodes are represented as 
Chengsong
parents: 648
diff changeset
   499
%a sub-expression (for example tagged NFA
Chengsong
parents: 648
diff changeset
   500
%Working on regular expressions 
Chengsong
parents: 648
diff changeset
   501
%Because of these problems with automata, we prefer regular expressions
Chengsong
parents: 648
diff changeset
   502
%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
Chengsong
parents: 648
diff changeset
   503
%the regular expression and a different data structure.
Chengsong
parents: 648
diff changeset
   504
%
Chengsong
parents: 648
diff changeset
   505
%
Chengsong
parents: 648
diff changeset
   506
%The key idea 
646
Chengsong
parents: 638
diff changeset
   507
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   508
(ends)
646
Chengsong
parents: 638
diff changeset
   509
Chengsong
parents: 638
diff changeset
   510
%Regular expressions are widely used in computer science: 
Chengsong
parents: 638
diff changeset
   511
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
Chengsong
parents: 638
diff changeset
   512
%command-line tools like $\mathit{grep}$ that facilitate easy 
Chengsong
parents: 638
diff changeset
   513
%text-processing \cite{grep}; network intrusion
Chengsong
parents: 638
diff changeset
   514
%detection systems that inspect suspicious traffic; or compiler
Chengsong
parents: 638
diff changeset
   515
%front ends.
Chengsong
parents: 638
diff changeset
   516
%Given their usefulness and ubiquity, one would assume that
Chengsong
parents: 638
diff changeset
   517
%modern regular expression matching implementations
Chengsong
parents: 638
diff changeset
   518
%are mature and fully studied.
Chengsong
parents: 638
diff changeset
   519
%Indeed, in many popular programming languages' regular expression engines, 
Chengsong
parents: 638
diff changeset
   520
%one can supply a regular expression and a string, and in most cases get
Chengsong
parents: 638
diff changeset
   521
%get the matching information in a very short time.
Chengsong
parents: 638
diff changeset
   522
%Those engines can sometimes be blindingly fast--some 
Chengsong
parents: 638
diff changeset
   523
%network intrusion detection systems
Chengsong
parents: 638
diff changeset
   524
%use regular expression engines that are able to process 
Chengsong
parents: 638
diff changeset
   525
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
Chengsong
parents: 638
diff changeset
   526
%However, those engines can sometimes also exhibit a surprising security vulnerability
Chengsong
parents: 638
diff changeset
   527
%under a certain class of inputs.
602
Chengsong
parents: 601
diff changeset
   528
%However, , this is not the case for $\mathbf{all}$ inputs.
601
Chengsong
parents: 600
diff changeset
   529
%TODO: get source for SNORT/BRO's regex matching engine/speed
Chengsong
parents: 600
diff changeset
   530
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   531
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   532
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   533
\section{Contribution}
646
Chengsong
parents: 638
diff changeset
   534
{\color{red} \rule{\linewidth}{0.5mm}}
Chengsong
parents: 638
diff changeset
   535
\textbf{issue with this part: way too short, not enough details of what I have done.}
Chengsong
parents: 638
diff changeset
   536
{\color{red} \rule{\linewidth}{0.5mm}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   537
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   538
In this thesis,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   539
we propose a solution to catastrophic
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   540
backtracking and error-prone matchers: a formally verified
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   541
regular expression lexing algorithm
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   542
that is both fast
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   543
and correct. 
646
Chengsong
parents: 638
diff changeset
   544
{\color{red} \rule{\linewidth}{0.5mm}}
Chengsong
parents: 638
diff changeset
   545
\HandRight Added content:
Chengsong
parents: 638
diff changeset
   546
%Package \verb`pifont`: \ding{43}
Chengsong
parents: 638
diff changeset
   547
%Package \verb`utfsym`: \usym{261E} 
Chengsong
parents: 638
diff changeset
   548
%Package \verb`dingbat`: \leftpointright 
Chengsong
parents: 638
diff changeset
   549
%Package \verb`dingbat`: \rightpointright 
Chengsong
parents: 638
diff changeset
   550
Chengsong
parents: 638
diff changeset
   551
In particular, the main problem we solved on top of previous work was
Chengsong
parents: 638
diff changeset
   552
coming up with a formally verified algorithm called $\blexersimp$ based
Chengsong
parents: 638
diff changeset
   553
on Brzozowski derivatives. It calculates a POSIX
Chengsong
parents: 638
diff changeset
   554
lexical value from a string and a regular expression. This algorithm was originally
Chengsong
parents: 638
diff changeset
   555
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
Chengsong
parents: 638
diff changeset
   556
function does not really simplify intermediate results where it needs to and improved the
Chengsong
parents: 638
diff changeset
   557
algorithm accordingly.
Chengsong
parents: 638
diff changeset
   558
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
Chengsong
parents: 638
diff changeset
   559
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
Chengsong
parents: 638
diff changeset
   560
\begin{theorem}
Chengsong
parents: 638
diff changeset
   561
$|\blexersimp \; r \; s | \leq N_r$
Chengsong
parents: 638
diff changeset
   562
\end{theorem}
Chengsong
parents: 638
diff changeset
   563
The simplifications applied in each step of $\blexersimp$ 
Chengsong
parents: 638
diff changeset
   564
Chengsong
parents: 638
diff changeset
   565
\begin{center}
Chengsong
parents: 638
diff changeset
   566
	$\blexersimp
Chengsong
parents: 638
diff changeset
   567
	$
Chengsong
parents: 638
diff changeset
   568
\end{center}
Chengsong
parents: 638
diff changeset
   569
keeps the derivatives small, but presents a 
Chengsong
parents: 638
diff changeset
   570
challenge
Chengsong
parents: 638
diff changeset
   571
Chengsong
parents: 638
diff changeset
   572
Chengsong
parents: 638
diff changeset
   573
establishing a correctness theorem of the below form:
Chengsong
parents: 638
diff changeset
   574
%TODO: change this to "theorem to prove"
Chengsong
parents: 638
diff changeset
   575
\begin{theorem}
Chengsong
parents: 638
diff changeset
   576
	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$, 
Chengsong
parents: 638
diff changeset
   577
	then $\blexersimp\; r \; s = \Some \; v$. Otherwise 
Chengsong
parents: 638
diff changeset
   578
	$\blexersimp \; r \; s = \None$.
Chengsong
parents: 638
diff changeset
   579
\end{theorem}
Chengsong
parents: 638
diff changeset
   580
Chengsong
parents: 638
diff changeset
   581
Chengsong
parents: 638
diff changeset
   582
Chengsong
parents: 638
diff changeset
   583
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   584
The result is %a regular expression lexing algorithm that comes with 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   585
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   586
\item
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   587
an improved version of  Sulzmann and Lu's bit-coded algorithm using 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   588
derivatives with simplifications, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   589
accompanied by
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   590
a proven correctness theorem according to POSIX specification 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   591
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   592
\item 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   593
a complexity-related property for that algorithm saying that the 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   594
internal data structure will
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   595
remain below a finite bound,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   596
\item
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   597
and an extension to
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   598
the bounded repetition constructs with the correctness and finiteness property
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   599
maintained.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   600
\end{itemize}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   601
\noindent
646
Chengsong
parents: 638
diff changeset
   602
{\color{red} \rule{\linewidth}{0.5mm}}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   603
With a formal finiteness bound in place,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   604
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
631
Chengsong
parents: 630
diff changeset
   605
The Isabelle/HOL code for our formalisation can be 
Chengsong
parents: 630
diff changeset
   606
found at
Chengsong
parents: 630
diff changeset
   607
\begin{center}
Chengsong
parents: 630
diff changeset
   608
\url{https://github.com/hellotommmy/posix}
Chengsong
parents: 630
diff changeset
   609
\end{center}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   610
Further improvements to the algorithm with an even stronger version of 
631
Chengsong
parents: 630
diff changeset
   611
simplification can be made. We conjecture that the resulting size of derivatives
Chengsong
parents: 630
diff changeset
   612
can be bounded by a cubic bound w.r.t. the size of the regular expression.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   613
We will give relevant code in Scala, 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   614
but do not give a formal proof for that in Isabelle/HOL.
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   615
This is still future work.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   616
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   617
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   618
\section{Structure of the thesis}
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   619
In chapter \ref{Inj} we will introduce the concepts
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   620
and notations we 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   621
use for describing regular expressions and derivatives,
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   622
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   623
its correctness proof).
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   624
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   625
together with the correctness proof by Ausaf and Urban.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   626
Then we illustrate in chapter \ref{Bitcoded2}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   627
how Sulzmann and Lu's
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   628
simplifications fail to simplify correctly. We therefore introduce our own version of the
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   629
algorithm with correct simplifications and 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   630
their correctness proof.  
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   631
In chapter \ref{Finite} we give the second guarantee
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   632
of our bitcoded algorithm, that is a finite bound on the size of any 
631
Chengsong
parents: 630
diff changeset
   633
regular expression's derivatives. 
Chengsong
parents: 630
diff changeset
   634
We also show how one can extend the
Chengsong
parents: 630
diff changeset
   635
algorithm to include bounded repetitions. 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   636
In chapter \ref{Cubic} we discuss stronger simplification rules which 
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   637
improve the finite bound to a cubic bound. %and the NOT regular expression.
637
Chengsong
parents: 636
diff changeset
   638
Chapter \ref{RelatedWork} introduces relevant work for this thesis.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   639
Chapter \ref{Future} concludes and mentions avenues of future research.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   640
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   641
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   642
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   643
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   644
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   645
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   646
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   647
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   648
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   649
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   650
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   651
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   652
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   653
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   654