ChengsongTanPhdThesis/Chapters/Introduction.tex
author Chengsong
Mon, 10 Jul 2023 19:29:22 +0100
changeset 664 ba44144875b1
parent 654 2ad20ba5b178
child 665 3bedbdce3a3b
permissions -rwxr-xr-x
introduction Contribution section update
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     1
% Chapter 1
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     2
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     4
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     5
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     6
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     8
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    15
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    16
%boxes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    17
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    18
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    19
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    20
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    21
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    22
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
    23
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564
Chengsong
parents: 558
diff changeset
    24
\def\derssimp{\textit{ders}\_\textit{simp}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    25
\def\rders{\textit{rders}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    26
\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    27
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
    28
\def\bsimps{\textit{bsimp}}
554
Chengsong
parents: 543
diff changeset
    29
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    30
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    31
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    32
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    33
\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    34
\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    35
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555
Chengsong
parents: 554
diff changeset
    36
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594
Chengsong
parents: 591
diff changeset
    37
\def\rdistincts{\textit{rdistinct}}
556
Chengsong
parents: 555
diff changeset
    38
\def\rDistinct{\textit{rdistinct}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    39
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    40
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    41
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
620
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    42
\def\cbn{\textit{createdByNtimes}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    43
\def\hpa{\textit{highestPowerAux}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    44
\def\hpower{\textit{highestPower}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    45
\def\ntset{\textit{ntset}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    46
\def\optermsimp{\textit{optermsimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    47
\def\optermOsimp{\textit{optermOsimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    48
\def\optermosimp{\textit{optermosimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    49
\def\opterm{\textit{opterm}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    50
\def\nString{\textit{nonemptyString}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    51
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    52
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    53
600
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    54
\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    55
\def\SEQs{\textit{SEQs}}
564
Chengsong
parents: 558
diff changeset
    56
\def\case{\textit{case}}
554
Chengsong
parents: 543
diff changeset
    57
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
Chengsong
parents: 543
diff changeset
    58
\def\rsimpalts{\textit{rsimp}_{ALTS}}
Chengsong
parents: 543
diff changeset
    59
\def\good{\textit{good}}
Chengsong
parents: 543
diff changeset
    60
\def\btrue{\textit{true}}
Chengsong
parents: 543
diff changeset
    61
\def\bfalse{\textit{false}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    62
\def\bnullable{\textit{bnullable}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    63
\def\bnullables{\textit{bnullables}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    64
\def\Some{\textit{Some}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    65
\def\None{\textit{None}}
537
Chengsong
parents: 532
diff changeset
    66
\def\code{\textit{code}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    67
\def\decode{\textit{decode}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    68
\def\internalise{\textit{internalise}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    69
\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    70
\def\mkeps{\textit{mkeps}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    71
\newcommand{\rder}[2]{#2 \backslash_r #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    72
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    73
\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    74
554
Chengsong
parents: 543
diff changeset
    75
\def\nonnested{\textit{nonnested}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    76
\def\AZERO{\textit{AZERO}}
558
Chengsong
parents: 557
diff changeset
    77
\def\sizeNregex{\textit{sizeNregex}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    78
\def\AONE{\textit{AONE}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    79
\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    80
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    81
\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    82
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    83
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555
Chengsong
parents: 554
diff changeset
    84
\def\frewrite{\rightsquigarrow_f}
Chengsong
parents: 554
diff changeset
    85
\def\hrewrite{\rightsquigarrow_h}
Chengsong
parents: 554
diff changeset
    86
\def\grewrite{\rightsquigarrow_g}
Chengsong
parents: 554
diff changeset
    87
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
Chengsong
parents: 554
diff changeset
    88
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
Chengsong
parents: 554
diff changeset
    89
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    90
\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    91
\def\bder{\textit{bder}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    92
\def\der{\textit{der}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    93
\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    94
\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    95
\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    96
\def\DFA{\textit{DFA}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    97
\def\NFA{\textit{NFA}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    98
\def\bmkeps{\textit{bmkeps}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    99
\def\bmkepss{\textit{bmkepss}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   100
\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   101
\def\blexer{\textit{blexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   102
\def\flex{\textit{flex}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   103
\def\inj{\textit{inj}}
564
Chengsong
parents: 558
diff changeset
   104
\def\Empty{\textit{Empty}}
567
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
   105
\def\Left{\textit{Left}}
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
   106
\def\Right{\textit{Right}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   107
\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   108
\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   109
\def\Seq{\textit{Seq}}
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   110
\def\Alt{\textit{Alt}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   111
\def\Der{\textit{Der}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   112
\def\Ders{\textit{Ders}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   113
\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   114
\def\Z{\mathit{Z}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   115
\def\S{\mathit{S}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   116
\def\rup{r^\uparrow}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   117
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   118
\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   119
\def\lf{\textit{lf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   120
\def\PD{\textit{PD}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   121
\def\suffix{\textit{Suffix}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   122
\def\distinctBy{\textit{distinctBy}}
558
Chengsong
parents: 557
diff changeset
   123
\def\starupdate{\textit{starUpdate}}
Chengsong
parents: 557
diff changeset
   124
\def\starupdates{\textit{starUpdates}}
620
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
   125
\def\nupdate{\textit{nupdate}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
   126
\def\nupdates{\textit{nupdates}}
558
Chengsong
parents: 557
diff changeset
   127
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   128
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   129
\def\size{\mathit{size}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   130
\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   131
\def\simp{\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   132
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   133
\def\map{\mathit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   134
\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   135
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   136
\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   137
\def\bsimpStrong{\textit{bsimpStrong}}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
   138
\def\bdersStrongs{\textit{bdersStrong}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   139
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   140
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   141
\def\map{\textit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   142
\def\rrexp{\textit{rrexp}}
554
Chengsong
parents: 543
diff changeset
   143
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   144
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   145
\newcommand\asize[1]{\llbracket #1 \rrbracket}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   146
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   147
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   148
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   149
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   150
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   151
\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   152
\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   153
\def\bsimpalts{\textit{bsimp}_{ALTS}}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   154
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   155
\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   156
\def\rsimpseq{\textit{rsimp}_{SEQ}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   157
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   158
\def\erase{\textit{erase}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   159
\def\STAR{\textit{STAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   160
\def\flts{\textit{flts}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   161
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   162
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   163
\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   164
\def\nub{\textit{nub}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   165
\def\filter{\textit{filter}}
601
Chengsong
parents: 600
diff changeset
   166
%\def\not{\textit{not}}
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   167
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   168
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   169
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   170
\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   171
\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   172
\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   173
\newcommand\RSEQ[2]{#1 \cdot #2}
558
Chengsong
parents: 557
diff changeset
   174
\newcommand\RALTS[1]{\sum #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   175
\newcommand\RSTAR[1]{#1^*}
558
Chengsong
parents: 557
diff changeset
   176
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   177
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   178
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   179
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   180
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   181
\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   182
  frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   183
  language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   184
  aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   185
  belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   186
  showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   187
  columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   188
  basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   189
  numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   190
  numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   191
  keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   192
  commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   193
  stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   194
  frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   195
  breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   196
  breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   197
  tabsize=3,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   198
}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   199
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   200
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   201
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   202
%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   203
%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   204
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   205
%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   206
654
Chengsong
parents: 653
diff changeset
   207
\marginpar{rephrasing using "imprecise words"}
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   208
Regular expressions, since their inception in the 1940s, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   209
have been subject to extensive study and implementation. 
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   210
Their primary application lies in text processing--finding
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   211
matches and identifying patterns in a string.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   212
%It is often used to match strings that comprises of numerous fields, 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   213
%where certain fields may recur or be omitted. 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   214
For example, a simple regular expression that tries 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   215
to recognise email addresses is
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   216
\marginpar{rephrased from "the regex for recognising" to "a simple regex that tries to match email"}
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   217
\begin{center}
654
Chengsong
parents: 653
diff changeset
   218
\verb|[a-z0-9._]^+@[a-z0-9.-]^+\.\{a-z\}\{2,6\}|
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   219
%$[a-z0-9._]^+@[a-z0-9.-]^+\.[a-z]{2,6}$.
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   220
\end{center}
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   221
\marginpar{Simplified example, but the distinction between . and escaped . is correct
654
Chengsong
parents: 653
diff changeset
   222
and therefore left unchanged. Also verbatim package does not straightforwardly support superscripts so + kept as they are.}
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   223
%Using this, regular expression matchers and lexers are able to extract 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   224
%the domain names by the use of \verb|[a-zA-Z0-9.-]+|. 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   225
\marginpar{Rewrote explanation for the expression.}
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   226
The bracketed sub-expressions are used to extract specific parts of an email address.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   227
The local part is recognised by the expression enclosed in 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   228
the first pair of brackets: $[a-z0-9._]$, and after the ``@'' sign
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   229
is the part that recognises the domain, where $[a-z]{2, 6}$ specifically
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   230
matches the top-level domain.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   231
%Consequently, they are an indispensible components in text processing tools 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   232
%of software systems such as compilers, IDEs, and firewalls.
601
Chengsong
parents: 600
diff changeset
   233
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   234
The study of regular expressions is ongoing due to an
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   235
issue known as catastrophic backtracking. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   236
This phenomenon refers to scenarios in which the regular expression 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   237
matching or lexing engine exhibits a disproportionately long 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   238
runtime despite the simplicity of the input and expression.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   239
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   240
One cause of catastrophic backtracking lies within the 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   241
ambiguities of lexing.\marginpar{rephrased "the origin of catastrophic ...} 
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   242
In the process of matching a multi-character string with 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   243
a regular expression that encompasses several sub-expressions, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   244
different positions can be designated to mark 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   245
the boundaries of corresponding substrings of the sub-expressions. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   246
For instance, in matching the string $aaa$ with the 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   247
regular expression $(a+aa)^*$, the divide between 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   248
the initial match and the subsequent iteration could either be 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   249
set between the first and second characters ($a | aa$) or between the second and third characters ($aa | a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   250
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   251
Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   252
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   253
Various disambiguation strategies are 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   254
employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
601
Chengsong
parents: 600
diff changeset
   255
Chengsong
parents: 600
diff changeset
   256
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   257
%Regular expressions 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   258
%have been extensively studied and
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   259
%implemented since their invention in 1940s.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   260
%It is primarily used in lexing, where an unstructured input string is broken
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   261
%down into a tree of tokens.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   262
%That tree's construction is guided by the shape of the regular expression.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   263
%This is particularly useful in expressing the shape of a string
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   264
%with many fields, where certain fields might repeat or be omitted.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   265
%Regular expression matchers and Lexers allow us to 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   266
%identify and delimit different subsections of a string and potentially 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   267
%extract information from inputs, making them
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   268
%an indispensible component in modern software systems' text processing tasks
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   269
%such as compilers, IDEs, and firewalls.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   270
%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   271
%which means the regular expression matching or lexing engine takes an unproportional time to run 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   272
%despite the input and the expression being relatively simple.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   273
%
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   274
%Catastrophic backtracking stems from the ambiguities of lexing: 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   275
%when matching a multiple-character string with a regular 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   276
%exression that includes serveral sub-expressions, there might be different positions to set 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   277
%the border between sub-expressions' corresponding sub-strings.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   278
%For example, matching the string $aaa$ against the regular expression
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   279
%$(a+aa)^*$, the border between the initial match and the second iteration
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   280
%could be between the first and second character ($a | aa$) 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   281
%or between the second and third character ($aa | a$).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   282
%As the size of the input string and the structural complexity 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   283
%of the regular expression increase,
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   284
%the number of different combinations of delimiters can grow exponentially, and
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   285
%algorithms that explore these possibilities unwisely will also see an exponential complexity.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   286
%
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   287
%Catastrophic backtracking allow a class of computationally inexpensive attacks called
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   288
%Regular expression Denial of Service attacks (ReDoS), in which the hacker 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   289
%simply sends out a small attack string to a server, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   290
%triggering high-complexity behaviours in its regular expression engine. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   291
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   292
%details of this in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   293
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   294
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   295
%There have been prose definitions like the following
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   296
%by Kuklewicz \cite{KuklewiczHaskell}: 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   297
%described the POSIX rule as (section 1, last paragraph):
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   298
\marginpar{\em Deleted some peripheral specifications.}
646
Chengsong
parents: 638
diff changeset
   299
\begin{quote}
Chengsong
parents: 638
diff changeset
   300
	\begin{itemize}
Chengsong
parents: 638
diff changeset
   301
		\item
Chengsong
parents: 638
diff changeset
   302
regular expressions (REs) take the leftmost starting match, and the longest match starting there
Chengsong
parents: 638
diff changeset
   303
earlier subpatterns have leftmost-longest priority over later subpatterns\\
Chengsong
parents: 638
diff changeset
   304
\item
Chengsong
parents: 638
diff changeset
   305
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
Chengsong
parents: 638
diff changeset
   306
\item
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   307
	$\ldots$
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   308
%REs have right associative concatenation which can be changed with parenthesis\\
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   309
%\item
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   310
%parenthesized subexpressions return the match from their last usage\\
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   311
%\item
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   312
%text of component subexpressions must be contained in the text of the 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   313
%higher-level subexpressions\\
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   314
%\item
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   315
%if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   316
%\item
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   317
%if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
646
Chengsong
parents: 638
diff changeset
   318
\end{itemize}
Chengsong
parents: 638
diff changeset
   319
\end{quote}
649
Chengsong
parents: 648
diff changeset
   320
However, the author noted that various lexers that claim to be POSIX 
Chengsong
parents: 648
diff changeset
   321
are rarely correct according to this standard.
Chengsong
parents: 648
diff changeset
   322
There are numerous occasions where programmers realised the subtlety and
Chengsong
parents: 648
diff changeset
   323
difficulty to implement correctly, one such quote from Go's regexp library author 
Chengsong
parents: 648
diff changeset
   324
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
Chengsong
parents: 648
diff changeset
   325
\begin{quote}\it
Chengsong
parents: 648
diff changeset
   326
``
Chengsong
parents: 648
diff changeset
   327
The POSIX rule is computationally prohibitive
Chengsong
parents: 648
diff changeset
   328
and not even well-defined.
Chengsong
parents: 648
diff changeset
   329
``
Chengsong
parents: 648
diff changeset
   330
\end{quote}
Chengsong
parents: 648
diff changeset
   331
Being able to formally define and capture the idea of 
Chengsong
parents: 648
diff changeset
   332
POSIX rules and prove 
Chengsong
parents: 648
diff changeset
   333
the correctness of regular expression matching/lexing 
Chengsong
parents: 648
diff changeset
   334
algorithms against the POSIX semantics definitions
Chengsong
parents: 648
diff changeset
   335
is valuable.
Chengsong
parents: 648
diff changeset
   336
646
Chengsong
parents: 638
diff changeset
   337
649
Chengsong
parents: 648
diff changeset
   338
Formal proofs are
Chengsong
parents: 648
diff changeset
   339
machine checked programs
Chengsong
parents: 648
diff changeset
   340
 %such as the ones written in Isabelle/HOL, is a powerful means 
Chengsong
parents: 648
diff changeset
   341
for computer scientists to be certain 
Chengsong
parents: 648
diff changeset
   342
about the correctness of their algorithms.
Chengsong
parents: 648
diff changeset
   343
This is done by 
Chengsong
parents: 648
diff changeset
   344
recursively checking that every fact in a proof script 
Chengsong
parents: 648
diff changeset
   345
is either an axiom or a fact that is derived from
Chengsong
parents: 648
diff changeset
   346
known axioms or verified facts.
Chengsong
parents: 648
diff changeset
   347
%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
Chengsong
parents: 648
diff changeset
   348
%methods as their implementation and complexity analysis tend to be error-prone.
Chengsong
parents: 648
diff changeset
   349
Formal proofs provides an unprecendented level of asssurance
Chengsong
parents: 648
diff changeset
   350
that an algorithm will perform as expected under all inputs.
Chengsong
parents: 648
diff changeset
   351
The software systems that help people interactively build and check
Chengsong
parents: 648
diff changeset
   352
such proofs are called theorem-provers or proof assitants.
Chengsong
parents: 648
diff changeset
   353
Many  theorem-provers have been developed, such as Mizar,
Chengsong
parents: 648
diff changeset
   354
Isabelle/HOL, HOL-Light, HOL4,
Chengsong
parents: 648
diff changeset
   355
Coq, Agda, Idris, Lean and so on.
Chengsong
parents: 648
diff changeset
   356
Isabelle/HOL is a theorem prover with a simple type theory
Chengsong
parents: 648
diff changeset
   357
and powerful automated proof generators like sledgehammer.
Chengsong
parents: 648
diff changeset
   358
We chose to use Isabelle/HOL for its powerful automation
Chengsong
parents: 648
diff changeset
   359
and ease and simplicity in expressing regular expressions and 
Chengsong
parents: 648
diff changeset
   360
regular languages.
Chengsong
parents: 648
diff changeset
   361
%Some of those employ
Chengsong
parents: 648
diff changeset
   362
%dependent types like Mizar, Coq, Agda, Lean and Idris.
Chengsong
parents: 648
diff changeset
   363
%Some support a constructivism approach, such as Coq.
646
Chengsong
parents: 638
diff changeset
   364
Chengsong
parents: 638
diff changeset
   365
649
Chengsong
parents: 648
diff changeset
   366
Formal proofs on regular expression matching and lexing
Chengsong
parents: 648
diff changeset
   367
complements the efforts in
Chengsong
parents: 648
diff changeset
   368
industry which tend to focus on overall speed
Chengsong
parents: 648
diff changeset
   369
with techniques like parallelization (FPGA paper), tackling 
Chengsong
parents: 648
diff changeset
   370
the problem of catastrophic backtracking 
Chengsong
parents: 648
diff changeset
   371
in an ad-hoc manner (cloudflare and stackexchange article).
Chengsong
parents: 648
diff changeset
   372
Chengsong
parents: 648
diff changeset
   373
There have been many interesting steps in the theorem-proving community 
Chengsong
parents: 648
diff changeset
   374
about formalising regular expressions and lexing.
Chengsong
parents: 648
diff changeset
   375
One flavour is to build from the regular expression an automaton, and determine
Chengsong
parents: 648
diff changeset
   376
acceptance in terms of the resulting 
Chengsong
parents: 648
diff changeset
   377
state after executing the input string on that automaton.
Chengsong
parents: 648
diff changeset
   378
Automata formalisations are in general harder and more cumbersome to deal
Chengsong
parents: 648
diff changeset
   379
with for theorem provers than working directly on regular expressions.
Chengsong
parents: 648
diff changeset
   380
One such example is by Nipkow \cite{Nipkow1998}.
Chengsong
parents: 648
diff changeset
   381
%They 
Chengsong
parents: 648
diff changeset
   382
%made everything recursive (for example the next state function),
Chengsong
parents: 648
diff changeset
   383
As a core idea, they
Chengsong
parents: 648
diff changeset
   384
used a list of booleans to name each state so that 
Chengsong
parents: 648
diff changeset
   385
after composing sub-automata together, renaming the states to maintain
Chengsong
parents: 648
diff changeset
   386
the distinctness of each state is recursive and simple.
Chengsong
parents: 648
diff changeset
   387
The result was the obvious lemmas incorporating  
Chengsong
parents: 648
diff changeset
   388
``a painful amount of detail'' in their formalisation.
Chengsong
parents: 648
diff changeset
   389
Sometimes the automata are represented as graphs. 
Chengsong
parents: 648
diff changeset
   390
But graphs are not inductive datatypes.
Chengsong
parents: 648
diff changeset
   391
Having to set the induction principle on the number of nodes
Chengsong
parents: 648
diff changeset
   392
in a graph makes formal reasoning non-intuitive and convoluted,
Chengsong
parents: 648
diff changeset
   393
resulting in large formalisations \cite{Lammich2012}. 
Chengsong
parents: 648
diff changeset
   394
When combining two graphs, one also needs to make sure that the nodes in
Chengsong
parents: 648
diff changeset
   395
both graphs are distinct, which almost always involve
Chengsong
parents: 648
diff changeset
   396
renaming of the nodes.
Chengsong
parents: 648
diff changeset
   397
A theorem-prover which provides dependent types such as Coq 
Chengsong
parents: 648
diff changeset
   398
can alleviate the issue of representing graph nodes
Chengsong
parents: 648
diff changeset
   399
\cite{Doczkal2013}. There the representation of nodes is made
Chengsong
parents: 648
diff changeset
   400
easier by the use of $\textit{FinType}$.
Chengsong
parents: 648
diff changeset
   401
Another representation for automata are matrices. 
Chengsong
parents: 648
diff changeset
   402
But the induction for them is not as straightforward either.
Chengsong
parents: 648
diff changeset
   403
There are some more clever representations, for example one by Paulson 
Chengsong
parents: 648
diff changeset
   404
using hereditarily finite sets \cite{Paulson2015}. 
Chengsong
parents: 648
diff changeset
   405
There the problem with combining graphs can be solved better. 
Chengsong
parents: 648
diff changeset
   406
%but we believe that such clever tricks are not very obvious for 
Chengsong
parents: 648
diff changeset
   407
%the John-average-Isabelle-user.
Chengsong
parents: 648
diff changeset
   408
Chengsong
parents: 648
diff changeset
   409
The approach that operates directly on regular expressions circumvents the problem of
Chengsong
parents: 648
diff changeset
   410
conversion between a regular expression and an automaton, thereby avoiding representation
Chengsong
parents: 648
diff changeset
   411
problems altogether, despite that regular expressions may be seen as a variant of a
Chengsong
parents: 648
diff changeset
   412
non-deterministic finite automaton (ref Laurikari tagged NFA paper).
Chengsong
parents: 648
diff changeset
   413
To matching a string, a sequence of algebraic transformations called 
Chengsong
parents: 648
diff changeset
   414
(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression.
Chengsong
parents: 648
diff changeset
   415
Each derivative takes a character and a regular expression, 
Chengsong
parents: 648
diff changeset
   416
and returns a new regular expression whose language is closely related to 
Chengsong
parents: 648
diff changeset
   417
the original regular expression's language:
Chengsong
parents: 648
diff changeset
   418
strings prefixed with that input character will have their head removed
Chengsong
parents: 648
diff changeset
   419
and strings not prefixed
Chengsong
parents: 648
diff changeset
   420
with that character will be eliminated. 
Chengsong
parents: 648
diff changeset
   421
After taking derivatives with respect to all the characters the string is
Chengsong
parents: 648
diff changeset
   422
exhausted. Then an algorithm checks whether the empty string is in that final 
Chengsong
parents: 648
diff changeset
   423
regular expression's language.
Chengsong
parents: 648
diff changeset
   424
If so, a match exists and the string is in the language of the input regular expression.
Chengsong
parents: 648
diff changeset
   425
Chengsong
parents: 648
diff changeset
   426
Again this process can be seen as the simulation of an NFA running on a string,
Chengsong
parents: 648
diff changeset
   427
but the recursive nature of the datatypes and functions involved makes
Chengsong
parents: 648
diff changeset
   428
derivatives a perfect object of study for theorem provers.
Chengsong
parents: 648
diff changeset
   429
That is why there has been numerous formalisations of regular expressions
Chengsong
parents: 648
diff changeset
   430
and Brzozowski derivatives in the functional programming and 
Chengsong
parents: 648
diff changeset
   431
theorem proving community (a large list of refs to derivatives formalisation publications).
Chengsong
parents: 648
diff changeset
   432
Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
Chengsong
parents: 648
diff changeset
   433
formalised the notion of bit-coded regular expressions
Chengsong
parents: 648
diff changeset
   434
and proved their relations with simple regular expressions in
Chengsong
parents: 648
diff changeset
   435
the dependently-typed proof assistant Agda.
Chengsong
parents: 648
diff changeset
   436
They also proved the soundness and completeness of a matching algorithm
Chengsong
parents: 648
diff changeset
   437
based on the bit-coded regular expressions. Their algorithm is a decision procedure
Chengsong
parents: 648
diff changeset
   438
that gives a Yes/No answer, which does not produce
Chengsong
parents: 648
diff changeset
   439
lexical values.
Chengsong
parents: 648
diff changeset
   440
%X also formalised derivatives and regular expressions, producing "parse trees".
Chengsong
parents: 648
diff changeset
   441
%(Some person who's a big name in formal methods)
Chengsong
parents: 648
diff changeset
   442
Chengsong
parents: 648
diff changeset
   443
Chengsong
parents: 648
diff changeset
   444
The variant of the problem we are looking at centers around
Chengsong
parents: 648
diff changeset
   445
an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
Chengsong
parents: 648
diff changeset
   446
The reason we chose to look at $\blexer$ and its simplifications 
Chengsong
parents: 648
diff changeset
   447
is because it allows a lexical tree to be generated
Chengsong
parents: 648
diff changeset
   448
by some elegant and subtle procedure based on Brzozowski derivatives.
Chengsong
parents: 648
diff changeset
   449
The procedures are made of recursive functions and inductive datatypes just like derivatives, 
Chengsong
parents: 648
diff changeset
   450
allowing intuitive and concise formal reasoning with theorem provers.
Chengsong
parents: 648
diff changeset
   451
Most importantly, $\blexer$ opens up a path to an optimized version
Chengsong
parents: 648
diff changeset
   452
of $\blexersimp$ possibilities to improve
Chengsong
parents: 648
diff changeset
   453
performance with simplifications that aggressively change the structure of regular expressions.
Chengsong
parents: 648
diff changeset
   454
While most derivative-based methods
Chengsong
parents: 648
diff changeset
   455
rely on structures to be maintained to allow induction to
Chengsong
parents: 648
diff changeset
   456
go through.
Chengsong
parents: 648
diff changeset
   457
For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
Chengsong
parents: 648
diff changeset
   458
with derivatives, but as soon as they started introducing
Chengsong
parents: 648
diff changeset
   459
optimizations such as memoization, they reverted to constructing
Chengsong
parents: 648
diff changeset
   460
DFAs first.
Chengsong
parents: 648
diff changeset
   461
Edelmann \ref{Edelmann2020} explored similar optimizations in his
Chengsong
parents: 648
diff changeset
   462
work on verified LL(1) parsing, with additional enhancements with data structures like
Chengsong
parents: 648
diff changeset
   463
zippers.
Chengsong
parents: 648
diff changeset
   464
Chengsong
parents: 648
diff changeset
   465
%Sulzmann and Lu have worked out an algorithm
Chengsong
parents: 648
diff changeset
   466
%that is especially suited for verification
Chengsong
parents: 648
diff changeset
   467
%which utilized the fact
Chengsong
parents: 648
diff changeset
   468
%that whenever ambiguity occurs one may duplicate a sub-expression and use
Chengsong
parents: 648
diff changeset
   469
%different copies to describe different matching choices.
Chengsong
parents: 648
diff changeset
   470
The idea behind the correctness of $\blexer$ is simple: during a derivative,
Chengsong
parents: 648
diff changeset
   471
multiple matches might be possible, where an alternative with multiple children
Chengsong
parents: 648
diff changeset
   472
each corresponding to a 
Chengsong
parents: 648
diff changeset
   473
different match is created. In the end of
Chengsong
parents: 648
diff changeset
   474
a lexing process one always picks up the leftmost alternative, which is guarnateed 
Chengsong
parents: 648
diff changeset
   475
to be a POSIX value.
Chengsong
parents: 648
diff changeset
   476
This is done by consistently keeping sub-regular expressions in an alternative
Chengsong
parents: 648
diff changeset
   477
with longer submatches
Chengsong
parents: 648
diff changeset
   478
to the left of other copies (
Chengsong
parents: 648
diff changeset
   479
Remember that POSIX values are roughly the values with the longest inital
Chengsong
parents: 648
diff changeset
   480
submatch).
Chengsong
parents: 648
diff changeset
   481
The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
Chengsong
parents: 648
diff changeset
   482
is that since we only take the leftmost copy, then all re-occurring copies can be
Chengsong
parents: 648
diff changeset
   483
eliminated without losing the POSIX property, and this can be done with
Chengsong
parents: 648
diff changeset
   484
children of alternatives at different levels by merging them together.
Chengsong
parents: 648
diff changeset
   485
Proving $\blexersimp$ requires a different
Chengsong
parents: 648
diff changeset
   486
proof strategy compared to that by Ausaf \cite{FahadThesis}.
Chengsong
parents: 648
diff changeset
   487
We invent a rewriting relation as an
Chengsong
parents: 648
diff changeset
   488
inductive predicate which captures 
Chengsong
parents: 648
diff changeset
   489
a strong enough invariance that ensures correctness,
Chengsong
parents: 648
diff changeset
   490
which commutes with the derivative operation. 
Chengsong
parents: 648
diff changeset
   491
This predicate allows a simple
Chengsong
parents: 648
diff changeset
   492
induction on the input string to go through.
Chengsong
parents: 648
diff changeset
   493
Chengsong
parents: 648
diff changeset
   494
%This idea has been repeatedly used in different variants of lexing
Chengsong
parents: 648
diff changeset
   495
%algorithms in their paper, one of which involves bit-codes. The bit-coded
Chengsong
parents: 648
diff changeset
   496
%derivative-based algorithm even allows relatively aggressive 
Chengsong
parents: 648
diff changeset
   497
%%simplification rules which cause
Chengsong
parents: 648
diff changeset
   498
%structural changes that render the induction used in the correctness
Chengsong
parents: 648
diff changeset
   499
%proofs unusable.
Chengsong
parents: 648
diff changeset
   500
%More details will be given in \ref{Bitcoded2} including the
Chengsong
parents: 648
diff changeset
   501
%new correctness proof which involves a new inductive predicate which allows 
Chengsong
parents: 648
diff changeset
   502
%rule induction to go through.
Chengsong
parents: 648
diff changeset
   503
Chengsong
parents: 648
diff changeset
   504
Chengsong
parents: 648
diff changeset
   505
Chengsong
parents: 648
diff changeset
   506
Chengsong
parents: 648
diff changeset
   507
Chengsong
parents: 648
diff changeset
   508
Chengsong
parents: 648
diff changeset
   509
Chengsong
parents: 648
diff changeset
   510
%first character is removed 
Chengsong
parents: 648
diff changeset
   511
%state of the automaton after matching that character 
Chengsong
parents: 648
diff changeset
   512
%where nodes are represented as 
Chengsong
parents: 648
diff changeset
   513
%a sub-expression (for example tagged NFA
Chengsong
parents: 648
diff changeset
   514
%Working on regular expressions 
Chengsong
parents: 648
diff changeset
   515
%Because of these problems with automata, we prefer regular expressions
Chengsong
parents: 648
diff changeset
   516
%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
Chengsong
parents: 648
diff changeset
   517
%the regular expression and a different data structure.
Chengsong
parents: 648
diff changeset
   518
%
Chengsong
parents: 648
diff changeset
   519
%
Chengsong
parents: 648
diff changeset
   520
%The key idea 
646
Chengsong
parents: 638
diff changeset
   521
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   522
(ends)
646
Chengsong
parents: 638
diff changeset
   523
Chengsong
parents: 638
diff changeset
   524
%Regular expressions are widely used in computer science: 
Chengsong
parents: 638
diff changeset
   525
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
Chengsong
parents: 638
diff changeset
   526
%command-line tools like $\mathit{grep}$ that facilitate easy 
Chengsong
parents: 638
diff changeset
   527
%text-processing \cite{grep}; network intrusion
Chengsong
parents: 638
diff changeset
   528
%detection systems that inspect suspicious traffic; or compiler
Chengsong
parents: 638
diff changeset
   529
%front ends.
Chengsong
parents: 638
diff changeset
   530
%Given their usefulness and ubiquity, one would assume that
Chengsong
parents: 638
diff changeset
   531
%modern regular expression matching implementations
Chengsong
parents: 638
diff changeset
   532
%are mature and fully studied.
Chengsong
parents: 638
diff changeset
   533
%Indeed, in many popular programming languages' regular expression engines, 
Chengsong
parents: 638
diff changeset
   534
%one can supply a regular expression and a string, and in most cases get
Chengsong
parents: 638
diff changeset
   535
%get the matching information in a very short time.
Chengsong
parents: 638
diff changeset
   536
%Those engines can sometimes be blindingly fast--some 
Chengsong
parents: 638
diff changeset
   537
%network intrusion detection systems
Chengsong
parents: 638
diff changeset
   538
%use regular expression engines that are able to process 
Chengsong
parents: 638
diff changeset
   539
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
Chengsong
parents: 638
diff changeset
   540
%However, those engines can sometimes also exhibit a surprising security vulnerability
Chengsong
parents: 638
diff changeset
   541
%under a certain class of inputs.
602
Chengsong
parents: 601
diff changeset
   542
%However, , this is not the case for $\mathbf{all}$ inputs.
601
Chengsong
parents: 600
diff changeset
   543
%TODO: get source for SNORT/BRO's regex matching engine/speed
Chengsong
parents: 600
diff changeset
   544
603
370fe1dde7c7 more restructuring chap1
Chengsong
parents: 602
diff changeset
   545
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   546
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   547
\section{Contribution}
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   548
%{\color{red} \rule{\linewidth}{0.5mm}}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   549
%\textbf{issue with this part: way too short, not enough details of what I have done.}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   550
 %{\color{red} \rule{\linewidth}{0.5mm}}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   551
\marginpar{\em Gerog: No sufficient context on related work making contribution hard to digest.}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   552
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   553
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   554
%In this thesis,
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   555
%we propose a solution to catastrophic
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   556
%backtracking and error-prone matchers: a formally verified
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   557
%regular expression lexing algorithm
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   558
%that is both fast
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   559
%and correct. 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   560
%%{\color{red} \rule{\linewidth}{0.5mm}}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   561
%\HandRight Added content:
646
Chengsong
parents: 638
diff changeset
   562
%Package \verb`pifont`: \ding{43}
Chengsong
parents: 638
diff changeset
   563
%Package \verb`utfsym`: \usym{261E} 
Chengsong
parents: 638
diff changeset
   564
%Package \verb`dingbat`: \leftpointright 
Chengsong
parents: 638
diff changeset
   565
%Package \verb`dingbat`: \rightpointright 
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   566
We have made mainly two contributions in this thesis: %the
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   567
%lexer we developed based on Brzozowski derivatives and 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   568
%Sulzmanna and Lu's developments called 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   569
proving the lexer $\blexersimp$ is both correctness and fast.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   570
It is correct w.r.t a formalisation of POSIX lexing by Ausaf et al.\ref{AusafDyckhoffUrban2016}.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   571
It is fast compared with un-optimised implementations like Sulzmann and Lu's orignal
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   572
development by our metric of internal data structures not growing unbounded.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   573
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   574
Our formalisation of complexity is unique among similar works in the sense that
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   575
%is about the size of internal data structures.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   576
to our knowledge %we don't know of a 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   577
there is not a lexing/parsing algorithm with similar formalised correctness theorems.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   578
Common practices involve making some empirical analysis of the complexity of the algorithm
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   579
in question (\ref{Verbatim}, \ref{Verbatimpp}), or relying 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   580
on prior (unformalised) complexity analysis of well-known algorithms (\ref{ValiantParsing}),
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   581
making them prone to complexity bugs.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   582
%TODO: add citation
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   583
%, for example in the Verbatim work \ref{Verbatim}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   584
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   585
%While formalised proofs have not included 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   586
%Issues known as "performance bugs" can
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   587
Whilst formalised complexity theorems 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   588
have not yet appeared in other certified lexers/parsers,
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   589
%while this work is done,
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   590
they do find themselves in the broader theorem-proving literature:
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   591
\emph{time credits} have been formalised for separation logic in Coq 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   592
\ref{atkey2010amortised}%not used in 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   593
to characterise the runtime complexity of an algorithm,
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   594
where integer values are recorded %from the beginning of an execution
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   595
as part of the program state
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   596
and decremented in each step. 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   597
The idea is that the total number of decrements
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   598
from the time credits during execution represents the complexity of an algorithm.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   599
%each time a basic step is executed. 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   600
%The way to use a time credit Time credit is an integer
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   601
%is assigned to a program prior to execution, and each step in the program consumes
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   602
%one credit.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   603
This has been further developed to include the big-O notation
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   604
so one can prove both the functional correctness and asymptotic complexity
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   605
of higher-order imperative algorithms \ref{bigOImperative}.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   606
%for example the complexity of
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   607
%the Quicksort algorithm 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   608
%is $\Theta n\ln n$ 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   609
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   610
\marginpar{more work on formalising complexity}.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   611
\marginpar{new changes up to this point.}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   612
646
Chengsong
parents: 638
diff changeset
   613
Chengsong
parents: 638
diff changeset
   614
In particular, the main problem we solved on top of previous work was
Chengsong
parents: 638
diff changeset
   615
coming up with a formally verified algorithm called $\blexersimp$ based
Chengsong
parents: 638
diff changeset
   616
on Brzozowski derivatives. It calculates a POSIX
Chengsong
parents: 638
diff changeset
   617
lexical value from a string and a regular expression. This algorithm was originally
Chengsong
parents: 638
diff changeset
   618
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
Chengsong
parents: 638
diff changeset
   619
function does not really simplify intermediate results where it needs to and improved the
Chengsong
parents: 638
diff changeset
   620
algorithm accordingly.
Chengsong
parents: 638
diff changeset
   621
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
Chengsong
parents: 638
diff changeset
   622
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
Chengsong
parents: 638
diff changeset
   623
\begin{theorem}
Chengsong
parents: 638
diff changeset
   624
$|\blexersimp \; r \; s | \leq N_r$
Chengsong
parents: 638
diff changeset
   625
\end{theorem}
Chengsong
parents: 638
diff changeset
   626
The simplifications applied in each step of $\blexersimp$ 
Chengsong
parents: 638
diff changeset
   627
Chengsong
parents: 638
diff changeset
   628
\begin{center}
Chengsong
parents: 638
diff changeset
   629
	$\blexersimp
Chengsong
parents: 638
diff changeset
   630
	$
Chengsong
parents: 638
diff changeset
   631
\end{center}
Chengsong
parents: 638
diff changeset
   632
keeps the derivatives small, but presents a 
Chengsong
parents: 638
diff changeset
   633
challenge
Chengsong
parents: 638
diff changeset
   634
Chengsong
parents: 638
diff changeset
   635
Chengsong
parents: 638
diff changeset
   636
establishing a correctness theorem of the below form:
Chengsong
parents: 638
diff changeset
   637
%TODO: change this to "theorem to prove"
Chengsong
parents: 638
diff changeset
   638
\begin{theorem}
Chengsong
parents: 638
diff changeset
   639
	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$, 
Chengsong
parents: 638
diff changeset
   640
	then $\blexersimp\; r \; s = \Some \; v$. Otherwise 
Chengsong
parents: 638
diff changeset
   641
	$\blexersimp \; r \; s = \None$.
Chengsong
parents: 638
diff changeset
   642
\end{theorem}
Chengsong
parents: 638
diff changeset
   643
Chengsong
parents: 638
diff changeset
   644
Chengsong
parents: 638
diff changeset
   645
Chengsong
parents: 638
diff changeset
   646
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   647
The result is %a regular expression lexing algorithm that comes with 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   648
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   649
\item
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   650
an improved version of  Sulzmann and Lu's bit-coded algorithm using 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   651
derivatives with simplifications, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   652
accompanied by
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   653
a proven correctness theorem according to POSIX specification 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   654
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   655
\item 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   656
a complexity-related property for that algorithm saying that the 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   657
internal data structure will
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   658
remain below a finite bound,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   659
\item
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   660
and an extension to
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   661
the bounded repetition constructs with the correctness and finiteness property
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   662
maintained.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   663
\end{itemize}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   664
\noindent
646
Chengsong
parents: 638
diff changeset
   665
{\color{red} \rule{\linewidth}{0.5mm}}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   666
With a formal finiteness bound in place,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   667
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
631
Chengsong
parents: 630
diff changeset
   668
The Isabelle/HOL code for our formalisation can be 
Chengsong
parents: 630
diff changeset
   669
found at
Chengsong
parents: 630
diff changeset
   670
\begin{center}
Chengsong
parents: 630
diff changeset
   671
\url{https://github.com/hellotommmy/posix}
Chengsong
parents: 630
diff changeset
   672
\end{center}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   673
Further improvements to the algorithm with an even stronger version of 
631
Chengsong
parents: 630
diff changeset
   674
simplification can be made. We conjecture that the resulting size of derivatives
Chengsong
parents: 630
diff changeset
   675
can be bounded by a cubic bound w.r.t. the size of the regular expression.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   676
We will give relevant code in Scala, 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   677
but do not give a formal proof for that in Isabelle/HOL.
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   678
This is still future work.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   679
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   680
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   681
\section{Structure of the thesis}
652
Chengsong
parents: 649
diff changeset
   682
\marginpar{\em This is a marginal note.} 
Chengsong
parents: 649
diff changeset
   683
Before talking about the formal proof of $\blexersimp$'s 
Chengsong
parents: 649
diff changeset
   684
correctness, which is the main contribution of this thesis,
Chengsong
parents: 649
diff changeset
   685
we need to introduce two formal proofs which belong
Chengsong
parents: 649
diff changeset
   686
to Ausafe et al. 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   687
In chapter \ref{Inj} we will introduce the concepts
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   688
and notations we 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   689
use for describing regular expressions and derivatives,
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   690
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   691
its correctness proof).
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   692
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   693
together with the correctness proof by Ausaf and Urban.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   694
Then we illustrate in chapter \ref{Bitcoded2}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   695
how Sulzmann and Lu's
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   696
simplifications fail to simplify correctly. We therefore introduce our own version of the
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   697
algorithm with correct simplifications and 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   698
their correctness proof.  
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   699
In chapter \ref{Finite} we give the second guarantee
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   700
of our bitcoded algorithm, that is a finite bound on the size of any 
631
Chengsong
parents: 630
diff changeset
   701
regular expression's derivatives. 
Chengsong
parents: 630
diff changeset
   702
We also show how one can extend the
Chengsong
parents: 630
diff changeset
   703
algorithm to include bounded repetitions. 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   704
In chapter \ref{Cubic} we discuss stronger simplification rules which 
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   705
improve the finite bound to a cubic bound. %and the NOT regular expression.
637
Chengsong
parents: 636
diff changeset
   706
Chapter \ref{RelatedWork} introduces relevant work for this thesis.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   707
Chapter \ref{Future} concludes and mentions avenues of future research.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   708
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   709
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   710
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   711
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   712
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   713
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   714
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   715
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   716
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   717
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   718
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   719
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   720
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   721
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   722