ChengsongTanPhdThesis/Chapters/Introduction.tex
author Chengsong
Wed, 23 Aug 2023 03:02:31 +0100
changeset 668 3831621d7b14
parent 666 6da4516ea87d
permissions -rwxr-xr-x
added technical Overview section, almost done introduction
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     1
% Chapter 1
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     2
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     4
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     5
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     6
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     8
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    15
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    16
%boxes
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    17
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    18
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    19
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    20
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    21
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    22
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
    23
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564
Chengsong
parents: 558
diff changeset
    24
\def\derssimp{\textit{ders}\_\textit{simp}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    25
\def\rders{\textit{rders}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    26
\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    27
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
    28
\def\bsimps{\textit{bsimp}}
554
Chengsong
parents: 543
diff changeset
    29
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    30
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    31
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    32
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    33
\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    34
\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    35
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555
Chengsong
parents: 554
diff changeset
    36
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594
Chengsong
parents: 591
diff changeset
    37
\def\rdistincts{\textit{rdistinct}}
556
Chengsong
parents: 555
diff changeset
    38
\def\rDistinct{\textit{rdistinct}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    39
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    40
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    41
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
620
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    42
\def\cbn{\textit{createdByNtimes}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    43
\def\hpa{\textit{highestPowerAux}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    44
\def\hpower{\textit{highestPower}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    45
\def\ntset{\textit{ntset}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    46
\def\optermsimp{\textit{optermsimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    47
\def\optermOsimp{\textit{optermOsimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    48
\def\optermosimp{\textit{optermosimp}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    49
\def\opterm{\textit{opterm}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
    50
\def\nString{\textit{nonemptyString}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    51
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    52
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    53
600
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    54
\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done
Chengsong
parents: 596
diff changeset
    55
\def\SEQs{\textit{SEQs}}
564
Chengsong
parents: 558
diff changeset
    56
\def\case{\textit{case}}
554
Chengsong
parents: 543
diff changeset
    57
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
Chengsong
parents: 543
diff changeset
    58
\def\rsimpalts{\textit{rsimp}_{ALTS}}
Chengsong
parents: 543
diff changeset
    59
\def\good{\textit{good}}
Chengsong
parents: 543
diff changeset
    60
\def\btrue{\textit{true}}
Chengsong
parents: 543
diff changeset
    61
\def\bfalse{\textit{false}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    62
\def\bnullable{\textit{bnullable}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    63
\def\bnullables{\textit{bnullables}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    64
\def\Some{\textit{Some}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    65
\def\None{\textit{None}}
537
Chengsong
parents: 532
diff changeset
    66
\def\code{\textit{code}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    67
\def\decode{\textit{decode}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    68
\def\internalise{\textit{internalise}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    69
\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    70
\def\mkeps{\textit{mkeps}}
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    71
\newcommand{\rder}[2]{#2 \backslash_r #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    72
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    73
\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    74
554
Chengsong
parents: 543
diff changeset
    75
\def\nonnested{\textit{nonnested}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    76
\def\AZERO{\textit{AZERO}}
558
Chengsong
parents: 557
diff changeset
    77
\def\sizeNregex{\textit{sizeNregex}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    78
\def\AONE{\textit{AONE}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    79
\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    80
585
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    81
\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more
Chengsong
parents: 579
diff changeset
    82
557
812e5d112f49 more changes
Chengsong
parents: 556
diff changeset
    83
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555
Chengsong
parents: 554
diff changeset
    84
\def\frewrite{\rightsquigarrow_f}
Chengsong
parents: 554
diff changeset
    85
\def\hrewrite{\rightsquigarrow_h}
Chengsong
parents: 554
diff changeset
    86
\def\grewrite{\rightsquigarrow_g}
Chengsong
parents: 554
diff changeset
    87
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
Chengsong
parents: 554
diff changeset
    88
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
Chengsong
parents: 554
diff changeset
    89
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    90
\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    91
\def\bder{\textit{bder}}
542
a7344c9afbaf chapter3 finished
Chengsong
parents: 538
diff changeset
    92
\def\der{\textit{der}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    93
\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    94
\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    95
\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    96
\def\DFA{\textit{DFA}}
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
    97
\def\NFA{\textit{NFA}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
    98
\def\bmkeps{\textit{bmkeps}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
    99
\def\bmkepss{\textit{bmkepss}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   100
\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   101
\def\blexer{\textit{blexer}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   102
\def\flex{\textit{flex}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   103
\def\inj{\textit{inj}}
564
Chengsong
parents: 558
diff changeset
   104
\def\Empty{\textit{Empty}}
567
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
   105
\def\Left{\textit{Left}}
28cb8089ec36 more updaates
Chengsong
parents: 564
diff changeset
   106
\def\Right{\textit{Right}}
573
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   107
\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   108
\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing
Chengsong
parents: 567
diff changeset
   109
\def\Seq{\textit{Seq}}
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   110
\def\Alt{\textit{Alt}}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   111
\def\Der{\textit{Der}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   112
\def\Ders{\textit{Ders}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   113
\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   114
\def\Z{\mathit{Z}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   115
\def\S{\mathit{S}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   116
\def\rup{r^\uparrow}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   117
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   118
\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   119
\def\lf{\textit{lf}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   120
\def\PD{\textit{PD}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   121
\def\suffix{\textit{Suffix}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   122
\def\distinctBy{\textit{distinctBy}}
558
Chengsong
parents: 557
diff changeset
   123
\def\starupdate{\textit{starUpdate}}
Chengsong
parents: 557
diff changeset
   124
\def\starupdates{\textit{starUpdates}}
620
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
   125
\def\nupdate{\textit{nupdate}}
ae6010c14e49 chap6 almost done
Chengsong
parents: 618
diff changeset
   126
\def\nupdates{\textit{nupdates}}
558
Chengsong
parents: 557
diff changeset
   127
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   128
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   129
\def\size{\mathit{size}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   130
\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   131
\def\simp{\mathit{simp}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   132
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   133
\def\map{\mathit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   134
\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   135
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   136
\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   137
\def\bsimpStrong{\textit{bsimpStrong}}
591
b2d0de6aee18 more polishing integrated comments chap2
Chengsong
parents: 590
diff changeset
   138
\def\bdersStrongs{\textit{bdersStrong}}
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   139
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   140
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   141
\def\map{\textit{map}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   142
\def\rrexp{\textit{rrexp}}
554
Chengsong
parents: 543
diff changeset
   143
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   144
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   145
\newcommand\asize[1]{\llbracket #1 \rrbracket}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   146
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   147
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   148
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   149
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   150
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   151
\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   152
\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   153
\def\bsimpalts{\textit{bsimp}_{ALTS}}
596
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   154
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   155
\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56
Chengsong
parents: 594
diff changeset
   156
\def\rsimpseq{\textit{rsimp}_{SEQ}}
543
b2bea5968b89 thesis_thys
Chengsong
parents: 542
diff changeset
   157
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   158
\def\erase{\textit{erase}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   159
\def\STAR{\textit{STAR}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   160
\def\flts{\textit{flts}}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   161
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   162
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   163
\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   164
\def\nub{\textit{nub}}
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   165
\def\filter{\textit{filter}}
601
Chengsong
parents: 600
diff changeset
   166
%\def\not{\textit{not}}
579
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   167
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   168
35df9cdd36ca more chap3
Chengsong
parents: 573
diff changeset
   169
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   170
\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   171
\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   172
\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   173
\newcommand\RSEQ[2]{#1 \cdot #2}
558
Chengsong
parents: 557
diff changeset
   174
\newcommand\RALTS[1]{\sum #1}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   175
\newcommand\RSTAR[1]{#1^*}
558
Chengsong
parents: 557
diff changeset
   176
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   177
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   178
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   179
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   180
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   181
\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   182
  frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   183
  language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   184
  aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   185
  belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   186
  showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   187
  columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   188
  basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   189
  numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   190
  numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   191
  keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   192
  commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   193
  stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   194
  frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   195
  breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   196
  breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   197
  tabsize=3,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   198
}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   199
590
988e92a70704 more chap5 and chap6 bsimp_idem
Chengsong
parents: 585
diff changeset
   200
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   201
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   202
%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   203
%and a bit-coded lexing algorithm with proven correctness and time bounds.
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   204
% \marginpar{Totally rewritten introduction}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   205
% Formal proofs, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   206
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   207
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   208
%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   209
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   210
\marginpar{rephrasing following Christian comment}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   211
Regular expressions, since their inception in the 1950s
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   212
\cite{RegularExpressions}, 
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   213
have been subject to extensive study and implementation. 
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   214
Their primary application lies in text processing--finding
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   215
matches and identifying patterns in a string.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   216
%It is often used to match strings that comprises of numerous fields, 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   217
%where certain fields may recur or be omitted. 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   218
For example, a simple regular expression that tries 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   219
to recognise email addresses is
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   220
\marginpar{rephrased from "the regex for recognising" to "a simple regex that tries to match email"}
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   221
\begin{center}
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   222
\verb|[a-z0-9._]+@[a-z0-9.-]+\.[a-z]{2,6}|
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   223
%$[a-z0-9._]^+@[a-z0-9.-]^+\.[a-z]{2,6}$.
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   224
\end{center}
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   225
\marginpar{Simplified example, but the distinction between . and escaped . is correct
654
Chengsong
parents: 653
diff changeset
   226
and therefore left unchanged. Also verbatim package does not straightforwardly support superscripts so + kept as they are.}
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   227
%Using this, regular expression matchers and lexers are able to extract 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   228
%the domain names by the use of \verb|[a-zA-Z0-9.-]+|. 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   229
\marginpar{Rewrote explanation for the expression.}
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   230
This expression assumes all letters in the email have been converted into lower-case.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   231
The local-part is recognised by the first 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   232
bracketed expression $[a-z0-9.\_]^+$ 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   233
where the
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   234
operator ``+'' (should be viewed as a superscript) 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   235
means that it must be some non-empty string
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   236
made of alpha-numeric characters.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   237
After the ``@'' sign
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   238
is the sub-expression that recognises the domain of that email, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   239
where $[a-z]^{\{2, 6\}}$ specifically
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   240
matches the top-level domain, such as ``org'', ``com'', ``uk'' and etc.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   241
The counters in the superscript 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   242
$2, 6$ specifies that all top-level domains
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   243
should have between two to six characters.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   244
This regular expression does not represent all possible email addresses 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   245
(e.g. those involving ``-'' cannot be recognised), but patterns
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   246
of similar shape and using roughly the same set of syntax are used
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   247
everywhere in our daily life,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   248
% Indeed regular expressions are used everywhere 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   249
% in a computer user's daily life--for example searching for a string
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   250
% in a text document using ctrl-F.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   251
% When such regex searching 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   252
% a useful and simple way to handle email-processing when the task does
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   253
% not require absolute precision. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   254
%The same can be said about
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   255
%applications like .
653
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   256
%Consequently, they are an indispensible components in text processing tools 
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments
Chengsong
parents: 652
diff changeset
   257
%of software systems such as compilers, IDEs, and firewalls.
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   258
% The picture gets more blurry when we start to ask questions like 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   259
% why 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   260
%TODO: Insert corresponding bibliography
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   261
%There are many more scenarios in which regular exp% %use cases for regular expressions in computer science,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   262
for example in compilers \cite{LEX}, networking \cite{Snort1999}, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   263
software engineering (IDEs) \cite{Atom}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   264
and operating systems \cite{GREP}, where the correctness
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   265
of matching can be crucially important.
601
Chengsong
parents: 600
diff changeset
   266
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   267
Implementations of regular expression matching out in the wild, however, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   268
are surprisingly unreliable.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   269
%are not always reliable as %people think
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   270
%they
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   271
%should be. %Given the importance of the correct functioning
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   272
% Indeed they have been heavily relied on and
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   273
% extensively implemented and studied in modern computer systems--whenever you do
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   274
% a 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   275
%TODO: double check this is true
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   276
An example is the regular expresion $(a^*)^*b$ and the string
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   277
$aa\ldots a$. The expectation is that any regex engine should
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   278
be able to solve this (return a no match) in no time.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   279
However, if this 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   280
%regular expression and string pair 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   281
is tried out in an
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   282
regex engine like that of Java 8, the runtime would quickly grow beyond 100 seconds
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   283
with just dozens of characters 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   284
%in the string. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   285
Such behaviour can result in Denial-of-Service (ReDoS) attacks
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   286
with minimal resources--just the pair of ``evil'' 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   287
regular expression and string.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   288
The outage of the web service provider Cloudflare \cite{Cloudflare} 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   289
in 2019 \cite{CloudflareOutage} is a recent example
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   290
where such issues caused serious negative impact.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   291
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   292
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   293
The reason why these regex matching engines get so slow
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   294
is because they use backtracking or a depth-first-search (DFS) on the 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   295
search space of possible matches. They employ backtracking algorithms to 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   296
support back-references, a mechanism allowing
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   297
expressing languages which repeating an arbitrary long string 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   298
such as 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   299
$\{ww| w\in \Sigma*\}$. Such a constructs makes matching
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   300
NP-complete, for which no known non-backtracking algorithms exist.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   301
More modern programming languages' regex engines such as
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   302
Rust and GO's do promise linear-time behaviours 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   303
with respect to input string,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   304
but they do not support back-references, and often 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   305
impose ad-hoc restrictions on the input patterns.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   306
More importantly, these promises are purely empirical,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   307
making them prone to future ReDoS attacks and other types of errors.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   308
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   309
The other unreliability of industry regex engines
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   310
is that they do not produce the desired output. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   311
Kuklewicz have found out during his testing
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   312
of practical regex engines that almost all of them are incorrect with respect
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   313
to the POSIX standard, a disambiguation strategy adopted most
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   314
widely in computer science. Roughly speaking POSIX lexing means to always
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   315
go for the longest initial submatch possible, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   316
for instance a single iteration
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   317
$aa$ is preferred over two iterations $a,a$ 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   318
when matching $(a|aa)^*$ with the string $aa$.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   319
The POSIX strategy as summarised by Kuklewicz goes
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   320
as follows:
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   321
\marginpar{\em Deleted some peripheral specifications.}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   322
\begin{quote}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   323
	\begin{itemize}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   324
		\item
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   325
regular expressions (REs) take the leftmost starting match, and the longest match starting there
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   326
earlier subpatterns have leftmost-longest priority over later subpatterns\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   327
\item
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   328
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   329
\item
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   330
	$\ldots$
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   331
%REs have right associative concatenation which can be changed with parenthesis\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   332
%\item
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   333
%parenthesized subexpressions return the match from their last usage\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   334
%\item
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   335
%text of component subexpressions must be contained in the text of the 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   336
%higher-level subexpressions\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   337
%\item
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   338
%if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   339
%\item
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   340
%if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   341
\end{itemize}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   342
\end{quote}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   343
The author noted that various lexers that come with POSIX operating systems
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   344
are rarely correct according to this standard.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   345
A test case that exposes the bugs 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   346
is the regular expression $(aba|ab|a)^*$ and string $ababa$.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   347
A correct match would split the string into $ab, aba$, involving
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   348
two repetitions of the Kleene star $(aba|ab|a)^*$.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   349
Most regex engines would instead return two (partial) matches
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   350
$aba$ and  $a$ \footnote{Try it out here: \url{https://regex101.com/r/c5hga5/1}, last accessed 22-Aug-2023}.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   351
There are numerous occasions where programmers realised the subtlety and
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   352
difficulty to implement POSIX correctly, one such quote from Go's regexp library author:
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   353
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}, last accessed 22-Aug-2023}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   354
\begin{quote}\it
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   355
``
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   356
The POSIX rule is computationally prohibitive
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   357
and not even well-defined.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   358
''
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   359
\end{quote}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   360
% Being able to formally define and capture the idea of 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   361
% POSIX rules and prove 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   362
% the correctness of regular expression matching/lexing 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   363
% algorithms against the POSIX semantics definitions
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   364
% is valuable.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   365
These all point towards a formal treatment of 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   366
POSIX lexing to clear up inaccuracies and errors
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   367
in understanding and implementation of regex. The work presented
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   368
in this thesis uses formal proofs to ensure the correctness 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   369
and runtime properties
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   370
of POSIX regular expression lexing.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   371
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   372
Formal proofs or mechanised proofs are
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   373
computer checked programs
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   374
 %such as the ones written in Isabelle/HOL, is a powerful means 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   375
that certify the correctness of facts
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   376
with respect to a set of axioms and definitions.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   377
% This is done by 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   378
% recursively checking that every fact in a proof script 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   379
% is either an axiom or a fact that is derived from
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   380
% known axioms or verified facts.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   381
%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   382
%methods as their implementation and complexity analysis tend to be error-prone.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   383
They provide an unprecendented level of asssurance
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   384
that an algorithm will perform as expected under all inputs.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   385
We believe such proofs can help eliminate catastrophic backtracking
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   386
once and for all.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   387
The software systems that help people interactively build and check
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   388
formal proofs are called proof assitants.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   389
% Many  theorem-provers have been developed, such as Mizar,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   390
% Isabelle/HOL, HOL-Light, HOL4,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   391
% Coq, Agda, Idris, Lean and so on.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   392
Isabelle/HOL \cite{Isabelle} is a widely-used proof assistant with a simple type theory
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   393
and powerful automated proof generators like sledgehammer.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   394
We chose to use Isabelle/HOL for its powerful automation
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   395
and ease and simplicity in expressing regular expressions and 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   396
regular languages.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   397
%Some of those employ
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   398
%dependent types like Mizar, Coq, Agda, Lean and Idris.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   399
%Some support a constructivism approach, such as Coq.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   400
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   401
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   402
% Formal proofs on regular expression matching and lexing
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   403
% complements the efforts in
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   404
% industry which tend to focus on overall speed
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   405
% with techniques like parallelization (FPGA paper), tackling 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   406
% the problem of catastrophic backtracking 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   407
% in an ad-hoc manner (cloudflare and stackexchange article).
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   408
The algorithm we work on is based on Brzozowski derivatives.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   409
Brzozowski invented the notion of ``derivatives'' on 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   410
regular expressions \cite{Brzozowski1964}, and the idea is
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   411
that we can reason about what regular expressions can match
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   412
by taking derivatives of them. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   413
A derivative operation takes a regular expression $r$ and a character
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   414
$c$,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   415
and returns a new regular expression $r\backslash c$.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   416
The derivative is taken with respect to $c$:
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   417
it transforms $r$ in such a way that the resulting derivative
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   418
$r\backslash c$ contains all strings in the language of $r$ that
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   419
starts with $c$, but now with the head character $c$ thrown away.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   420
For example, for $r$ equal to $(aba | ab | a)^*$
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   421
as discussed earlier, its derivative with respect to character $a$ is
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   422
\[
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   423
  r\backslash a = (ba | b| \ONE)(aba | ab | a)^*.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   424
\]
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   425
% By applying derivatives repeatedly one can c
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   426
Taking derivatives repeatedly with respect to the sequence
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   427
of characters in the string $s$, one obtain a regular expression
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   428
containing the information of how $r$ matched $s$.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   429
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   430
Brzozowski derivatives are purely algebraic operations
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   431
that can be implemented as a recursive function
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   432
that does a pattern match on the structure of the regular expression.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   433
For example, the derivatives of character regular expressions,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   434
Kleene star and bounded repetitions can be described by the following
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   435
code equations:
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   436
% For example some clauses
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   437
\begin{center}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   438
	\begin{tabular}{lcl}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   439
    $d \backslash c$     & $\dn$ & 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   440
		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   441
% $(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   442
% $(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   443
% 	&   & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   444
% 	&   & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   445
	$(r^*)\backslash c$           & $\dn$ & $(r\backslash c) \cdot r^*$\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   446
		$r^{\{n\}} \backslash c$     & $\dn$ & $r \backslash c \cdot
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   447
		r^{\{n-1\}} (\text{when} \; n > 0)$\\
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   448
	\end{tabular}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   449
\end{center}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   450
(end of ready work, rest construction site)
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   451
In particular, we are working on an improvement of the work
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   452
by Ausaf et al. \cite{Ausaf} \cite{AusafDyckhoffUrban2016} 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   453
and Sulzmann and Lu \cite{Sulzmann2014}.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   454
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   455
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   456
The variant of the problem we are looking at centers around
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   457
an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   458
The reason we chose to look at $\blexer$ and its simplifications 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   459
is because it allows a lexical tree to be generated
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   460
by some elegant and subtle procedure based on Brzozowski derivatives.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   461
The procedures are made of recursive functions and inductive datatypes just like derivatives, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   462
allowing intuitive and concise formal reasoning with theorem provers.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   463
Most importantly, $\blexer$ opens up a path to an optimized version
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   464
of $\blexersimp$ possibilities to improve
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   465
performance with simplifications that aggressively change the structure of regular expressions.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   466
While most derivative-based methods
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   467
rely on structures to be maintained to allow induction to
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   468
go through.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   469
For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   470
with derivatives, but as soon as they started introducing
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   471
optimizations such as memoization, they reverted to constructing
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   472
DFAs first.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   473
Edelmann \ref{Edelmann2020} explored similar optimizations in his
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   474
work on verified LL(1) parsing, with additional enhancements with data structures like
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   475
zippers.
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   476
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   477
%Sulzmann and Lu have worked out an algorithm
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   478
%that is especially suited for verification
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   479
%which utilized the fact
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   480
%that whenever ambiguity occurs one may duplicate a sub-expression and use
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   481
%different copies to describe different matching choices.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   482
The idea behind the correctness of $\blexer$ is simple: during a derivative,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   483
multiple matches might be possible, where an alternative with multiple children
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   484
each corresponding to a 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   485
different match is created. In the end of
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   486
a lexing process one always picks up the leftmost alternative, which is guarnateed 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   487
to be a POSIX value.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   488
This is done by consistently keeping sub-regular expressions in an alternative
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   489
with longer submatches
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   490
to the left of other copies (
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   491
Remember that POSIX values are roughly the values with the longest inital
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   492
submatch).
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   493
The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   494
is that since we only take the leftmost copy, then all re-occurring copies can be
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   495
eliminated without losing the POSIX property, and this can be done with
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   496
children of alternatives at different levels by merging them together.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   497
Proving $\blexersimp$ requires a different
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   498
proof strategy compared to that by Ausaf \cite{FahadThesis}.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   499
We invent a rewriting relation as an
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   500
inductive predicate which captures 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   501
a strong enough invariance that ensures correctness,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   502
which commutes with the derivative operation. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   503
This predicate allows a simple
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   504
induction on the input string to go through.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   505
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   506
%This idea has been repeatedly used in different variants of lexing
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   507
%algorithms in their paper, one of which involves bit-codes. The bit-coded
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   508
%derivative-based algorithm even allows relatively aggressive 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   509
%%simplification rules which cause
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   510
%structural changes that render the induction used in the correctness
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   511
%proofs unusable.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   512
%More details will be given in \ref{Bitcoded2} including the
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   513
%new correctness proof which involves a new inductive predicate which allows 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   514
%rule induction to go through.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   515
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   516
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   517
\marginpar{20230821 Addition: high-level idea in context}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   518
To summarise, we expect modern regex matching and lexing engines
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   519
to be reliabe, fast and correct, and support rich syntax constructs
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   520
like bounded repetitions and back references.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   521
Backtracking regex engines have exhibited exponential 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   522
worst-case behaviours (catastrophic backtracking)
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   523
for employing a depth-first-search on the search tree of possible matches,
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   524
and complexity analysis about them also takes worst-case exponential 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   525
time (ref Minamide and Birmingham work).
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   526
%Expand notes: JFLEX generates DFAs, cannot support backrefs
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   527
%Expand notes: Java 8, python supports pcre, but is exponential
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   528
%Expand notes: more modern prog langs like GO, Rust claims linear, and forbids large counters and backrefs
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   529
%The work described in this thesis is part of an ongoing
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   530
%project which aims to build a formally verified lexer satisfying all these
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   531
To ensure the correctness and predictable behaviour of lexical analysis, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   532
we propose to
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   533
build a formally verified lexer that satisfy correctness and non-backtracking
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   534
requirements in a bottom-up manner using Brzozowski derivatives.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   535
We build on the line of work by Ausaf et al.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   536
A derivative-based matching algorithm avoids backtracking, by explicitly
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   537
representing possible matches on the same level of a search tree 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   538
as regular expression terms
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   539
in a breadth-first manner.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   540
Efficiency of such algorithms rely on limiting the size
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   541
of the derivatives during the computation, for example by
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   542
eliminating redundant 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   543
terms that lead to identical matches. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   544
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   545
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   546
The end goal would be a lexer that comes with additional formal guarantees
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   547
on computational complexity and actual speed competent with other
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   548
unverified regex engines.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   549
We will report in the next section the outcome of this project
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   550
so far and the contribution with respect to other works in
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   551
lexical analysis and theorem proving.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   552
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   553
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   554
Derivatives on regular expressions are popular in the theorem proving community
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   555
because their algebraic features are amenable to formal reasoning.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   556
As a testimony for this, there exists a sizable number of formalisations of Brzozowski derivatives and
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   557
regular expressions in different theorem proving languages.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   558
%Expand notes: verified lexers: proof-of-concept: Ruberio, Egolf, fast but no formal proof in a proof assistant: Magnus, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   559
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   560
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   561
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   562
% The study of regular expressions is ongoing due to an
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   563
% issue known as catastrophic backtracking. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   564
% This phenomenon refers to scenarios in which the regular expression 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   565
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   566
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   567
% One cause of catastrophic backtracking lies within the 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   568
% ambiguities of lexing.\marginpar{rephrased "the origin of catastrophic ...} 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   569
% In the process of matching a multi-character string with 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   570
% a regular expression that encompasses several sub-expressions, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   571
% different positions can be designated to mark 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   572
% the boundaries of corresponding substrings of the sub-expressions. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   573
% For instance, in matching the string $aaa$ with the 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   574
% regular expression $(a+aa)^*$, the divide between 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   575
% the initial match and the subsequent iteration could either be 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   576
% set between the first and second characters ($a | aa$) or 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   577
% between the second and third characters ($aa | a$). 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   578
% As both the length of the input string and the structural complexity 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   579
% of the regular expression increase, the number of potential delimiter 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   580
% combinations can grow exponentially, leading to a corresponding increase 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   581
% in complexity for algorithms that do not optimally navigate these possibilities.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   582
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   583
% Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   584
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   585
% Various disambiguation strategies are 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   586
% employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
601
Chengsong
parents: 600
diff changeset
   587
Chengsong
parents: 600
diff changeset
   588
648
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   589
%Regular expressions 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   590
%have been extensively studied and
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   591
%implemented since their invention in 1940s.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   592
%It is primarily used in lexing, where an unstructured input string is broken
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   593
%down into a tree of tokens.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   594
%That tree's construction is guided by the shape of the regular expression.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   595
%This is particularly useful in expressing the shape of a string
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   596
%with many fields, where certain fields might repeat or be omitted.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   597
%Regular expression matchers and Lexers allow us to 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   598
%identify and delimit different subsections of a string and potentially 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   599
%extract information from inputs, making them
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   600
%an indispensible component in modern software systems' text processing tasks
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   601
%such as compilers, IDEs, and firewalls.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   602
%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   603
%which means the regular expression matching or lexing engine takes an unproportional time to run 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   604
%despite the input and the expression being relatively simple.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   605
%
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   606
%Catastrophic backtracking stems from the ambiguities of lexing: 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   607
%when matching a multiple-character string with a regular 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   608
%exression that includes serveral sub-expressions, there might be different positions to set 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   609
%the border between sub-expressions' corresponding sub-strings.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   610
%For example, matching the string $aaa$ against the regular expression
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   611
%$(a+aa)^*$, the border between the initial match and the second iteration
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   612
%could be between the first and second character ($a | aa$) 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   613
%or between the second and third character ($aa | a$).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   614
%As the size of the input string and the structural complexity 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   615
%of the regular expression increase,
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   616
%the number of different combinations of delimiters can grow exponentially, and
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   617
%algorithms that explore these possibilities unwisely will also see an exponential complexity.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   618
%
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   619
%Catastrophic backtracking allow a class of computationally inexpensive attacks called
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   620
%Regular expression Denial of Service attacks (ReDoS), in which the hacker 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   621
%simply sends out a small attack string to a server, 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   622
%triggering high-complexity behaviours in its regular expression engine. 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   623
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   624
%details of this in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   625
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   626
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   627
%There have been prose definitions like the following
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   628
%by Kuklewicz \cite{KuklewiczHaskell}: 
d15a0b7d6d90 new amend
Chengsong
parents: 646
diff changeset
   629
%described the POSIX rule as (section 1, last paragraph):
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   630
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   631
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   632
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   633
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   634
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   635
%first character is removed 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   636
%state of the automaton after matching that character 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   637
%where nodes are represented as 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   638
%a sub-expression (for example tagged NFA
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   639
%Working on regular expressions 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   640
%Because of these problems with automata, we prefer regular expressions
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   641
%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   642
%the regular expression and a different data structure.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   643
%
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   644
%
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   645
%The key idea 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   646
649
Chengsong
parents: 648
diff changeset
   647
646
Chengsong
parents: 638
diff changeset
   648
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   649
%Regular expressions are widely used in computer science: 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   650
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   651
%command-line tools like $\mathit{grep}$ that facilitate easy 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   652
%text-processing \cite{grep}; network intrusion
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   653
%detection systems that inspect suspicious traffic; or compiler
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   654
%front ends.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   655
%Given their usefulness and ubiquity, one would assume that
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   656
%modern regular expression matching implementations
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   657
%are mature and fully studied.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   658
%Indeed, in many popular programming languages' regular expression engines, 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   659
%one can supply a regular expression and a string, and in most cases get
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   660
%get the matching information in a very short time.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   661
%Those engines can sometimes be blindingly fast--some 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   662
%network intrusion detection systems
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   663
%use regular expression engines that are able to process 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   664
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   665
%However, those engines can sometimes also exhibit a surprising security vulnerability
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   666
%under a certain class of inputs.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   667
%However, , this is not the case for $\mathbf{all}$ inputs.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   668
%TODO: get source for SNORT/BRO's regex matching engine/speed
646
Chengsong
parents: 638
diff changeset
   669
Chengsong
parents: 638
diff changeset
   670
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   671
%----------------------------------------------------------------------------------------
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   672
\section{Contribution}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   673
%{\color{red} \rule{\linewidth}{0.5mm}}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   674
%\textbf{issue with this part: way too short, not enough details of what I have done.}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   675
 %{\color{red} \rule{\linewidth}{0.5mm}}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   676
\marginpar{\em Gerog: No sufficient context on related work making contribution hard to digest.}
649
Chengsong
parents: 648
diff changeset
   677
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   678
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   679
\marginpar{introducing formalisations on regex matching}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   680
There have been many formalisations in the theorem-proving community 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   681
about regular expressions and lexing.
649
Chengsong
parents: 648
diff changeset
   682
One flavour is to build from the regular expression an automaton, and determine
Chengsong
parents: 648
diff changeset
   683
acceptance in terms of the resulting 
Chengsong
parents: 648
diff changeset
   684
state after executing the input string on that automaton.
Chengsong
parents: 648
diff changeset
   685
Automata formalisations are in general harder and more cumbersome to deal
Chengsong
parents: 648
diff changeset
   686
with for theorem provers than working directly on regular expressions.
Chengsong
parents: 648
diff changeset
   687
One such example is by Nipkow \cite{Nipkow1998}.
Chengsong
parents: 648
diff changeset
   688
%They 
Chengsong
parents: 648
diff changeset
   689
%made everything recursive (for example the next state function),
Chengsong
parents: 648
diff changeset
   690
As a core idea, they
Chengsong
parents: 648
diff changeset
   691
used a list of booleans to name each state so that 
Chengsong
parents: 648
diff changeset
   692
after composing sub-automata together, renaming the states to maintain
Chengsong
parents: 648
diff changeset
   693
the distinctness of each state is recursive and simple.
Chengsong
parents: 648
diff changeset
   694
The result was the obvious lemmas incorporating  
Chengsong
parents: 648
diff changeset
   695
``a painful amount of detail'' in their formalisation.
Chengsong
parents: 648
diff changeset
   696
Sometimes the automata are represented as graphs. 
Chengsong
parents: 648
diff changeset
   697
But graphs are not inductive datatypes.
Chengsong
parents: 648
diff changeset
   698
Having to set the induction principle on the number of nodes
Chengsong
parents: 648
diff changeset
   699
in a graph makes formal reasoning non-intuitive and convoluted,
Chengsong
parents: 648
diff changeset
   700
resulting in large formalisations \cite{Lammich2012}. 
Chengsong
parents: 648
diff changeset
   701
When combining two graphs, one also needs to make sure that the nodes in
Chengsong
parents: 648
diff changeset
   702
both graphs are distinct, which almost always involve
Chengsong
parents: 648
diff changeset
   703
renaming of the nodes.
Chengsong
parents: 648
diff changeset
   704
A theorem-prover which provides dependent types such as Coq 
Chengsong
parents: 648
diff changeset
   705
can alleviate the issue of representing graph nodes
Chengsong
parents: 648
diff changeset
   706
\cite{Doczkal2013}. There the representation of nodes is made
Chengsong
parents: 648
diff changeset
   707
easier by the use of $\textit{FinType}$.
Chengsong
parents: 648
diff changeset
   708
Another representation for automata are matrices. 
Chengsong
parents: 648
diff changeset
   709
But the induction for them is not as straightforward either.
Chengsong
parents: 648
diff changeset
   710
There are some more clever representations, for example one by Paulson 
Chengsong
parents: 648
diff changeset
   711
using hereditarily finite sets \cite{Paulson2015}. 
Chengsong
parents: 648
diff changeset
   712
There the problem with combining graphs can be solved better. 
Chengsong
parents: 648
diff changeset
   713
%but we believe that such clever tricks are not very obvious for 
Chengsong
parents: 648
diff changeset
   714
%the John-average-Isabelle-user.
Chengsong
parents: 648
diff changeset
   715
Chengsong
parents: 648
diff changeset
   716
The approach that operates directly on regular expressions circumvents the problem of
Chengsong
parents: 648
diff changeset
   717
conversion between a regular expression and an automaton, thereby avoiding representation
Chengsong
parents: 648
diff changeset
   718
problems altogether, despite that regular expressions may be seen as a variant of a
Chengsong
parents: 648
diff changeset
   719
non-deterministic finite automaton (ref Laurikari tagged NFA paper).
Chengsong
parents: 648
diff changeset
   720
To matching a string, a sequence of algebraic transformations called 
Chengsong
parents: 648
diff changeset
   721
(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression.
Chengsong
parents: 648
diff changeset
   722
Each derivative takes a character and a regular expression, 
Chengsong
parents: 648
diff changeset
   723
and returns a new regular expression whose language is closely related to 
Chengsong
parents: 648
diff changeset
   724
the original regular expression's language:
Chengsong
parents: 648
diff changeset
   725
strings prefixed with that input character will have their head removed
Chengsong
parents: 648
diff changeset
   726
and strings not prefixed
Chengsong
parents: 648
diff changeset
   727
with that character will be eliminated. 
Chengsong
parents: 648
diff changeset
   728
After taking derivatives with respect to all the characters the string is
Chengsong
parents: 648
diff changeset
   729
exhausted. Then an algorithm checks whether the empty string is in that final 
Chengsong
parents: 648
diff changeset
   730
regular expression's language.
Chengsong
parents: 648
diff changeset
   731
If so, a match exists and the string is in the language of the input regular expression.
Chengsong
parents: 648
diff changeset
   732
Chengsong
parents: 648
diff changeset
   733
Again this process can be seen as the simulation of an NFA running on a string,
Chengsong
parents: 648
diff changeset
   734
but the recursive nature of the datatypes and functions involved makes
Chengsong
parents: 648
diff changeset
   735
derivatives a perfect object of study for theorem provers.
Chengsong
parents: 648
diff changeset
   736
That is why there has been numerous formalisations of regular expressions
Chengsong
parents: 648
diff changeset
   737
and Brzozowski derivatives in the functional programming and 
Chengsong
parents: 648
diff changeset
   738
theorem proving community (a large list of refs to derivatives formalisation publications).
Chengsong
parents: 648
diff changeset
   739
Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
Chengsong
parents: 648
diff changeset
   740
formalised the notion of bit-coded regular expressions
Chengsong
parents: 648
diff changeset
   741
and proved their relations with simple regular expressions in
Chengsong
parents: 648
diff changeset
   742
the dependently-typed proof assistant Agda.
Chengsong
parents: 648
diff changeset
   743
They also proved the soundness and completeness of a matching algorithm
Chengsong
parents: 648
diff changeset
   744
based on the bit-coded regular expressions. Their algorithm is a decision procedure
Chengsong
parents: 648
diff changeset
   745
that gives a Yes/No answer, which does not produce
Chengsong
parents: 648
diff changeset
   746
lexical values.
Chengsong
parents: 648
diff changeset
   747
%X also formalised derivatives and regular expressions, producing "parse trees".
Chengsong
parents: 648
diff changeset
   748
%(Some person who's a big name in formal methods)
Chengsong
parents: 648
diff changeset
   749
Chengsong
parents: 648
diff changeset
   750
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   751
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   752
%In this thesis,
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   753
%we propose a solution to catastrophic
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   754
%backtracking and error-prone matchers: a formally verified
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   755
%regular expression lexing algorithm
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   756
%that is both fast
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   757
%and correct. 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   758
%%{\color{red} \rule{\linewidth}{0.5mm}}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   759
%\HandRight Added content:
646
Chengsong
parents: 638
diff changeset
   760
%Package \verb`pifont`: \ding{43}
Chengsong
parents: 638
diff changeset
   761
%Package \verb`utfsym`: \usym{261E} 
Chengsong
parents: 638
diff changeset
   762
%Package \verb`dingbat`: \leftpointright 
Chengsong
parents: 638
diff changeset
   763
%Package \verb`dingbat`: \rightpointright 
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   764
We have made mainly two contributions in this thesis: %the
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   765
%lexer we developed based on Brzozowski derivatives and 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   766
%Sulzmanna and Lu's developments called 
665
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   767
proving the lexer $\blexersimp$ is both i) correctness and ii)fast.
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   768
It is correct w.r.t a formalisation of POSIX lexing by Ausaf et al.\cite{AusafDyckhoffUrban2016}.
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   769
It is fast compared with un-optimised implementations like Sulzmann and Lu's orignal
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   770
development by our metric of internal data structures not growing unbounded.
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   771
\subsection{Theorem Proving with Derivatives}
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   772
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   773
Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   774
formalised the notion of bit-coded regular expressions
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   775
and proved their relations with simple regular expressions in
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   776
the dependently-typed proof assistant Agda.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   777
They also proved the soundness and completeness of a matching algorithm
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   778
based on the bit-coded regular expressions.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   779
Ausaf et al. \cite{AusafDyckhoffUrban2016}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   780
are the first to
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   781
give a quite simple formalised POSIX
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   782
specification in Isabelle/HOL, and also prove 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   783
that their specification coincides with an earlier (unformalised) 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   784
POSIX specification given by Okui and Suzuki \cite{Okui10}.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   785
They then formally proved the correctness of
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   786
a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   787
with regards to that specification.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   788
They also found that the informal POSIX
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   789
specification by Sulzmann and Lu needs to be substantially 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   790
revised in order for the correctness proof to go through.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   791
Their original specification and proof were unfixable
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   792
according to Ausaf et al.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   793
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   794
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   795
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   796
\subsection{Complexity Results}
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   797
Our formalisation of complexity is unique among similar works in the sense that
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   798
%is about the size of internal data structures.
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   799
to our knowledge %we don't know of a 
665
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   800
there are not other certified 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   801
lexing/parsing algorithms with similar data structure size bound theorems.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   802
Common practices involve making empirical analysis of the complexity of the algorithm
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   803
in question (\cite{Verbatim}, \cite{Verbatimpp}), or relying 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   804
on prior (unformalised) complexity analysis of well-known algorithms (\cite{ValiantParsing}),
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   805
making them prone to complexity bugs.
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   806
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   807
%TODO: add citation
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   808
%, for example in the Verbatim work \cite{Verbatim}
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   809
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   810
%While formalised proofs have not included 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   811
%Issues known as "performance bugs" can
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   812
Whilst formalised complexity theorems 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   813
have not yet appeared in other certified lexers/parsers,
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   814
%while this work is done,
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   815
they do find themselves in the broader theorem-proving literature:
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   816
\emph{time credits} have been formalised for separation logic in Coq 
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   817
\cite{atkey2010amortised}
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   818
%not used in 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   819
to characterise the runtime complexity of union-find.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   820
%where integer values are recorded %from the beginning of an execution
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   821
%as part of the program state
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   822
%and decremented in each step. 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   823
The idea is that the total number of instructions executed 
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   824
is equal to the time credits consumed
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   825
during the execution of an algorithm.
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   826
%each time a basic step is executed. 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   827
%The way to use a time credit Time credit is an integer
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   828
%is assigned to a program prior to execution, and each step in the program consumes
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   829
%one credit.
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   830
Arma{\"e}l et al. \cite{bigOImperative} have extended the framework to allow expressing time
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   831
credits using big-O notations,
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   832
so one can prove both the functional correctness and asymptotic complexity
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   833
of higher-order imperative algorithms.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   834
A number of formal proofs also exist for some other
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   835
algorithms in Coq \cite{subtypingFormalComplexity}.
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   836
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   837
The big-O notation have also been formalised in Isabelle
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   838
\cite{bigOIsabelle}
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   839
%for example the complexity of
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   840
%the Quicksort algorithm 
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   841
%is $\Theta n\ln n$ 
665
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   842
\marginpar{more work on formalising complexity}.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   843
%Our next step is to leverage these frameworks
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   844
%It is a precursor to our 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   845
Our work focuses on the space complexity of the algorithm under our notion of the size of 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   846
a regular expression.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   847
Despite not being a direct asymptotic time complexity proof,
668
3831621d7b14 added technical Overview section, almost done introduction
Chengsong
parents: 666
diff changeset
   848
our result is an important stepping stone towards one.
666
6da4516ea87d more changes to figures & benchmarking
Chengsong
parents: 665
diff changeset
   849
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   850
665
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   851
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   852
Brzozowski showed that there are finitely many similar deriviatives, 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   853
where similarity is defined in terms of ACI equations. 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   854
This allows him to use derivatives as a basis for DFAs where each state is 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   855
labelled with a derivative. 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   856
However, Brzozowski did not show anything about 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   857
the size of the derivatives.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   858
Antimirov showed that there can only be finitely 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   859
many partial derivatives for a regular expression and any set of 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   860
strings. He showed that the number is actually the 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   861
``alphabetical width'' plus 1. 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   862
From this result one can relatively easily establish 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   863
that the size of the partial derivatives is 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   864
no bigger than $(\textit{size} \; r)^3$ for every string.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   865
Unfortunately this result does not seem to carry over to our 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   866
setting because partial derivatives have the simplification 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   867
\begin{equation}\label{eq:headSplitRule}
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   868
	(r_1 + r_2) \cdot r_3 \rightarrow  (r_1 \cdot r_3) + (r_2 \cdot r_3)
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   869
\end{equation}
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   870
built in. We cannot have this because otherwise we would 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   871
lose the POSIX property. For instance, the lexing result of
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   872
regular expression
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   873
\[
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   874
	(a+ab)\cdot(bc+c)
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   875
\]
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   876
with respect to string $abc$ using our lexer with the simplification rule \ref{eq:headSplitRule}
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   877
would be 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   878
\[
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   879
	\Left (\Seq \; (\Char \; a), \Seq (\Char \; b) \; (\Char \; c)  )
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   880
\]
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   881
instead of the correct POSIX value
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   882
\[
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   883
	\Seq \; (\Right \; (\Seq \; (\Char \; a) \; (\Char \; b)) ) \; (\Char \;)
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   884
\]
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   885
Our result about the finite bound also does not say anything about the number of derivatives. 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   886
In fact there are infinitely many derivatives in general 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   887
because in the annotated regular expression for STAR we record the 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   888
number of iterations. What our result shows that the size of 
3bedbdce3a3b a bit more intro, incorporating Christian chat messages
Chengsong
parents: 664
diff changeset
   889
the derivatives is bounded, not the number.
664
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   890
\marginpar{new changes up to this point.}
ba44144875b1 introduction Contribution section update
Chengsong
parents: 654
diff changeset
   891
646
Chengsong
parents: 638
diff changeset
   892
Chengsong
parents: 638
diff changeset
   893
In particular, the main problem we solved on top of previous work was
Chengsong
parents: 638
diff changeset
   894
coming up with a formally verified algorithm called $\blexersimp$ based
Chengsong
parents: 638
diff changeset
   895
on Brzozowski derivatives. It calculates a POSIX
Chengsong
parents: 638
diff changeset
   896
lexical value from a string and a regular expression. This algorithm was originally
Chengsong
parents: 638
diff changeset
   897
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
Chengsong
parents: 638
diff changeset
   898
function does not really simplify intermediate results where it needs to and improved the
Chengsong
parents: 638
diff changeset
   899
algorithm accordingly.
Chengsong
parents: 638
diff changeset
   900
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
Chengsong
parents: 638
diff changeset
   901
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
Chengsong
parents: 638
diff changeset
   902
\begin{theorem}
Chengsong
parents: 638
diff changeset
   903
$|\blexersimp \; r \; s | \leq N_r$
Chengsong
parents: 638
diff changeset
   904
\end{theorem}
Chengsong
parents: 638
diff changeset
   905
The simplifications applied in each step of $\blexersimp$ 
Chengsong
parents: 638
diff changeset
   906
Chengsong
parents: 638
diff changeset
   907
\begin{center}
Chengsong
parents: 638
diff changeset
   908
	$\blexersimp
Chengsong
parents: 638
diff changeset
   909
	$
Chengsong
parents: 638
diff changeset
   910
\end{center}
Chengsong
parents: 638
diff changeset
   911
keeps the derivatives small, but presents a 
Chengsong
parents: 638
diff changeset
   912
challenge
Chengsong
parents: 638
diff changeset
   913
Chengsong
parents: 638
diff changeset
   914
Chengsong
parents: 638
diff changeset
   915
establishing a correctness theorem of the below form:
Chengsong
parents: 638
diff changeset
   916
%TODO: change this to "theorem to prove"
Chengsong
parents: 638
diff changeset
   917
\begin{theorem}
Chengsong
parents: 638
diff changeset
   918
	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$, 
Chengsong
parents: 638
diff changeset
   919
	then $\blexersimp\; r \; s = \Some \; v$. Otherwise 
Chengsong
parents: 638
diff changeset
   920
	$\blexersimp \; r \; s = \None$.
Chengsong
parents: 638
diff changeset
   921
\end{theorem}
Chengsong
parents: 638
diff changeset
   922
Chengsong
parents: 638
diff changeset
   923
Chengsong
parents: 638
diff changeset
   924
Chengsong
parents: 638
diff changeset
   925
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   926
The result is %a regular expression lexing algorithm that comes with 
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   927
\begin{itemize}
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   928
\item
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   929
an improved version of  Sulzmann and Lu's bit-coded algorithm using 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   930
derivatives with simplifications, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   931
accompanied by
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   932
a proven correctness theorem according to POSIX specification 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   933
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   934
\item 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   935
a complexity-related property for that algorithm saying that the 
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   936
internal data structure will
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   937
remain below a finite bound,
538
8016a2480704 intro and chap2
Chengsong
parents: 537
diff changeset
   938
\item
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   939
and an extension to
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   940
the bounded repetition constructs with the correctness and finiteness property
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   941
maintained.
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   942
\end{itemize}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   943
\noindent
646
Chengsong
parents: 638
diff changeset
   944
{\color{red} \rule{\linewidth}{0.5mm}}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   945
With a formal finiteness bound in place,
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   946
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
631
Chengsong
parents: 630
diff changeset
   947
The Isabelle/HOL code for our formalisation can be 
Chengsong
parents: 630
diff changeset
   948
found at
Chengsong
parents: 630
diff changeset
   949
\begin{center}
Chengsong
parents: 630
diff changeset
   950
\url{https://github.com/hellotommmy/posix}
Chengsong
parents: 630
diff changeset
   951
\end{center}
609
61139fdddae0 chap1 totally done
Chengsong
parents: 608
diff changeset
   952
Further improvements to the algorithm with an even stronger version of 
631
Chengsong
parents: 630
diff changeset
   953
simplification can be made. We conjecture that the resulting size of derivatives
Chengsong
parents: 630
diff changeset
   954
can be bounded by a cubic bound w.r.t. the size of the regular expression.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   955
We will give relevant code in Scala, 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   956
but do not give a formal proof for that in Isabelle/HOL.
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   957
This is still future work.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   958
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   959
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   960
\section{Structure of the thesis}
652
Chengsong
parents: 649
diff changeset
   961
\marginpar{\em This is a marginal note.} 
Chengsong
parents: 649
diff changeset
   962
Before talking about the formal proof of $\blexersimp$'s 
Chengsong
parents: 649
diff changeset
   963
correctness, which is the main contribution of this thesis,
Chengsong
parents: 649
diff changeset
   964
we need to introduce two formal proofs which belong
Chengsong
parents: 649
diff changeset
   965
to Ausafe et al. 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   966
In chapter \ref{Inj} we will introduce the concepts
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   967
and notations we 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   968
use for describing regular expressions and derivatives,
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   969
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   970
its correctness proof).
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   971
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   972
together with the correctness proof by Ausaf and Urban.
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   973
Then we illustrate in chapter \ref{Bitcoded2}
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   974
how Sulzmann and Lu's
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   975
simplifications fail to simplify correctly. We therefore introduce our own version of the
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   976
algorithm with correct simplifications and 
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   977
their correctness proof.  
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   978
In chapter \ref{Finite} we give the second guarantee
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   979
of our bitcoded algorithm, that is a finite bound on the size of any 
631
Chengsong
parents: 630
diff changeset
   980
regular expression's derivatives. 
Chengsong
parents: 630
diff changeset
   981
We also show how one can extend the
Chengsong
parents: 630
diff changeset
   982
algorithm to include bounded repetitions. 
622
4b1149fb5aec incorporated more comments, bib
Chengsong
parents: 620
diff changeset
   983
In chapter \ref{Cubic} we discuss stronger simplification rules which 
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   984
improve the finite bound to a cubic bound. %and the NOT regular expression.
637
Chengsong
parents: 636
diff changeset
   985
Chapter \ref{RelatedWork} introduces relevant work for this thesis.
638
dd9dde2d902b comments till chap4
Chengsong
parents: 637
diff changeset
   986
Chapter \ref{Future} concludes and mentions avenues of future research.
532
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   987
 
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   988
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   989
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   990
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   991
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   992
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   993
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   994
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   995
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   996
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   997
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   998
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
   999
%----------------------------------------------------------------------------------------
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1000
cc54ce075db5 restructured
Chengsong
parents:
diff changeset
  1001