lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@3bedbdce3a3b (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	28	\def\bsimps{\textit{bsimp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	29	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	35	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	36	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594 62f8fa03863e more Chengsong parents: 591 diff changeset	37	\def\rdistincts{\textit{rdistinct}}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	38	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	40	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
620 ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	42	\def\cbn{\textit{createdByNtimes}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	43	\def\hpa{\textit{highestPowerAux}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	44	\def\hpower{\textit{highestPower}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	45	\def\ntset{\textit{ntset}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	46	\def\optermsimp{\textit{optermsimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	47	\def\optermOsimp{\textit{optermOsimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	48	\def\optermosimp{\textit{optermosimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	49	\def\opterm{\textit{opterm}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	50	\def\nString{\textit{nonemptyString}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	51
cc54ce075db5 restructured Chengsong parents: diff changeset	52	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	53
600 fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	54	\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	55	\def\SEQs{\textit{SEQs}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	56	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	57	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	58	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	59	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	60	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	61	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	62	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	63	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	64	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	65	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	66	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	67	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	68	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	69	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	70	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	71	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	72
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	73	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	74
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	75	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	76	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	77	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	78	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	79	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	80
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	81	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	82
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	83	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	84	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	85	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	86	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	87	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	88	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	89	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	90	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	91	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	92	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	93	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	94	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	95	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	96	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	97	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	98	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	99	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	100	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	103	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	104	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	105	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	106	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	107	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	108	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	109	\def\Seq{\textit{Seq}}
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	110	\def\Alt{\textit{Alt}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	111	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	112	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	113	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	114	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	115	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	116	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	117	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	121	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	122	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	123	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	124	\def\starupdates{\textit{starUpdates}}
620 ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	125	\def\nupdate{\textit{nupdate}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	126	\def\nupdates{\textit{nupdates}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	127
532 cc54ce075db5 restructured Chengsong parents: diff changeset	128
cc54ce075db5 restructured Chengsong parents: diff changeset	129	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	130	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	131	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	132	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	133	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	134	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	135	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	136	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	137	\def\bsimpStrong{\textit{bsimpStrong}}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	138	\def\bdersStrongs{\textit{bdersStrong}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	139	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	140
532 cc54ce075db5 restructured Chengsong parents: diff changeset	141	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	142	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	143	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	144	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	145	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	146	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	147
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	148	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	149
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	150
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	151	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	152	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	153	\def\bsimpalts{\textit{bsimp}_{ALTS}}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	154	\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	155	\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	156	\def\rsimpseq{\textit{rsimp}_{SEQ}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	157
532 cc54ce075db5 restructured Chengsong parents: diff changeset	158	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	159	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	160	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	161
cc54ce075db5 restructured Chengsong parents: diff changeset	162
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	163	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	164	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	165	\def\filter{\textit{filter}}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	166	%\def\not{\textit{not}}
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	167
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	168
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	169
532 cc54ce075db5 restructured Chengsong parents: diff changeset	170	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	171	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	172	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	173	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	174	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	175	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	176	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	177
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	178
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	179
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	180
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	181	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	182	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	183	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	184	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	185	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	186	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	187	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	188	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	189	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	190	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	191	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	192	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	193	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	194	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	195	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	196	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	197	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	198	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	199
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	200
532 cc54ce075db5 restructured Chengsong parents: diff changeset	201	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	202	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	203	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	204
cc54ce075db5 restructured Chengsong parents: diff changeset	205	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	206
654 2ad20ba5b178 more Chengsong parents: 653 diff changeset	207	\marginpar{rephrasing using "imprecise words"}
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	208	Regular expressions, since their inception in the 1940s,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	209	have been subject to extensive study and implementation.
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	210	Their primary application lies in text processing--finding
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	211	matches and identifying patterns in a string.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	212	%It is often used to match strings that comprises of numerous fields,
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	213	%where certain fields may recur or be omitted.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	214	For example, a simple regular expression that tries
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	215	to recognise email addresses is
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	216	\marginpar{rephrased from "the regex for recognising" to "a simple regex that tries to match email"}
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	217	\begin{center}
654 2ad20ba5b178 more Chengsong parents: 653 diff changeset	218	\verb\|[a-z0-9._]^+@[a-z0-9.-]^+\.\{a-z\}\{2,6\}\|
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	219	%$[a-z0-9._]^+@[a-z0-9.-]^+\.[a-z]{2,6}$.
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	220	\end{center}
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	221	\marginpar{Simplified example, but the distinction between . and escaped . is correct
654 2ad20ba5b178 more Chengsong parents: 653 diff changeset	222	and therefore left unchanged. Also verbatim package does not straightforwardly support superscripts so + kept as they are.}
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	223	%Using this, regular expression matchers and lexers are able to extract
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	224	%the domain names by the use of \verb\|[a-zA-Z0-9.-]+\|.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	225	\marginpar{Rewrote explanation for the expression.}
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	226	The bracketed sub-expressions are used to extract specific parts of an email address.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	227	The local part is recognised by the expression enclosed in
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	228	the first pair of brackets: $[a-z0-9._]$, and after the ``@'' sign
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	229	is the part that recognises the domain, where $[a-z]{2, 6}$ specifically
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	230	matches the top-level domain.
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	231	%Consequently, they are an indispensible components in text processing tools
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	232	%of software systems such as compilers, IDEs, and firewalls.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	233
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	234	The study of regular expressions is ongoing due to an
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	235	issue known as catastrophic backtracking.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	236	This phenomenon refers to scenarios in which the regular expression
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	237	matching or lexing engine exhibits a disproportionately long
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	238	runtime despite the simplicity of the input and expression.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	239
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	240	One cause of catastrophic backtracking lies within the
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	241	ambiguities of lexing.\marginpar{rephrased "the origin of catastrophic ...}
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	242	In the process of matching a multi-character string with
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	243	a regular expression that encompasses several sub-expressions,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	244	different positions can be designated to mark
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	245	the boundaries of corresponding substrings of the sub-expressions.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	246	For instance, in matching the string $aaa$ with the
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	247	regular expression $(a+aa)^*$, the divide between
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	248	the initial match and the subsequent iteration could either be
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	249	set between the first and second characters ($a \| aa$) or between the second and third characters ($aa \| a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	250
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	251	Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	252
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	253	Various disambiguation strategies are
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	254	employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	255
ce4e5151a836 more Chengsong parents: 600 diff changeset	256
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	257	%Regular expressions
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	258	%have been extensively studied and
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	259	%implemented since their invention in 1940s.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	260	%It is primarily used in lexing, where an unstructured input string is broken
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	261	%down into a tree of tokens.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	262	%That tree's construction is guided by the shape of the regular expression.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	263	%This is particularly useful in expressing the shape of a string
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	264	%with many fields, where certain fields might repeat or be omitted.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	265	%Regular expression matchers and Lexers allow us to
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	266	%identify and delimit different subsections of a string and potentially
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	267	%extract information from inputs, making them
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	268	%an indispensible component in modern software systems' text processing tasks
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	269	%such as compilers, IDEs, and firewalls.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	270	%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	271	%which means the regular expression matching or lexing engine takes an unproportional time to run
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	272	%despite the input and the expression being relatively simple.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	273	%
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	274	%Catastrophic backtracking stems from the ambiguities of lexing:
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	275	%when matching a multiple-character string with a regular
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	276	%exression that includes serveral sub-expressions, there might be different positions to set
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	277	%the border between sub-expressions' corresponding sub-strings.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	278	%For example, matching the string $aaa$ against the regular expression
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	279	%$(a+aa)^*$, the border between the initial match and the second iteration
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	280	%could be between the first and second character ($a \| aa$)
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	281	%or between the second and third character ($aa \| a$).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	282	%As the size of the input string and the structural complexity
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	283	%of the regular expression increase,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	284	%the number of different combinations of delimiters can grow exponentially, and
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	285	%algorithms that explore these possibilities unwisely will also see an exponential complexity.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	286	%
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	287	%Catastrophic backtracking allow a class of computationally inexpensive attacks called
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	288	%Regular expression Denial of Service attacks (ReDoS), in which the hacker
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	289	%simply sends out a small attack string to a server,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	290	%triggering high-complexity behaviours in its regular expression engine.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	291	%These attacks, be it deliberate or not, have caused real-world systems to go down (see more
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	292	%details of this in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	293	%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	294	%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	295	%There have been prose definitions like the following
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	296	%by Kuklewicz \cite{KuklewiczHaskell}:
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	297	%described the POSIX rule as (section 1, last paragraph):
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	298	\marginpar{\em Deleted some peripheral specifications.}
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	299	\begin{quote}
56057198e4f5 intro Chengsong parents: 638 diff changeset	300	\begin{itemize}
56057198e4f5 intro Chengsong parents: 638 diff changeset	301	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	302	regular expressions (REs) take the leftmost starting match, and the longest match starting there
56057198e4f5 intro Chengsong parents: 638 diff changeset	303	earlier subpatterns have leftmost-longest priority over later subpatterns\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	304	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	305	higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	306	\item
653 bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	307	$\ldots$
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	308	%REs have right associative concatenation which can be changed with parenthesis\\
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	309	%\item
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	310	%parenthesized subexpressions return the match from their last usage\\
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	311	%\item
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	312	%text of component subexpressions must be contained in the text of the
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	313	%higher-level subexpressions\\
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	314	%\item
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	315	%if "p" and "q" can never match the same text then "p\|q" and "q\|p" are equivalent, up to trivial renumbering of captured subexpressions\\
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	316	%\item
bc5571c38d1f more updates in section 4.2 and incorporating Christian comments Chengsong parents: 652 diff changeset	317	%if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	318	\end{itemize}
56057198e4f5 intro Chengsong parents: 638 diff changeset	319	\end{quote}
649 ef2b8abcbc55 more Chengsong parents: 648 diff changeset	320	However, the author noted that various lexers that claim to be POSIX
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	321	are rarely correct according to this standard.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	322	There are numerous occasions where programmers realised the subtlety and
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	323	difficulty to implement correctly, one such quote from Go's regexp library author
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	324	\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	325	\begin{quote}\it
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	326	``
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	327	The POSIX rule is computationally prohibitive
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	328	and not even well-defined.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	329	``
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	330	\end{quote}
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	331	Being able to formally define and capture the idea of
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	332	POSIX rules and prove
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	333	the correctness of regular expression matching/lexing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	334	algorithms against the POSIX semantics definitions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	335	is valuable.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	336
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	337
649 ef2b8abcbc55 more Chengsong parents: 648 diff changeset	338	Formal proofs are
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	339	machine checked programs
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	340	%such as the ones written in Isabelle/HOL, is a powerful means
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	341	for computer scientists to be certain
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	342	about the correctness of their algorithms.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	343	This is done by
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	344	recursively checking that every fact in a proof script
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	345	is either an axiom or a fact that is derived from
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	346	known axioms or verified facts.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	347	%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	348	%methods as their implementation and complexity analysis tend to be error-prone.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	349	Formal proofs provides an unprecendented level of asssurance
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	350	that an algorithm will perform as expected under all inputs.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	351	The software systems that help people interactively build and check
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	352	such proofs are called theorem-provers or proof assitants.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	353	Many theorem-provers have been developed, such as Mizar,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	354	Isabelle/HOL, HOL-Light, HOL4,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	355	Coq, Agda, Idris, Lean and so on.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	356	Isabelle/HOL is a theorem prover with a simple type theory
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	357	and powerful automated proof generators like sledgehammer.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	358	We chose to use Isabelle/HOL for its powerful automation
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	359	and ease and simplicity in expressing regular expressions and
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	360	regular languages.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	361	%Some of those employ
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	362	%dependent types like Mizar, Coq, Agda, Lean and Idris.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	363	%Some support a constructivism approach, such as Coq.
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	364
56057198e4f5 intro Chengsong parents: 638 diff changeset	365
649 ef2b8abcbc55 more Chengsong parents: 648 diff changeset	366	Formal proofs on regular expression matching and lexing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	367	complements the efforts in
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	368	industry which tend to focus on overall speed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	369	with techniques like parallelization (FPGA paper), tackling
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	370	the problem of catastrophic backtracking
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	371	in an ad-hoc manner (cloudflare and stackexchange article).
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	372
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	373	There have been many interesting steps in the theorem-proving community
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	374	about formalising regular expressions and lexing.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	375	One flavour is to build from the regular expression an automaton, and determine
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	376	acceptance in terms of the resulting
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	377	state after executing the input string on that automaton.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	378	Automata formalisations are in general harder and more cumbersome to deal
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	379	with for theorem provers than working directly on regular expressions.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	380	One such example is by Nipkow \cite{Nipkow1998}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	381	%They
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	382	%made everything recursive (for example the next state function),
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	383	As a core idea, they
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	384	used a list of booleans to name each state so that
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	385	after composing sub-automata together, renaming the states to maintain
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	386	the distinctness of each state is recursive and simple.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	387	The result was the obvious lemmas incorporating
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	388	``a painful amount of detail'' in their formalisation.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	389	Sometimes the automata are represented as graphs.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	390	But graphs are not inductive datatypes.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	391	Having to set the induction principle on the number of nodes
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	392	in a graph makes formal reasoning non-intuitive and convoluted,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	393	resulting in large formalisations \cite{Lammich2012}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	394	When combining two graphs, one also needs to make sure that the nodes in
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	395	both graphs are distinct, which almost always involve
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	396	renaming of the nodes.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	397	A theorem-prover which provides dependent types such as Coq
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	398	can alleviate the issue of representing graph nodes
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	399	\cite{Doczkal2013}. There the representation of nodes is made
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	400	easier by the use of $\textit{FinType}$.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	401	Another representation for automata are matrices.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	402	But the induction for them is not as straightforward either.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	403	There are some more clever representations, for example one by Paulson
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	404	using hereditarily finite sets \cite{Paulson2015}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	405	There the problem with combining graphs can be solved better.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	406	%but we believe that such clever tricks are not very obvious for
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	407	%the John-average-Isabelle-user.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	408
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	409	The approach that operates directly on regular expressions circumvents the problem of
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	410	conversion between a regular expression and an automaton, thereby avoiding representation
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	411	problems altogether, despite that regular expressions may be seen as a variant of a
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	412	non-deterministic finite automaton (ref Laurikari tagged NFA paper).
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	413	To matching a string, a sequence of algebraic transformations called
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	414	(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	415	Each derivative takes a character and a regular expression,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	416	and returns a new regular expression whose language is closely related to
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	417	the original regular expression's language:
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	418	strings prefixed with that input character will have their head removed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	419	and strings not prefixed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	420	with that character will be eliminated.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	421	After taking derivatives with respect to all the characters the string is
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	422	exhausted. Then an algorithm checks whether the empty string is in that final
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	423	regular expression's language.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	424	If so, a match exists and the string is in the language of the input regular expression.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	425
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	426	Again this process can be seen as the simulation of an NFA running on a string,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	427	but the recursive nature of the datatypes and functions involved makes
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	428	derivatives a perfect object of study for theorem provers.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	429	That is why there has been numerous formalisations of regular expressions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	430	and Brzozowski derivatives in the functional programming and
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	431	theorem proving community (a large list of refs to derivatives formalisation publications).
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	432	Ribeiro and Du Bois \cite{RibeiroAgda2017} have
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	433	formalised the notion of bit-coded regular expressions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	434	and proved their relations with simple regular expressions in
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	435	the dependently-typed proof assistant Agda.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	436	They also proved the soundness and completeness of a matching algorithm
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	437	based on the bit-coded regular expressions. Their algorithm is a decision procedure
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	438	that gives a Yes/No answer, which does not produce
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	439	lexical values.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	440	%X also formalised derivatives and regular expressions, producing "parse trees".
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	441	%(Some person who's a big name in formal methods)
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	442
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	443
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	444	The variant of the problem we are looking at centers around
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	445	an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	446	The reason we chose to look at $\blexer$ and its simplifications
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	447	is because it allows a lexical tree to be generated
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	448	by some elegant and subtle procedure based on Brzozowski derivatives.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	449	The procedures are made of recursive functions and inductive datatypes just like derivatives,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	450	allowing intuitive and concise formal reasoning with theorem provers.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	451	Most importantly, $\blexer$ opens up a path to an optimized version
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	452	of $\blexersimp$ possibilities to improve
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	453	performance with simplifications that aggressively change the structure of regular expressions.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	454	While most derivative-based methods
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	455	rely on structures to be maintained to allow induction to
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	456	go through.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	457	For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	458	with derivatives, but as soon as they started introducing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	459	optimizations such as memoization, they reverted to constructing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	460	DFAs first.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	461	Edelmann \ref{Edelmann2020} explored similar optimizations in his
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	462	work on verified LL(1) parsing, with additional enhancements with data structures like
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	463	zippers.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	464
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	465	%Sulzmann and Lu have worked out an algorithm
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	466	%that is especially suited for verification
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	467	%which utilized the fact
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	468	%that whenever ambiguity occurs one may duplicate a sub-expression and use
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	469	%different copies to describe different matching choices.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	470	The idea behind the correctness of $\blexer$ is simple: during a derivative,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	471	multiple matches might be possible, where an alternative with multiple children
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	472	each corresponding to a
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	473	different match is created. In the end of
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	474	a lexing process one always picks up the leftmost alternative, which is guarnateed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	475	to be a POSIX value.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	476	This is done by consistently keeping sub-regular expressions in an alternative
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	477	with longer submatches
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	478	to the left of other copies (
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	479	Remember that POSIX values are roughly the values with the longest inital
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	480	submatch).
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	481	The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	482	is that since we only take the leftmost copy, then all re-occurring copies can be
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	483	eliminated without losing the POSIX property, and this can be done with
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	484	children of alternatives at different levels by merging them together.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	485	Proving $\blexersimp$ requires a different
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	486	proof strategy compared to that by Ausaf \cite{FahadThesis}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	487	We invent a rewriting relation as an
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	488	inductive predicate which captures
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	489	a strong enough invariance that ensures correctness,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	490	which commutes with the derivative operation.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	491	This predicate allows a simple
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	492	induction on the input string to go through.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	493
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	494	%This idea has been repeatedly used in different variants of lexing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	495	%algorithms in their paper, one of which involves bit-codes. The bit-coded
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	496	%derivative-based algorithm even allows relatively aggressive
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	497	%%simplification rules which cause
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	498	%structural changes that render the induction used in the correctness
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	499	%proofs unusable.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	500	%More details will be given in \ref{Bitcoded2} including the
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	501	%new correctness proof which involves a new inductive predicate which allows
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	502	%rule induction to go through.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	503
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	504
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	505
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	506
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	507
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	508
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	509
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	510	%first character is removed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	511	%state of the automaton after matching that character
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	512	%where nodes are represented as
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	513	%a sub-expression (for example tagged NFA
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	514	%Working on regular expressions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	515	%Because of these problems with automata, we prefer regular expressions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	516	%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	517	%the regular expression and a different data structure.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	518	%
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	519	%
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	520	%The key idea
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	521
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	522	(ends)
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	523
56057198e4f5 intro Chengsong parents: 638 diff changeset	524	%Regular expressions are widely used in computer science:
56057198e4f5 intro Chengsong parents: 638 diff changeset	525	%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
56057198e4f5 intro Chengsong parents: 638 diff changeset	526	%command-line tools like $\mathit{grep}$ that facilitate easy
56057198e4f5 intro Chengsong parents: 638 diff changeset	527	%text-processing \cite{grep}; network intrusion
56057198e4f5 intro Chengsong parents: 638 diff changeset	528	%detection systems that inspect suspicious traffic; or compiler
56057198e4f5 intro Chengsong parents: 638 diff changeset	529	%front ends.
56057198e4f5 intro Chengsong parents: 638 diff changeset	530	%Given their usefulness and ubiquity, one would assume that
56057198e4f5 intro Chengsong parents: 638 diff changeset	531	%modern regular expression matching implementations
56057198e4f5 intro Chengsong parents: 638 diff changeset	532	%are mature and fully studied.
56057198e4f5 intro Chengsong parents: 638 diff changeset	533	%Indeed, in many popular programming languages' regular expression engines,
56057198e4f5 intro Chengsong parents: 638 diff changeset	534	%one can supply a regular expression and a string, and in most cases get
56057198e4f5 intro Chengsong parents: 638 diff changeset	535	%get the matching information in a very short time.
56057198e4f5 intro Chengsong parents: 638 diff changeset	536	%Those engines can sometimes be blindingly fast--some
56057198e4f5 intro Chengsong parents: 638 diff changeset	537	%network intrusion detection systems
56057198e4f5 intro Chengsong parents: 638 diff changeset	538	%use regular expression engines that are able to process
56057198e4f5 intro Chengsong parents: 638 diff changeset	539	%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
56057198e4f5 intro Chengsong parents: 638 diff changeset	540	%However, those engines can sometimes also exhibit a surprising security vulnerability
56057198e4f5 intro Chengsong parents: 638 diff changeset	541	%under a certain class of inputs.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	542	%However, , this is not the case for $\mathbf{all}$ inputs.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	543	%TODO: get source for SNORT/BRO's regex matching engine/speed
ce4e5151a836 more Chengsong parents: 600 diff changeset	544
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	545
532 cc54ce075db5 restructured Chengsong parents: diff changeset	546	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	547	\section{Contribution}
664 ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	548	%{\color{red} \rule{\linewidth}{0.5mm}}
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	549	%\textbf{issue with this part: way too short, not enough details of what I have done.}
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	550	%{\color{red} \rule{\linewidth}{0.5mm}}
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	551	\marginpar{\em Gerog: No sufficient context on related work making contribution hard to digest.}
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	552
532 cc54ce075db5 restructured Chengsong parents: diff changeset	553
664 ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	554	%In this thesis,
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	555	%we propose a solution to catastrophic
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	556	%backtracking and error-prone matchers: a formally verified
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	557	%regular expression lexing algorithm
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	558	%that is both fast
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	559	%and correct.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	560	%%{\color{red} \rule{\linewidth}{0.5mm}}
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	561	%\HandRight Added content:
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	562	%Package \verb`pifont`: \ding{43}
56057198e4f5 intro Chengsong parents: 638 diff changeset	563	%Package \verb`utfsym`: \usym{261E}
56057198e4f5 intro Chengsong parents: 638 diff changeset	564	%Package \verb`dingbat`: \leftpointright
56057198e4f5 intro Chengsong parents: 638 diff changeset	565	%Package \verb`dingbat`: \rightpointright
664 ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	566	We have made mainly two contributions in this thesis: %the
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	567	%lexer we developed based on Brzozowski derivatives and
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	568	%Sulzmanna and Lu's developments called
665 3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	569	proving the lexer $\blexersimp$ is both i) correctness and ii)fast.
664 ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	570	It is correct w.r.t a formalisation of POSIX lexing by Ausaf et al.\ref{AusafDyckhoffUrban2016}.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	571	It is fast compared with un-optimised implementations like Sulzmann and Lu's orignal
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	572	development by our metric of internal data structures not growing unbounded.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	573
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	574	Our formalisation of complexity is unique among similar works in the sense that
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	575	%is about the size of internal data structures.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	576	to our knowledge %we don't know of a
665 3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	577	there are not other certified
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	578	lexing/parsing algorithms with similar data structure size bound theorems.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	579	Common practices involve making empirical analysis of the complexity of the algorithm
664 ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	580	in question (\ref{Verbatim}, \ref{Verbatimpp}), or relying
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	581	on prior (unformalised) complexity analysis of well-known algorithms (\ref{ValiantParsing}),
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	582	making them prone to complexity bugs.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	583	%TODO: add citation
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	584	%, for example in the Verbatim work \ref{Verbatim}
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	585
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	586	%While formalised proofs have not included
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	587	%Issues known as "performance bugs" can
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	588	Whilst formalised complexity theorems
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	589	have not yet appeared in other certified lexers/parsers,
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	590	%while this work is done,
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	591	they do find themselves in the broader theorem-proving literature:
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	592	\emph{time credits} have been formalised for separation logic in Coq
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	593	\ref{atkey2010amortised}%not used in
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	594	to characterise the runtime complexity of an algorithm,
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	595	where integer values are recorded %from the beginning of an execution
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	596	as part of the program state
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	597	and decremented in each step.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	598	The idea is that the total number of decrements
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	599	from the time credits during execution represents the complexity of an algorithm.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	600	%each time a basic step is executed.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	601	%The way to use a time credit Time credit is an integer
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	602	%is assigned to a program prior to execution, and each step in the program consumes
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	603	%one credit.
665 3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	604	Arma{\"e}l et al. have extended the framework to allow expressing time
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	605	credits using big-O notations
664 ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	606	so one can prove both the functional correctness and asymptotic complexity
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	607	of higher-order imperative algorithms \ref{bigOImperative}.
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	608	%for example the complexity of
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	609	%the Quicksort algorithm
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	610	%is $\Theta n\ln n$
665 3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	611	\marginpar{more work on formalising complexity}.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	612	%Our next step is to leverage these frameworks
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	613	%It is a precursor to our
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	614	Our work focuses on the space complexity of the algorithm under our notion of the size of
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	615	a regular expression.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	616	Despite not being a direct asymptotic time complexity proof,
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	617	our result is an important stepping leading towards one.
664 ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	618
665 3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	619
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	620	Brzozowski showed that there are finitely many similar deriviatives,
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	621	where similarity is defined in terms of ACI equations.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	622	This allows him to use derivatives as a basis for DFAs where each state is
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	623	labelled with a derivative.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	624	However, Brzozowski did not show anything about
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	625	the size of the derivatives.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	626	Antimirov showed that there can only be finitely
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	627	many partial derivatives for a regular expression and any set of
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	628	strings. He showed that the number is actually the
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	629	``alphabetical width'' plus 1.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	630	From this result one can relatively easily establish
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	631	that the size of the partial derivatives is
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	632	no bigger than $(\textit{size} \; r)^3$ for every string.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	633	Unfortunately this result does not seem to carry over to our
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	634	setting because partial derivatives have the simplification
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	635	\begin{equation}\label{eq:headSplitRule}
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	636	(r_1 + r_2) \cdot r_3 \rightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	637	\end{equation}
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	638	built in. We cannot have this because otherwise we would
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	639	lose the POSIX property. For instance, the lexing result of
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	640	regular expression
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	641	\[
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	642	(a+ab)\cdot(bc+c)
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	643	\]
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	644	with respect to string $abc$ using our lexer with the simplification rule \ref{eq:headSplitRule}
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	645	would be
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	646	\[
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	647	\Left (\Seq \; (\Char \; a), \Seq (\Char \; b) \; (\Char \; c) )
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	648	\]
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	649	instead of the correct POSIX value
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	650	\[
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	651	\Seq \; (\Right \; (\Seq \; (\Char \; a) \; (\Char \; b)) ) \; (\Char \;)
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	652	\]
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	653	Our result about the finite bound also does not say anything about the number of derivatives.
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	654	In fact there are infinitely many derivatives in general
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	655	because in the annotated regular expression for STAR we record the
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	656	number of iterations. What our result shows that the size of
3bedbdce3a3b a bit more intro, incorporating Christian chat messages Chengsong parents: 664 diff changeset	657	the derivatives is bounded, not the number.
664 ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	658	\marginpar{new changes up to this point.}
ba44144875b1 introduction Contribution section update Chengsong parents: 654 diff changeset	659
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	660
56057198e4f5 intro Chengsong parents: 638 diff changeset	661	In particular, the main problem we solved on top of previous work was
56057198e4f5 intro Chengsong parents: 638 diff changeset	662	coming up with a formally verified algorithm called $\blexersimp$ based
56057198e4f5 intro Chengsong parents: 638 diff changeset	663	on Brzozowski derivatives. It calculates a POSIX
56057198e4f5 intro Chengsong parents: 638 diff changeset	664	lexical value from a string and a regular expression. This algorithm was originally
56057198e4f5 intro Chengsong parents: 638 diff changeset	665	by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
56057198e4f5 intro Chengsong parents: 638 diff changeset	666	function does not really simplify intermediate results where it needs to and improved the
56057198e4f5 intro Chengsong parents: 638 diff changeset	667	algorithm accordingly.
56057198e4f5 intro Chengsong parents: 638 diff changeset	668	We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
56057198e4f5 intro Chengsong parents: 638 diff changeset	669	depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
56057198e4f5 intro Chengsong parents: 638 diff changeset	670	\begin{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	671	$\|\blexersimp \; r \; s \| \leq N_r$
56057198e4f5 intro Chengsong parents: 638 diff changeset	672	\end{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	673	The simplifications applied in each step of $\blexersimp$
56057198e4f5 intro Chengsong parents: 638 diff changeset	674
56057198e4f5 intro Chengsong parents: 638 diff changeset	675	\begin{center}
56057198e4f5 intro Chengsong parents: 638 diff changeset	676	$\blexersimp
56057198e4f5 intro Chengsong parents: 638 diff changeset	677	$
56057198e4f5 intro Chengsong parents: 638 diff changeset	678	\end{center}
56057198e4f5 intro Chengsong parents: 638 diff changeset	679	keeps the derivatives small, but presents a
56057198e4f5 intro Chengsong parents: 638 diff changeset	680	challenge
56057198e4f5 intro Chengsong parents: 638 diff changeset	681
56057198e4f5 intro Chengsong parents: 638 diff changeset	682
56057198e4f5 intro Chengsong parents: 638 diff changeset	683	establishing a correctness theorem of the below form:
56057198e4f5 intro Chengsong parents: 638 diff changeset	684	%TODO: change this to "theorem to prove"
56057198e4f5 intro Chengsong parents: 638 diff changeset	685	\begin{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	686	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$,
56057198e4f5 intro Chengsong parents: 638 diff changeset	687	then $\blexersimp\; r \; s = \Some \; v$. Otherwise
56057198e4f5 intro Chengsong parents: 638 diff changeset	688	$\blexersimp \; r \; s = \None$.
56057198e4f5 intro Chengsong parents: 638 diff changeset	689	\end{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	690
56057198e4f5 intro Chengsong parents: 638 diff changeset	691
56057198e4f5 intro Chengsong parents: 638 diff changeset	692
56057198e4f5 intro Chengsong parents: 638 diff changeset	693
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	694	The result is %a regular expression lexing algorithm that comes with
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	695	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	696	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	697	an improved version of Sulzmann and Lu's bit-coded algorithm using
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	698	derivatives with simplifications,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	699	accompanied by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	700	a proven correctness theorem according to POSIX specification
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	701	given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	702	\item
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	703	a complexity-related property for that algorithm saying that the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	704	internal data structure will
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	705	remain below a finite bound,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	706	\item
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	707	and an extension to
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	708	the bounded repetition constructs with the correctness and finiteness property
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	709	maintained.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	710	\end{itemize}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	711	\noindent
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	712	{\color{red} \rule{\linewidth}{0.5mm}}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	713	With a formal finiteness bound in place,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	714	we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	715	The Isabelle/HOL code for our formalisation can be
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	716	found at
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	717	\begin{center}
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	718	\url{https://github.com/hellotommmy/posix}
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	719	\end{center}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	720	Further improvements to the algorithm with an even stronger version of
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	721	simplification can be made. We conjecture that the resulting size of derivatives
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	722	can be bounded by a cubic bound w.r.t. the size of the regular expression.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	723	We will give relevant code in Scala,
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	724	but do not give a formal proof for that in Isabelle/HOL.
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	725	This is still future work.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	726
cc54ce075db5 restructured Chengsong parents: diff changeset	727
cc54ce075db5 restructured Chengsong parents: diff changeset	728	\section{Structure of the thesis}
652 a4d692a9a289 more Chengsong parents: 649 diff changeset	729	\marginpar{\em This is a marginal note.}
a4d692a9a289 more Chengsong parents: 649 diff changeset	730	Before talking about the formal proof of $\blexersimp$'s
a4d692a9a289 more Chengsong parents: 649 diff changeset	731	correctness, which is the main contribution of this thesis,
a4d692a9a289 more Chengsong parents: 649 diff changeset	732	we need to introduce two formal proofs which belong
a4d692a9a289 more Chengsong parents: 649 diff changeset	733	to Ausafe et al.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	734	In chapter \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	735	and notations we
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	736	use for describing regular expressions and derivatives,
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	737	and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	738	its correctness proof).
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	739	We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	740	together with the correctness proof by Ausaf and Urban.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	741	Then we illustrate in chapter \ref{Bitcoded2}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	742	how Sulzmann and Lu's
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	743	simplifications fail to simplify correctly. We therefore introduce our own version of the
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	744	algorithm with correct simplifications and
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	745	their correctness proof.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	746	In chapter \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	747	of our bitcoded algorithm, that is a finite bound on the size of any
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	748	regular expression's derivatives.
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	749	We also show how one can extend the
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	750	algorithm to include bounded repetitions.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	751	In chapter \ref{Cubic} we discuss stronger simplification rules which
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	752	improve the finite bound to a cubic bound. %and the NOT regular expression.
637 e3752aac8ec2 more Chengsong parents: 636 diff changeset	753	Chapter \ref{RelatedWork} introduces relevant work for this thesis.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	754	Chapter \ref{Future} concludes and mentions avenues of future research.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	755
cc54ce075db5 restructured Chengsong parents: diff changeset	756
cc54ce075db5 restructured Chengsong parents: diff changeset	757
cc54ce075db5 restructured Chengsong parents: diff changeset	758
cc54ce075db5 restructured Chengsong parents: diff changeset	759
cc54ce075db5 restructured Chengsong parents: diff changeset	760	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	761
cc54ce075db5 restructured Chengsong parents: diff changeset	762
cc54ce075db5 restructured Chengsong parents: diff changeset	763	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	764
cc54ce075db5 restructured Chengsong parents: diff changeset	765	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	766
cc54ce075db5 restructured Chengsong parents: diff changeset	767	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	768
cc54ce075db5 restructured Chengsong parents: diff changeset	769

author	Chengsong
	Fri, 14 Jul 2023 00:32:41 +0100
changeset 665	3bedbdce3a3b
parent 664	ba44144875b1
child 666	6da4516ea87d
permissions	-rwxr-xr-x