lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@a4d692a9a289 (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	28	\def\bsimps{\textit{bsimp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	29	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	35	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	36	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594 62f8fa03863e more Chengsong parents: 591 diff changeset	37	\def\rdistincts{\textit{rdistinct}}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	38	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	40	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
620 ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	42	\def\cbn{\textit{createdByNtimes}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	43	\def\hpa{\textit{highestPowerAux}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	44	\def\hpower{\textit{highestPower}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	45	\def\ntset{\textit{ntset}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	46	\def\optermsimp{\textit{optermsimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	47	\def\optermOsimp{\textit{optermOsimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	48	\def\optermosimp{\textit{optermosimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	49	\def\opterm{\textit{opterm}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	50	\def\nString{\textit{nonemptyString}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	51
cc54ce075db5 restructured Chengsong parents: diff changeset	52	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	53
600 fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	54	\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	55	\def\SEQs{\textit{SEQs}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	56	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	57	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	58	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	59	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	60	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	61	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	62	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	63	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	64	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	65	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	66	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	67	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	68	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	69	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	70	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	71	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	72
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	73	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	74
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	75	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	76	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	77	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	78	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	79	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	80
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	81	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	82
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	83	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	84	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	85	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	86	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	87	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	88	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	89	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	90	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	91	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	92	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	93	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	94	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	95	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	96	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	97	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	98	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	99	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	100	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	103	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	104	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	105	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	106	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	107	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	108	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	109	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	110	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	111	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	112	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	113	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	114	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	115	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	116	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	121	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	122	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	123	\def\starupdates{\textit{starUpdates}}
620 ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	124	\def\nupdate{\textit{nupdate}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	125	\def\nupdates{\textit{nupdates}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	126
532 cc54ce075db5 restructured Chengsong parents: diff changeset	127
cc54ce075db5 restructured Chengsong parents: diff changeset	128	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	129	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	130	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	131	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	132	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	133	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	134	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	135	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	136	\def\bsimpStrong{\textit{bsimpStrong}}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	137	\def\bdersStrongs{\textit{bdersStrong}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	138	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	139
532 cc54ce075db5 restructured Chengsong parents: diff changeset	140	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	141	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	142	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	143	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	144	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	145	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	146
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	147	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	148
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	149
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	150	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	151	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	152	\def\bsimpalts{\textit{bsimp}_{ALTS}}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	153	\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	154	\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	155	\def\rsimpseq{\textit{rsimp}_{SEQ}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	156
532 cc54ce075db5 restructured Chengsong parents: diff changeset	157	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	158	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	159	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	160
cc54ce075db5 restructured Chengsong parents: diff changeset	161
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	162	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	163	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	164	\def\filter{\textit{filter}}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	165	%\def\not{\textit{not}}
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	166
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	167
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	168
532 cc54ce075db5 restructured Chengsong parents: diff changeset	169	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	170	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	171	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	172	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	173	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	174	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	175	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	176
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	177
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	178
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	179
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	180	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	181	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	182	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	183	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	184	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	185	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	186	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	187	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	188	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	189	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	190	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	191	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	192	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	193	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	194	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	195	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	196	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	197	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	198
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	199
532 cc54ce075db5 restructured Chengsong parents: diff changeset	200	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	201	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	202	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	203
cc54ce075db5 restructured Chengsong parents: diff changeset	204	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	205
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	206
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	207	Regular expressions, since their inception in the 1940s,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	208	have been subject to extensive study and implementation.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	209	Their primary application lies in lexing,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	210	a process whereby an unstructured input string is disassembled into a tree of tokens
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	211	with their guidance.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	212	This is often used to match strings that comprises of numerous fields,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	213	where certain fields may recur or be omitted.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	214	For example, the regular expression for recognising email addresses is
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	215	\begin{center}
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	216	\verb\|^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$\|.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	217	\end{center}
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	218	Using this, regular expression matchers and lexers are able to extract
649 ef2b8abcbc55 more Chengsong parents: 648 diff changeset	219	the domain names by the use of \verb\|[a-zA-Z0-9.-]+\|.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	220	Consequently, they are an indispensible components in text processing tools
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	221	of software systems such as compilers, IDEs, and firewalls.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	222
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	223	The study of regular expressions is ongoing due to an
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	224	issue known as catastrophic backtracking.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	225	This phenomenon refers to scenarios in which the regular expression
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	226	matching or lexing engine exhibits a disproportionately long
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	227	runtime despite the simplicity of the input and expression.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	228
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	229	The origin of catastrophic backtracking is rooted in the
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	230	ambiguities of lexing.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	231	In the process of matching a multi-character string with
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	232	a regular expression that encompasses several sub-expressions,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	233	different positions can be designated to mark
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	234	the boundaries of corresponding substrings of the sub-expressions.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	235	For instance, in matching the string $aaa$ with the
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	236	regular expression $(a+aa)^*$, the divide between
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	237	the initial match and the subsequent iteration could either be
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	238	set between the first and second characters ($a \| aa$) or between the second and third characters ($aa \| a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	239
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	240	Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	241
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	242	Various disambiguation strategies are employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	243
ce4e5151a836 more Chengsong parents: 600 diff changeset	244
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	245	%Regular expressions
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	246	%have been extensively studied and
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	247	%implemented since their invention in 1940s.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	248	%It is primarily used in lexing, where an unstructured input string is broken
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	249	%down into a tree of tokens.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	250	%That tree's construction is guided by the shape of the regular expression.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	251	%This is particularly useful in expressing the shape of a string
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	252	%with many fields, where certain fields might repeat or be omitted.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	253	%Regular expression matchers and Lexers allow us to
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	254	%identify and delimit different subsections of a string and potentially
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	255	%extract information from inputs, making them
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	256	%an indispensible component in modern software systems' text processing tasks
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	257	%such as compilers, IDEs, and firewalls.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	258	%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	259	%which means the regular expression matching or lexing engine takes an unproportional time to run
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	260	%despite the input and the expression being relatively simple.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	261	%
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	262	%Catastrophic backtracking stems from the ambiguities of lexing:
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	263	%when matching a multiple-character string with a regular
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	264	%exression that includes serveral sub-expressions, there might be different positions to set
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	265	%the border between sub-expressions' corresponding sub-strings.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	266	%For example, matching the string $aaa$ against the regular expression
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	267	%$(a+aa)^*$, the border between the initial match and the second iteration
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	268	%could be between the first and second character ($a \| aa$)
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	269	%or between the second and third character ($aa \| a$).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	270	%As the size of the input string and the structural complexity
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	271	%of the regular expression increase,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	272	%the number of different combinations of delimiters can grow exponentially, and
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	273	%algorithms that explore these possibilities unwisely will also see an exponential complexity.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	274	%
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	275	%Catastrophic backtracking allow a class of computationally inexpensive attacks called
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	276	%Regular expression Denial of Service attacks (ReDoS), in which the hacker
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	277	%simply sends out a small attack string to a server,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	278	%triggering high-complexity behaviours in its regular expression engine.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	279	%These attacks, be it deliberate or not, have caused real-world systems to go down (see more
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	280	%details of this in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	281	%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	282	%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	283	%There have been prose definitions like the following
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	284	%by Kuklewicz \cite{KuklewiczHaskell}:
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	285	%described the POSIX rule as (section 1, last paragraph):
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	286	\begin{quote}
56057198e4f5 intro Chengsong parents: 638 diff changeset	287	\begin{itemize}
56057198e4f5 intro Chengsong parents: 638 diff changeset	288	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	289	regular expressions (REs) take the leftmost starting match, and the longest match starting there
56057198e4f5 intro Chengsong parents: 638 diff changeset	290	earlier subpatterns have leftmost-longest priority over later subpatterns\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	291	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	292	higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	293	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	294	REs have right associative concatenation which can be changed with parenthesis\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	295	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	296	parenthesized subexpressions return the match from their last usage\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	297	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	298	text of component subexpressions must be contained in the text of the
56057198e4f5 intro Chengsong parents: 638 diff changeset	299	higher-level subexpressions\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	300	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	301	if "p" and "q" can never match the same text then "p\|q" and "q\|p" are equivalent, up to trivial renumbering of captured subexpressions\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	302	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	303	if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	304	\end{itemize}
56057198e4f5 intro Chengsong parents: 638 diff changeset	305	\end{quote}
649 ef2b8abcbc55 more Chengsong parents: 648 diff changeset	306	However, the author noted that various lexers that claim to be POSIX
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	307	are rarely correct according to this standard.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	308	There are numerous occasions where programmers realised the subtlety and
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	309	difficulty to implement correctly, one such quote from Go's regexp library author
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	310	\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	311	\begin{quote}\it
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	312	``
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	313	The POSIX rule is computationally prohibitive
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	314	and not even well-defined.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	315	``
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	316	\end{quote}
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	317	Being able to formally define and capture the idea of
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	318	POSIX rules and prove
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	319	the correctness of regular expression matching/lexing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	320	algorithms against the POSIX semantics definitions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	321	is valuable.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	322
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	323
649 ef2b8abcbc55 more Chengsong parents: 648 diff changeset	324	Formal proofs are
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	325	machine checked programs
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	326	%such as the ones written in Isabelle/HOL, is a powerful means
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	327	for computer scientists to be certain
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	328	about the correctness of their algorithms.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	329	This is done by
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	330	recursively checking that every fact in a proof script
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	331	is either an axiom or a fact that is derived from
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	332	known axioms or verified facts.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	333	%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	334	%methods as their implementation and complexity analysis tend to be error-prone.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	335	Formal proofs provides an unprecendented level of asssurance
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	336	that an algorithm will perform as expected under all inputs.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	337	The software systems that help people interactively build and check
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	338	such proofs are called theorem-provers or proof assitants.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	339	Many theorem-provers have been developed, such as Mizar,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	340	Isabelle/HOL, HOL-Light, HOL4,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	341	Coq, Agda, Idris, Lean and so on.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	342	Isabelle/HOL is a theorem prover with a simple type theory
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	343	and powerful automated proof generators like sledgehammer.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	344	We chose to use Isabelle/HOL for its powerful automation
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	345	and ease and simplicity in expressing regular expressions and
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	346	regular languages.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	347	%Some of those employ
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	348	%dependent types like Mizar, Coq, Agda, Lean and Idris.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	349	%Some support a constructivism approach, such as Coq.
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	350
56057198e4f5 intro Chengsong parents: 638 diff changeset	351
649 ef2b8abcbc55 more Chengsong parents: 648 diff changeset	352	Formal proofs on regular expression matching and lexing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	353	complements the efforts in
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	354	industry which tend to focus on overall speed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	355	with techniques like parallelization (FPGA paper), tackling
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	356	the problem of catastrophic backtracking
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	357	in an ad-hoc manner (cloudflare and stackexchange article).
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	358
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	359	There have been many interesting steps in the theorem-proving community
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	360	about formalising regular expressions and lexing.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	361	One flavour is to build from the regular expression an automaton, and determine
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	362	acceptance in terms of the resulting
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	363	state after executing the input string on that automaton.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	364	Automata formalisations are in general harder and more cumbersome to deal
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	365	with for theorem provers than working directly on regular expressions.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	366	One such example is by Nipkow \cite{Nipkow1998}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	367	%They
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	368	%made everything recursive (for example the next state function),
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	369	As a core idea, they
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	370	used a list of booleans to name each state so that
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	371	after composing sub-automata together, renaming the states to maintain
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	372	the distinctness of each state is recursive and simple.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	373	The result was the obvious lemmas incorporating
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	374	``a painful amount of detail'' in their formalisation.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	375	Sometimes the automata are represented as graphs.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	376	But graphs are not inductive datatypes.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	377	Having to set the induction principle on the number of nodes
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	378	in a graph makes formal reasoning non-intuitive and convoluted,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	379	resulting in large formalisations \cite{Lammich2012}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	380	When combining two graphs, one also needs to make sure that the nodes in
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	381	both graphs are distinct, which almost always involve
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	382	renaming of the nodes.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	383	A theorem-prover which provides dependent types such as Coq
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	384	can alleviate the issue of representing graph nodes
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	385	\cite{Doczkal2013}. There the representation of nodes is made
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	386	easier by the use of $\textit{FinType}$.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	387	Another representation for automata are matrices.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	388	But the induction for them is not as straightforward either.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	389	There are some more clever representations, for example one by Paulson
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	390	using hereditarily finite sets \cite{Paulson2015}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	391	There the problem with combining graphs can be solved better.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	392	%but we believe that such clever tricks are not very obvious for
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	393	%the John-average-Isabelle-user.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	394
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	395	The approach that operates directly on regular expressions circumvents the problem of
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	396	conversion between a regular expression and an automaton, thereby avoiding representation
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	397	problems altogether, despite that regular expressions may be seen as a variant of a
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	398	non-deterministic finite automaton (ref Laurikari tagged NFA paper).
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	399	To matching a string, a sequence of algebraic transformations called
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	400	(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	401	Each derivative takes a character and a regular expression,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	402	and returns a new regular expression whose language is closely related to
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	403	the original regular expression's language:
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	404	strings prefixed with that input character will have their head removed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	405	and strings not prefixed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	406	with that character will be eliminated.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	407	After taking derivatives with respect to all the characters the string is
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	408	exhausted. Then an algorithm checks whether the empty string is in that final
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	409	regular expression's language.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	410	If so, a match exists and the string is in the language of the input regular expression.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	411
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	412	Again this process can be seen as the simulation of an NFA running on a string,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	413	but the recursive nature of the datatypes and functions involved makes
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	414	derivatives a perfect object of study for theorem provers.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	415	That is why there has been numerous formalisations of regular expressions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	416	and Brzozowski derivatives in the functional programming and
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	417	theorem proving community (a large list of refs to derivatives formalisation publications).
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	418	Ribeiro and Du Bois \cite{RibeiroAgda2017} have
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	419	formalised the notion of bit-coded regular expressions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	420	and proved their relations with simple regular expressions in
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	421	the dependently-typed proof assistant Agda.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	422	They also proved the soundness and completeness of a matching algorithm
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	423	based on the bit-coded regular expressions. Their algorithm is a decision procedure
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	424	that gives a Yes/No answer, which does not produce
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	425	lexical values.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	426	%X also formalised derivatives and regular expressions, producing "parse trees".
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	427	%(Some person who's a big name in formal methods)
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	428
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	429
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	430	The variant of the problem we are looking at centers around
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	431	an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	432	The reason we chose to look at $\blexer$ and its simplifications
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	433	is because it allows a lexical tree to be generated
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	434	by some elegant and subtle procedure based on Brzozowski derivatives.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	435	The procedures are made of recursive functions and inductive datatypes just like derivatives,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	436	allowing intuitive and concise formal reasoning with theorem provers.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	437	Most importantly, $\blexer$ opens up a path to an optimized version
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	438	of $\blexersimp$ possibilities to improve
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	439	performance with simplifications that aggressively change the structure of regular expressions.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	440	While most derivative-based methods
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	441	rely on structures to be maintained to allow induction to
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	442	go through.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	443	For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	444	with derivatives, but as soon as they started introducing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	445	optimizations such as memoization, they reverted to constructing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	446	DFAs first.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	447	Edelmann \ref{Edelmann2020} explored similar optimizations in his
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	448	work on verified LL(1) parsing, with additional enhancements with data structures like
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	449	zippers.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	450
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	451	%Sulzmann and Lu have worked out an algorithm
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	452	%that is especially suited for verification
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	453	%which utilized the fact
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	454	%that whenever ambiguity occurs one may duplicate a sub-expression and use
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	455	%different copies to describe different matching choices.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	456	The idea behind the correctness of $\blexer$ is simple: during a derivative,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	457	multiple matches might be possible, where an alternative with multiple children
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	458	each corresponding to a
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	459	different match is created. In the end of
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	460	a lexing process one always picks up the leftmost alternative, which is guarnateed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	461	to be a POSIX value.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	462	This is done by consistently keeping sub-regular expressions in an alternative
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	463	with longer submatches
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	464	to the left of other copies (
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	465	Remember that POSIX values are roughly the values with the longest inital
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	466	submatch).
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	467	The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	468	is that since we only take the leftmost copy, then all re-occurring copies can be
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	469	eliminated without losing the POSIX property, and this can be done with
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	470	children of alternatives at different levels by merging them together.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	471	Proving $\blexersimp$ requires a different
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	472	proof strategy compared to that by Ausaf \cite{FahadThesis}.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	473	We invent a rewriting relation as an
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	474	inductive predicate which captures
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	475	a strong enough invariance that ensures correctness,
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	476	which commutes with the derivative operation.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	477	This predicate allows a simple
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	478	induction on the input string to go through.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	479
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	480	%This idea has been repeatedly used in different variants of lexing
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	481	%algorithms in their paper, one of which involves bit-codes. The bit-coded
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	482	%derivative-based algorithm even allows relatively aggressive
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	483	%%simplification rules which cause
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	484	%structural changes that render the induction used in the correctness
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	485	%proofs unusable.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	486	%More details will be given in \ref{Bitcoded2} including the
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	487	%new correctness proof which involves a new inductive predicate which allows
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	488	%rule induction to go through.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	489
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	490
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	491
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	492
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	493
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	494
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	495
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	496	%first character is removed
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	497	%state of the automaton after matching that character
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	498	%where nodes are represented as
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	499	%a sub-expression (for example tagged NFA
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	500	%Working on regular expressions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	501	%Because of these problems with automata, we prefer regular expressions
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	502	%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	503	%the regular expression and a different data structure.
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	504	%
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	505	%
ef2b8abcbc55 more Chengsong parents: 648 diff changeset	506	%The key idea
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	507
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	508	(ends)
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	509
56057198e4f5 intro Chengsong parents: 638 diff changeset	510	%Regular expressions are widely used in computer science:
56057198e4f5 intro Chengsong parents: 638 diff changeset	511	%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
56057198e4f5 intro Chengsong parents: 638 diff changeset	512	%command-line tools like $\mathit{grep}$ that facilitate easy
56057198e4f5 intro Chengsong parents: 638 diff changeset	513	%text-processing \cite{grep}; network intrusion
56057198e4f5 intro Chengsong parents: 638 diff changeset	514	%detection systems that inspect suspicious traffic; or compiler
56057198e4f5 intro Chengsong parents: 638 diff changeset	515	%front ends.
56057198e4f5 intro Chengsong parents: 638 diff changeset	516	%Given their usefulness and ubiquity, one would assume that
56057198e4f5 intro Chengsong parents: 638 diff changeset	517	%modern regular expression matching implementations
56057198e4f5 intro Chengsong parents: 638 diff changeset	518	%are mature and fully studied.
56057198e4f5 intro Chengsong parents: 638 diff changeset	519	%Indeed, in many popular programming languages' regular expression engines,
56057198e4f5 intro Chengsong parents: 638 diff changeset	520	%one can supply a regular expression and a string, and in most cases get
56057198e4f5 intro Chengsong parents: 638 diff changeset	521	%get the matching information in a very short time.
56057198e4f5 intro Chengsong parents: 638 diff changeset	522	%Those engines can sometimes be blindingly fast--some
56057198e4f5 intro Chengsong parents: 638 diff changeset	523	%network intrusion detection systems
56057198e4f5 intro Chengsong parents: 638 diff changeset	524	%use regular expression engines that are able to process
56057198e4f5 intro Chengsong parents: 638 diff changeset	525	%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
56057198e4f5 intro Chengsong parents: 638 diff changeset	526	%However, those engines can sometimes also exhibit a surprising security vulnerability
56057198e4f5 intro Chengsong parents: 638 diff changeset	527	%under a certain class of inputs.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	528	%However, , this is not the case for $\mathbf{all}$ inputs.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	529	%TODO: get source for SNORT/BRO's regex matching engine/speed
ce4e5151a836 more Chengsong parents: 600 diff changeset	530
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	531
532 cc54ce075db5 restructured Chengsong parents: diff changeset	532	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	533	\section{Contribution}
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	534	{\color{red} \rule{\linewidth}{0.5mm}}
56057198e4f5 intro Chengsong parents: 638 diff changeset	535	\textbf{issue with this part: way too short, not enough details of what I have done.}
56057198e4f5 intro Chengsong parents: 638 diff changeset	536	{\color{red} \rule{\linewidth}{0.5mm}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	537
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	538	In this thesis,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	539	we propose a solution to catastrophic
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	540	backtracking and error-prone matchers: a formally verified
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	541	regular expression lexing algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	542	that is both fast
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	543	and correct.
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	544	{\color{red} \rule{\linewidth}{0.5mm}}
56057198e4f5 intro Chengsong parents: 638 diff changeset	545	\HandRight Added content:
56057198e4f5 intro Chengsong parents: 638 diff changeset	546	%Package \verb`pifont`: \ding{43}
56057198e4f5 intro Chengsong parents: 638 diff changeset	547	%Package \verb`utfsym`: \usym{261E}
56057198e4f5 intro Chengsong parents: 638 diff changeset	548	%Package \verb`dingbat`: \leftpointright
56057198e4f5 intro Chengsong parents: 638 diff changeset	549	%Package \verb`dingbat`: \rightpointright
56057198e4f5 intro Chengsong parents: 638 diff changeset	550
56057198e4f5 intro Chengsong parents: 638 diff changeset	551	In particular, the main problem we solved on top of previous work was
56057198e4f5 intro Chengsong parents: 638 diff changeset	552	coming up with a formally verified algorithm called $\blexersimp$ based
56057198e4f5 intro Chengsong parents: 638 diff changeset	553	on Brzozowski derivatives. It calculates a POSIX
56057198e4f5 intro Chengsong parents: 638 diff changeset	554	lexical value from a string and a regular expression. This algorithm was originally
56057198e4f5 intro Chengsong parents: 638 diff changeset	555	by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
56057198e4f5 intro Chengsong parents: 638 diff changeset	556	function does not really simplify intermediate results where it needs to and improved the
56057198e4f5 intro Chengsong parents: 638 diff changeset	557	algorithm accordingly.
56057198e4f5 intro Chengsong parents: 638 diff changeset	558	We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
56057198e4f5 intro Chengsong parents: 638 diff changeset	559	depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
56057198e4f5 intro Chengsong parents: 638 diff changeset	560	\begin{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	561	$\|\blexersimp \; r \; s \| \leq N_r$
56057198e4f5 intro Chengsong parents: 638 diff changeset	562	\end{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	563	The simplifications applied in each step of $\blexersimp$
56057198e4f5 intro Chengsong parents: 638 diff changeset	564
56057198e4f5 intro Chengsong parents: 638 diff changeset	565	\begin{center}
56057198e4f5 intro Chengsong parents: 638 diff changeset	566	$\blexersimp
56057198e4f5 intro Chengsong parents: 638 diff changeset	567	$
56057198e4f5 intro Chengsong parents: 638 diff changeset	568	\end{center}
56057198e4f5 intro Chengsong parents: 638 diff changeset	569	keeps the derivatives small, but presents a
56057198e4f5 intro Chengsong parents: 638 diff changeset	570	challenge
56057198e4f5 intro Chengsong parents: 638 diff changeset	571
56057198e4f5 intro Chengsong parents: 638 diff changeset	572
56057198e4f5 intro Chengsong parents: 638 diff changeset	573	establishing a correctness theorem of the below form:
56057198e4f5 intro Chengsong parents: 638 diff changeset	574	%TODO: change this to "theorem to prove"
56057198e4f5 intro Chengsong parents: 638 diff changeset	575	\begin{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	576	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$,
56057198e4f5 intro Chengsong parents: 638 diff changeset	577	then $\blexersimp\; r \; s = \Some \; v$. Otherwise
56057198e4f5 intro Chengsong parents: 638 diff changeset	578	$\blexersimp \; r \; s = \None$.
56057198e4f5 intro Chengsong parents: 638 diff changeset	579	\end{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	580
56057198e4f5 intro Chengsong parents: 638 diff changeset	581
56057198e4f5 intro Chengsong parents: 638 diff changeset	582
56057198e4f5 intro Chengsong parents: 638 diff changeset	583
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	584	The result is %a regular expression lexing algorithm that comes with
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	585	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	586	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	587	an improved version of Sulzmann and Lu's bit-coded algorithm using
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	588	derivatives with simplifications,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	589	accompanied by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	590	a proven correctness theorem according to POSIX specification
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	591	given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	592	\item
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	593	a complexity-related property for that algorithm saying that the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	594	internal data structure will
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	595	remain below a finite bound,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	596	\item
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	597	and an extension to
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	598	the bounded repetition constructs with the correctness and finiteness property
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	599	maintained.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	600	\end{itemize}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	601	\noindent
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	602	{\color{red} \rule{\linewidth}{0.5mm}}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	603	With a formal finiteness bound in place,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	604	we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	605	The Isabelle/HOL code for our formalisation can be
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	606	found at
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	607	\begin{center}
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	608	\url{https://github.com/hellotommmy/posix}
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	609	\end{center}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	610	Further improvements to the algorithm with an even stronger version of
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	611	simplification can be made. We conjecture that the resulting size of derivatives
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	612	can be bounded by a cubic bound w.r.t. the size of the regular expression.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	613	We will give relevant code in Scala,
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	614	but do not give a formal proof for that in Isabelle/HOL.
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	615	This is still future work.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	616
cc54ce075db5 restructured Chengsong parents: diff changeset	617
cc54ce075db5 restructured Chengsong parents: diff changeset	618	\section{Structure of the thesis}
652 a4d692a9a289 more Chengsong parents: 649 diff changeset	619	\marginpar{\em This is a marginal note.}
a4d692a9a289 more Chengsong parents: 649 diff changeset	620	Before talking about the formal proof of $\blexersimp$'s
a4d692a9a289 more Chengsong parents: 649 diff changeset	621	correctness, which is the main contribution of this thesis,
a4d692a9a289 more Chengsong parents: 649 diff changeset	622	we need to introduce two formal proofs which belong
a4d692a9a289 more Chengsong parents: 649 diff changeset	623	to Ausafe et al.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	624	In chapter \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	625	and notations we
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	626	use for describing regular expressions and derivatives,
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	627	and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	628	its correctness proof).
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	629	We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	630	together with the correctness proof by Ausaf and Urban.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	631	Then we illustrate in chapter \ref{Bitcoded2}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	632	how Sulzmann and Lu's
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	633	simplifications fail to simplify correctly. We therefore introduce our own version of the
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	634	algorithm with correct simplifications and
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	635	their correctness proof.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	636	In chapter \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	637	of our bitcoded algorithm, that is a finite bound on the size of any
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	638	regular expression's derivatives.
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	639	We also show how one can extend the
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	640	algorithm to include bounded repetitions.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	641	In chapter \ref{Cubic} we discuss stronger simplification rules which
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	642	improve the finite bound to a cubic bound. %and the NOT regular expression.
637 e3752aac8ec2 more Chengsong parents: 636 diff changeset	643	Chapter \ref{RelatedWork} introduces relevant work for this thesis.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	644	Chapter \ref{Future} concludes and mentions avenues of future research.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	645
cc54ce075db5 restructured Chengsong parents: diff changeset	646
cc54ce075db5 restructured Chengsong parents: diff changeset	647
cc54ce075db5 restructured Chengsong parents: diff changeset	648
cc54ce075db5 restructured Chengsong parents: diff changeset	649
cc54ce075db5 restructured Chengsong parents: diff changeset	650	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	651
cc54ce075db5 restructured Chengsong parents: diff changeset	652
cc54ce075db5 restructured Chengsong parents: diff changeset	653	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	654
cc54ce075db5 restructured Chengsong parents: diff changeset	655	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	656
cc54ce075db5 restructured Chengsong parents: diff changeset	657	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	658
cc54ce075db5 restructured Chengsong parents: diff changeset	659

author	Chengsong
	Tue, 27 Jun 2023 16:26:48 +0100 (20 months ago)
changeset 652	a4d692a9a289
parent 649	ef2b8abcbc55
child 653	bc5571c38d1f
permissions	-rwxr-xr-x