lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@d15a0b7d6d90 (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	28	\def\bsimps{\textit{bsimp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	29	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	35	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	36	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594 62f8fa03863e more Chengsong parents: 591 diff changeset	37	\def\rdistincts{\textit{rdistinct}}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	38	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	40	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
620 ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	42	\def\cbn{\textit{createdByNtimes}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	43	\def\hpa{\textit{highestPowerAux}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	44	\def\hpower{\textit{highestPower}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	45	\def\ntset{\textit{ntset}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	46	\def\optermsimp{\textit{optermsimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	47	\def\optermOsimp{\textit{optermOsimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	48	\def\optermosimp{\textit{optermosimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	49	\def\opterm{\textit{opterm}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	50	\def\nString{\textit{nonemptyString}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	51
cc54ce075db5 restructured Chengsong parents: diff changeset	52	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	53
600 fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	54	\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	55	\def\SEQs{\textit{SEQs}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	56	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	57	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	58	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	59	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	60	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	61	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	62	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	63	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	64	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	65	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	66	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	67	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	68	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	69	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	70	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	71	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	72
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	73	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	74
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	75	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	76	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	77	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	78	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	79	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	80
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	81	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	82
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	83	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	84	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	85	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	86	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	87	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	88	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	89	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	90	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	91	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	92	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	93	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	94	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	95	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	96	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	97	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	98	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	99	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	100	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	103	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	104	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	105	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	106	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	107	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	108	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	109	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	110	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	111	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	112	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	113	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	114	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	115	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	116	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	121	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	122	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	123	\def\starupdates{\textit{starUpdates}}
620 ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	124	\def\nupdate{\textit{nupdate}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	125	\def\nupdates{\textit{nupdates}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	126
532 cc54ce075db5 restructured Chengsong parents: diff changeset	127
cc54ce075db5 restructured Chengsong parents: diff changeset	128	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	129	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	130	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	131	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	132	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	133	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	134	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	135	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	136	\def\bsimpStrong{\textit{bsimpStrong}}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	137	\def\bdersStrongs{\textit{bdersStrong}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	138	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	139
532 cc54ce075db5 restructured Chengsong parents: diff changeset	140	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	141	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	142	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	143	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	144	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	145	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	146
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	147	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	148
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	149
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	150	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	151	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	152	\def\bsimpalts{\textit{bsimp}_{ALTS}}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	153	\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	154	\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	155	\def\rsimpseq{\textit{rsimp}_{SEQ}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	156
532 cc54ce075db5 restructured Chengsong parents: diff changeset	157	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	158	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	159	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	160
cc54ce075db5 restructured Chengsong parents: diff changeset	161
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	162	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	163	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	164	\def\filter{\textit{filter}}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	165	%\def\not{\textit{not}}
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	166
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	167
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	168
532 cc54ce075db5 restructured Chengsong parents: diff changeset	169	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	170	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	171	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	172	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	173	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	174	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	175	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	176
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	177
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	178
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	179
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	180	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	181	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	182	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	183	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	184	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	185	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	186	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	187	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	188	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	189	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	190	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	191	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	192	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	193	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	194	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	195	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	196	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	197	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	198
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	199
532 cc54ce075db5 restructured Chengsong parents: diff changeset	200	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	201	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	202	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	203
cc54ce075db5 restructured Chengsong parents: diff changeset	204	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	205
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	206
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	207	Regular expressions, since their inception in the 1940s,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	208	have been subject to extensive study and implementation.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	209	Their primary application lies in lexing,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	210	a process whereby an unstructured input string is disassembled into a tree of tokens
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	211	with their guidance.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	212	This is often used to match strings that comprises of numerous fields,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	213	where certain fields may recur or be omitted.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	214	For example, the regular expression for recognising email addresses is
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	215	\begin{center}
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	216	\verb\|^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$\|.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	217	\end{center}
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	218	Using this, regular expression matchers and lexers are able to extract
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	219	for example the domain names by the use of \verb\|[a-zA-Z0-9.-]+\|.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	220	Consequently, they are an indispensible component in text processing tools
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	221	of software systems such as compilers, IDEs, and firewalls.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	222
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	223	The study of regular expressions is ongoing due to an
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	224	issue known as catastrophic backtracking.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	225	This phenomenon refers to scenarios in which the regular expression
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	226	matching or lexing engine exhibits a disproportionately long
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	227	runtime despite the simplicity of the input and expression.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	228
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	229	The origin of catastrophic backtracking is rooted in the
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	230	ambiguities of lexing.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	231	In the process of matching a multi-character string with
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	232	a regular expression that encompasses several sub-expressions,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	233	different positions can be designated to mark
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	234	the boundaries of corresponding substrings of the sub-expressions.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	235	For instance, in matching the string $aaa$ with the
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	236	regular expression $(a+aa)^*$, the divide between
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	237	the initial match and the subsequent iteration could either be
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	238	set between the first and second characters ($a \| aa$) or between the second and third characters ($aa \| a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	239
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	240	Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	241
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	242	Various disambiguation strategies are employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	243
ce4e5151a836 more Chengsong parents: 600 diff changeset	244
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	245	%Regular expressions
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	246	%have been extensively studied and
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	247	%implemented since their invention in 1940s.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	248	%It is primarily used in lexing, where an unstructured input string is broken
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	249	%down into a tree of tokens.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	250	%That tree's construction is guided by the shape of the regular expression.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	251	%This is particularly useful in expressing the shape of a string
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	252	%with many fields, where certain fields might repeat or be omitted.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	253	%Regular expression matchers and Lexers allow us to
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	254	%identify and delimit different subsections of a string and potentially
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	255	%extract information from inputs, making them
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	256	%an indispensible component in modern software systems' text processing tasks
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	257	%such as compilers, IDEs, and firewalls.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	258	%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	259	%which means the regular expression matching or lexing engine takes an unproportional time to run
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	260	%despite the input and the expression being relatively simple.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	261	%
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	262	%Catastrophic backtracking stems from the ambiguities of lexing:
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	263	%when matching a multiple-character string with a regular
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	264	%exression that includes serveral sub-expressions, there might be different positions to set
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	265	%the border between sub-expressions' corresponding sub-strings.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	266	%For example, matching the string $aaa$ against the regular expression
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	267	%$(a+aa)^*$, the border between the initial match and the second iteration
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	268	%could be between the first and second character ($a \| aa$)
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	269	%or between the second and third character ($aa \| a$).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	270	%As the size of the input string and the structural complexity
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	271	%of the regular expression increase,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	272	%the number of different combinations of delimiters can grow exponentially, and
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	273	%algorithms that explore these possibilities unwisely will also see an exponential complexity.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	274	%
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	275	%Catastrophic backtracking allow a class of computationally inexpensive attacks called
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	276	%Regular expression Denial of Service attacks (ReDoS), in which the hacker
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	277	%simply sends out a small attack string to a server,
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	278	%triggering high-complexity behaviours in its regular expression engine.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	279	%These attacks, be it deliberate or not, have caused real-world systems to go down (see more
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	280	%details of this in the practical overview section in chapter \ref{Introduction}).
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	281	%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	282	%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	283	%There have been prose definitions like the following
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	284	%by Kuklewicz \cite{KuklewiczHaskell}:
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	285	%described the POSIX rule as (section 1, last paragraph):
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	286	\begin{quote}
56057198e4f5 intro Chengsong parents: 638 diff changeset	287	\begin{itemize}
56057198e4f5 intro Chengsong parents: 638 diff changeset	288	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	289	regular expressions (REs) take the leftmost starting match, and the longest match starting there
56057198e4f5 intro Chengsong parents: 638 diff changeset	290	earlier subpatterns have leftmost-longest priority over later subpatterns\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	291	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	292	higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	293	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	294	REs have right associative concatenation which can be changed with parenthesis\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	295	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	296	parenthesized subexpressions return the match from their last usage\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	297	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	298	text of component subexpressions must be contained in the text of the
56057198e4f5 intro Chengsong parents: 638 diff changeset	299	higher-level subexpressions\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	300	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	301	if "p" and "q" can never match the same text then "p\|q" and "q\|p" are equivalent, up to trivial renumbering of captured subexpressions\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	302	\item
56057198e4f5 intro Chengsong parents: 638 diff changeset	303	if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
56057198e4f5 intro Chengsong parents: 638 diff changeset	304	\end{itemize}
56057198e4f5 intro Chengsong parents: 638 diff changeset	305	\end{quote}
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	306	However, the author noted that lexers are rarely correct according to this standard.
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	307	Being able to formally define and capture the idea of POSIX rules and matching/lexing algorithms and prove
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	308	the correctness of such algorithms against the POSIX semantics definitions
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	309	would be valuable.
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	310
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	311	Formal proofs are machine checked programs
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	312	such as the ones written in Isabelle/HOL, is a powerful means
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	313	for computer scientists to be certain about correctness of their algorithms and correctness properties.
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	314	This is done by automatically checking that the end goal is derived solely from a list of axioms, definitions
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	315	and proof rules that are known to be correct.
56057198e4f5 intro Chengsong parents: 638 diff changeset	316	The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
56057198e4f5 intro Chengsong parents: 638 diff changeset	317	methods as their implementation and complexity analysis tend to be error-prone.
56057198e4f5 intro Chengsong parents: 638 diff changeset	318
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	319	There have been many interesting steps taken in the direction of formal proofs and regular expressions
d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	320	lexing and matching.
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	321
56057198e4f5 intro Chengsong parents: 638 diff changeset	322
56057198e4f5 intro Chengsong parents: 638 diff changeset	323
648 d15a0b7d6d90 new amend Chengsong parents: 646 diff changeset	324	(ends)
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	325
56057198e4f5 intro Chengsong parents: 638 diff changeset	326	%Regular expressions are widely used in computer science:
56057198e4f5 intro Chengsong parents: 638 diff changeset	327	%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
56057198e4f5 intro Chengsong parents: 638 diff changeset	328	%command-line tools like $\mathit{grep}$ that facilitate easy
56057198e4f5 intro Chengsong parents: 638 diff changeset	329	%text-processing \cite{grep}; network intrusion
56057198e4f5 intro Chengsong parents: 638 diff changeset	330	%detection systems that inspect suspicious traffic; or compiler
56057198e4f5 intro Chengsong parents: 638 diff changeset	331	%front ends.
56057198e4f5 intro Chengsong parents: 638 diff changeset	332	%Given their usefulness and ubiquity, one would assume that
56057198e4f5 intro Chengsong parents: 638 diff changeset	333	%modern regular expression matching implementations
56057198e4f5 intro Chengsong parents: 638 diff changeset	334	%are mature and fully studied.
56057198e4f5 intro Chengsong parents: 638 diff changeset	335	%Indeed, in many popular programming languages' regular expression engines,
56057198e4f5 intro Chengsong parents: 638 diff changeset	336	%one can supply a regular expression and a string, and in most cases get
56057198e4f5 intro Chengsong parents: 638 diff changeset	337	%get the matching information in a very short time.
56057198e4f5 intro Chengsong parents: 638 diff changeset	338	%Those engines can sometimes be blindingly fast--some
56057198e4f5 intro Chengsong parents: 638 diff changeset	339	%network intrusion detection systems
56057198e4f5 intro Chengsong parents: 638 diff changeset	340	%use regular expression engines that are able to process
56057198e4f5 intro Chengsong parents: 638 diff changeset	341	%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
56057198e4f5 intro Chengsong parents: 638 diff changeset	342	%However, those engines can sometimes also exhibit a surprising security vulnerability
56057198e4f5 intro Chengsong parents: 638 diff changeset	343	%under a certain class of inputs.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	344	%However, , this is not the case for $\mathbf{all}$ inputs.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	345	%TODO: get source for SNORT/BRO's regex matching engine/speed
ce4e5151a836 more Chengsong parents: 600 diff changeset	346
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	347
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	348	Consider for example the regular expression $(a^)^\,b$ and
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	349	strings of the form $aa..a$. These strings cannot be matched by this regular
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	350	expression: Obviously the expected $b$ in the last
f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	351	position is missing. One would assume that modern regular expression
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	352	matching engines can find this out very quickly. Surprisingly, if one tries
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	353	this example in JavaScript, Python or Java 8, even with small strings,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	354	say of lenght of around 30 $a$'s,
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	355	the decision takes an absurd amount of time to finish (see graphs in figure \ref{fig:aStarStarb}).
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	356	The algorithms clearly show exponential behaviour, and as can be seen
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	357	is triggered by some relatively simple regular expressions.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	358	Java 9 and newer
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	359	versions improve this behaviour somewhat, but are still slow compared
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	360	with the approach we are going to use in this thesis.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	361
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	362
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	363
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	364	This superlinear blowup in regular expression engines
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	365	has caused grief in ``real life'' where it is
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	366	given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	367	For example, on 20 July 2016 one evil
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	368	regular expression brought the webpage
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	369	\href{http://stackexchange.com}{Stack Exchange} to its
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	370	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	371	In this instance, a regular expression intended to just trim white
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	372	spaces from the beginning and the end of a line actually consumed
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	373	massive amounts of CPU resources---causing the web servers to grind to a
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	374	halt. In this example, the time needed to process
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	375	the string was
b079aaee5e10 more Chengsong parents: 633 diff changeset	376	$O(n^2)$ with respect to the string length $n$. This
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	377	quadratic overhead was enough for the homepage of Stack Exchange to
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	378	respond so slowly that the load balancer assumed a $\mathit{DoS}$
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	379	attack and therefore stopped the servers from responding to any
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	380	requests. This made the whole site become unavailable.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	381
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	382	\begin{figure}[p]
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	383	\begin{center}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	384	\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	385	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	386	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	387	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	388	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	389	ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	390	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	391	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	392	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	393	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	394	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	395	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	396	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	397	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	398	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	399	legend entries={JavaScript},
cc54ce075db5 restructured Chengsong parents: diff changeset	400	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	401	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	402	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	403	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	404	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	405	&
cc54ce075db5 restructured Chengsong parents: diff changeset	406	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	407	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	408	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	409	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	410	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	411	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	412	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	413	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	414	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	415	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	416	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	417	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	418	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	419	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	420	legend entries={Python},
cc54ce075db5 restructured Chengsong parents: diff changeset	421	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	422	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	423	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	424	\end{axis}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	425	\end{tikzpicture}\\
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	426	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	427	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	428	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	429	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	430	ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	431	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	432	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	433	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	434	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	435	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	436	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	437	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	438	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	439	height=4cm,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	440	legend entries={Java 8},
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	441	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	442	legend cell align=left]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	443	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	444	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	445	\end{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	446	&
ce4e5151a836 more Chengsong parents: 600 diff changeset	447	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	448	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	449	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	450	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	451	%ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	452	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	453	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	454	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	455	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	456	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	457	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	458	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	459	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	460	height=4cm,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	461	legend entries={Dart},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	462	legend pos=north west,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	463	legend cell align=left]
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	464	\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	465	\end{axis}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	466	\end{tikzpicture}\\
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	467	\begin{tikzpicture}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	468	\begin{axis}[
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	469	xlabel={$n$},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	470	x label style={at={(1.05,-0.05)}},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	471	ylabel={time in secs},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	472	enlargelimits=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	473	xtick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	474	xmax=33,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	475	ymax=35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	476	ytick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	477	scaled ticks=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	478	axis lines=left,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	479	width=5cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	480	height=4cm,
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	481	legend entries={Swift},
ce4e5151a836 more Chengsong parents: 600 diff changeset	482	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	483	legend cell align=left]
ce4e5151a836 more Chengsong parents: 600 diff changeset	484	\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
ce4e5151a836 more Chengsong parents: 600 diff changeset	485	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	486	\end{tikzpicture}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	487	&
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	488	\begin{tikzpicture}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	489	\begin{axis}[
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	490	xlabel={$n$},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	491	x label style={at={(1.05,-0.05)}},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	492	%ylabel={time in secs},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	493	enlargelimits=true,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	494	%xtick={0,5000,...,40000},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	495	%xmax=40000,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	496	%ymax=35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	497	restrict x to domain*=0:40000,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	498	restrict y to domain*=0:35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	499	%ytick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	500	%scaled ticks=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	501	axis lines=left,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	502	width=5cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	503	height=4cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	504	legend entries={Java9+},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	505	legend pos=north west,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	506	legend cell align=left]
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	507	\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	508	\end{axis}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	509	\end{tikzpicture}\\
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	510	\multicolumn{2}{c}{Graphs}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	511	\end{tabular}
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	512	\end{center}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	513	\caption{Graphs showing runtime for matching $(a^)^\,b$ with strings
ce4e5151a836 more Chengsong parents: 600 diff changeset	514	of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	515	The reason for their superlinear behaviour is that they do a depth-first-search
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	516	using NFAs.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	517	If the string does not match, the regular expression matching
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	518	engine starts to explore all possibilities.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	519	}\label{fig:aStarStarb}
ce4e5151a836 more Chengsong parents: 600 diff changeset	520	\end{figure}\afterpage{\clearpage}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	521
532 cc54ce075db5 restructured Chengsong parents: diff changeset	522	A more recent example is a global outage of all Cloudflare servers on 2 July
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	523	2019. A poorly written regular expression exhibited catastrophic backtracking
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	524	and exhausted CPUs that serve HTTP traffic. Although the outage
532 cc54ce075db5 restructured Chengsong parents: diff changeset	525	had several causes, at the heart was a regular expression that
cc54ce075db5 restructured Chengsong parents: diff changeset	526	was used to monitor network
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	527	traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	528	These problems with regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	529	are not isolated events that happen
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	530	very rarely,
b079aaee5e10 more Chengsong parents: 633 diff changeset	531	%but actually widespread.
b079aaee5e10 more Chengsong parents: 633 diff changeset	532	%They occur so often that they have a
b079aaee5e10 more Chengsong parents: 633 diff changeset	533	but they occur actually often enough that they have a
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	534	name: Regular-Expression-Denial-Of-Service (ReDoS)
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	535	attacks.
b079aaee5e10 more Chengsong parents: 633 diff changeset	536	Davis et al. \cite{Davis18} detected more
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	537	than 1000 evil regular expressions
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	538	in Node.js, Python core libraries, npm and pypi.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	539	They therefore concluded that evil regular expressions
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	540	are a real problem rather than just "a parlour trick".
532 cc54ce075db5 restructured Chengsong parents: diff changeset	541
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	542	The work in this thesis aims to address this issue
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	543	with the help of formal proofs.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	544	We describe a lexing algorithm based
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	545	on Brzozowski derivatives with verified correctness
7ce2389dff4b more Chengsong parents: 634 diff changeset	546	and a finiteness property for the size of derivatives
7ce2389dff4b more Chengsong parents: 634 diff changeset	547	(which are all done in Isabelle/HOL).
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	548	Such properties %guarantee the absence of
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	549	are an important step in preventing
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	550	catastrophic backtracking once and for all.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	551	We will give more details in the next sections
16d67f9c07d4 more Chengsong parents: 603 diff changeset	552	on (i) why the slow cases in graph \ref{fig:aStarStarb}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	553	can occur in traditional regular expression engines
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	554	and (ii) why we choose our
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	555	approach based on Brzozowski derivatives and formal proofs.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	556
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	557
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	558	\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	559	Regular expressions and regular expression matchers
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	560	have clearly been studied for many, many years.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	561	Theoretical results in automata theory state
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	562	that basic regular expression matching should be linear
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	563	w.r.t the input.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	564	This assumes that the regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	565	$r$ was pre-processed and turned into a
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	566	deterministic finite automaton (DFA) before matching \cite{Sakarovitch2009}.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	567	By basic we mean textbook definitions such as the one
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	568	below, involving only regular expressions for characters, alternatives,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	569	sequences, and Kleene stars:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	570	\[
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	571	r ::= c \| r_1 + r_2 \| r_1 \cdot r_2 \| r^*
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	572	\]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	573	Modern regular expression matchers used by programmers,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	574	however,
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	575	support much richer constructs, such as bounded repetitions,
7ce2389dff4b more Chengsong parents: 634 diff changeset	576	negations,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	577	and back-references.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	578	To differentiate, we use the word \emph{regex} to refer
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	579	to those expressions with richer constructs while reserving the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	580	term \emph{regular expression}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	581	for the more traditional meaning in formal languages theory.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	582	We follow this convention
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	583	in this thesis.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	584	In the future, we aim to support all the popular features of regexes,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	585	but for this work we mainly look at basic regular expressions
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	586	and bounded repetitions.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	587
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	588
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	589
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	590	%Most modern regex libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	591	%the so-called PCRE standard (Peral Compatible Regular Expressions)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	592	%has the back-references
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	593	Regexes come with a number of constructs
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	594	that make it more convenient for
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	595	programmers to write regular expressions.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	596	Depending on the types of constructs
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	597	the task of matching and lexing with them
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	598	will have different levels of complexity.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	599	Some of those constructs are syntactic sugars that are
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	600	simply short hand notations
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	601	that save the programmers a few keystrokes.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	602	These will not cause problems for regex libraries.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	603	For example the
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	604	non-binary alternative involving three or more choices just means:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	605	\[
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	606	(a \| b \| c) \stackrel{means}{=} ((a + b)+ c)
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	607	\]
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	608	Similarly, the range operator
7ce2389dff4b more Chengsong parents: 634 diff changeset	609	%used to express the alternative
7ce2389dff4b more Chengsong parents: 634 diff changeset	610	%of all characters between its operands,
7ce2389dff4b more Chengsong parents: 634 diff changeset	611	is just a concise way
7ce2389dff4b more Chengsong parents: 634 diff changeset	612	of expressing an alternative of consecutive characters:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	613	\[
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	614	[0~-9]\stackrel{means}{=} (0 \| 1 \| \ldots \| 9 )
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	615	\]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	616	for an alternative. The
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	617	wildcard character '$.$' is used to refer to any single character,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	618	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	619	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	620	\]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	621	except the newline.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	622
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	623	\subsection{Bounded Repetitions}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	624	More interesting are bounded repetitions, which can
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	625	make the regular expressions much
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	626	more compact.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	627	Normally there are four kinds of bounded repetitions:
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	628	$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	629	(where $n$ and $m$ are constant natural numbers).
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	630	Like the star regular expressions, the set of strings or language
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	631	a bounded regular expression can match
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	632	is defined using the power operation on sets:
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	633	\begin{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	634	\begin{tabular}{lcl}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	635	$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	636	$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	637	$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	638	$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	639	\end{tabular}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	640	\end{center}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	641	The attraction of bounded repetitions is that they can be
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	642	used to avoid a size blow up: for example $r^{\{n\}}$
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	643	is a shorthand for
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	644	the much longer regular expression:
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	645	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	646	\underbrace{r\ldots r}_\text{n copies of r}.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	647	\]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	648	%Therefore, a naive algorithm that simply unfolds
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	649	%them into their desugared forms
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	650	%will suffer from at least an exponential runtime increase.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	651
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	652
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	653	The problem with matching
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	654	such bounded repetitions
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	655	is that tools based on the classic notion of
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	656	automata need to expand $r^{\{n\}}$ into $n$ connected
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	657	copies of the automaton for $r$. This leads to very inefficient matching
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	658	algorithms or algorithms that consume large amounts of memory.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	659	Implementations using $\DFA$s will
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	660	in such situations
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	661	either become excruciatingly slow
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	662	(for example Verbatim++ \cite{Verbatimpp}) or run
7ce2389dff4b more Chengsong parents: 634 diff changeset	663	out of memory (for example $\mathit{LEX}$ and
7ce2389dff4b more Chengsong parents: 634 diff changeset	664	$\mathit{JFLEX}$\footnote{LEX and JFLEX are lexer generators
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	665	in C and JAVA that generate $\mathit{DFA}$-based
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	666	lexers. The user provides a set of regular expressions
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	667	and configurations, and then
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	668	gets an output program encoding a minimized $\mathit{DFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	669	that can be compiled and run.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	670	When given the above countdown regular expression,
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	671	a small $n$ (say 20) would result in a program representing a
7ce2389dff4b more Chengsong parents: 634 diff changeset	672	DFA
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	673	with millions of states.}) for large counters.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	674	A classic example for this phenomenon is the regular expression $(a+b)^* a (a+b)^{n}$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	675	where the minimal DFA requires at least $2^{n+1}$ states.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	676	For example, when $n$ is equal to 2,
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	677	the corresponding $\mathit{NFA}$ looks like:
7ce2389dff4b more Chengsong parents: 634 diff changeset	678	\vspace{6mm}
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	679	\begin{center}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	680	\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	681	\node[state,initial] (q_0) {$q_0$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	682	\node[state, red] (q_1) [right=of q_0] {$q_1$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	683	\node[state, red] (q_2) [right=of q_1] {$q_2$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	684	\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	685	\path[->]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	686	(q_0) edge node {a} (q_1)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	687	edge [loop below] node {a,b} ()
16d67f9c07d4 more Chengsong parents: 603 diff changeset	688	(q_1) edge node {a,b} (q_2)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	689	(q_2) edge node {a,b} (q_3);
16d67f9c07d4 more Chengsong parents: 603 diff changeset	690	\end{tikzpicture}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	691	\end{center}
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	692	and when turned into a DFA by the subset construction
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	693	requires at least $2^3$ states.\footnote{The
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	694	red states are "countdown states" which count down
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	695	the number of characters needed in addition to the current
16d67f9c07d4 more Chengsong parents: 603 diff changeset	696	string to make a successful match.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	697	For example, state $q_1$ indicates a match that has
16d67f9c07d4 more Chengsong parents: 603 diff changeset	698	gone past the $(a\|b)^$ part of $(a\|b)^a(a\|b)^{\{2\}}$,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	699	and just consumed the "delimiter" $a$ in the middle, and
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	700	needs to match 2 more iterations of $(a\|b)$ to complete.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	701	State $q_2$ on the other hand, can be viewed as a state
16d67f9c07d4 more Chengsong parents: 603 diff changeset	702	after $q_1$ has consumed 1 character, and just waits
16d67f9c07d4 more Chengsong parents: 603 diff changeset	703	for 1 more character to complete.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	704	The state $q_3$ is the last (accepting) state, requiring 0
7ce2389dff4b more Chengsong parents: 634 diff changeset	705	more characters.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	706	Depending on the suffix of the
16d67f9c07d4 more Chengsong parents: 603 diff changeset	707	input string up to the current read location,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	708	the states $q_1$ and $q_2$, $q_3$
16d67f9c07d4 more Chengsong parents: 603 diff changeset	709	may or may
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	710	not be active.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	711	A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
16d67f9c07d4 more Chengsong parents: 603 diff changeset	712	contain at least $2^3$ non-equivalent states that cannot be merged,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	713	because the subset construction during determinisation will generate
16d67f9c07d4 more Chengsong parents: 603 diff changeset	714	all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	715	Generalizing this to regular expressions with larger
16d67f9c07d4 more Chengsong parents: 603 diff changeset	716	bounded repetitions number, we have that
16d67f9c07d4 more Chengsong parents: 603 diff changeset	717	regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	718	would require at least $2^{n+1}$ states, if $r$ itself contains
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	719	more than 1 string.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	720	This is to represent all different
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	721	scenarios in which "countdown" states are active.}
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	722
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	723
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	724	Bounded repetitions are important because they
7ce2389dff4b more Chengsong parents: 634 diff changeset	725	tend to occur frequently in practical use,
7ce2389dff4b more Chengsong parents: 634 diff changeset	726	for example in the regex library RegExLib, in
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	727	the rules library of Snort \cite{Snort1999}\footnote{
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	728	Snort is a network intrusion detection (NID) tool
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	729	for monitoring network traffic.
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	730	The network security community curates a list
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	731	of malicious patterns written as regexes,
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	732	which is used by Snort's detection engine
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	733	to match against network traffic for any hostile
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	734	activities such as buffer overflow attacks.},
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	735	as well as in XML Schema definitions (XSDs).
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	736	According to Bj\"{o}rklund et al \cite{xml2015},
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	737	more than half of the
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	738	XSDs they found on the Maven.org central repository
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	739	have bounded regular expressions in them.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	740	Often the counters are quite large, with the largest being
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	741	close to ten million.
7ce2389dff4b more Chengsong parents: 634 diff changeset	742	A smaller sample XSD they gave
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	743	is:
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	744	\lstset{
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	745	basicstyle=\fontsize{8.5}{9}\ttfamily,
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	746	language=XML,
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	747	morekeywords={encoding,
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	748	xs:schema,xs:element,xs:complexType,xs:sequence,xs:attribute}
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	749	}
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	750	\begin{lstlisting}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	751	<sequence minOccurs="0" maxOccurs="65535">
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	752	<element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	753	<element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	754	</sequence>
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	755	\end{lstlisting}
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	756	This can be seen as the regex
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	757	$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	758	regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	759	satisfying certain constraints (such as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	760	satisfying the floating point number format).
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	761	It is therefore quite unsatisfying that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	762	some regular expressions matching libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	763	impose adhoc limits
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	764	for bounded regular expressions:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	765	For example, in the regular expression matching library in the Go
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	766	language the regular expression $a^{1001}$ is not permitted, because no counter
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	767	can be above 1000, and in the built-in Rust regular expression library
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	768	expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
606 99b530103464 new Chengsong parents: 605 diff changeset	769	for being too big.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	770	As Becchi and Crawley \cite{Becchi08} have pointed out,
606 99b530103464 new Chengsong parents: 605 diff changeset	771	the reason for these restrictions
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	772	is that they simulate a non-deterministic finite
606 99b530103464 new Chengsong parents: 605 diff changeset	773	automata (NFA) with a breadth-first search.
99b530103464 new Chengsong parents: 605 diff changeset	774	This way the number of active states could
99b530103464 new Chengsong parents: 605 diff changeset	775	be equal to the counter number.
99b530103464 new Chengsong parents: 605 diff changeset	776	When the counters are large,
99b530103464 new Chengsong parents: 605 diff changeset	777	the memory requirement could become
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	778	infeasible, and a regex engine
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	779	like in Go will reject this pattern straight away.
606 99b530103464 new Chengsong parents: 605 diff changeset	780	\begin{figure}[H]
99b530103464 new Chengsong parents: 605 diff changeset	781	\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	782	\begin{tikzpicture} [node distance = 2cm, on grid, auto]
99b530103464 new Chengsong parents: 605 diff changeset	783
99b530103464 new Chengsong parents: 605 diff changeset	784	\node (q0) [state, initial] {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	785	\node (q1) [state, right = of q0] {$1$};
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	786	%\node (q2) [state, right = of q1] {$2$};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	787	\node (qdots) [right = of q1] {$\ldots$};
606 99b530103464 new Chengsong parents: 605 diff changeset	788	\node (qn) [state, right = of qdots] {$n$};
99b530103464 new Chengsong parents: 605 diff changeset	789	\node (qn1) [state, right = of qn] {$n+1$};
99b530103464 new Chengsong parents: 605 diff changeset	790	\node (qn2) [state, right = of qn1] {$n+2$};
99b530103464 new Chengsong parents: 605 diff changeset	791	\node (qn3) [state, accepting, right = of qn2] {$n+3$};
99b530103464 new Chengsong parents: 605 diff changeset	792
99b530103464 new Chengsong parents: 605 diff changeset	793	\path [-stealth, thick]
99b530103464 new Chengsong parents: 605 diff changeset	794	(q0) edge [loop above] node {a} ()
99b530103464 new Chengsong parents: 605 diff changeset	795	(q0) edge node {a} (q1)
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	796	%(q1) edge node {.} (q2)
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	797	(q1) edge node {.} (qdots)
606 99b530103464 new Chengsong parents: 605 diff changeset	798	(qdots) edge node {.} (qn)
99b530103464 new Chengsong parents: 605 diff changeset	799	(qn) edge node {.} (qn1)
99b530103464 new Chengsong parents: 605 diff changeset	800	(qn1) edge node {b} (qn2)
99b530103464 new Chengsong parents: 605 diff changeset	801	(qn2) edge node {$c$} (qn3);
99b530103464 new Chengsong parents: 605 diff changeset	802	\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	803	%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
99b530103464 new Chengsong parents: 605 diff changeset	804	% \node[state,initial] (q_0) {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	805	% \node[state, ] (q_1) [right=of q_0] {$1$};
99b530103464 new Chengsong parents: 605 diff changeset	806	% \node[state, ] (q_2) [right=of q_1] {$2$};
99b530103464 new Chengsong parents: 605 diff changeset	807	% \node[state,
99b530103464 new Chengsong parents: 605 diff changeset	808	% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
99b530103464 new Chengsong parents: 605 diff changeset	809	% \path[->]
99b530103464 new Chengsong parents: 605 diff changeset	810	% (q_0) edge node {a} (q_1)
99b530103464 new Chengsong parents: 605 diff changeset	811	% edge [loop below] node {a,b} ()
99b530103464 new Chengsong parents: 605 diff changeset	812	% (q_1) edge node {a,b} (q_2)
99b530103464 new Chengsong parents: 605 diff changeset	813	% (q_2) edge node {a,b} (q_3);
99b530103464 new Chengsong parents: 605 diff changeset	814	%\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	815	\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	816	\caption{The example given by Becchi and Crawley
99b530103464 new Chengsong parents: 605 diff changeset	817	that NFA simulation can consume large
99b530103464 new Chengsong parents: 605 diff changeset	818	amounts of memory: $.^*a.^{\{n\}}bc$ matching
99b530103464 new Chengsong parents: 605 diff changeset	819	strings of the form $aaa\ldots aaaabc$.
99b530103464 new Chengsong parents: 605 diff changeset	820	When traversing in a breadth-first manner,
99b530103464 new Chengsong parents: 605 diff changeset	821	all states from 0 till $n+1$ will become active.}
99b530103464 new Chengsong parents: 605 diff changeset	822	\end{figure}
99b530103464 new Chengsong parents: 605 diff changeset	823	%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
99b530103464 new Chengsong parents: 605 diff changeset	824	%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
99b530103464 new Chengsong parents: 605 diff changeset	825	%in terms of input string length.
99b530103464 new Chengsong parents: 605 diff changeset	826	%TODO:try out these lexers
99b530103464 new Chengsong parents: 605 diff changeset	827	These problems can of course be solved in matching algorithms where
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	828	automata go beyond the classic notion and for instance include explicit
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	829	counters \cite{Turo_ov__2020}.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	830	These solutions can be quite efficient,
606 99b530103464 new Chengsong parents: 605 diff changeset	831	with the ability to process
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	832	gigabits of strings input per second
606 99b530103464 new Chengsong parents: 605 diff changeset	833	even with large counters \cite{Becchi08}.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	834	These practical solutions do not come with
7ce2389dff4b more Chengsong parents: 634 diff changeset	835	formal guarantees, and as pointed out by
7ce2389dff4b more Chengsong parents: 634 diff changeset	836	Kuklewicz \cite{KuklewiczHaskell}, can be error-prone.
7ce2389dff4b more Chengsong parents: 634 diff changeset	837	%But formal reasoning about these automata especially in Isabelle
7ce2389dff4b more Chengsong parents: 634 diff changeset	838	%can be challenging
7ce2389dff4b more Chengsong parents: 634 diff changeset	839	%and un-intuitive.
7ce2389dff4b more Chengsong parents: 634 diff changeset	840	%Therefore, we take correctness and runtime claims made about these solutions
7ce2389dff4b more Chengsong parents: 634 diff changeset	841	%with a grain of salt.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	842
628 7af4e2420a8c ready to submit~~ Chengsong parents: 622 diff changeset	843	In the work reported in \cite{FoSSaCS2023} and here,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	844	we add better support using derivatives
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	845	for bounded regular expression $r^{\{n\}}$.
7ce2389dff4b more Chengsong parents: 634 diff changeset	846	Our results
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	847	extend straightforwardly to
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	848	repetitions with intervals such as
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	849	$r^{\{n\ldots m\}}$.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	850	The merit of Brzozowski derivatives (more on this later)
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	851	on this problem is that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	852	it can be naturally extended to support bounded repetitions.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	853	Moreover these extensions are still made up of only small
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	854	inductive datatypes and recursive functions,
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	855	making it handy to deal with them in theorem provers.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	856	%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	857	%straightforwardly extended to deal with bounded regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	858	%and moreover the resulting code still consists of only simple
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	859	%recursive functions and inductive datatypes.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	860	Finally, bounded regular expressions do not destroy our finite
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	861	boundedness property, which we shall prove later on.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	862
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	863
606 99b530103464 new Chengsong parents: 605 diff changeset	864
99b530103464 new Chengsong parents: 605 diff changeset	865
99b530103464 new Chengsong parents: 605 diff changeset	866
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	867	\subsection{Back-References}
606 99b530103464 new Chengsong parents: 605 diff changeset	868	The other way to simulate an $\mathit{NFA}$ for matching is choosing
99b530103464 new Chengsong parents: 605 diff changeset	869	a single transition each time, keeping all the other options in
99b530103464 new Chengsong parents: 605 diff changeset	870	a queue or stack, and backtracking if that choice eventually
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	871	fails.
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	872	This method, often called a "depth-first-search",
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	873	is efficient in many cases, but could end up
606 99b530103464 new Chengsong parents: 605 diff changeset	874	with exponential run time.
99b530103464 new Chengsong parents: 605 diff changeset	875	The backtracking method is employed in regex libraries
99b530103464 new Chengsong parents: 605 diff changeset	876	that support \emph{back-references}, for example
99b530103464 new Chengsong parents: 605 diff changeset	877	in Java and Python.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	878	%\section{Back-references and The Terminology Regex}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	879
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	880	%When one constructs an $\NFA$ out of a regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	881	%there is often very little to be done in the first phase, one simply
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	882	%construct the $\NFA$ states based on the structure of the input regular expression.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	883
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	884	%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	885	%one by keeping track of all active states after consuming
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	886	%a character, and update that set of states iteratively.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	887	%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	888	%for a path terminating
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	889	%at an accepting state.
606 99b530103464 new Chengsong parents: 605 diff changeset	890
99b530103464 new Chengsong parents: 605 diff changeset	891
99b530103464 new Chengsong parents: 605 diff changeset	892
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	893	Consider the following regular expression where the sequence
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	894	operator is omitted for brevity:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	895	\begin{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	896	$r_1r_2r_3r_4$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	897	\end{center}
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	898	In this example,
606 99b530103464 new Chengsong parents: 605 diff changeset	899	one could label sub-expressions of interest
532 cc54ce075db5 restructured Chengsong parents: diff changeset	900	by parenthesizing them and giving
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	901	them a number in the order in which their opening parentheses appear.
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	902	One possible way of parenthesizing and labelling is:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	903	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	904	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured Chengsong parents: diff changeset	905	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	906	The sub-expressions
99b530103464 new Chengsong parents: 605 diff changeset	907	$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
99b530103464 new Chengsong parents: 605 diff changeset	908	by 1 to 4, and can be ``referred back'' by their respective numbers.
99b530103464 new Chengsong parents: 605 diff changeset	909	%These sub-expressions are called "capturing groups".
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	910	To do so, one uses the syntax $\backslash i$
606 99b530103464 new Chengsong parents: 605 diff changeset	911	to denote that we want the sub-string
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	912	of the input matched by the i-th
606 99b530103464 new Chengsong parents: 605 diff changeset	913	sub-expression to appear again,
99b530103464 new Chengsong parents: 605 diff changeset	914	exactly the same as it first appeared:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	915	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	916	$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	917	\underset{s_i \text{ which just matched} \;r_i}{\backslash i} \ldots$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	918	\end{center}
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	919	Once the sub-string $s_i$ for the sub-expression $r_i$
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	920	has been fixed, there is no variability on what the back-reference
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	921	label $\backslash i$ can be---it is tied to $s_i$.
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	922	The overall string will look like $\ldots s_i \ldots s_i \ldots $
606 99b530103464 new Chengsong parents: 605 diff changeset	923	%The backslash and number $i$ are the
99b530103464 new Chengsong parents: 605 diff changeset	924	%so-called "back-references".
99b530103464 new Chengsong parents: 605 diff changeset	925	%Let $e$ be an expression made of regular expressions
99b530103464 new Chengsong parents: 605 diff changeset	926	%and back-references. $e$ contains the expression $e_i$
99b530103464 new Chengsong parents: 605 diff changeset	927	%as its $i$-th capturing group.
99b530103464 new Chengsong parents: 605 diff changeset	928	%The semantics of back-reference can be recursively
99b530103464 new Chengsong parents: 605 diff changeset	929	%written as:
99b530103464 new Chengsong parents: 605 diff changeset	930	%\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	931	% \begin{tabular}{c}
99b530103464 new Chengsong parents: 605 diff changeset	932	% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
99b530103464 new Chengsong parents: 605 diff changeset	933	% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
99b530103464 new Chengsong parents: 605 diff changeset	934	% \end{tabular}
99b530103464 new Chengsong parents: 605 diff changeset	935	%\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	936	A concrete example
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	937	for back-references is
532 cc54ce075db5 restructured Chengsong parents: diff changeset	938	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	939	$(.^*)\backslash 1$,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	940	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	941	which matches
606 99b530103464 new Chengsong parents: 605 diff changeset	942	strings that can be split into two identical halves,
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	943	for example $\mathit{foofoo}$, $\mathit{ww}$ and so on.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	944	Note that this is different from
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	945	repeating the sub-expression verbatim like
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	946	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	947	$(.^)(.^)$,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	948	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	949	which does not impose any restrictions on what strings the second
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	950	sub-expression $.^*$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	951	might match.
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	952	Another example for back-references is
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	953	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	954	$(.)(.)\backslash 2\backslash 1$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	955	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	956	which matches four-character palindromes
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	957	like $abba$, $x??x$ and so on.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	958
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	959	Back-references are a regex construct
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	960	that programmers find quite useful.
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	961	According to Becchi and Crawley \cite{Becchi08},
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	962	6\% of Snort rules (up until 2008) use them.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	963	The most common use of back-references
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	964	is to express well-formed html files,
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	965	where back-references are convenient for matching
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	966	opening and closing tags like
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	967	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	968	$\langle html \rangle \ldots \langle / html \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	969	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	970	A regex describing such a format
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	971	is
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	972	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	973	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	974	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	975	Despite being useful, the expressive power of regexes
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	976	go beyond regular languages
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	977	once back-references are included.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	978	In fact, they allow the regex construct to express
532 cc54ce075db5 restructured Chengsong parents: diff changeset	979	languages that cannot be contained in context-free
cc54ce075db5 restructured Chengsong parents: diff changeset	980	languages either.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	981	For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	982	expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	983	which cannot be expressed by
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	984	context-free grammars \cite{campeanu2003formal}.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	985	Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured Chengsong parents: diff changeset	986	of formal languages.
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	987	Also solving the matching problem involving back-references
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	988	is known to be NP-complete \parencite{alfred2014algorithms}.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	989	Regex libraries supporting back-references such as
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	990	PCRE \cite{pcre} therefore have to
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	991	revert to a depth-first search algorithm including backtracking.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	992	What is unexpected is that even in the cases
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	993	not involving back-references, there is still
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	994	a (non-negligible) chance they might backtrack super-linearly,
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	995	as shown in the graphs in figure \ref{fig:aStarStarb}.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	996
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	997	Summing up, we can categorise existing
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	998	practical regex libraries into two kinds:
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	999	(i) The ones with linear
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1000	time guarantees like Go and Rust. The downside with them is that
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1001	they impose restrictions
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1002	on the regular expressions (not allowing back-references,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1003	bounded repetitions cannot exceed an ad hoc limit etc.).
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	1004	And (ii) those
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1005	that allow large bounded regular expressions and back-references
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1006	at the expense of using backtracking algorithms.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1007	They can potentially ``grind to a halt''
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1008	on some very simple cases, resulting
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	1009	ReDoS attacks if exposed to the internet.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1010
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	1011	The problems with both approaches are the motivation for us
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1012	to look again at the regular expression matching problem.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1013	Another motivation is that regular expression matching algorithms
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1014	that follow the POSIX standard often contain errors and bugs
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1015	as we shall explain next.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1016
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1017	%We would like to have regex engines that can
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1018	%deal with the regular part (e.g.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1019	%bounded repetitions) of regexes more
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1020	%efficiently.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1021	%Also we want to make sure that they do it correctly.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1022	%It turns out that such aim is not so easy to achieve.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1023	%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
cc54ce075db5 restructured Chengsong parents: diff changeset	1024	% For example, the Rust regex engine claims to be linear,
cc54ce075db5 restructured Chengsong parents: diff changeset	1025	% but does not support lookarounds and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	1026	% The GoLang regex library does not support over 1000 repetitions.
cc54ce075db5 restructured Chengsong parents: diff changeset	1027	% Java and Python both support back-references, but shows
cc54ce075db5 restructured Chengsong parents: diff changeset	1028	%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured Chengsong parents: diff changeset	1029	%when the language is still regular).
cc54ce075db5 restructured Chengsong parents: diff changeset	1030	%TODO: test performance of Rust on (((((aa)b)b){20}))c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured Chengsong parents: diff changeset	1031	%TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured Chengsong parents: diff changeset	1032
cc54ce075db5 restructured Chengsong parents: diff changeset	1033
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1034
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1035
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1036	%The time cost of regex matching algorithms in general
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1037	%involve two different phases, and different things can go differently wrong on
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1038	%these phases.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1039	%$\DFA$s usually have problems in the first (construction) phase
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1040	%, whereas $\NFA$s usually run into trouble
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1041	%on the second phase.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1042
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1043
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1044	\section{Error-prone POSIX Implementations}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1045	Very often there are multiple ways of matching a string
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1046	with a regular expression.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1047	In such cases the regular expressions matcher needs to
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1048	disambiguate.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1049	The more widely used strategy is called POSIX,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1050	which roughly speaking always chooses the longest initial match.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1051	The POSIX strategy is widely adopted in many regular expression matchers
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1052	because it is a natural strategy for lexers.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1053	However, many implementations (including the C libraries
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1054	used by Linux and OS X distributions) contain bugs
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1055	or do not meet the specification they claim to adhere to.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1056	Kuklewicz maintains a unit test repository which lists some
628 7af4e2420a8c ready to submit~~ Chengsong parents: 622 diff changeset	1057	problems with existing regular expression engines \cite{KuklewiczHaskell}.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1058	In some cases, they either fail to generate a
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1059	result when there exists a match,
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1060	or give results that are inconsistent with the POSIX standard.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1061	A concrete example is the regex:
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1062	\begin{center}
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1063	$(aba + ab + a)^* \text{and the string} \; ababa$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1064	\end{center}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1065	The correct POSIX match for the above
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1066	involves the entire string $ababa$,
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1067	split into two Kleene star iterations, namely $[ab], [aba]$ at positions
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1068	$[0, 2), [2, 5)$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1069	respectively.
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1070	But feeding this example to the different engines
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1071	listed at regex101 \footnote{
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1072	regex101 is an online regular expression matcher which
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1073	provides API for trying out regular expression
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1074	engines of multiple popular programming languages like
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1075	Java, Python, Go, etc.} \parencite{regex101}.
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1076	yields
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1077	only two incomplete matches: $[aba]$ at $[0, 3)$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1078	and $a$ at $[4, 5)$.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1079	Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1080	commented that most regex libraries are not
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1081	correctly implementing the central POSIX
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1082	rule, called the maximum munch rule.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1083	Grathwohl \parencite{grathwohl2014crash} wrote:
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1084	\begin{quote}\it
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1085	``The POSIX strategy is more complicated than the
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1086	greedy because of the dependence on information about
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1087	the length of matched strings in the various subexpressions.''
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1088	\end{quote}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1089	%\noindent
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	1090	People have recognised that the implementation complexity of POSIX rules also come from
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1091	the specification being not very precise.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1092	The developers of the regexp package of Go
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1093	\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1094	commented that
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1095	\begin{quote}\it
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1096	``
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1097	The POSIX rule is computationally prohibitive
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1098	and not even well-defined.
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1099	``
0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1100	\end{quote}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1101	There are many informal summaries of this disambiguation
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1102	strategy, which are often quite long and delicate.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	1103	For example Kuklewicz \cite{KuklewiczHaskell}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1104	described the POSIX rule as (section 1, last paragraph):
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1105	\begin{quote}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1106	\begin{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1107	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1108	regular expressions (REs) take the leftmost starting match, and the longest match starting there
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1109	earlier subpatterns have leftmost-longest priority over later subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1110	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1111	higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1112	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1113	REs have right associative concatenation which can be changed with parenthesis\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1114	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1115	parenthesized subexpressions return the match from their last usage\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1116	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1117	text of component subexpressions must be contained in the text of the
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1118	higher-level subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1119	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1120	if "p" and "q" can never match the same text then "p\|q" and "q\|p" are equivalent, up to trivial renumbering of captured subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1121	\item
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1122	if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1123	\end{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	1124	\end{quote}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1125	%The text above
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1126	%is trying to capture something very precise,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1127	%and is crying out for formalising.
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	1128	Ribeiro and Du Bois \cite{RibeiroAgda2017} have
56057198e4f5 intro Chengsong parents: 638 diff changeset	1129	formalised the notion of bit-coded regular expressions
56057198e4f5 intro Chengsong parents: 638 diff changeset	1130	and proved their relations with simple regular expressions in
56057198e4f5 intro Chengsong parents: 638 diff changeset	1131	the dependently-typed proof assistant Agda.
56057198e4f5 intro Chengsong parents: 638 diff changeset	1132	They also proved the soundness and completeness of a matching algorithm
56057198e4f5 intro Chengsong parents: 638 diff changeset	1133	based on the bit-coded regular expressions.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	1134	Ausaf et al. \cite{AusafDyckhoffUrban2016}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1135	are the first to
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1136	give a quite simple formalised POSIX
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1137	specification in Isabelle/HOL, and also prove
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1138	that their specification coincides with an earlier (unformalised)
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1139	POSIX specification given by Okui and Suzuki \cite{Okui10}.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	1140	They then formally proved the correctness of
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	1141	a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1142	with regards to that specification.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1143	They also found that the informal POSIX
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1144	specification by Sulzmann and Lu needs to be substantially
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1145	revised in order for the correctness proof to go through.
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1146	Their original specification and proof were unfixable
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1147	according to Ausaf et al.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1148
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	1149
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1150	In the next section, we will briefly
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1151	introduce Brzozowski derivatives and Sulzmann
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1152	and Lu's algorithm, which the main point of this thesis builds on.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1153	%We give a taste of what they
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1154	%are like and why they are suitable for regular expression
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1155	%matching and lexing.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1156	\section{Formal Specification of POSIX Matching
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1157	and Brzozowski Derivatives}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1158	%Now we start with the central topic of the thesis: Brzozowski derivatives.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1159	Brzozowski \cite{Brzozowski1964} first introduced the
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1160	concept of a \emph{derivative} of regular expression in 1964.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1161	The derivative of a regular expression $r$
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1162	with respect to a character $c$, is written as $r \backslash c$.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1163	This operation tells us what $r$ transforms into
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1164	if we ``chop'' off a particular character $c$
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1165	from all strings in the language of $r$ (defined
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1166	later as $L \; r$).
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1167	%To give a flavour of Brzozowski derivatives, we present
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1168	%two straightforward clauses from it:
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1169	%\begin{center}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1170	% \begin{tabular}{lcl}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1171	% $d \backslash c$ & $\dn$ &
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1172	% $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1173	%$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1174	% \end{tabular}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1175	%\end{center}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1176	%\noindent
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1177	%The first clause says that for the regular expression
636 0bcb4a7cb40c chap 1 all incorporated Chengsong parents: 635 diff changeset	1178	%denoting a singleton set consisting of a single-character string $\{ d \}$,
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1179	%we check the derivative character $c$ against $d$,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1180	%returning a set containing only the empty string $\{ [] \}$
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1181	%if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1182	%The second clause states that to obtain the regular expression
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1183	%representing all strings' head character $c$ being chopped off
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1184	%from $r_1 + r_2$, one simply needs to recursively take derivative
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1185	%of $r_1$ and $r_2$ and then put them together.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1186	Derivatives have the property
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1187	that $s \in L \; (r\backslash c)$ if and only if
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1188	$c::s \in L \; r$ where $::$ stands for list prepending.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1189	%This property can be used on regular expressions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1190	%matching and lexing--to test whether a string $s$ is in $L \; r$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1191	%one simply takes derivatives of $r$ successively with
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1192	%respect to the characters (in the correct order) in $s$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1193	%and then test whether the empty string is in the last regular expression.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1194	With this property, derivatives can give a simple solution
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1195	to the problem of matching a string $s$ with a regular
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1196	expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1197	succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1198	then $r$ matches $s$ (and {\em vice versa}).
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1199	%This makes formally reasoning about these properties such
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1200	%as correctness and complexity smooth and intuitive.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1201	There are several mechanised proofs of this property in various theorem
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1202	provers,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1203	for example one by Owens and Slind \cite{Owens2008} in HOL4,
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1204	another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, and
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1205	yet another in Coq by Coquand and Siles \cite{Coquand2012}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1206
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1207	In addition, one can extend derivatives to bounded repetitions
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1208	relatively straightforwardly. For example, the derivative for
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1209	this can be defined as:
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1210	\begin{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1211	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1212	$r^{\{n\}} \backslash c$ & $\dn$ & $r \backslash c \cdot
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1213	r^{\{n-1\}} (\text{when} n > 0)$\\
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1214	\end{tabular}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1215	\end{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1216	\noindent
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1217	Experimental results suggest that unlike DFA-based solutions
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1218	for bounded regular expressions,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1219	derivatives can cope
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1220	large counters
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1221	quite well.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1222
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1223	There have also been
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1224	extensions of derivatives to other regex constructs.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1225	For example, Owens et al include the derivatives
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1226	for the \emph{NOT} regular expression, which is
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1227	able to concisely express C-style comments of the form
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1228	$/* \ldots */$ (see \cite{Owens2008}).
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1229	Another extension for derivatives is
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1230	regular expressions with look-aheads, done
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1231	by Miyazaki and Minamide
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1232	\cite{Takayuki2019}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1233	%We therefore use Brzozowski derivatives on regular expressions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1234	%lexing
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1235
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1236
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1237
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1238	Given the above definitions and properties of
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1239	Brzozowski derivatives, one quickly realises their potential
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1240	in generating a formally verified algorithm for lexing: the clauses and property
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1241	can be easily expressed in a functional programming language
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1242	or converted to theorem prover
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1243	code, with great ease.
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1244	Perhaps this is the reason why derivatives have sparked quite a bit of interest
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1245	in the functional programming and theorem prover communities in the last
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1246	fifteen or so years (
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1247	\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1248	\cite{Chen12} and \cite{Coquand2012}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1249	to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1250	after they were first published by Brzozowski.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1251
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1252
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1253	However, there are two difficulties with derivative-based matchers:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1254	First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured Chengsong parents: diff changeset	1255	for whether a regular expression matches a string or not. This is too
cc54ce075db5 restructured Chengsong parents: diff changeset	1256	little information in the context of lexing where separate tokens must
cc54ce075db5 restructured Chengsong parents: diff changeset	1257	be identified and also classified (for example as keywords
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1258	or identifiers).
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1259	Second, derivative-based matchers need to be more efficient in terms
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1260	of the sizes of derivatives.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1261	Elegant and beautiful
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1262	as many implementations are,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1263	they can be excruciatingly slow.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1264	For example, Sulzmann and Lu
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1265	claim a linear running time of their proposed algorithm,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1266	but that was falsified by our experiments. The running time
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1267	is actually $\Omega(2^n)$ in the worst case.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1268	A similar claim about a theoretical runtime of $O(n^2)$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1269	is made for the Verbatim \cite{Verbatim}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1270	%TODO: give references
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1271	lexer, which calculates POSIX matches and is based on derivatives.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1272	They formalized the correctness of the lexer, but not their complexity result.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1273	In the performance evaluation section, they analyzed the run time
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1274	of matching $a$ with the string
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1275	\begin{center}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1276	$\underbrace{a \ldots a}_{\text{n a's}}$.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1277	\end{center}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1278	\noindent
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1279	They concluded that the algorithm is quadratic in terms of
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1280	the length of the input string.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1281	When we tried out their extracted OCaml code with the example $(a+aa)^*$,
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1282	the time it took to match a string of 40 $a$'s was approximately 5 minutes.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1283
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1284
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1285	\subsection{Sulzmann and Lu's Algorithm}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1286	Sulzmann and Lu~\cite{Sulzmann2014} overcame the first
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1287	problem with the yes/no answer
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1288	by cleverly extending Brzozowski's matching
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1289	algorithm. Their extended version generates additional information on
cc54ce075db5 restructured Chengsong parents: diff changeset	1290	\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured Chengsong parents: diff changeset	1291	rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured Chengsong parents: diff changeset	1292	second ``phase'' to Brzozowski's algorithm involving an injection
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1293	function. This injection function in a sense undoes the ``damage''
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1294	of the derivatives chopping off characters.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1295	In earlier work, Ausaf et al provided the formal
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1296	specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured Chengsong parents: diff changeset	1297	the correctness
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1298	of this extended algorithm accordingly
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1299	\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured Chengsong parents: diff changeset	1300
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1301	The version of the algorithm proven correct
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1302	suffers however heavily from a
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1303	second difficulty, where derivatives can
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1304	grow to arbitrarily big sizes.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1305	For example if we start with the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1306	regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured Chengsong parents: diff changeset	1307	successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured Chengsong parents: diff changeset	1308	a sequence of ever-growing derivatives like
cc54ce075db5 restructured Chengsong parents: diff changeset	1309
cc54ce075db5 restructured Chengsong parents: diff changeset	1310	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured Chengsong parents: diff changeset	1311	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1312	\begin{tabular}{rll}
cc54ce075db5 restructured Chengsong parents: diff changeset	1313	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1314	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1315	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1316	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1317	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured Chengsong parents: diff changeset	1318	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	1319	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1320
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1321	\noindent where after around 35 steps we usually run out of memory on a
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1322	typical computer. Clearly, the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1323	notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured Chengsong parents: diff changeset	1324	simplification rules that can be applied to regular regular
cc54ce075db5 restructured Chengsong parents: diff changeset	1325	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured Chengsong parents: diff changeset	1326	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1327	r$. While such simple-minded simplifications have been proved in
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1328	the work by Ausaf et al. to preserve the correctness of Sulzmann and Lu's
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1329	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured Chengsong parents: diff changeset	1330	\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured Chengsong parents: diff changeset	1331	above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured Chengsong parents: diff changeset	1332	quickly beyond any finite bound.
cc54ce075db5 restructured Chengsong parents: diff changeset	1333
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1334	Therefore we want to look in this thesis at a second
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1335	algorithm by Sulzmann and Lu where they
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1336	overcame this ``growth problem'' \cite{Sulzmann2014}.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1337	In this version, POSIX values are
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1338	represented as bit sequences and such sequences are incrementally generated
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1339	when derivatives are calculated. The compact representation
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1340	of bit sequences and regular expressions allows them to define a more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1341	``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured Chengsong parents: diff changeset	1342	derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured Chengsong parents: diff changeset	1343	They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured Chengsong parents: diff changeset	1344	of this version, but do not provide any supporting proof arguments, not
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1345	even ``pencil-and-paper'' arguments. They write about their bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1346	\emph{incremental parsing method} (that is the algorithm to be formalised
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1347	in this dissertation)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1348
cc54ce075db5 restructured Chengsong parents: diff changeset	1349
cc54ce075db5 restructured Chengsong parents: diff changeset	1350
cc54ce075db5 restructured Chengsong parents: diff changeset	1351	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	1352	``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured Chengsong parents: diff changeset	1353	method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured Chengsong parents: diff changeset	1354	yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured Chengsong parents: diff changeset	1355	extensively [..] but yet
cc54ce075db5 restructured Chengsong parents: diff changeset	1356	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured Chengsong parents: diff changeset	1357	\end{quote}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1358	Ausaf and Urban made some initial progress towards the
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1359	full correctness proof but still had to leave out the optimisation
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1360	Sulzmann and Lu proposed.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1361	Ausaf wrote \cite{Ausaf},
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1362	\begin{quote}\it
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1363	``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1364	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	1365	This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured Chengsong parents: diff changeset	1366	by Ausaf and Urban,
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1367	together with a formal proof of the correctness of those simplifications.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1368
cc54ce075db5 restructured Chengsong parents: diff changeset	1369
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1370	One of the most recent work in the context of lexing
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1371	%with this issue
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	1372	is the Verbatim lexer by Egolf, Lasser and Fisher~\cite{Verbatim}.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1373	This is relevant work for us and we will compare it later with
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1374	our derivative-based matcher we are going to present.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1375	There is also some newer work called
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	1376	Verbatim++~\cite{Verbatimpp}, which does not use derivatives,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1377	but deterministic finite automaton instead.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1378	We will also study this work in a later section.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1379	%An example that gives problem to automaton approaches would be
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1380	%the regular expression $(a\|b)^*a(a\|b)^{\{n\}}$.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1381	%It requires at least $2^{n+1}$ states to represent
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1382	%as a DFA.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1383
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1384
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1385	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1386	\section{Contribution}
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	1387	{\color{red} \rule{\linewidth}{0.5mm}}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1388	\textbf{issue with this part: way too short, not enough details of what I have done.}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1389	{\color{red} \rule{\linewidth}{0.5mm}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1390
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1391	In this thesis,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1392	we propose a solution to catastrophic
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1393	backtracking and error-prone matchers: a formally verified
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1394	regular expression lexing algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1395	that is both fast
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1396	and correct.
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	1397	{\color{red} \rule{\linewidth}{0.5mm}}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1398	\HandRight Added content:
56057198e4f5 intro Chengsong parents: 638 diff changeset	1399	%Package \verb`pifont`: \ding{43}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1400	%Package \verb`utfsym`: \usym{261E}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1401	%Package \verb`dingbat`: \leftpointright
56057198e4f5 intro Chengsong parents: 638 diff changeset	1402	%Package \verb`dingbat`: \rightpointright
56057198e4f5 intro Chengsong parents: 638 diff changeset	1403
56057198e4f5 intro Chengsong parents: 638 diff changeset	1404	In particular, the main problem we solved on top of previous work was
56057198e4f5 intro Chengsong parents: 638 diff changeset	1405	coming up with a formally verified algorithm called $\blexersimp$ based
56057198e4f5 intro Chengsong parents: 638 diff changeset	1406	on Brzozowski derivatives. It calculates a POSIX
56057198e4f5 intro Chengsong parents: 638 diff changeset	1407	lexical value from a string and a regular expression. This algorithm was originally
56057198e4f5 intro Chengsong parents: 638 diff changeset	1408	by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
56057198e4f5 intro Chengsong parents: 638 diff changeset	1409	function does not really simplify intermediate results where it needs to and improved the
56057198e4f5 intro Chengsong parents: 638 diff changeset	1410	algorithm accordingly.
56057198e4f5 intro Chengsong parents: 638 diff changeset	1411	We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
56057198e4f5 intro Chengsong parents: 638 diff changeset	1412	depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
56057198e4f5 intro Chengsong parents: 638 diff changeset	1413	\begin{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1414	$\|\blexersimp \; r \; s \| \leq N_r$
56057198e4f5 intro Chengsong parents: 638 diff changeset	1415	\end{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1416	The simplifications applied in each step of $\blexersimp$
56057198e4f5 intro Chengsong parents: 638 diff changeset	1417
56057198e4f5 intro Chengsong parents: 638 diff changeset	1418	\begin{center}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1419	$\blexersimp
56057198e4f5 intro Chengsong parents: 638 diff changeset	1420	$
56057198e4f5 intro Chengsong parents: 638 diff changeset	1421	\end{center}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1422	keeps the derivatives small, but presents a
56057198e4f5 intro Chengsong parents: 638 diff changeset	1423	challenge
56057198e4f5 intro Chengsong parents: 638 diff changeset	1424
56057198e4f5 intro Chengsong parents: 638 diff changeset	1425
56057198e4f5 intro Chengsong parents: 638 diff changeset	1426	establishing a correctness theorem of the below form:
56057198e4f5 intro Chengsong parents: 638 diff changeset	1427	%TODO: change this to "theorem to prove"
56057198e4f5 intro Chengsong parents: 638 diff changeset	1428	\begin{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1429	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$,
56057198e4f5 intro Chengsong parents: 638 diff changeset	1430	then $\blexersimp\; r \; s = \Some \; v$. Otherwise
56057198e4f5 intro Chengsong parents: 638 diff changeset	1431	$\blexersimp \; r \; s = \None$.
56057198e4f5 intro Chengsong parents: 638 diff changeset	1432	\end{theorem}
56057198e4f5 intro Chengsong parents: 638 diff changeset	1433
56057198e4f5 intro Chengsong parents: 638 diff changeset	1434
56057198e4f5 intro Chengsong parents: 638 diff changeset	1435
56057198e4f5 intro Chengsong parents: 638 diff changeset	1436
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1437	The result is %a regular expression lexing algorithm that comes with
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1438	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1439	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1440	an improved version of Sulzmann and Lu's bit-coded algorithm using
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1441	derivatives with simplifications,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1442	accompanied by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1443	a proven correctness theorem according to POSIX specification
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1444	given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1445	\item
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1446	a complexity-related property for that algorithm saying that the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1447	internal data structure will
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1448	remain below a finite bound,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1449	\item
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1450	and an extension to
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1451	the bounded repetition constructs with the correctness and finiteness property
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1452	maintained.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1453	\end{itemize}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1454	\noindent
646 56057198e4f5 intro Chengsong parents: 638 diff changeset	1455	{\color{red} \rule{\linewidth}{0.5mm}}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1456	With a formal finiteness bound in place,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1457	we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1458	The Isabelle/HOL code for our formalisation can be
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1459	found at
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1460	\begin{center}
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1461	\url{https://github.com/hellotommmy/posix}
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1462	\end{center}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1463	Further improvements to the algorithm with an even stronger version of
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1464	simplification can be made. We conjecture that the resulting size of derivatives
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1465	can be bounded by a cubic bound w.r.t. the size of the regular expression.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1466	We will give relevant code in Scala,
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1467	but do not give a formal proof for that in Isabelle/HOL.
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1468	This is still future work.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1469
cc54ce075db5 restructured Chengsong parents: diff changeset	1470
cc54ce075db5 restructured Chengsong parents: diff changeset	1471	\section{Structure of the thesis}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1472	In chapter \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1473	and notations we
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1474	use for describing regular expressions and derivatives,
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1475	and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1476	its correctness proof).
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1477	We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1478	together with the correctness proof by Ausaf and Urban.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1479	Then we illustrate in chapter \ref{Bitcoded2}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1480	how Sulzmann and Lu's
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1481	simplifications fail to simplify correctly. We therefore introduce our own version of the
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1482	algorithm with correct simplifications and
dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1483	their correctness proof.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1484	In chapter \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1485	of our bitcoded algorithm, that is a finite bound on the size of any
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1486	regular expression's derivatives.
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1487	We also show how one can extend the
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1488	algorithm to include bounded repetitions.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1489	In chapter \ref{Cubic} we discuss stronger simplification rules which
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1490	improve the finite bound to a cubic bound. %and the NOT regular expression.
637 e3752aac8ec2 more Chengsong parents: 636 diff changeset	1491	Chapter \ref{RelatedWork} introduces relevant work for this thesis.
638 dd9dde2d902b comments till chap4 Chengsong parents: 637 diff changeset	1492	Chapter \ref{Future} concludes and mentions avenues of future research.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1493
cc54ce075db5 restructured Chengsong parents: diff changeset	1494
cc54ce075db5 restructured Chengsong parents: diff changeset	1495
cc54ce075db5 restructured Chengsong parents: diff changeset	1496
cc54ce075db5 restructured Chengsong parents: diff changeset	1497
cc54ce075db5 restructured Chengsong parents: diff changeset	1498	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1499
cc54ce075db5 restructured Chengsong parents: diff changeset	1500
cc54ce075db5 restructured Chengsong parents: diff changeset	1501	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1502
cc54ce075db5 restructured Chengsong parents: diff changeset	1503	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1504
cc54ce075db5 restructured Chengsong parents: diff changeset	1505	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1506
cc54ce075db5 restructured Chengsong parents: diff changeset	1507

author	Chengsong
	Fri, 26 May 2023 23:42:15 +0100 (21 months ago)
changeset 648	d15a0b7d6d90
parent 646	56057198e4f5
child 649	ef2b8abcbc55
permissions	-rwxr-xr-x