lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@37b6fd310a16 (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	28	\def\bsimps{\textit{bsimp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	29	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	35	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	36	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594 62f8fa03863e more Chengsong parents: 591 diff changeset	37	\def\rdistincts{\textit{rdistinct}}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	38	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	40	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured Chengsong parents: diff changeset	42
cc54ce075db5 restructured Chengsong parents: diff changeset	43	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	44
600 fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	45	\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	46	\def\SEQs{\textit{SEQs}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	47	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	48	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	49	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	50	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	51	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	52	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	53	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	54	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	55	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	56	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	57	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	58	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	59	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	60	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	61	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	62	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	63
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	64	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	65
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	66	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	67	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	68	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	69	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	70	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	71
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	72	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	73
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	74	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	75	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	76	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	77	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	78	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	79	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	80	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	81	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	82	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	83	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	84	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	85	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	86	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	87	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	88	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	89	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	90	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	91	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	92	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	93	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	94	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	95	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	96	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	97	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	98	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	99	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	100	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	103	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	104	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	105	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	106	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	107	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	108	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	109	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	110	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	111	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	112	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	113	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	114	\def\starupdates{\textit{starUpdates}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	115
532 cc54ce075db5 restructured Chengsong parents: diff changeset	116
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	121	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	122	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	123	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	124	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	125	\def\bsimpStrong{\textit{bsimpStrong}}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	126	\def\bdersStrongs{\textit{bdersStrong}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	127	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	128
532 cc54ce075db5 restructured Chengsong parents: diff changeset	129	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	130	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	131	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	132	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	133	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	134	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	135
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	136	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	137
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	138
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	139	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	140	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	141	\def\bsimpalts{\textit{bsimp}_{ALTS}}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	142	\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	143	\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	144	\def\rsimpseq{\textit{rsimp}_{SEQ}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	145
532 cc54ce075db5 restructured Chengsong parents: diff changeset	146	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	147	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	148	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	149
cc54ce075db5 restructured Chengsong parents: diff changeset	150
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	151	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	152	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	153	\def\filter{\textit{filter}}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	154	%\def\not{\textit{not}}
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	155
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	156
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	157
532 cc54ce075db5 restructured Chengsong parents: diff changeset	158	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	159	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	160	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	161	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	162	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	163	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	164	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	165
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	166
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	167
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	168
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	169	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	170	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	171	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	172	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	173	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	174	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	175	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	176	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	177	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	178	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	179	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	180	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	181	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	182	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	183	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	184	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	185	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	186	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	187
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	188
532 cc54ce075db5 restructured Chengsong parents: diff changeset	189	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	190	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	191	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	192
cc54ce075db5 restructured Chengsong parents: diff changeset	193	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	194
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	195
ce4e5151a836 more Chengsong parents: 600 diff changeset	196
ce4e5151a836 more Chengsong parents: 600 diff changeset	197
ce4e5151a836 more Chengsong parents: 600 diff changeset	198
ce4e5151a836 more Chengsong parents: 600 diff changeset	199
ce4e5151a836 more Chengsong parents: 600 diff changeset	200	Regular expressions are widely used in computer science:
ce4e5151a836 more Chengsong parents: 600 diff changeset	201	be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
ce4e5151a836 more Chengsong parents: 600 diff changeset	202	command-line tools like $\mathit{grep}$ that facilitate easy
ce4e5151a836 more Chengsong parents: 600 diff changeset	203	text-processing; network intrusion
ce4e5151a836 more Chengsong parents: 600 diff changeset	204	detection systems that reject suspicious traffic; or compiler
ce4e5151a836 more Chengsong parents: 600 diff changeset	205	front ends--the majority of the solutions to these tasks
ce4e5151a836 more Chengsong parents: 600 diff changeset	206	involve lexing with regular
ce4e5151a836 more Chengsong parents: 600 diff changeset	207	expressions.
ce4e5151a836 more Chengsong parents: 600 diff changeset	208	Given its usefulness and ubiquity, one would imagine that
ce4e5151a836 more Chengsong parents: 600 diff changeset	209	modern regular expression matching implementations
ce4e5151a836 more Chengsong parents: 600 diff changeset	210	are mature and fully studied.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	211	Indeed, in a popular programming language's regex engine,
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	212	supplying it with regular expressions and strings,
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	213	in most cases one can
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	214	get the matching information in a very short time.
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	215	Those matchers can be blindingly fast--some
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	216	network intrusion detection systems
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	217	use regex engines that are able to process
ce4e5151a836 more Chengsong parents: 600 diff changeset	218	megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	219	However, those matchers can exhibit a surprising security vulnerability
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	220	under a certain class of inputs.
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	221	%However, , this is not the case for $\mathbf{all}$ inputs.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	222	%TODO: get source for SNORT/BRO's regex matching engine/speed
ce4e5151a836 more Chengsong parents: 600 diff changeset	223
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	224
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	225	Take $(a^)^\,b$ and ask whether
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	226	strings of the form $aa..a$ match this regular
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	227	expression. Obviously this is not the case---the expected $b$ in the last
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	228	position is missing. One would expect that modern regular expression
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	229	matching engines can find this out very quickly. Alas, if one tries
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	230	this example in JavaScript, Python or Java 8, even with strings of a small
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	231	length, say around 30 $a$'s,
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	232	the decision takes crazy time to finish (graph \ref{fig:aStarStarb}).
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	233	This is clearly exponential behaviour, and
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	234	is triggered by some relatively simple regex patterns.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	235	Java 9 and newer
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	236	versions improves this behaviour, but is still slow compared
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	237	with the approach we are going to use.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	238
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	239
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	240
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	241
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	242	This superlinear blowup in regular expression engines
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	243	had repeatedly caused grief in real life that they
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	244	get a name for them--``catastrophic backtracking''.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	245	For example, on 20 July 2016 one evil
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	246	regular expression brought the webpage
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	247	\href{http://stackexchange.com}{Stack Exchange} to its
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	248	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	249	In this instance, a regular expression intended to just trim white
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	250	spaces from the beginning and the end of a line actually consumed
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	251	massive amounts of CPU resources---causing web servers to grind to a
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	252	halt. In this example, the time needed to process
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	253	the string was $O(n^2)$ with respect to the string length. This
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	254	quadratic overhead was enough for the homepage of Stack Exchange to
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	255	respond so slowly that the load balancer assumed a $\mathit{DoS}$
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	256	attack and therefore stopped the servers from responding to any
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	257	requests. This made the whole site become unavailable.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	258
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	259	\begin{figure}[p]
532 cc54ce075db5 restructured Chengsong parents: diff changeset	260	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
cc54ce075db5 restructured Chengsong parents: diff changeset	261	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	262	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	263	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	264	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	265	ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	266	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	267	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	268	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	269	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	270	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	271	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	272	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	273	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	274	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	275	legend entries={JavaScript},
cc54ce075db5 restructured Chengsong parents: diff changeset	276	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	277	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	278	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	279	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	280	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	281	&
cc54ce075db5 restructured Chengsong parents: diff changeset	282	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	283	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	284	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	285	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	286	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	287	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	288	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	289	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	290	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	291	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	292	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	293	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	294	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	295	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	296	legend entries={Python},
cc54ce075db5 restructured Chengsong parents: diff changeset	297	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	298	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	299	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	300	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	301	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	302	&
cc54ce075db5 restructured Chengsong parents: diff changeset	303	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	304	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	305	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	306	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	307	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	308	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	309	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	310	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	311	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	312	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	313	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	314	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	315	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	316	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	317	legend entries={Java 8},
cc54ce075db5 restructured Chengsong parents: diff changeset	318	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	319	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	320	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	321	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	322	\end{tikzpicture}\\
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	323	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	324	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	325	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	326	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	327	ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	328	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	329	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	330	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	331	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	332	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	333	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	334	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	335	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	336	height=4cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	337	legend entries={Dart},
ce4e5151a836 more Chengsong parents: 600 diff changeset	338	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	339	legend cell align=left]
ce4e5151a836 more Chengsong parents: 600 diff changeset	340	\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
ce4e5151a836 more Chengsong parents: 600 diff changeset	341	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	342	\end{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	343	&
ce4e5151a836 more Chengsong parents: 600 diff changeset	344	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	345	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	346	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	347	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	348	%ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	349	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	350	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	351	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	352	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	353	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	354	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	355	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	356	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	357	height=4cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	358	legend entries={Swift},
ce4e5151a836 more Chengsong parents: 600 diff changeset	359	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	360	legend cell align=left]
ce4e5151a836 more Chengsong parents: 600 diff changeset	361	\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
ce4e5151a836 more Chengsong parents: 600 diff changeset	362	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	363	\end{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	364	& \\
ce4e5151a836 more Chengsong parents: 600 diff changeset	365	\multicolumn{3}{c}{Graphs}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	366	\end{tabular}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	367	\caption{Graphs showing runtime for matching $(a^)^\,b$ with strings
ce4e5151a836 more Chengsong parents: 600 diff changeset	368	of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
ce4e5151a836 more Chengsong parents: 600 diff changeset	369	The reason for their superlinear behaviour is that they do a depth-first-search.
ce4e5151a836 more Chengsong parents: 600 diff changeset	370	If the string does not match, the engine starts to explore all possibilities.
ce4e5151a836 more Chengsong parents: 600 diff changeset	371	}\label{fig:aStarStarb}
ce4e5151a836 more Chengsong parents: 600 diff changeset	372	\end{figure}\afterpage{\clearpage}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	373
532 cc54ce075db5 restructured Chengsong parents: diff changeset	374	A more recent example is a global outage of all Cloudflare servers on 2 July
cc54ce075db5 restructured Chengsong parents: diff changeset	375	2019. A poorly written regular expression exhibited exponential
cc54ce075db5 restructured Chengsong parents: diff changeset	376	behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
cc54ce075db5 restructured Chengsong parents: diff changeset	377	had several causes, at the heart was a regular expression that
cc54ce075db5 restructured Chengsong parents: diff changeset	378	was used to monitor network
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	379	traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	380	These problems with regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	381	are not isolated events that happen
cc54ce075db5 restructured Chengsong parents: diff changeset	382	very occasionally, but actually widespread.
cc54ce075db5 restructured Chengsong parents: diff changeset	383	They occur so often that they get a
cc54ce075db5 restructured Chengsong parents: diff changeset	384	name--Regular-Expression-Denial-Of-Service (ReDoS)
cc54ce075db5 restructured Chengsong parents: diff changeset	385	attack.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	386	\citeauthor{Davis18} detected more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	387	than 1000 super-linear (SL) regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	388	in Node.js, Python core libraries, and npm and pypi.
cc54ce075db5 restructured Chengsong parents: diff changeset	389	They therefore concluded that evil regular expressions
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	390	are problems "more than a parlour trick", but one that
532 cc54ce075db5 restructured Chengsong parents: diff changeset	391	requires
cc54ce075db5 restructured Chengsong parents: diff changeset	392	more research attention.
cc54ce075db5 restructured Chengsong parents: diff changeset	393
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	394	This work aims to address this issue
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	395	with the help of formal proofs.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	396	We offer a lexing algorithm based
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	397	on Brzozowski derivatives with certified correctness (in
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	398	Isabelle/HOL)
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	399	and finiteness property.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	400	Such properties guarantee the absence of
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	401	catastrophic backtracking in most cases.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	402	We will give more details in the next sections
16d67f9c07d4 more Chengsong parents: 603 diff changeset	403	on (i) why the slow cases in graph \ref{fig:aStarStarb}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	404	can occur
16d67f9c07d4 more Chengsong parents: 603 diff changeset	405	and (ii) why we choose our
16d67f9c07d4 more Chengsong parents: 603 diff changeset	406	approach (Brzozowski derivatives and formal proofs).
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	407
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	408
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	409	\section{Regex, and the Problems with Regex Matchers}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	410	Regular expressions and regular expression matchers
ce4e5151a836 more Chengsong parents: 600 diff changeset	411	have of course been studied for many, many years.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	412	Theoretical results in automata theory say
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	413	that basic regular expression matching should be linear
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	414	w.r.t the input.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	415	This assumes that the regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	416	$r$ was pre-processed and turned into a
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	417	deterministic finite automata (DFA) before matching,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	418	which could be exponential\cite{Sakarovitch2009}.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	419	By basic we mean textbook definitions such as the one
16d67f9c07d4 more Chengsong parents: 603 diff changeset	420	below, involving only characters, alternatives,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	421	sequences, and Kleene stars:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	422	\[
16d67f9c07d4 more Chengsong parents: 603 diff changeset	423	r ::= \ZERO \| \ONE \| c \| r_1 + r_2 \| r_1 \cdot r_2 \| r^*
16d67f9c07d4 more Chengsong parents: 603 diff changeset	424	\]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	425	Modern regular expression matchers used by programmers,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	426	however,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	427	support richer constructs such as bounded repetitions
16d67f9c07d4 more Chengsong parents: 603 diff changeset	428	and back-references.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	429	To differentiate, people use the word \emph{regex} to refer
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	430	to those expressions with richer constructs while reserving the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	431	term \emph{regular expression}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	432	for the more traditional meaning in formal languages theory.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	433	We follow this convention
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	434	in this thesis.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	435	In the future, we aim to support all the popular features of regexes,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	436	but for this work we mainly look at regular expressions.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	437
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	438
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	439
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	440	%Most modern regex libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	441	%the so-called PCRE standard (Peral Compatible Regular Expressions)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	442	%has the back-references
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	443	Regexes come with a lot of constructs
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	444	beyond the basic ones
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	445	that make it more convenient for
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	446	programmers to write regular expressions.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	447	Depending on the types of these constructs
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	448	the task of matching and lexing with them
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	449	will have different levels of complexity increase.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	450	Some of those constructs are syntactic sugars that are
16d67f9c07d4 more Chengsong parents: 603 diff changeset	451	simply short hand notations
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	452	that save the programmers a few keystrokes.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	453	These will not cause trouble for regex libraries.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	454
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	455	\noindent
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	456	For example the
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	457	non-binary alternative involving three or more choices:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	458	\[
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	459	(a \| b \| c) \stackrel{means}{=} ((a + b)+ c)
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	460	\]
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	461	the range operator $-$ used to express the alternative
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	462	of all characters between its operands in a concise way:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	463	\[
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	464	[0~-9]\stackrel{means}{=} (0 \| 1 \| \ldots \| 9 ) \; \text{(all number digits)}
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	465	\]
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	466	and the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	467	wildcard character $.$ used to refer to any single character:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	468	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	469	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	470	\]
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	471
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	472	\noindent
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	473	\subsection{Bounded Repetitions}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	474	Some of those constructs do make the expressions much
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	475	more compact.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	476	For example, the bounded regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	477	(where $n$ and $m$ are constant natural numbers)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	478	$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	479	defined as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	480	\begin{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	481	\begin{tabular}{lcl}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	482	$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	483	$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	484	$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	485	$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	486	\end{tabular}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	487	\end{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	488	are exponentially smaller compared with
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	489	their unfolded form: for example $r^{\{n\}}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	490	as opposed to
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	491	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	492	\underbrace{r\ldots r}_\text{n copies of r}.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	493	\]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	494	%Therefore, a naive algorithm that simply unfolds
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	495	%them into their desugared forms
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	496	%will suffer from at least an exponential runtime increase.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	497
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	498	The problem here is that tools based on the classic notion of
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	499	automata need to expand $r^{n}$ into $n$ connected
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	500	copies of the automaton for $r$. This leads to very inefficient matching
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	501	algorithms or algorithms that consume large amounts of memory.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	502	Implementations using $\DFA$s will
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	503	either become excruciatingly slow
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	504	(for example Verbatim++\cite{Verbatimpp}) or get
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	505	out of memory errors (for example $\mathit{LEX}$ and
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	506	$\mathit{JFLEX}$\footnote{which are lexer generators
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	507	in C and JAVA that generate $\mathit{DFA}$-based
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	508	lexers. The user provides a set of regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	509	and configurations to them, and then
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	510	gets an output program encoding a minimized $\mathit{DFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	511	that can be compiled and run.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	512	When given the above countdown regular expression,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	513	a small $n$ (a few dozen) would result in a
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	514	determinised automata
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	515	with millions of states.}) under large counters.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	516	A classic example is the regular expression $(a+b)^* a (a+b)^{n}$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	517	where the minimal DFA requires at least $2^{n+1}$ states.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	518	For example, when $n$ is equal to 2,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	519	an $\mathit{NFA}$ describing it would look like:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	520	\begin{center}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	521	\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	522	\node[state,initial] (q_0) {$q_0$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	523	\node[state, red] (q_1) [right=of q_0] {$q_1$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	524	\node[state, red] (q_2) [right=of q_1] {$q_2$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	525	\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	526	\path[->]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	527	(q_0) edge node {a} (q_1)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	528	edge [loop below] node {a,b} ()
16d67f9c07d4 more Chengsong parents: 603 diff changeset	529	(q_1) edge node {a,b} (q_2)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	530	(q_2) edge node {a,b} (q_3);
16d67f9c07d4 more Chengsong parents: 603 diff changeset	531	\end{tikzpicture}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	532	\end{center}
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	533	which requires at least $2^3$ states
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	534	for its subset construction.\footnote{The
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	535	red states are "countdown states" which counts down
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	536	the number of characters needed in addition to the current
16d67f9c07d4 more Chengsong parents: 603 diff changeset	537	string to make a successful match.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	538	For example, state $q_1$ indicates a match that has
16d67f9c07d4 more Chengsong parents: 603 diff changeset	539	gone past the $(a\|b)^$ part of $(a\|b)^a(a\|b)^{\{2\}}$,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	540	and just consumed the "delimiter" $a$ in the middle, and
16d67f9c07d4 more Chengsong parents: 603 diff changeset	541	need to match 2 more iterations of $(a\|b)$ to complete.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	542	State $q_2$ on the other hand, can be viewed as a state
16d67f9c07d4 more Chengsong parents: 603 diff changeset	543	after $q_1$ has consumed 1 character, and just waits
16d67f9c07d4 more Chengsong parents: 603 diff changeset	544	for 1 more character to complete.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	545	$q_3$ is the last state, requiring 0 more character and is accepting.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	546	Depending on the suffix of the
16d67f9c07d4 more Chengsong parents: 603 diff changeset	547	input string up to the current read location,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	548	the states $q_1$ and $q_2$, $q_3$
16d67f9c07d4 more Chengsong parents: 603 diff changeset	549	may or may
16d67f9c07d4 more Chengsong parents: 603 diff changeset	550	not be active, independent from each other.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	551	A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
16d67f9c07d4 more Chengsong parents: 603 diff changeset	552	contain at least $2^3$ non-equivalent states that cannot be merged,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	553	because the subset construction during determinisation will generate
16d67f9c07d4 more Chengsong parents: 603 diff changeset	554	all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	555	Generalizing this to regular expressions with larger
16d67f9c07d4 more Chengsong parents: 603 diff changeset	556	bounded repetitions number, we have that
16d67f9c07d4 more Chengsong parents: 603 diff changeset	557	regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	558	would require at least $2^{n+1}$ states, if $r$ itself contains
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	559	more than 1 string.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	560	This is to represent all different
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	561	scenarios which "countdown" states are active.}
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	562
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	563	One of the most recent work in the context of lexing
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	564	%with this issue
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	565	is the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	566	This is relevant work and we will compare later on
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	567	our derivative-based matcher we are going to present.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	568	There is also some newer work called
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	569	Verbatim++\cite{Verbatimpp}, which does not use derivatives,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	570	but deterministic finite automaton instead.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	571	%An example that gives problem to automaton approaches would be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	572	%the regular expression $(a\|b)^*a(a\|b)^{\{n\}}$.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	573	%It requires at least $2^{n+1}$ states to represent
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	574	%as a DFA.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	575
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	576
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	577	Bounded repetitions are very important because they
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	578	tend to occur a lot in practical use.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	579	For example in the regex library RegExLib,
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	580	the rules library of Snort \cite{Snort1999}\footnote{
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	581	Snort is a network intrusion detection (NID) tool
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	582	for monitoring network traffic.
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	583	The network security community curates a list
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	584	of malicious patterns written as regexes,
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	585	which is used by Snort's detection engine
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	586	to match against network traffic for any hostile
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	587	activities such as buffer overflow attacks.},
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	588	as well as in XML Schema definitions (XSDs).
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	589	According to Bj\"{o}rklund et al \cite{xml2015},
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	590	more than half of the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	591	XSDs they found have bounded regular expressions in them.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	592	Often the counters are quite large, the largest up to ten million.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	593	An example XSD they gave
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	594	was:
606 99b530103464 new Chengsong parents: 605 diff changeset	595	%\begin{verbatim}
99b530103464 new Chengsong parents: 605 diff changeset	596	%<sequence minOccurs="0" maxOccurs="65535">
99b530103464 new Chengsong parents: 605 diff changeset	597	% <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
99b530103464 new Chengsong parents: 605 diff changeset	598	% <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
99b530103464 new Chengsong parents: 605 diff changeset	599	%</sequence>
99b530103464 new Chengsong parents: 605 diff changeset	600	%\end{verbatim}
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	601	This can be seen as the expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	602	$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	603	regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	604	satisfying certain constraints (such as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	605	satisfying the floating point number format).
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	606
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	607	It is therefore quite unsatisfying that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	608	some regular expressions matching libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	609	impose adhoc limits
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	610	for bounded regular expressions:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	611	For example, in the regular expression matching library in the Go
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	612	language the regular expression $a^{1001}$ is not permitted, because no counter
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	613	can be above 1000, and in the built-in Rust regular expression library
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	614	expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
606 99b530103464 new Chengsong parents: 605 diff changeset	615	for being too big.
99b530103464 new Chengsong parents: 605 diff changeset	616	As Becchi and Crawley\cite{Becchi08} have pointed out,
99b530103464 new Chengsong parents: 605 diff changeset	617	the reason for these restrictions
99b530103464 new Chengsong parents: 605 diff changeset	618	are that they simulate a non-deterministic finite
99b530103464 new Chengsong parents: 605 diff changeset	619	automata (NFA) with a breadth-first search.
99b530103464 new Chengsong parents: 605 diff changeset	620	This way the number of active states could
99b530103464 new Chengsong parents: 605 diff changeset	621	be equal to the counter number.
99b530103464 new Chengsong parents: 605 diff changeset	622	When the counters are large,
99b530103464 new Chengsong parents: 605 diff changeset	623	the memory requirement could become
99b530103464 new Chengsong parents: 605 diff changeset	624	infeasible, and the pattern needs to be rejected straight away.
99b530103464 new Chengsong parents: 605 diff changeset	625	\begin{figure}[H]
99b530103464 new Chengsong parents: 605 diff changeset	626	\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	627	\begin{tikzpicture} [node distance = 2cm, on grid, auto]
99b530103464 new Chengsong parents: 605 diff changeset	628
99b530103464 new Chengsong parents: 605 diff changeset	629	\node (q0) [state, initial] {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	630	\node (q1) [state, right = of q0] {$1$};
99b530103464 new Chengsong parents: 605 diff changeset	631	\node (q2) [state, right = of q1] {$2$};
99b530103464 new Chengsong parents: 605 diff changeset	632	\node (qdots) [right = of q2] {$\ldots$};
99b530103464 new Chengsong parents: 605 diff changeset	633	\node (qn) [state, right = of qdots] {$n$};
99b530103464 new Chengsong parents: 605 diff changeset	634	\node (qn1) [state, right = of qn] {$n+1$};
99b530103464 new Chengsong parents: 605 diff changeset	635	\node (qn2) [state, right = of qn1] {$n+2$};
99b530103464 new Chengsong parents: 605 diff changeset	636	\node (qn3) [state, accepting, right = of qn2] {$n+3$};
99b530103464 new Chengsong parents: 605 diff changeset	637
99b530103464 new Chengsong parents: 605 diff changeset	638	\path [-stealth, thick]
99b530103464 new Chengsong parents: 605 diff changeset	639	(q0) edge [loop above] node {a} ()
99b530103464 new Chengsong parents: 605 diff changeset	640	(q0) edge node {a} (q1)
99b530103464 new Chengsong parents: 605 diff changeset	641	(q1) edge node {.} (q2)
99b530103464 new Chengsong parents: 605 diff changeset	642	(q2) edge node {.} (qdots)
99b530103464 new Chengsong parents: 605 diff changeset	643	(qdots) edge node {.} (qn)
99b530103464 new Chengsong parents: 605 diff changeset	644	(qn) edge node {.} (qn1)
99b530103464 new Chengsong parents: 605 diff changeset	645	(qn1) edge node {b} (qn2)
99b530103464 new Chengsong parents: 605 diff changeset	646	(qn2) edge node {$c$} (qn3);
99b530103464 new Chengsong parents: 605 diff changeset	647	\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	648	%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
99b530103464 new Chengsong parents: 605 diff changeset	649	% \node[state,initial] (q_0) {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	650	% \node[state, ] (q_1) [right=of q_0] {$1$};
99b530103464 new Chengsong parents: 605 diff changeset	651	% \node[state, ] (q_2) [right=of q_1] {$2$};
99b530103464 new Chengsong parents: 605 diff changeset	652	% \node[state,
99b530103464 new Chengsong parents: 605 diff changeset	653	% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
99b530103464 new Chengsong parents: 605 diff changeset	654	% \path[->]
99b530103464 new Chengsong parents: 605 diff changeset	655	% (q_0) edge node {a} (q_1)
99b530103464 new Chengsong parents: 605 diff changeset	656	% edge [loop below] node {a,b} ()
99b530103464 new Chengsong parents: 605 diff changeset	657	% (q_1) edge node {a,b} (q_2)
99b530103464 new Chengsong parents: 605 diff changeset	658	% (q_2) edge node {a,b} (q_3);
99b530103464 new Chengsong parents: 605 diff changeset	659	%\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	660	\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	661	\caption{The example given by Becchi and Crawley
99b530103464 new Chengsong parents: 605 diff changeset	662	that NFA simulation can consume large
99b530103464 new Chengsong parents: 605 diff changeset	663	amounts of memory: $.^*a.^{\{n\}}bc$ matching
99b530103464 new Chengsong parents: 605 diff changeset	664	strings of the form $aaa\ldots aaaabc$.
99b530103464 new Chengsong parents: 605 diff changeset	665	When traversing in a breadth-first manner,
99b530103464 new Chengsong parents: 605 diff changeset	666	all states from 0 till $n+1$ will become active.}
99b530103464 new Chengsong parents: 605 diff changeset	667	\end{figure}
99b530103464 new Chengsong parents: 605 diff changeset	668	%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
99b530103464 new Chengsong parents: 605 diff changeset	669	%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
99b530103464 new Chengsong parents: 605 diff changeset	670	%in terms of input string length.
99b530103464 new Chengsong parents: 605 diff changeset	671	%TODO:try out these lexers
99b530103464 new Chengsong parents: 605 diff changeset	672	These problems can of course be solved in matching algorithms where
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	673	automata go beyond the classic notion and for instance include explicit
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	674	counters \cite{Turo_ov__2020}.
606 99b530103464 new Chengsong parents: 605 diff changeset	675	These solutions can be quite effective,
99b530103464 new Chengsong parents: 605 diff changeset	676	with the ability to process
99b530103464 new Chengsong parents: 605 diff changeset	677	gigabytes of string input per second
99b530103464 new Chengsong parents: 605 diff changeset	678	even with large counters \cite{Becchi08}.
99b530103464 new Chengsong parents: 605 diff changeset	679	But formally reasoning about these automata can be challenging
99b530103464 new Chengsong parents: 605 diff changeset	680	and un-intuitive.
99b530103464 new Chengsong parents: 605 diff changeset	681	Therefore, correctness and runtime claims made about these solutions need to be
99b530103464 new Chengsong parents: 605 diff changeset	682	taken with a grain of salt.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	683
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	684	In the work reported in \cite{CSL2022} and here,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	685	we add better support using derivatives
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	686	for bounded regular expressions $r^{\{n\}}$.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	687	The results
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	688	extend straightforwardly to
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	689	repetitions with an interval such as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	690	$r^{\{n\ldots m\}}$.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	691	The merit of Brzozowski derivatives (more on this later)
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	692	on this problem is that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	693	it can be naturally extended to support bounded repetitions.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	694	Moreover these extensions are still made up of only
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	695	inductive datatypes and recursive functions,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	696	making it handy to deal with using theorem provers.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	697	%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	698	%straightforwardly extended to deal with bounded regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	699	%and moreover the resulting code still consists of only simple
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	700	%recursive functions and inductive datatypes.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	701	Finally, bounded regular expressions do not destroy our finite
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	702	boundedness property, which we shall prove later on.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	703
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	704
606 99b530103464 new Chengsong parents: 605 diff changeset	705
99b530103464 new Chengsong parents: 605 diff changeset	706
99b530103464 new Chengsong parents: 605 diff changeset	707
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	708	\subsection{Back-References}
606 99b530103464 new Chengsong parents: 605 diff changeset	709	The other way to simulate an $\mathit{NFA}$ for matching is choosing
99b530103464 new Chengsong parents: 605 diff changeset	710	a single transition each time, keeping all the other options in
99b530103464 new Chengsong parents: 605 diff changeset	711	a queue or stack, and backtracking if that choice eventually
99b530103464 new Chengsong parents: 605 diff changeset	712	fails. This method, often called a "depth-first-search",
99b530103464 new Chengsong parents: 605 diff changeset	713	is efficient in a lot of cases, but could end up
99b530103464 new Chengsong parents: 605 diff changeset	714	with exponential run time.
99b530103464 new Chengsong parents: 605 diff changeset	715	The backtracking method is employed in regex libraries
99b530103464 new Chengsong parents: 605 diff changeset	716	that support \emph{back-references}, for example
99b530103464 new Chengsong parents: 605 diff changeset	717	in Java and Python.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	718	%\section{Back-references and The Terminology Regex}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	719
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	720	%When one constructs an $\NFA$ out of a regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	721	%there is often very little to be done in the first phase, one simply
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	722	%construct the $\NFA$ states based on the structure of the input regular expression.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	723
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	724	%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	725	%one by keeping track of all active states after consuming
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	726	%a character, and update that set of states iteratively.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	727	%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	728	%for a path terminating
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	729	%at an accepting state.
606 99b530103464 new Chengsong parents: 605 diff changeset	730
99b530103464 new Chengsong parents: 605 diff changeset	731
99b530103464 new Chengsong parents: 605 diff changeset	732
99b530103464 new Chengsong parents: 605 diff changeset	733	Given a regular expression like this (the sequence
532 cc54ce075db5 restructured Chengsong parents: diff changeset	734	operator is omitted for brevity):
cc54ce075db5 restructured Chengsong parents: diff changeset	735	\begin{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	736	$r_1r_2r_3r_4$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	737	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	738	one could label sub-expressions of interest
532 cc54ce075db5 restructured Chengsong parents: diff changeset	739	by parenthesizing them and giving
cc54ce075db5 restructured Chengsong parents: diff changeset	740	them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured Chengsong parents: diff changeset	741	One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured Chengsong parents: diff changeset	742	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	743	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured Chengsong parents: diff changeset	744	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	745	The sub-expressions
99b530103464 new Chengsong parents: 605 diff changeset	746	$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
99b530103464 new Chengsong parents: 605 diff changeset	747	by 1 to 4, and can be ``referred back'' by their respective numbers.
99b530103464 new Chengsong parents: 605 diff changeset	748	%These sub-expressions are called "capturing groups".
99b530103464 new Chengsong parents: 605 diff changeset	749	To do so, we use the syntax $\backslash i$
99b530103464 new Chengsong parents: 605 diff changeset	750	to denote that we want the sub-string
99b530103464 new Chengsong parents: 605 diff changeset	751	of the input just matched by the i-th
99b530103464 new Chengsong parents: 605 diff changeset	752	sub-expression to appear again,
99b530103464 new Chengsong parents: 605 diff changeset	753	exactly the same as it first appeared:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	754	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	755	$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
cc54ce075db5 restructured Chengsong parents: diff changeset	756	\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured Chengsong parents: diff changeset	757	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	758	%The backslash and number $i$ are the
99b530103464 new Chengsong parents: 605 diff changeset	759	%so-called "back-references".
99b530103464 new Chengsong parents: 605 diff changeset	760	%Let $e$ be an expression made of regular expressions
99b530103464 new Chengsong parents: 605 diff changeset	761	%and back-references. $e$ contains the expression $e_i$
99b530103464 new Chengsong parents: 605 diff changeset	762	%as its $i$-th capturing group.
99b530103464 new Chengsong parents: 605 diff changeset	763	%The semantics of back-reference can be recursively
99b530103464 new Chengsong parents: 605 diff changeset	764	%written as:
99b530103464 new Chengsong parents: 605 diff changeset	765	%\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	766	% \begin{tabular}{c}
99b530103464 new Chengsong parents: 605 diff changeset	767	% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
99b530103464 new Chengsong parents: 605 diff changeset	768	% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
99b530103464 new Chengsong parents: 605 diff changeset	769	% \end{tabular}
99b530103464 new Chengsong parents: 605 diff changeset	770	%\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	771	A concrete example
99b530103464 new Chengsong parents: 605 diff changeset	772	for back-references would be
532 cc54ce075db5 restructured Chengsong parents: diff changeset	773	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	774	$(.^*)\backslash 1$,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	775	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	776	which would match
99b530103464 new Chengsong parents: 605 diff changeset	777	strings that can be split into two identical halves,
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	778	for example $\mathit{foofoo}$, $\mathit{ww}$ and etc.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	779	Note that this is different from
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	780	repeating the sub-expression verbatim like
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	781	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	782	$(.^)(.^)$,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	783	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	784	which does not impose any restrictions on what strings the second
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	785	sub-expression $.^*$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	786	might match.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	787	Another example of back-references would be
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	788	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	789	$(.)(.)\backslash 2\backslash 1$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	790	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	791	which expresses four-character palindromes
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	792	like $abba$, $x??x$ etc.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	793
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	794	Back-references is a regex construct
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	795	that programmers found quite useful.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	796	According to Becchi and Crawley\cite{Becchi08},
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	797	6\% of Snort rules (up until 2008) include the use of them.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	798	The most common use of back-references
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	799	would be expressing well-formed html files,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	800	where back-references would be handy in expressing
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	801	a pair of opening and closing tags like
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	802	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	803	$\langle html \rangle \ldots \langle / html \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	804	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	805	A regex describing such a format
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	806	could be
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	807	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	808	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	809	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	810	Despite being useful, the syntax and expressive power of regexes
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	811	go beyond the regular language hierarchy
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	812	with back-references.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	813	In fact, they allow the regex construct to express
532 cc54ce075db5 restructured Chengsong parents: diff changeset	814	languages that cannot be contained in context-free
cc54ce075db5 restructured Chengsong parents: diff changeset	815	languages either.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	816	For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	817	expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	818	which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured Chengsong parents: diff changeset	819	Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured Chengsong parents: diff changeset	820	of formal languages.
cc54ce075db5 restructured Chengsong parents: diff changeset	821	Solving the back-reference expressions matching problem
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	822	is NP-complete\parencite{alfred2014algorithms}.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	823	A non-bactracking,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	824	efficient solution is not known to exist.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	825	Regex libraries supporting back-references such as
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	826	PCRE \cite{pcre} therefore have to
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	827	revert to a depth-first search algorithm that backtracks.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	828	The unreasonable part with them, is that even in the case of
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	829	regexes not involving back-references, there is still
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	830	a (non-negligible) chance they might backtrack super-linearly.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	831
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	832	\subsection{Summary of the Catastrophic Backtracking Problem}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	833	Summing these up, we can categorise existing
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	834	practical regex libraries into two kinds:
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	835	(i)The ones with linear
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	836	time guarantees like Go and Rust. The cost with them is that
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	837	they impose restrictions
532 cc54ce075db5 restructured Chengsong parents: diff changeset	838	on the user input (not allowing back-references,
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	839	bounded repetitions cannot exceed a counter limit etc.).
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	840	(ii) Those
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	841	that allow large bounded regular expressions and back-references
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	842	at the expense of using a backtracking algorithm.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	843	They could grind to a halt
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	844	on some very simple cases, posing a vulnerability of
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	845	a ReDoS attack.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	846
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	847
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	848	We would like to have regex engines that can
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	849	deal with the regular part (e.g.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	850	bounded repetitions) of regexes more
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	851	efficiently.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	852	Also we want to make sure that they do it correctly.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	853	It turns out that such aim is not so easy to achieve.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	854	%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
cc54ce075db5 restructured Chengsong parents: diff changeset	855	% For example, the Rust regex engine claims to be linear,
cc54ce075db5 restructured Chengsong parents: diff changeset	856	% but does not support lookarounds and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	857	% The GoLang regex library does not support over 1000 repetitions.
cc54ce075db5 restructured Chengsong parents: diff changeset	858	% Java and Python both support back-references, but shows
cc54ce075db5 restructured Chengsong parents: diff changeset	859	%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured Chengsong parents: diff changeset	860	%when the language is still regular).
cc54ce075db5 restructured Chengsong parents: diff changeset	861	%TODO: test performance of Rust on (((((aa)b)b){20}))c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured Chengsong parents: diff changeset	862	%TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured Chengsong parents: diff changeset	863
cc54ce075db5 restructured Chengsong parents: diff changeset	864
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	865
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	866
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	867	%The time cost of regex matching algorithms in general
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	868	%involve two different phases, and different things can go differently wrong on
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	869	%these phases.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	870	%$\DFA$s usually have problems in the first (construction) phase
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	871	%, whereas $\NFA$s usually run into trouble
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	872	%on the second phase.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	873
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	874
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	875	\section{Error-prone POSIX Implementations}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	876	When there are multiple ways of matching a string
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	877	with a regular expression, a matcher needs to
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	878	disambiguate.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	879	The standard for which particular match to pick
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	880	is called the disambiguation strategy.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	881	The more intuitive strategy is called POSIX,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	882	which always chooses the longest initial match.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	883	An alternative strategy would be greedy matches,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	884	which always ends a sub-match as early as possible.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	885	The POSIX standard is widely adopted in many operating systems.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	886	However, many implementations (including the C libraries
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	887	used by Linux and OS X distributions) contain bugs
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	888	or do not meet the specification they claim to adhere to.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	889	In some cases, they either fail to generate a lexing
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	890	result when there exists a match,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	891	or give results that are inconsistent with the $\POSIX$ standard.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	892	A concrete example would be the regex given by \cite{fowler2003}
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	893	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	894	$(aba + ab + a)^* \text{and the string} ababa$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	895	\end{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	896	The correct $\POSIX$ match for the above would be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	897	with the entire string $ababa$,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	898	split into two Kleene star iterations, $[ab] [aba]$ at positions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	899	$[0, 2), [2, 5)$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	900	respectively.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	901	But trying this out in regex101\parencite{regex101}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	902	with different language engines would yield
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	903	the same two fragmented matches: $[aba]$ at $[0, 3)$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	904	and $a$ at $[4, 5)$.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	905	Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	906	commented that most regex libraries are not
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	907	correctly implementing the POSIX (maximum-munch)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	908	rule of regular expression matching.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	909	As Grathwohl\parencite{grathwohl2014crash} wrote,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	910	\begin{quote}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	911	``The POSIX strategy is more complicated than the
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	912	greedy because of the dependence on information about
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	913	the length of matched strings in the various subexpressions.''
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	914	\end{quote}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	915	%\noindent
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	916	The implementation complexity of POSIX rules also come from
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	917	the specification being not very clear.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	918	There are many informal summaries of this disambiguation
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	919	strategy, which are often quite long and delicate.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	920	For example Kuklewicz \cite{KuklewiczHaskell}
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	921	described the POSIX rule as
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	922	\begin{quote}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	923	``
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	924	\begin{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	925	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	926	regular expressions (REs) take the leftmost starting match, and the longest match starting there
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	927	earlier subpatterns have leftmost-longest priority over later subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	928	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	929	higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	930	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	931	REs have right associative concatenation which can be changed with parenthesis\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	932	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	933	parenthesized subexpressions return the match from their last usage\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	934	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	935	text of component subexpressions must be contained in the text of the
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	936	higher-level subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	937	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	938	if "p" and "q" can never match the same text then "p\|q" and "q\|p" are equivalent, up to trivial renumbering of captured subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	939	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	940	if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\''
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	941	\end{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	942	\end{quote}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	943	The text above
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	944	is trying to capture something very precise,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	945	and is crying out for formalising.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	946	Ausaf et al. \cite{AusafDyckhoffUrban2016}
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	947	are the first to describe such a formalised POSIX
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	948	specification in Isabelle/HOL.
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	949	They then formally proved the correctness of
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	950	a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	951	based on that specification.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	952
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	953	In this thesis,
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	954	we propose a solution to catastrophic
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	955	backtracking and error-prone matchers--a formally verified
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	956	regular expression lexing algorithm
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	957	that is both fast
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	958	and correct by extending Ausaf et al.'s work.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	959	The end result is a regular expression lexing algorithm that comes with
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	960	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	961	\item
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	962	a proven correctness theorem according to POSIX specification
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	963	given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	964	\item
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	965	a proven complexity-related property saying that the
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	966	algorithm's internal data structure will
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	967	remain finite,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	968	\item
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	969	and extension to
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	970	the bounded repetitions construct with the correctness and finiteness property
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	971	maintained.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	972	\end{itemize}
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	973	In the next section we will very briefly
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	974	introduce Brzozowski derivatives.
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	975	We will give a taste to what they
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	976	are like and why they are suitable for regular expression
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	977	matching and lexing.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	978
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	979	\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	980	We propose Brzozowski derivatives on regular expressions as
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	981	a solution to this.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	982	In the last fifteen or so years, Brzozowski's derivatives of regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	983	expressions have sparked quite a bit of interest in the functional
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	984	programming and theorem prover communities.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	985
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	986	\subsection{Motivation}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	987
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	988	Derivatives give a simple solution
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	989	to the problem of matching a string $s$ with a regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	990	expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	991	succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	992	then $r$ matches $s$ (and {\em vice versa}).
532 cc54ce075db5 restructured Chengsong parents: diff changeset	993
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	994	The beauty of
532 cc54ce075db5 restructured Chengsong parents: diff changeset	995	Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
cc54ce075db5 restructured Chengsong parents: diff changeset	996	expressible in any functional language, and easily definable and
cc54ce075db5 restructured Chengsong parents: diff changeset	997	reasoned about in theorem provers---the definitions just consist of
cc54ce075db5 restructured Chengsong parents: diff changeset	998	inductive datatypes and simple recursive functions.
cc54ce075db5 restructured Chengsong parents: diff changeset	999	And an algorithms based on it by
cc54ce075db5 restructured Chengsong parents: diff changeset	1000	Suzmann and Lu \parencite{Sulzmann2014} allows easy extension
cc54ce075db5 restructured Chengsong parents: diff changeset	1001	to include extended regular expressions and
cc54ce075db5 restructured Chengsong parents: diff changeset	1002	simplification of internal data structures
cc54ce075db5 restructured Chengsong parents: diff changeset	1003	eliminating the exponential behaviours.
cc54ce075db5 restructured Chengsong parents: diff changeset	1004
cc54ce075db5 restructured Chengsong parents: diff changeset	1005	However, two difficulties with derivative-based matchers exist:
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1006	\subsubsection{Problems with Current Brzozowski Matchers}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1007	First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured Chengsong parents: diff changeset	1008	for whether a regular expression matches a string or not. This is too
cc54ce075db5 restructured Chengsong parents: diff changeset	1009	little information in the context of lexing where separate tokens must
cc54ce075db5 restructured Chengsong parents: diff changeset	1010	be identified and also classified (for example as keywords
cc54ce075db5 restructured Chengsong parents: diff changeset	1011	or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
cc54ce075db5 restructured Chengsong parents: diff changeset	1012	difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured Chengsong parents: diff changeset	1013	algorithm. Their extended version generates additional information on
cc54ce075db5 restructured Chengsong parents: diff changeset	1014	\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured Chengsong parents: diff changeset	1015	rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured Chengsong parents: diff changeset	1016	second ``phase'' to Brzozowski's algorithm involving an injection
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1017	function. In our own earlier work, we provided the formal
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1018	specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured Chengsong parents: diff changeset	1019	the correctness
cc54ce075db5 restructured Chengsong parents: diff changeset	1020	of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured Chengsong parents: diff changeset	1021	\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured Chengsong parents: diff changeset	1022
cc54ce075db5 restructured Chengsong parents: diff changeset	1023	The second difficulty is that Brzozowski's derivatives can
cc54ce075db5 restructured Chengsong parents: diff changeset	1024	grow to arbitrarily big sizes. For example if we start with the
cc54ce075db5 restructured Chengsong parents: diff changeset	1025	regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured Chengsong parents: diff changeset	1026	successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured Chengsong parents: diff changeset	1027	a sequence of ever-growing derivatives like
cc54ce075db5 restructured Chengsong parents: diff changeset	1028
cc54ce075db5 restructured Chengsong parents: diff changeset	1029	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured Chengsong parents: diff changeset	1030	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1031	\begin{tabular}{rll}
cc54ce075db5 restructured Chengsong parents: diff changeset	1032	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1033	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1034	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1035	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1036	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured Chengsong parents: diff changeset	1037	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	1038	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1039
cc54ce075db5 restructured Chengsong parents: diff changeset	1040	\noindent where after around 35 steps we run out of memory on a
cc54ce075db5 restructured Chengsong parents: diff changeset	1041	typical computer (we shall define shortly the precise details of our
cc54ce075db5 restructured Chengsong parents: diff changeset	1042	regular expressions and the derivative operation). Clearly, the
cc54ce075db5 restructured Chengsong parents: diff changeset	1043	notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured Chengsong parents: diff changeset	1044	simplification rules that can be applied to regular regular
cc54ce075db5 restructured Chengsong parents: diff changeset	1045	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured Chengsong parents: diff changeset	1046	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured Chengsong parents: diff changeset	1047	r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured Chengsong parents: diff changeset	1048	earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured Chengsong parents: diff changeset	1049	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured Chengsong parents: diff changeset	1050	\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured Chengsong parents: diff changeset	1051	above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured Chengsong parents: diff changeset	1052	quickly beyond any finite bound.
cc54ce075db5 restructured Chengsong parents: diff changeset	1053
cc54ce075db5 restructured Chengsong parents: diff changeset	1054
cc54ce075db5 restructured Chengsong parents: diff changeset	1055	Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1056	\cite{Sulzmann2014} where they introduce bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1057	regular expressions. In this version, POSIX values are
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1058	represented as bit sequences and such sequences are incrementally generated
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1059	when derivatives are calculated. The compact representation
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1060	of bit sequences and regular expressions allows them to define a more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1061	``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured Chengsong parents: diff changeset	1062	derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured Chengsong parents: diff changeset	1063	They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured Chengsong parents: diff changeset	1064	of this version, but do not provide any supporting proof arguments, not
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1065	even ``pencil-and-paper'' arguments. They write about their bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1066	\emph{incremental parsing method} (that is the algorithm to be formalised
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1067	in this dissertation)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1068
cc54ce075db5 restructured Chengsong parents: diff changeset	1069
cc54ce075db5 restructured Chengsong parents: diff changeset	1070
cc54ce075db5 restructured Chengsong parents: diff changeset	1071	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	1072	``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured Chengsong parents: diff changeset	1073	method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured Chengsong parents: diff changeset	1074	yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured Chengsong parents: diff changeset	1075	extensively [..] but yet
cc54ce075db5 restructured Chengsong parents: diff changeset	1076	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured Chengsong parents: diff changeset	1077	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	1078
cc54ce075db5 restructured Chengsong parents: diff changeset	1079	Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured Chengsong parents: diff changeset	1080	a formal proof.
cc54ce075db5 restructured Chengsong parents: diff changeset	1081
cc54ce075db5 restructured Chengsong parents: diff changeset	1082	But as they stated,
cc54ce075db5 restructured Chengsong parents: diff changeset	1083	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	1084	The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
cc54ce075db5 restructured Chengsong parents: diff changeset	1085	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	1086
cc54ce075db5 restructured Chengsong parents: diff changeset	1087	This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured Chengsong parents: diff changeset	1088	by Ausaf and Urban,
cc54ce075db5 restructured Chengsong parents: diff changeset	1089	and gives a formal proof of the correctness with those simplifications.
cc54ce075db5 restructured Chengsong parents: diff changeset	1090
cc54ce075db5 restructured Chengsong parents: diff changeset	1091
cc54ce075db5 restructured Chengsong parents: diff changeset	1092	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1093	\section{Contribution}
cc54ce075db5 restructured Chengsong parents: diff changeset	1094
cc54ce075db5 restructured Chengsong parents: diff changeset	1095
cc54ce075db5 restructured Chengsong parents: diff changeset	1096
cc54ce075db5 restructured Chengsong parents: diff changeset	1097	This work addresses the vulnerability of super-linear and
cc54ce075db5 restructured Chengsong parents: diff changeset	1098	buggy regex implementations by the combination
cc54ce075db5 restructured Chengsong parents: diff changeset	1099	of Brzozowski's derivatives and interactive theorem proving.
cc54ce075db5 restructured Chengsong parents: diff changeset	1100	We give an
cc54ce075db5 restructured Chengsong parents: diff changeset	1101	improved version of Sulzmann and Lu's bit-coded algorithm using
cc54ce075db5 restructured Chengsong parents: diff changeset	1102	derivatives, which come with a formal guarantee in terms of correctness and
cc54ce075db5 restructured Chengsong parents: diff changeset	1103	running time as an Isabelle/HOL proof.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1104	Further improvements to the algorithm with an even stronger version of
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1105	simplification is made.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1106	We have not yet come up with one, but believe that it leads to a
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1107	formalised proof with a time bound linear to input and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1108	cubic to regular expression size using a technique by
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1109	Antimirov\cite{Antimirov}.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1110
cc54ce075db5 restructured Chengsong parents: diff changeset	1111
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1112	The main contribution of this thesis is
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1113	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1114	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1115	a proven correct lexing algorithm
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1116	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1117	with formalized finite bounds on internal data structures' sizes.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1118	\end{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1119
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1120	To our best knowledge, no lexing libraries using Brzozowski derivatives
cc54ce075db5 restructured Chengsong parents: diff changeset	1121	have a provable time guarantee,
cc54ce075db5 restructured Chengsong parents: diff changeset	1122	and claims about running time are usually speculative and backed by thin empirical
cc54ce075db5 restructured Chengsong parents: diff changeset	1123	evidence.
cc54ce075db5 restructured Chengsong parents: diff changeset	1124	%TODO: give references
cc54ce075db5 restructured Chengsong parents: diff changeset	1125	For example, Sulzmann and Lu had proposed an algorithm in which they
cc54ce075db5 restructured Chengsong parents: diff changeset	1126	claim a linear running time.
cc54ce075db5 restructured Chengsong parents: diff changeset	1127	But that was falsified by our experiments and the running time
cc54ce075db5 restructured Chengsong parents: diff changeset	1128	is actually $\Omega(2^n)$ in the worst case.
cc54ce075db5 restructured Chengsong parents: diff changeset	1129	A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
cc54ce075db5 restructured Chengsong parents: diff changeset	1130	%TODO: give references
cc54ce075db5 restructured Chengsong parents: diff changeset	1131	lexer, which calculates POSIX matches and is based on derivatives.
cc54ce075db5 restructured Chengsong parents: diff changeset	1132	They formalized the correctness of the lexer, but not the complexity.
cc54ce075db5 restructured Chengsong parents: diff changeset	1133	In the performance evaluation section, they simply analyzed the run time
cc54ce075db5 restructured Chengsong parents: diff changeset	1134	of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
cc54ce075db5 restructured Chengsong parents: diff changeset	1135	and concluded that the algorithm is quadratic in terms of input length.
cc54ce075db5 restructured Chengsong parents: diff changeset	1136	When we tried out their extracted OCaml code with our example $(a+aa)^*$,
cc54ce075db5 restructured Chengsong parents: diff changeset	1137	the time it took to lex only 40 $a$'s was 5 minutes.
cc54ce075db5 restructured Chengsong parents: diff changeset	1138
cc54ce075db5 restructured Chengsong parents: diff changeset	1139
cc54ce075db5 restructured Chengsong parents: diff changeset	1140
cc54ce075db5 restructured Chengsong parents: diff changeset	1141	\subsection{Related Work}
cc54ce075db5 restructured Chengsong parents: diff changeset	1142	We are aware
cc54ce075db5 restructured Chengsong parents: diff changeset	1143	of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
cc54ce075db5 restructured Chengsong parents: diff changeset	1144	Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
cc54ce075db5 restructured Chengsong parents: diff changeset	1145	of the work by Krauss and Nipkow \parencite{Krauss2011}. And another one
cc54ce075db5 restructured Chengsong parents: diff changeset	1146	in Coq is given by Coquand and Siles \parencite{Coquand2012}.
cc54ce075db5 restructured Chengsong parents: diff changeset	1147	Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
cc54ce075db5 restructured Chengsong parents: diff changeset	1148
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1149
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1150	When a regular expression does not behave as intended,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1151	people usually try to rewrite the regex to some equivalent form
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1152	or they try to avoid the possibly problematic patterns completely,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1153	for which many false positives exist\parencite{Davis18}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1154	Animated tools to "debug" regular expressions such as
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1155	\parencite{regexploit2021} \parencite{regex101} are also popular.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1156	We are also aware of static analysis work on regular expressions that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1157	aims to detect potentially expoential regex patterns. Rathnayake and Thielecke
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1158	\parencite{Rathnayake2014StaticAF} proposed an algorithm
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1159	that detects regular expressions triggering exponential
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1160	behavious on backtracking matchers.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1161	Weideman \parencite{Weideman2017Static} came up with
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1162	non-linear polynomial worst-time estimates
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1163	for regexes, attack string that exploit the worst-time
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1164	scenario, and "attack automata" that generates
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1165	attack strings.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1166
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1167
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1168
cc54ce075db5 restructured Chengsong parents: diff changeset	1169
cc54ce075db5 restructured Chengsong parents: diff changeset	1170	\section{Structure of the thesis}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1171	In chapter 2 \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1172	and notations we
cc54ce075db5 restructured Chengsong parents: diff changeset	1173	use for describing the lexing algorithm by Sulzmann and Lu,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1174	and then give the lexing algorithm.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1175	We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1176	Then we illustrate in \ref{Bitcoded2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1177	how the algorithm without bitcodes falls short for such aggressive
cc54ce075db5 restructured Chengsong parents: diff changeset	1178	simplifications and therefore introduce our version of the
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1179	bit-coded algorithm and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1180	its correctness proof .
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1181	In \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1182	of our bitcoded algorithm, that is a finite bound on the size of any
cc54ce075db5 restructured Chengsong parents: diff changeset	1183	regex's derivatives.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1184	In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1185	in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1186	algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured Chengsong parents: diff changeset	1187
cc54ce075db5 restructured Chengsong parents: diff changeset	1188
cc54ce075db5 restructured Chengsong parents: diff changeset	1189
cc54ce075db5 restructured Chengsong parents: diff changeset	1190
cc54ce075db5 restructured Chengsong parents: diff changeset	1191
cc54ce075db5 restructured Chengsong parents: diff changeset	1192	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1193
cc54ce075db5 restructured Chengsong parents: diff changeset	1194
cc54ce075db5 restructured Chengsong parents: diff changeset	1195	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1196
cc54ce075db5 restructured Chengsong parents: diff changeset	1197	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1198
cc54ce075db5 restructured Chengsong parents: diff changeset	1199	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1200
cc54ce075db5 restructured Chengsong parents: diff changeset	1201

author	Chengsong
	Sat, 01 Oct 2022 12:06:46 +0100
changeset 608	37b6fd310a16
parent 607	e6fc9b72c0e3
child 609	61139fdddae0
permissions	-rwxr-xr-x