lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@61139fdddae0 (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	28	\def\bsimps{\textit{bsimp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	29	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	35	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	36	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594 62f8fa03863e more Chengsong parents: 591 diff changeset	37	\def\rdistincts{\textit{rdistinct}}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	38	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	40	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured Chengsong parents: diff changeset	42
cc54ce075db5 restructured Chengsong parents: diff changeset	43	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	44
600 fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	45	\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	46	\def\SEQs{\textit{SEQs}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	47	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	48	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	49	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	50	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	51	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	52	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	53	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	54	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	55	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	56	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	57	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	58	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	59	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	60	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	61	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	62	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	63
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	64	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	65
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	66	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	67	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	68	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	69	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	70	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	71
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	72	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	73
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	74	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	75	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	76	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	77	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	78	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	79	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	80	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	81	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	82	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	83	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	84	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	85	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	86	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	87	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	88	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	89	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	90	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	91	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	92	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	93	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	94	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	95	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	96	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	97	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	98	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	99	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	100	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	103	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	104	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	105	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	106	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	107	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	108	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	109	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	110	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	111	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	112	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	113	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	114	\def\starupdates{\textit{starUpdates}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	115
532 cc54ce075db5 restructured Chengsong parents: diff changeset	116
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	121	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	122	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	123	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	124	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	125	\def\bsimpStrong{\textit{bsimpStrong}}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	126	\def\bdersStrongs{\textit{bdersStrong}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	127	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	128
532 cc54ce075db5 restructured Chengsong parents: diff changeset	129	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	130	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	131	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	132	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	133	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	134	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	135
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	136	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	137
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	138
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	139	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	140	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	141	\def\bsimpalts{\textit{bsimp}_{ALTS}}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	142	\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	143	\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	144	\def\rsimpseq{\textit{rsimp}_{SEQ}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	145
532 cc54ce075db5 restructured Chengsong parents: diff changeset	146	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	147	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	148	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	149
cc54ce075db5 restructured Chengsong parents: diff changeset	150
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	151	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	152	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	153	\def\filter{\textit{filter}}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	154	%\def\not{\textit{not}}
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	155
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	156
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	157
532 cc54ce075db5 restructured Chengsong parents: diff changeset	158	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	159	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	160	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	161	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	162	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	163	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	164	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	165
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	166
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	167
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	168
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	169	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	170	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	171	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	172	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	173	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	174	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	175	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	176	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	177	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	178	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	179	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	180	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	181	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	182	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	183	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	184	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	185	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	186	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	187
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	188
532 cc54ce075db5 restructured Chengsong parents: diff changeset	189	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	190	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	191	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	192
cc54ce075db5 restructured Chengsong parents: diff changeset	193	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	194
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	195
ce4e5151a836 more Chengsong parents: 600 diff changeset	196
ce4e5151a836 more Chengsong parents: 600 diff changeset	197
ce4e5151a836 more Chengsong parents: 600 diff changeset	198
ce4e5151a836 more Chengsong parents: 600 diff changeset	199
ce4e5151a836 more Chengsong parents: 600 diff changeset	200	Regular expressions are widely used in computer science:
ce4e5151a836 more Chengsong parents: 600 diff changeset	201	be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
ce4e5151a836 more Chengsong parents: 600 diff changeset	202	command-line tools like $\mathit{grep}$ that facilitate easy
ce4e5151a836 more Chengsong parents: 600 diff changeset	203	text-processing; network intrusion
ce4e5151a836 more Chengsong parents: 600 diff changeset	204	detection systems that reject suspicious traffic; or compiler
ce4e5151a836 more Chengsong parents: 600 diff changeset	205	front ends--the majority of the solutions to these tasks
ce4e5151a836 more Chengsong parents: 600 diff changeset	206	involve lexing with regular
ce4e5151a836 more Chengsong parents: 600 diff changeset	207	expressions.
ce4e5151a836 more Chengsong parents: 600 diff changeset	208	Given its usefulness and ubiquity, one would imagine that
ce4e5151a836 more Chengsong parents: 600 diff changeset	209	modern regular expression matching implementations
ce4e5151a836 more Chengsong parents: 600 diff changeset	210	are mature and fully studied.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	211	Indeed, in a popular programming language's regex engine,
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	212	supplying it with regular expressions and strings,
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	213	in most cases one can
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	214	get the matching information in a very short time.
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	215	Those matchers can be blindingly fast--some
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	216	network intrusion detection systems
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	217	use regex engines that are able to process
ce4e5151a836 more Chengsong parents: 600 diff changeset	218	megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	219	However, those matchers can exhibit a surprising security vulnerability
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	220	under a certain class of inputs.
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	221	%However, , this is not the case for $\mathbf{all}$ inputs.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	222	%TODO: get source for SNORT/BRO's regex matching engine/speed
ce4e5151a836 more Chengsong parents: 600 diff changeset	223
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	224
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	225	Take $(a^)^\,b$ and ask whether
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	226	strings of the form $aa..a$ match this regular
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	227	expression. Obviously this is not the case---the expected $b$ in the last
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	228	position is missing. One would expect that modern regular expression
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	229	matching engines can find this out very quickly. Alas, if one tries
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	230	this example in JavaScript, Python or Java 8, even with strings of a small
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	231	length, say around 30 $a$'s,
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	232	the decision takes crazy time to finish (graph \ref{fig:aStarStarb}).
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	233	This is clearly exponential behaviour, and
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	234	is triggered by some relatively simple regex patterns.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	235	Java 9 and newer
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	236	versions improves this behaviour, but is still slow compared
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	237	with the approach we are going to use.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	238
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	239
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	240
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	241
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	242	This superlinear blowup in regular expression engines
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	243	had repeatedly caused grief in real life that they
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	244	get a name for them--``catastrophic backtracking''.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	245	For example, on 20 July 2016 one evil
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	246	regular expression brought the webpage
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	247	\href{http://stackexchange.com}{Stack Exchange} to its
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	248	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	249	In this instance, a regular expression intended to just trim white
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	250	spaces from the beginning and the end of a line actually consumed
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	251	massive amounts of CPU resources---causing web servers to grind to a
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	252	halt. In this example, the time needed to process
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	253	the string was $O(n^2)$ with respect to the string length. This
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	254	quadratic overhead was enough for the homepage of Stack Exchange to
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	255	respond so slowly that the load balancer assumed a $\mathit{DoS}$
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	256	attack and therefore stopped the servers from responding to any
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	257	requests. This made the whole site become unavailable.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	258
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	259	\begin{figure}[p]
532 cc54ce075db5 restructured Chengsong parents: diff changeset	260	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
cc54ce075db5 restructured Chengsong parents: diff changeset	261	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	262	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	263	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	264	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	265	ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	266	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	267	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	268	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	269	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	270	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	271	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	272	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	273	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	274	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	275	legend entries={JavaScript},
cc54ce075db5 restructured Chengsong parents: diff changeset	276	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	277	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	278	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	279	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	280	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	281	&
cc54ce075db5 restructured Chengsong parents: diff changeset	282	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	283	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	284	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	285	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	286	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	287	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	288	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	289	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	290	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	291	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	292	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	293	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	294	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	295	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	296	legend entries={Python},
cc54ce075db5 restructured Chengsong parents: diff changeset	297	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	298	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	299	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	300	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	301	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	302	&
cc54ce075db5 restructured Chengsong parents: diff changeset	303	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	304	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	305	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	306	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	307	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	308	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	309	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	310	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	311	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	312	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	313	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	314	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	315	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	316	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	317	legend entries={Java 8},
cc54ce075db5 restructured Chengsong parents: diff changeset	318	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	319	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	320	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	321	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	322	\end{tikzpicture}\\
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	323	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	324	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	325	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	326	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	327	ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	328	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	329	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	330	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	331	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	332	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	333	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	334	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	335	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	336	height=4cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	337	legend entries={Dart},
ce4e5151a836 more Chengsong parents: 600 diff changeset	338	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	339	legend cell align=left]
ce4e5151a836 more Chengsong parents: 600 diff changeset	340	\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
ce4e5151a836 more Chengsong parents: 600 diff changeset	341	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	342	\end{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	343	&
ce4e5151a836 more Chengsong parents: 600 diff changeset	344	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	345	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	346	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	347	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	348	%ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	349	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	350	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	351	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	352	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	353	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	354	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	355	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	356	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	357	height=4cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	358	legend entries={Swift},
ce4e5151a836 more Chengsong parents: 600 diff changeset	359	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	360	legend cell align=left]
ce4e5151a836 more Chengsong parents: 600 diff changeset	361	\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
ce4e5151a836 more Chengsong parents: 600 diff changeset	362	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	363	\end{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	364	& \\
ce4e5151a836 more Chengsong parents: 600 diff changeset	365	\multicolumn{3}{c}{Graphs}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	366	\end{tabular}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	367	\caption{Graphs showing runtime for matching $(a^)^\,b$ with strings
ce4e5151a836 more Chengsong parents: 600 diff changeset	368	of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
ce4e5151a836 more Chengsong parents: 600 diff changeset	369	The reason for their superlinear behaviour is that they do a depth-first-search.
ce4e5151a836 more Chengsong parents: 600 diff changeset	370	If the string does not match, the engine starts to explore all possibilities.
ce4e5151a836 more Chengsong parents: 600 diff changeset	371	}\label{fig:aStarStarb}
ce4e5151a836 more Chengsong parents: 600 diff changeset	372	\end{figure}\afterpage{\clearpage}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	373
532 cc54ce075db5 restructured Chengsong parents: diff changeset	374	A more recent example is a global outage of all Cloudflare servers on 2 July
cc54ce075db5 restructured Chengsong parents: diff changeset	375	2019. A poorly written regular expression exhibited exponential
cc54ce075db5 restructured Chengsong parents: diff changeset	376	behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
cc54ce075db5 restructured Chengsong parents: diff changeset	377	had several causes, at the heart was a regular expression that
cc54ce075db5 restructured Chengsong parents: diff changeset	378	was used to monitor network
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	379	traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	380	These problems with regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	381	are not isolated events that happen
cc54ce075db5 restructured Chengsong parents: diff changeset	382	very occasionally, but actually widespread.
cc54ce075db5 restructured Chengsong parents: diff changeset	383	They occur so often that they get a
cc54ce075db5 restructured Chengsong parents: diff changeset	384	name--Regular-Expression-Denial-Of-Service (ReDoS)
cc54ce075db5 restructured Chengsong parents: diff changeset	385	attack.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	386	\citeauthor{Davis18} detected more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	387	than 1000 super-linear (SL) regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	388	in Node.js, Python core libraries, and npm and pypi.
cc54ce075db5 restructured Chengsong parents: diff changeset	389	They therefore concluded that evil regular expressions
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	390	are problems "more than a parlour trick", but one that
532 cc54ce075db5 restructured Chengsong parents: diff changeset	391	requires
cc54ce075db5 restructured Chengsong parents: diff changeset	392	more research attention.
cc54ce075db5 restructured Chengsong parents: diff changeset	393
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	394	This work aims to address this issue
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	395	with the help of formal proofs.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	396	We offer a lexing algorithm based
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	397	on Brzozowski derivatives with certified correctness (in
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	398	Isabelle/HOL)
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	399	and finiteness property.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	400	Such properties guarantee the absence of
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	401	catastrophic backtracking in most cases.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	402	We will give more details in the next sections
16d67f9c07d4 more Chengsong parents: 603 diff changeset	403	on (i) why the slow cases in graph \ref{fig:aStarStarb}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	404	can occur
16d67f9c07d4 more Chengsong parents: 603 diff changeset	405	and (ii) why we choose our
16d67f9c07d4 more Chengsong parents: 603 diff changeset	406	approach (Brzozowski derivatives and formal proofs).
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	407
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	408
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	409	\section{Regex, and the Problems with Regex Matchers}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	410	Regular expressions and regular expression matchers
ce4e5151a836 more Chengsong parents: 600 diff changeset	411	have of course been studied for many, many years.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	412	Theoretical results in automata theory say
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	413	that basic regular expression matching should be linear
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	414	w.r.t the input.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	415	This assumes that the regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	416	$r$ was pre-processed and turned into a
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	417	deterministic finite automata (DFA) before matching,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	418	which could be exponential\cite{Sakarovitch2009}.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	419	By basic we mean textbook definitions such as the one
16d67f9c07d4 more Chengsong parents: 603 diff changeset	420	below, involving only characters, alternatives,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	421	sequences, and Kleene stars:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	422	\[
16d67f9c07d4 more Chengsong parents: 603 diff changeset	423	r ::= \ZERO \| \ONE \| c \| r_1 + r_2 \| r_1 \cdot r_2 \| r^*
16d67f9c07d4 more Chengsong parents: 603 diff changeset	424	\]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	425	Modern regular expression matchers used by programmers,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	426	however,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	427	support richer constructs such as bounded repetitions
16d67f9c07d4 more Chengsong parents: 603 diff changeset	428	and back-references.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	429	To differentiate, people use the word \emph{regex} to refer
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	430	to those expressions with richer constructs while reserving the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	431	term \emph{regular expression}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	432	for the more traditional meaning in formal languages theory.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	433	We follow this convention
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	434	in this thesis.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	435	In the future, we aim to support all the popular features of regexes,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	436	but for this work we mainly look at regular expressions.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	437
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	438
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	439
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	440	%Most modern regex libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	441	%the so-called PCRE standard (Peral Compatible Regular Expressions)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	442	%has the back-references
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	443	Regexes come with a lot of constructs
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	444	beyond the basic ones
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	445	that make it more convenient for
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	446	programmers to write regular expressions.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	447	Depending on the types of these constructs
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	448	the task of matching and lexing with them
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	449	will have different levels of complexity increase.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	450	Some of those constructs are syntactic sugars that are
16d67f9c07d4 more Chengsong parents: 603 diff changeset	451	simply short hand notations
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	452	that save the programmers a few keystrokes.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	453	These will not cause trouble for regex libraries.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	454
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	455	\noindent
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	456	For example the
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	457	non-binary alternative involving three or more choices:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	458	\[
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	459	(a \| b \| c) \stackrel{means}{=} ((a + b)+ c)
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	460	\]
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	461	the range operator $-$ used to express the alternative
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	462	of all characters between its operands in a concise way:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	463	\[
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	464	[0~-9]\stackrel{means}{=} (0 \| 1 \| \ldots \| 9 ) \; \text{(all number digits)}
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	465	\]
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	466	and the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	467	wildcard character $.$ used to refer to any single character:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	468	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	469	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	470	\]
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	471
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	472	\noindent
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	473	\subsection{Bounded Repetitions}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	474	Some of those constructs do make the expressions much
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	475	more compact.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	476	For example, the bounded regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	477	(where $n$ and $m$ are constant natural numbers)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	478	$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	479	defined as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	480	\begin{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	481	\begin{tabular}{lcl}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	482	$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	483	$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	484	$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	485	$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	486	\end{tabular}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	487	\end{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	488	are exponentially smaller compared with
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	489	their unfolded form: for example $r^{\{n\}}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	490	as opposed to
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	491	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	492	\underbrace{r\ldots r}_\text{n copies of r}.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	493	\]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	494	%Therefore, a naive algorithm that simply unfolds
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	495	%them into their desugared forms
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	496	%will suffer from at least an exponential runtime increase.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	497
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	498	The problem here is that tools based on the classic notion of
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	499	automata need to expand $r^{n}$ into $n$ connected
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	500	copies of the automaton for $r$. This leads to very inefficient matching
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	501	algorithms or algorithms that consume large amounts of memory.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	502	Implementations using $\DFA$s will
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	503	either become excruciatingly slow
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	504	(for example Verbatim++\cite{Verbatimpp}) or get
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	505	out of memory errors (for example $\mathit{LEX}$ and
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	506	$\mathit{JFLEX}$\footnote{which are lexer generators
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	507	in C and JAVA that generate $\mathit{DFA}$-based
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	508	lexers. The user provides a set of regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	509	and configurations to them, and then
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	510	gets an output program encoding a minimized $\mathit{DFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	511	that can be compiled and run.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	512	When given the above countdown regular expression,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	513	a small $n$ (a few dozen) would result in a
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	514	determinised automata
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	515	with millions of states.}) under large counters.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	516	A classic example is the regular expression $(a+b)^* a (a+b)^{n}$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	517	where the minimal DFA requires at least $2^{n+1}$ states.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	518	For example, when $n$ is equal to 2,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	519	an $\mathit{NFA}$ describing it would look like:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	520	\begin{center}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	521	\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	522	\node[state,initial] (q_0) {$q_0$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	523	\node[state, red] (q_1) [right=of q_0] {$q_1$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	524	\node[state, red] (q_2) [right=of q_1] {$q_2$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	525	\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	526	\path[->]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	527	(q_0) edge node {a} (q_1)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	528	edge [loop below] node {a,b} ()
16d67f9c07d4 more Chengsong parents: 603 diff changeset	529	(q_1) edge node {a,b} (q_2)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	530	(q_2) edge node {a,b} (q_3);
16d67f9c07d4 more Chengsong parents: 603 diff changeset	531	\end{tikzpicture}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	532	\end{center}
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	533	which requires at least $2^3$ states
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	534	for its subset construction.\footnote{The
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	535	red states are "countdown states" which counts down
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	536	the number of characters needed in addition to the current
16d67f9c07d4 more Chengsong parents: 603 diff changeset	537	string to make a successful match.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	538	For example, state $q_1$ indicates a match that has
16d67f9c07d4 more Chengsong parents: 603 diff changeset	539	gone past the $(a\|b)^$ part of $(a\|b)^a(a\|b)^{\{2\}}$,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	540	and just consumed the "delimiter" $a$ in the middle, and
16d67f9c07d4 more Chengsong parents: 603 diff changeset	541	need to match 2 more iterations of $(a\|b)$ to complete.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	542	State $q_2$ on the other hand, can be viewed as a state
16d67f9c07d4 more Chengsong parents: 603 diff changeset	543	after $q_1$ has consumed 1 character, and just waits
16d67f9c07d4 more Chengsong parents: 603 diff changeset	544	for 1 more character to complete.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	545	$q_3$ is the last state, requiring 0 more character and is accepting.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	546	Depending on the suffix of the
16d67f9c07d4 more Chengsong parents: 603 diff changeset	547	input string up to the current read location,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	548	the states $q_1$ and $q_2$, $q_3$
16d67f9c07d4 more Chengsong parents: 603 diff changeset	549	may or may
16d67f9c07d4 more Chengsong parents: 603 diff changeset	550	not be active, independent from each other.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	551	A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
16d67f9c07d4 more Chengsong parents: 603 diff changeset	552	contain at least $2^3$ non-equivalent states that cannot be merged,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	553	because the subset construction during determinisation will generate
16d67f9c07d4 more Chengsong parents: 603 diff changeset	554	all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	555	Generalizing this to regular expressions with larger
16d67f9c07d4 more Chengsong parents: 603 diff changeset	556	bounded repetitions number, we have that
16d67f9c07d4 more Chengsong parents: 603 diff changeset	557	regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	558	would require at least $2^{n+1}$ states, if $r$ itself contains
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	559	more than 1 string.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	560	This is to represent all different
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	561	scenarios which "countdown" states are active.}
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	562
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	563	One of the most recent work in the context of lexing
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	564	%with this issue
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	565	is the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	566	This is relevant work and we will compare later on
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	567	our derivative-based matcher we are going to present.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	568	There is also some newer work called
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	569	Verbatim++\cite{Verbatimpp}, which does not use derivatives,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	570	but deterministic finite automaton instead.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	571	%An example that gives problem to automaton approaches would be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	572	%the regular expression $(a\|b)^*a(a\|b)^{\{n\}}$.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	573	%It requires at least $2^{n+1}$ states to represent
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	574	%as a DFA.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	575
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	576
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	577	Bounded repetitions are very important because they
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	578	tend to occur a lot in practical use.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	579	For example in the regex library RegExLib,
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	580	the rules library of Snort \cite{Snort1999}\footnote{
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	581	Snort is a network intrusion detection (NID) tool
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	582	for monitoring network traffic.
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	583	The network security community curates a list
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	584	of malicious patterns written as regexes,
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	585	which is used by Snort's detection engine
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	586	to match against network traffic for any hostile
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	587	activities such as buffer overflow attacks.},
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	588	as well as in XML Schema definitions (XSDs).
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	589	According to Bj\"{o}rklund et al \cite{xml2015},
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	590	more than half of the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	591	XSDs they found have bounded regular expressions in them.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	592	Often the counters are quite large, the largest up to ten million.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	593	An example XSD they gave
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	594	was:
606 99b530103464 new Chengsong parents: 605 diff changeset	595	%\begin{verbatim}
99b530103464 new Chengsong parents: 605 diff changeset	596	%<sequence minOccurs="0" maxOccurs="65535">
99b530103464 new Chengsong parents: 605 diff changeset	597	% <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
99b530103464 new Chengsong parents: 605 diff changeset	598	% <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
99b530103464 new Chengsong parents: 605 diff changeset	599	%</sequence>
99b530103464 new Chengsong parents: 605 diff changeset	600	%\end{verbatim}
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	601	This can be seen as the expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	602	$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	603	regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	604	satisfying certain constraints (such as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	605	satisfying the floating point number format).
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	606
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	607	It is therefore quite unsatisfying that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	608	some regular expressions matching libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	609	impose adhoc limits
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	610	for bounded regular expressions:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	611	For example, in the regular expression matching library in the Go
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	612	language the regular expression $a^{1001}$ is not permitted, because no counter
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	613	can be above 1000, and in the built-in Rust regular expression library
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	614	expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
606 99b530103464 new Chengsong parents: 605 diff changeset	615	for being too big.
99b530103464 new Chengsong parents: 605 diff changeset	616	As Becchi and Crawley\cite{Becchi08} have pointed out,
99b530103464 new Chengsong parents: 605 diff changeset	617	the reason for these restrictions
99b530103464 new Chengsong parents: 605 diff changeset	618	are that they simulate a non-deterministic finite
99b530103464 new Chengsong parents: 605 diff changeset	619	automata (NFA) with a breadth-first search.
99b530103464 new Chengsong parents: 605 diff changeset	620	This way the number of active states could
99b530103464 new Chengsong parents: 605 diff changeset	621	be equal to the counter number.
99b530103464 new Chengsong parents: 605 diff changeset	622	When the counters are large,
99b530103464 new Chengsong parents: 605 diff changeset	623	the memory requirement could become
99b530103464 new Chengsong parents: 605 diff changeset	624	infeasible, and the pattern needs to be rejected straight away.
99b530103464 new Chengsong parents: 605 diff changeset	625	\begin{figure}[H]
99b530103464 new Chengsong parents: 605 diff changeset	626	\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	627	\begin{tikzpicture} [node distance = 2cm, on grid, auto]
99b530103464 new Chengsong parents: 605 diff changeset	628
99b530103464 new Chengsong parents: 605 diff changeset	629	\node (q0) [state, initial] {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	630	\node (q1) [state, right = of q0] {$1$};
99b530103464 new Chengsong parents: 605 diff changeset	631	\node (q2) [state, right = of q1] {$2$};
99b530103464 new Chengsong parents: 605 diff changeset	632	\node (qdots) [right = of q2] {$\ldots$};
99b530103464 new Chengsong parents: 605 diff changeset	633	\node (qn) [state, right = of qdots] {$n$};
99b530103464 new Chengsong parents: 605 diff changeset	634	\node (qn1) [state, right = of qn] {$n+1$};
99b530103464 new Chengsong parents: 605 diff changeset	635	\node (qn2) [state, right = of qn1] {$n+2$};
99b530103464 new Chengsong parents: 605 diff changeset	636	\node (qn3) [state, accepting, right = of qn2] {$n+3$};
99b530103464 new Chengsong parents: 605 diff changeset	637
99b530103464 new Chengsong parents: 605 diff changeset	638	\path [-stealth, thick]
99b530103464 new Chengsong parents: 605 diff changeset	639	(q0) edge [loop above] node {a} ()
99b530103464 new Chengsong parents: 605 diff changeset	640	(q0) edge node {a} (q1)
99b530103464 new Chengsong parents: 605 diff changeset	641	(q1) edge node {.} (q2)
99b530103464 new Chengsong parents: 605 diff changeset	642	(q2) edge node {.} (qdots)
99b530103464 new Chengsong parents: 605 diff changeset	643	(qdots) edge node {.} (qn)
99b530103464 new Chengsong parents: 605 diff changeset	644	(qn) edge node {.} (qn1)
99b530103464 new Chengsong parents: 605 diff changeset	645	(qn1) edge node {b} (qn2)
99b530103464 new Chengsong parents: 605 diff changeset	646	(qn2) edge node {$c$} (qn3);
99b530103464 new Chengsong parents: 605 diff changeset	647	\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	648	%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
99b530103464 new Chengsong parents: 605 diff changeset	649	% \node[state,initial] (q_0) {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	650	% \node[state, ] (q_1) [right=of q_0] {$1$};
99b530103464 new Chengsong parents: 605 diff changeset	651	% \node[state, ] (q_2) [right=of q_1] {$2$};
99b530103464 new Chengsong parents: 605 diff changeset	652	% \node[state,
99b530103464 new Chengsong parents: 605 diff changeset	653	% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
99b530103464 new Chengsong parents: 605 diff changeset	654	% \path[->]
99b530103464 new Chengsong parents: 605 diff changeset	655	% (q_0) edge node {a} (q_1)
99b530103464 new Chengsong parents: 605 diff changeset	656	% edge [loop below] node {a,b} ()
99b530103464 new Chengsong parents: 605 diff changeset	657	% (q_1) edge node {a,b} (q_2)
99b530103464 new Chengsong parents: 605 diff changeset	658	% (q_2) edge node {a,b} (q_3);
99b530103464 new Chengsong parents: 605 diff changeset	659	%\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	660	\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	661	\caption{The example given by Becchi and Crawley
99b530103464 new Chengsong parents: 605 diff changeset	662	that NFA simulation can consume large
99b530103464 new Chengsong parents: 605 diff changeset	663	amounts of memory: $.^*a.^{\{n\}}bc$ matching
99b530103464 new Chengsong parents: 605 diff changeset	664	strings of the form $aaa\ldots aaaabc$.
99b530103464 new Chengsong parents: 605 diff changeset	665	When traversing in a breadth-first manner,
99b530103464 new Chengsong parents: 605 diff changeset	666	all states from 0 till $n+1$ will become active.}
99b530103464 new Chengsong parents: 605 diff changeset	667	\end{figure}
99b530103464 new Chengsong parents: 605 diff changeset	668	%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
99b530103464 new Chengsong parents: 605 diff changeset	669	%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
99b530103464 new Chengsong parents: 605 diff changeset	670	%in terms of input string length.
99b530103464 new Chengsong parents: 605 diff changeset	671	%TODO:try out these lexers
99b530103464 new Chengsong parents: 605 diff changeset	672	These problems can of course be solved in matching algorithms where
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	673	automata go beyond the classic notion and for instance include explicit
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	674	counters \cite{Turo_ov__2020}.
606 99b530103464 new Chengsong parents: 605 diff changeset	675	These solutions can be quite effective,
99b530103464 new Chengsong parents: 605 diff changeset	676	with the ability to process
99b530103464 new Chengsong parents: 605 diff changeset	677	gigabytes of string input per second
99b530103464 new Chengsong parents: 605 diff changeset	678	even with large counters \cite{Becchi08}.
99b530103464 new Chengsong parents: 605 diff changeset	679	But formally reasoning about these automata can be challenging
99b530103464 new Chengsong parents: 605 diff changeset	680	and un-intuitive.
99b530103464 new Chengsong parents: 605 diff changeset	681	Therefore, correctness and runtime claims made about these solutions need to be
99b530103464 new Chengsong parents: 605 diff changeset	682	taken with a grain of salt.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	683
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	684	In the work reported in \cite{CSL2022} and here,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	685	we add better support using derivatives
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	686	for bounded regular expressions $r^{\{n\}}$.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	687	The results
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	688	extend straightforwardly to
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	689	repetitions with an interval such as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	690	$r^{\{n\ldots m\}}$.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	691	The merit of Brzozowski derivatives (more on this later)
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	692	on this problem is that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	693	it can be naturally extended to support bounded repetitions.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	694	Moreover these extensions are still made up of only
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	695	inductive datatypes and recursive functions,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	696	making it handy to deal with using theorem provers.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	697	%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	698	%straightforwardly extended to deal with bounded regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	699	%and moreover the resulting code still consists of only simple
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	700	%recursive functions and inductive datatypes.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	701	Finally, bounded regular expressions do not destroy our finite
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	702	boundedness property, which we shall prove later on.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	703
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	704
606 99b530103464 new Chengsong parents: 605 diff changeset	705
99b530103464 new Chengsong parents: 605 diff changeset	706
99b530103464 new Chengsong parents: 605 diff changeset	707
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	708	\subsection{Back-References}
606 99b530103464 new Chengsong parents: 605 diff changeset	709	The other way to simulate an $\mathit{NFA}$ for matching is choosing
99b530103464 new Chengsong parents: 605 diff changeset	710	a single transition each time, keeping all the other options in
99b530103464 new Chengsong parents: 605 diff changeset	711	a queue or stack, and backtracking if that choice eventually
99b530103464 new Chengsong parents: 605 diff changeset	712	fails. This method, often called a "depth-first-search",
99b530103464 new Chengsong parents: 605 diff changeset	713	is efficient in a lot of cases, but could end up
99b530103464 new Chengsong parents: 605 diff changeset	714	with exponential run time.
99b530103464 new Chengsong parents: 605 diff changeset	715	The backtracking method is employed in regex libraries
99b530103464 new Chengsong parents: 605 diff changeset	716	that support \emph{back-references}, for example
99b530103464 new Chengsong parents: 605 diff changeset	717	in Java and Python.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	718	%\section{Back-references and The Terminology Regex}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	719
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	720	%When one constructs an $\NFA$ out of a regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	721	%there is often very little to be done in the first phase, one simply
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	722	%construct the $\NFA$ states based on the structure of the input regular expression.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	723
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	724	%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	725	%one by keeping track of all active states after consuming
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	726	%a character, and update that set of states iteratively.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	727	%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	728	%for a path terminating
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	729	%at an accepting state.
606 99b530103464 new Chengsong parents: 605 diff changeset	730
99b530103464 new Chengsong parents: 605 diff changeset	731
99b530103464 new Chengsong parents: 605 diff changeset	732
99b530103464 new Chengsong parents: 605 diff changeset	733	Given a regular expression like this (the sequence
532 cc54ce075db5 restructured Chengsong parents: diff changeset	734	operator is omitted for brevity):
cc54ce075db5 restructured Chengsong parents: diff changeset	735	\begin{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	736	$r_1r_2r_3r_4$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	737	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	738	one could label sub-expressions of interest
532 cc54ce075db5 restructured Chengsong parents: diff changeset	739	by parenthesizing them and giving
cc54ce075db5 restructured Chengsong parents: diff changeset	740	them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured Chengsong parents: diff changeset	741	One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured Chengsong parents: diff changeset	742	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	743	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured Chengsong parents: diff changeset	744	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	745	The sub-expressions
99b530103464 new Chengsong parents: 605 diff changeset	746	$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
99b530103464 new Chengsong parents: 605 diff changeset	747	by 1 to 4, and can be ``referred back'' by their respective numbers.
99b530103464 new Chengsong parents: 605 diff changeset	748	%These sub-expressions are called "capturing groups".
99b530103464 new Chengsong parents: 605 diff changeset	749	To do so, we use the syntax $\backslash i$
99b530103464 new Chengsong parents: 605 diff changeset	750	to denote that we want the sub-string
99b530103464 new Chengsong parents: 605 diff changeset	751	of the input just matched by the i-th
99b530103464 new Chengsong parents: 605 diff changeset	752	sub-expression to appear again,
99b530103464 new Chengsong parents: 605 diff changeset	753	exactly the same as it first appeared:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	754	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	755	$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
cc54ce075db5 restructured Chengsong parents: diff changeset	756	\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured Chengsong parents: diff changeset	757	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	758	%The backslash and number $i$ are the
99b530103464 new Chengsong parents: 605 diff changeset	759	%so-called "back-references".
99b530103464 new Chengsong parents: 605 diff changeset	760	%Let $e$ be an expression made of regular expressions
99b530103464 new Chengsong parents: 605 diff changeset	761	%and back-references. $e$ contains the expression $e_i$
99b530103464 new Chengsong parents: 605 diff changeset	762	%as its $i$-th capturing group.
99b530103464 new Chengsong parents: 605 diff changeset	763	%The semantics of back-reference can be recursively
99b530103464 new Chengsong parents: 605 diff changeset	764	%written as:
99b530103464 new Chengsong parents: 605 diff changeset	765	%\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	766	% \begin{tabular}{c}
99b530103464 new Chengsong parents: 605 diff changeset	767	% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
99b530103464 new Chengsong parents: 605 diff changeset	768	% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
99b530103464 new Chengsong parents: 605 diff changeset	769	% \end{tabular}
99b530103464 new Chengsong parents: 605 diff changeset	770	%\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	771	A concrete example
99b530103464 new Chengsong parents: 605 diff changeset	772	for back-references would be
532 cc54ce075db5 restructured Chengsong parents: diff changeset	773	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	774	$(.^*)\backslash 1$,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	775	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	776	which would match
99b530103464 new Chengsong parents: 605 diff changeset	777	strings that can be split into two identical halves,
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	778	for example $\mathit{foofoo}$, $\mathit{ww}$ and etc.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	779	Note that this is different from
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	780	repeating the sub-expression verbatim like
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	781	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	782	$(.^)(.^)$,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	783	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	784	which does not impose any restrictions on what strings the second
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	785	sub-expression $.^*$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	786	might match.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	787	Another example of back-references would be
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	788	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	789	$(.)(.)\backslash 2\backslash 1$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	790	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	791	which expresses four-character palindromes
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	792	like $abba$, $x??x$ etc.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	793
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	794	Back-references is a regex construct
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	795	that programmers found quite useful.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	796	According to Becchi and Crawley\cite{Becchi08},
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	797	6\% of Snort rules (up until 2008) include the use of them.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	798	The most common use of back-references
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	799	would be expressing well-formed html files,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	800	where back-references would be handy in expressing
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	801	a pair of opening and closing tags like
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	802	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	803	$\langle html \rangle \ldots \langle / html \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	804	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	805	A regex describing such a format
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	806	could be
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	807	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	808	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	809	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	810	Despite being useful, the syntax and expressive power of regexes
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	811	go beyond the regular language hierarchy
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	812	with back-references.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	813	In fact, they allow the regex construct to express
532 cc54ce075db5 restructured Chengsong parents: diff changeset	814	languages that cannot be contained in context-free
cc54ce075db5 restructured Chengsong parents: diff changeset	815	languages either.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	816	For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	817	expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	818	which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured Chengsong parents: diff changeset	819	Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured Chengsong parents: diff changeset	820	of formal languages.
cc54ce075db5 restructured Chengsong parents: diff changeset	821	Solving the back-reference expressions matching problem
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	822	is known to be NP-complete \parencite{alfred2014algorithms}.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	823	A non-bactracking,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	824	efficient solution is not known to exist.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	825	Regex libraries supporting back-references such as
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	826	PCRE \cite{pcre} therefore have to
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	827	revert to a depth-first search algorithm which backtracks.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	828	What is unexpected is that even in the cases
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	829	not involving back-references, there is still
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	830	a (non-negligible) chance they might backtrack super-linearly,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	831	as shown in the graphs in \ref{fig:aStarStarb}.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	832
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	833	\subsection{Summary of the Catastrophic Backtracking Problem}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	834	Summing these up, we can categorise existing
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	835	practical regex libraries into two kinds:
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	836	(i)The ones with linear
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	837	time guarantees like Go and Rust. The cost with them is that
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	838	they impose restrictions
532 cc54ce075db5 restructured Chengsong parents: diff changeset	839	on the user input (not allowing back-references,
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	840	bounded repetitions cannot exceed a counter limit etc.).
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	841	(ii) Those
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	842	that allow large bounded regular expressions and back-references
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	843	at the expense of using a backtracking algorithm.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	844	They could grind to a halt
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	845	on some very simple cases, posing a vulnerability of
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	846	a ReDoS attack.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	847
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	848
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	849	We would like to have regex engines that can
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	850	deal with the regular part (e.g.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	851	bounded repetitions) of regexes more
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	852	efficiently.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	853	Also we want to make sure that they do it correctly.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	854	It turns out that such aim is not so easy to achieve.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	855	%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
cc54ce075db5 restructured Chengsong parents: diff changeset	856	% For example, the Rust regex engine claims to be linear,
cc54ce075db5 restructured Chengsong parents: diff changeset	857	% but does not support lookarounds and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	858	% The GoLang regex library does not support over 1000 repetitions.
cc54ce075db5 restructured Chengsong parents: diff changeset	859	% Java and Python both support back-references, but shows
cc54ce075db5 restructured Chengsong parents: diff changeset	860	%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured Chengsong parents: diff changeset	861	%when the language is still regular).
cc54ce075db5 restructured Chengsong parents: diff changeset	862	%TODO: test performance of Rust on (((((aa)b)b){20}))c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured Chengsong parents: diff changeset	863	%TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured Chengsong parents: diff changeset	864
cc54ce075db5 restructured Chengsong parents: diff changeset	865
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	866
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	867
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	868	%The time cost of regex matching algorithms in general
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	869	%involve two different phases, and different things can go differently wrong on
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	870	%these phases.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	871	%$\DFA$s usually have problems in the first (construction) phase
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	872	%, whereas $\NFA$s usually run into trouble
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	873	%on the second phase.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	874
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	875
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	876	\section{Error-prone POSIX Implementations}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	877	When there are multiple ways of matching a string
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	878	with a regular expression, a matcher needs to
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	879	disambiguate.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	880	The standard for which particular match to pick
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	881	is called the disambiguation strategy.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	882	The more intuitive strategy is called POSIX,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	883	which always chooses the longest initial match.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	884	An alternative strategy would be greedy matches,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	885	which always ends a sub-match as early as possible.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	886	The POSIX standard is widely adopted in many operating systems.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	887	However, many implementations (including the C libraries
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	888	used by Linux and OS X distributions) contain bugs
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	889	or do not meet the specification they claim to adhere to.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	890	In some cases, they either fail to generate a lexing
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	891	result when there exists a match,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	892	or give results that are inconsistent with the $\POSIX$ standard.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	893	A concrete example would be the regex given by \cite{fowler2003}
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	894	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	895	$(aba + ab + a)^* \text{and the string} ababa$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	896	\end{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	897	The correct $\POSIX$ match for the above would be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	898	with the entire string $ababa$,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	899	split into two Kleene star iterations, $[ab] [aba]$ at positions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	900	$[0, 2), [2, 5)$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	901	respectively.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	902	But trying this out in regex101\parencite{regex101}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	903	with different language engines would yield
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	904	the same two fragmented matches: $[aba]$ at $[0, 3)$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	905	and $a$ at $[4, 5)$.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	906	Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	907	commented that most regex libraries are not
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	908	correctly implementing the POSIX (maximum-munch)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	909	rule of regular expression matching.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	910	As Grathwohl\parencite{grathwohl2014crash} wrote,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	911	\begin{quote}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	912	``The POSIX strategy is more complicated than the
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	913	greedy because of the dependence on information about
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	914	the length of matched strings in the various subexpressions.''
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	915	\end{quote}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	916	%\noindent
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	917	The implementation complexity of POSIX rules also come from
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	918	the specification being not very clear.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	919	There are many informal summaries of this disambiguation
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	920	strategy, which are often quite long and delicate.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	921	For example Kuklewicz \cite{KuklewiczHaskell}
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	922	described the POSIX rule as
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	923	\begin{quote}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	924	``
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	925	\begin{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	926	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	927	regular expressions (REs) take the leftmost starting match, and the longest match starting there
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	928	earlier subpatterns have leftmost-longest priority over later subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	929	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	930	higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	931	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	932	REs have right associative concatenation which can be changed with parenthesis\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	933	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	934	parenthesized subexpressions return the match from their last usage\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	935	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	936	text of component subexpressions must be contained in the text of the
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	937	higher-level subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	938	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	939	if "p" and "q" can never match the same text then "p\|q" and "q\|p" are equivalent, up to trivial renumbering of captured subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	940	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	941	if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\''
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	942	\end{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	943	\end{quote}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	944	The text above
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	945	is trying to capture something very precise,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	946	and is crying out for formalising.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	947	Ausaf et al. \cite{AusafDyckhoffUrban2016}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	948	are the first to fill the gap
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	949	by not just describing such a formalised POSIX
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	950	specification in Isabelle/HOL, but also proving
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	951	that their specification coincides with the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	952	POSIX specification given by Okui and Suzuki \cite{Okui10}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	953	which is a completely
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	954	different characterisation.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	955	They then formally proved the correctness of
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	956	a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	957	based on that specification.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	958
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	959	In the next section we will very briefly
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	960	introduce Brzozowski derivatives and Sulzmann
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	961	and Lu's algorithm, which this thesis builds on.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	962	We give a taste of what they
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	963	are like and why they are suitable for regular expression
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	964	matching and lexing.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	965
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	966	\section{Our Solution--Formal Specification of POSIX Matching
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	967	and Brzozowski Derivatives}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	968	Now we start with the central topic of the thesis: Brzozowski derivatives.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	969	Brzozowski \cite{Brzozowski1964} first introduced the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	970	concept of the \emph{derivative} in the 1960s.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	971	The derivative of a regular expression $r$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	972	with respect to a character $c$, is written as $r \backslash c$.\footnote{
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	973	Despite having the same name, regular expression
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	974	derivatives bear little similarity with the mathematical definition
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	975	of derivatives on functions.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	976	}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	977	It tells us what $r$ would transform into
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	978	if we chop off the first character $c$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	979	from all strings in the language of $r$ ($L \; r$).
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	980	To give a flavour of Brzozowski derivatives, we present
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	981	two straightforward clauses from it:
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	982	\begin{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	983	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	984	$d \backslash c$ & $\dn$ &
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	985	$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	986	$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	987	\end{tabular}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	988	\end{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	989	\noindent
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	990	The first clause says that for the regular expression
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	991	denoting a singleton set consisting of a sinlge-character string $\{ d \}$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	992	we check the derivative character $c$ against $d$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	993	returning a set containing only the empty string $\{ [] \}$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	994	if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	995	The second clause states that to obtain the regular expression
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	996	representing all strings' head character $c$ being chopped off
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	997	from $r_1 + r_2$, one simply needs to recursively take derivative
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	998	of $r_1$ and $r_2$ and then put them together.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	999
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1000	Thanks to the definition, derivatives have the nice property
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1001	that $s \in L \; (r\backslash c)$ if and only if
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1002	$c::s \in L \; r$.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1003	%This property can be used on regular expressions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1004	%matching and lexing--to test whether a string $s$ is in $L \; r$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1005	%one simply takes derivatives of $r$ successively with
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1006	%respect to the characters (in the correct order) in $s$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1007	%and then test whether the empty string is in the last regular expression.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1008	Derivatives give a simple solution
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1009	to the problem of matching and lexing a string $s$ with a regular
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1010	expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1011	succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1012	then $r$ matches $s$ (and {\em vice versa}).
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1013
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1014	This makes formally reasoning about these properties such
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1015	as correctness and complexity smooth and intuitive.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1016	In fact, there has already been several mechanised proofs about them,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1017	for example the one by Owens and Slind \cite{Owens2008} in HOL4,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1018	another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, and
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1019	yet another in Coq by Coquand and Siles \cite{Coquand2012}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1020
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1021	In addition, one can extend the clauses to bounded repetitions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1022	``for free'':
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1023	\begin{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1024	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1025	$r^{\{n\}} \backslash c$ & $\dn$ & $r \backslash c \cdot
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1026	r^{\{n-1\}}$\\
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1027	\end{tabular}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1028	\end{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1029	\noindent
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1030	And experimental results suggest that unlike DFA-based solutions,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1031	this derivatives can support
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1032	bounded regular expressions with large counters
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1033	quite well.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1034
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1035	There has also been
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1036	extensions to other constructs.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1037	For example, Owens et al include the derivatives
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1038	for \emph{NOT} regular expressions, which is
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1039	able to concisely express C-style comments of the form
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1040	$/* \ldots */$.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1041	Another extension for derivatives would be
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1042	regular expressions with look-aheads, done by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1043	by Miyazaki and Minamide
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1044	\cite{Takayuki2019}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1045	%We therefore use Brzozowski derivatives on regular expressions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1046	%lexing
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1047
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1048
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1049
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1050	Given the above definitions and properties of
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1051	Brzozowski derivatives, one quickly realises their potential
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1052	in generating a formally verified algorithm for lexing--the clauses and property
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1053	can be easily expressed in a functional programming language
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1054	or converted to theorem prover
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1055	code, with great extensibility.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1056	Perhaps this is the reason why it has sparked quite a bit of interest
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1057	in the functional programming and theorem prover communities in the last
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1058	fifteen or so years (
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1059	\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1060	\cite{Chen12} and \cite{Coquand2012}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1061	to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1062	after they were first published.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1063
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1064
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1065	However, there are two difficulties with derivative-based matchers:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1066	First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured Chengsong parents: diff changeset	1067	for whether a regular expression matches a string or not. This is too
cc54ce075db5 restructured Chengsong parents: diff changeset	1068	little information in the context of lexing where separate tokens must
cc54ce075db5 restructured Chengsong parents: diff changeset	1069	be identified and also classified (for example as keywords
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1070	or identifiers).
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1071	Second, derivative-based matchers need to be more efficient.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1072	Elegant and beautiful
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1073	as many implementations are,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1074	they can be excruciatingly slow.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1075	For example, Sulzmann and Lu
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1076	claim a linear running time of their proposed algorithm,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1077	but that was falsified by our experiments. The running time
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1078	is actually $\Omega(2^n)$ in the worst case.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1079	A similar claim about a theoretical runtime of $O(n^2)$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1080	is made for the Verbatim \cite{Verbatim}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1081	%TODO: give references
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1082	lexer, which calculates POSIX matches and is based on derivatives.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1083	They formalized the correctness of the lexer, but not the complexity.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1084	In the performance evaluation section, they simply analyzed the run time
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1085	of matching $a$ with the string
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1086	\begin{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1087	$\underbrace{a \ldots a}_{\text{n a's}}$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1088	\end{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1089	and concluded that the algorithm is quadratic in terms of input length.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1090	When we tried out their extracted OCaml code with our example $(a+aa)^*$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1091	the time it took to lex only 40 $a$'s was 5 minutes.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1092
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1093
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1094	\subsection{Sulzmann and Lu's Algorithm}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1095	Sulzmann and Lu~\cite{Sulzmann2014} overcame the first
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1096	difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured Chengsong parents: diff changeset	1097	algorithm. Their extended version generates additional information on
cc54ce075db5 restructured Chengsong parents: diff changeset	1098	\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured Chengsong parents: diff changeset	1099	rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured Chengsong parents: diff changeset	1100	second ``phase'' to Brzozowski's algorithm involving an injection
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1101	function simplification of internal data structures
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1102	eliminating the exponential behaviours.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1103	In an earlier work, Ausaf et al provided the formal
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1104	specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured Chengsong parents: diff changeset	1105	the correctness
cc54ce075db5 restructured Chengsong parents: diff changeset	1106	of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured Chengsong parents: diff changeset	1107	\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured Chengsong parents: diff changeset	1108
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1109	The version of the algorithm proven correct
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1110	suffers from the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1111	second difficulty though, where the internal derivatives can
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1112	grow to arbitrarily big sizes.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1113	For example if we start with the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1114	regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured Chengsong parents: diff changeset	1115	successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured Chengsong parents: diff changeset	1116	a sequence of ever-growing derivatives like
cc54ce075db5 restructured Chengsong parents: diff changeset	1117
cc54ce075db5 restructured Chengsong parents: diff changeset	1118	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured Chengsong parents: diff changeset	1119	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1120	\begin{tabular}{rll}
cc54ce075db5 restructured Chengsong parents: diff changeset	1121	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1122	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1123	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1124	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1125	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured Chengsong parents: diff changeset	1126	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	1127	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1128
cc54ce075db5 restructured Chengsong parents: diff changeset	1129	\noindent where after around 35 steps we run out of memory on a
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1130	typical computer (we shall define in the next chapter
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1131	the precise details of our
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1132	regular expressions and the derivative operation). Clearly, the
cc54ce075db5 restructured Chengsong parents: diff changeset	1133	notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured Chengsong parents: diff changeset	1134	simplification rules that can be applied to regular regular
cc54ce075db5 restructured Chengsong parents: diff changeset	1135	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured Chengsong parents: diff changeset	1136	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured Chengsong parents: diff changeset	1137	r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured Chengsong parents: diff changeset	1138	earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured Chengsong parents: diff changeset	1139	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured Chengsong parents: diff changeset	1140	\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured Chengsong parents: diff changeset	1141	above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured Chengsong parents: diff changeset	1142	quickly beyond any finite bound.
cc54ce075db5 restructured Chengsong parents: diff changeset	1143
cc54ce075db5 restructured Chengsong parents: diff changeset	1144	Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1145	\cite{Sulzmann2014} where they introduce bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1146	regular expressions. In this version, POSIX values are
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1147	represented as bit sequences and such sequences are incrementally generated
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1148	when derivatives are calculated. The compact representation
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1149	of bit sequences and regular expressions allows them to define a more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1150	``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured Chengsong parents: diff changeset	1151	derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured Chengsong parents: diff changeset	1152	They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured Chengsong parents: diff changeset	1153	of this version, but do not provide any supporting proof arguments, not
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1154	even ``pencil-and-paper'' arguments. They write about their bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1155	\emph{incremental parsing method} (that is the algorithm to be formalised
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1156	in this dissertation)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1157
cc54ce075db5 restructured Chengsong parents: diff changeset	1158
cc54ce075db5 restructured Chengsong parents: diff changeset	1159
cc54ce075db5 restructured Chengsong parents: diff changeset	1160	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	1161	``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured Chengsong parents: diff changeset	1162	method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured Chengsong parents: diff changeset	1163	yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured Chengsong parents: diff changeset	1164	extensively [..] but yet
cc54ce075db5 restructured Chengsong parents: diff changeset	1165	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured Chengsong parents: diff changeset	1166	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	1167	Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured Chengsong parents: diff changeset	1168	a formal proof.
cc54ce075db5 restructured Chengsong parents: diff changeset	1169
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1170	However a faster formally verified
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1171	lexing program with the optimisations
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1172	mentioned by Sulzmann and Lu's second algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1173	is still missing.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1174	As they stated,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1175	\begin{quote}\it
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1176	``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1177	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	1178	This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured Chengsong parents: diff changeset	1179	by Ausaf and Urban,
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1180	together with a formal proof of the correctness with those simplifications.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1181
cc54ce075db5 restructured Chengsong parents: diff changeset	1182
cc54ce075db5 restructured Chengsong parents: diff changeset	1183	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1184	\section{Contribution}
cc54ce075db5 restructured Chengsong parents: diff changeset	1185
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1186	In this thesis,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1187	we propose a solution to catastrophic
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1188	backtracking and error-prone matchers: a formally verified
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1189	regular expression lexing algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1190	that is both fast
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1191	and correct by extending Ausaf et al.'s work.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1192	The end result is %a regular expression lexing algorithm that comes with
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1193	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1194	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1195	an improved version of Sulzmann and Lu's bit-coded algorithm using
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1196	derivatives with simplifications,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1197	accompanied by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1198	a proven correctness theorem according to POSIX specification
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1199	given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1200	\item
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1201	a complexity-related property for that algorithm saying that the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1202	internal data structure will
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1203	remain finite,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1204	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1205	and extension to
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1206	the bounded repetitions construct with the correctness and finiteness property
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1207	maintained.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1208	\end{itemize}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1209
cc54ce075db5 restructured Chengsong parents: diff changeset	1210
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1211	With a formal finiteness bound in place,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1212	we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1213	Further improvements to the algorithm with an even stronger version of
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1214	simplification is made.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1215	Thanks to our theorem-prover-friendly approach,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1216	we believe that
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1217	this finiteness bound can be improved to a bound
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1218	linear to input and
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1219	cubic to the regular expression size using a technique by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1220	Antimirov\cite{Antimirov95}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1221	Once formalised, this would be a guarantee for the absence of all super-linear behavious.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1222	We are working out the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1223	details.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1224
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1225
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1226	To our best knowledge, no lexing libraries using Brzozowski derivatives
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1227	have similar complexity-related bounds,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1228	and claims about running time are usually speculative and backed by empirical
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1229	evidence on a few test cases.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1230	If a matching or lexing algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1231	does not come with certain basic complexity related
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1232	guarantees (for examaple the internal data structure size
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1233	does not grow indefinitely),
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1234	then they cannot claim with confidence having solved the problem
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1235	of catastrophic backtracking.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1236
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1237
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1238
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1239
cc54ce075db5 restructured Chengsong parents: diff changeset	1240
cc54ce075db5 restructured Chengsong parents: diff changeset	1241	\section{Structure of the thesis}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1242	In chapter 2 \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1243	and notations we
cc54ce075db5 restructured Chengsong parents: diff changeset	1244	use for describing the lexing algorithm by Sulzmann and Lu,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1245	and then give the lexing algorithm.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1246	We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1247	Then we illustrate in \ref{Bitcoded2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1248	how the algorithm without bitcodes falls short for such aggressive
cc54ce075db5 restructured Chengsong parents: diff changeset	1249	simplifications and therefore introduce our version of the
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1250	bit-coded algorithm and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1251	its correctness proof .
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1252	In \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1253	of our bitcoded algorithm, that is a finite bound on the size of any
cc54ce075db5 restructured Chengsong parents: diff changeset	1254	regex's derivatives.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1255	In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1256	in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1257	algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured Chengsong parents: diff changeset	1258
cc54ce075db5 restructured Chengsong parents: diff changeset	1259
cc54ce075db5 restructured Chengsong parents: diff changeset	1260
cc54ce075db5 restructured Chengsong parents: diff changeset	1261
cc54ce075db5 restructured Chengsong parents: diff changeset	1262
cc54ce075db5 restructured Chengsong parents: diff changeset	1263	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1264
cc54ce075db5 restructured Chengsong parents: diff changeset	1265
cc54ce075db5 restructured Chengsong parents: diff changeset	1266	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1267
cc54ce075db5 restructured Chengsong parents: diff changeset	1268	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1269
cc54ce075db5 restructured Chengsong parents: diff changeset	1270	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1271
cc54ce075db5 restructured Chengsong parents: diff changeset	1272

author	Chengsong
	Mon, 03 Oct 2022 02:08:49 +0100
changeset 609	61139fdddae0
parent 608	37b6fd310a16
child 612	8c234a1bc7e0
permissions	-rwxr-xr-x