lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@233cf2b97d1a (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	28	\def\bsimps{\textit{bsimp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	29	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	35	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	36	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594 62f8fa03863e more Chengsong parents: 591 diff changeset	37	\def\rdistincts{\textit{rdistinct}}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	38	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	40	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured Chengsong parents: diff changeset	42
cc54ce075db5 restructured Chengsong parents: diff changeset	43	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	44
600 fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	45	\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	46	\def\SEQs{\textit{SEQs}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	47	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	48	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	49	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	50	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	51	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	52	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	53	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	54	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	55	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	56	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	57	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	58	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	59	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	60	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	61	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	62	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	63
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	64	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	65
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	66	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	67	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	68	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	69	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	70	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	71
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	72	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	73
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	74	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	75	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	76	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	77	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	78	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	79	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	80	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	81	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	82	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	83	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	84	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	85	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	86	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	87	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	88	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	89	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	90	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	91	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	92	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	93	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	94	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	95	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	96	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	97	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	98	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	99	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	100	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	103	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	104	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	105	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	106	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	107	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	108	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	109	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	110	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	111	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	112	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	113	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	114	\def\starupdates{\textit{starUpdates}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	115
532 cc54ce075db5 restructured Chengsong parents: diff changeset	116
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	121	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	122	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	123	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	124	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	125	\def\bsimpStrong{\textit{bsimpStrong}}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	126	\def\bdersStrongs{\textit{bdersStrong}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	127	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	128
532 cc54ce075db5 restructured Chengsong parents: diff changeset	129	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	130	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	131	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	132	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	133	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	134	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	135
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	136	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	137
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	138
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	139	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	140	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	141	\def\bsimpalts{\textit{bsimp}_{ALTS}}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	142	\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	143	\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	144	\def\rsimpseq{\textit{rsimp}_{SEQ}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	145
532 cc54ce075db5 restructured Chengsong parents: diff changeset	146	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	147	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	148	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	149
cc54ce075db5 restructured Chengsong parents: diff changeset	150
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	151	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	152	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	153	\def\filter{\textit{filter}}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	154	%\def\not{\textit{not}}
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	155
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	156
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	157
532 cc54ce075db5 restructured Chengsong parents: diff changeset	158	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	159	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	160	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	161	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	162	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	163	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	164	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	165
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	166
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	167
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	168
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	169	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	170	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	171	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	172	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	173	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	174	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	175	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	176	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	177	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	178	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	179	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	180	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	181	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	182	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	183	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	184	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	185	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	186	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	187
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	188
532 cc54ce075db5 restructured Chengsong parents: diff changeset	189	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	190	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	191	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	192
cc54ce075db5 restructured Chengsong parents: diff changeset	193	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	194
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	195
ce4e5151a836 more Chengsong parents: 600 diff changeset	196
ce4e5151a836 more Chengsong parents: 600 diff changeset	197
ce4e5151a836 more Chengsong parents: 600 diff changeset	198
ce4e5151a836 more Chengsong parents: 600 diff changeset	199
ce4e5151a836 more Chengsong parents: 600 diff changeset	200	Regular expressions are widely used in computer science:
ce4e5151a836 more Chengsong parents: 600 diff changeset	201	be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
ce4e5151a836 more Chengsong parents: 600 diff changeset	202	command-line tools like $\mathit{grep}$ that facilitate easy
ce4e5151a836 more Chengsong parents: 600 diff changeset	203	text-processing; network intrusion
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	204	detection systems that inspect suspicious traffic; or compiler
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	205	front ends.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	206	Given their usefulness and ubiquity, one would assume that
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	207	modern regular expression matching implementations
ce4e5151a836 more Chengsong parents: 600 diff changeset	208	are mature and fully studied.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	209	Indeed, in a popular programming language's regex engine,
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	210	supplying it with regular expressions and strings,
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	211	in most cases one can
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	212	get the matching information in a very short time.
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	213	Those matchers can be blindingly fast--some
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	214	network intrusion detection systems
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	215	use regex engines that are able to process
ce4e5151a836 more Chengsong parents: 600 diff changeset	216	megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	217	However, those matchers can exhibit a surprising security vulnerability
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	218	under a certain class of inputs.
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	219	%However, , this is not the case for $\mathbf{all}$ inputs.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	220	%TODO: get source for SNORT/BRO's regex matching engine/speed
ce4e5151a836 more Chengsong parents: 600 diff changeset	221
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	222
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	223	Consider $(a^)^\,b$ and ask whether
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	224	strings of the form $aa..a$ can be matched by this regular
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	225	expression. Obviously this is not the case---the expected $b$ in the last
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	226	position is missing. One would expect that modern regular expression
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	227	matching engines can find this out very quickly. Surprisingly, if one tries
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	228	this example in JavaScript, Python or Java 8, even with small strings,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	229	say of lenght of around 30 $a$'s,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	230	the decision takes an absurd time to finish (see graphs in figure \ref{fig:aStarStarb}).
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	231	This is clearly exponential behaviour, and
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	232	is triggered by some relatively simple regular expressions.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	233	Java 9 and newer
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	234	versions improve this behaviour somewhat, but is still slow compared
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	235	with the approach we are going to use in this thesis.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	236
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	237
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	238
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	239	This superlinear blowup in regular expression engines
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	240	had repeatedly caused grief in ``real life'' where it is
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	241	given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	242	For example, on 20 July 2016 one evil
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	243	regular expression brought the webpage
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	244	\href{http://stackexchange.com}{Stack Exchange} to its
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	245	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	246	In this instance, a regular expression intended to just trim white
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	247	spaces from the beginning and the end of a line actually consumed
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	248	massive amounts of CPU resources---causing web servers to grind to a
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	249	halt. In this example, the time needed to process
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	250	the string was $O(n^2)$ with respect to the string length. This
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	251	quadratic overhead was enough for the homepage of Stack Exchange to
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	252	respond so slowly that the load balancer assumed a $\mathit{DoS}$
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	253	attack and therefore stopped the servers from responding to any
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	254	requests. This made the whole site become unavailable.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	255
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	256	\begin{figure}[p]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	257	\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	258	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	259	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	260	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	261	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	262	ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	263	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	264	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	265	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	266	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	267	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	268	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	269	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	270	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	271	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	272	legend entries={JavaScript},
cc54ce075db5 restructured Chengsong parents: diff changeset	273	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	274	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	275	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	276	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	277	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	278	&
cc54ce075db5 restructured Chengsong parents: diff changeset	279	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	280	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	281	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	282	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	283	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	284	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	285	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	286	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	287	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	288	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	289	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	290	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	291	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	292	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	293	legend entries={Python},
cc54ce075db5 restructured Chengsong parents: diff changeset	294	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	295	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	296	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	297	\end{axis}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	298	\end{tikzpicture}\\
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	299	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	300	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	301	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	302	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	303	ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	304	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	305	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	306	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	307	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	308	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	309	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	310	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	311	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	312	height=4cm,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	313	legend entries={Java 8},
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	314	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	315	legend cell align=left]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	316	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	317	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	318	\end{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	319	&
ce4e5151a836 more Chengsong parents: 600 diff changeset	320	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	321	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	322	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	323	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	324	%ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	325	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	326	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	327	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	328	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	329	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	330	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	331	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	332	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	333	height=4cm,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	334	legend entries={Dart},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	335	legend pos=north west,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	336	legend cell align=left]
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	337	\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	338	\end{axis}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	339	\end{tikzpicture}\\
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	340	\begin{tikzpicture}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	341	\begin{axis}[
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	342	xlabel={$n$},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	343	x label style={at={(1.05,-0.05)}},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	344	ylabel={time in secs},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	345	enlargelimits=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	346	xtick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	347	xmax=33,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	348	ymax=35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	349	ytick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	350	scaled ticks=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	351	axis lines=left,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	352	width=5cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	353	height=4cm,
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	354	legend entries={Swift},
ce4e5151a836 more Chengsong parents: 600 diff changeset	355	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	356	legend cell align=left]
ce4e5151a836 more Chengsong parents: 600 diff changeset	357	\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
ce4e5151a836 more Chengsong parents: 600 diff changeset	358	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	359	\end{tikzpicture}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	360	&
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	361	\begin{tikzpicture}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	362	\begin{axis}[
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	363	xlabel={$n$},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	364	x label style={at={(1.05,-0.05)}},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	365	%ylabel={time in secs},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	366	enlargelimits=true,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	367	%xtick={0,5000,...,40000},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	368	%xmax=40000,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	369	%ymax=35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	370	restrict x to domain*=0:40000,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	371	restrict y to domain*=0:35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	372	%ytick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	373	%scaled ticks=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	374	axis lines=left,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	375	width=5cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	376	height=4cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	377	legend entries={Java9+},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	378	legend pos=north west,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	379	legend cell align=left]
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	380	\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	381	\end{axis}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	382	\end{tikzpicture}\\
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	383	\multicolumn{2}{c}{Graphs}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	384	\end{tabular}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	385	\caption{Graphs showing runtime for matching $(a^)^\,b$ with strings
ce4e5151a836 more Chengsong parents: 600 diff changeset	386	of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	387	The reason for their superlinear behaviour is that they do a depth-first-search
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	388	using NFAs.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	389	If the string does not match, the regular expression matching
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	390	engine starts to explore all possibilities.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	391	}\label{fig:aStarStarb}
ce4e5151a836 more Chengsong parents: 600 diff changeset	392	\end{figure}\afterpage{\clearpage}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	393
532 cc54ce075db5 restructured Chengsong parents: diff changeset	394	A more recent example is a global outage of all Cloudflare servers on 2 July
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	395	2019. A poorly written regular expression exhibited catastrophic backtracking
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	396	and exhausted CPUs that serve HTTP traffic. Although the outage
532 cc54ce075db5 restructured Chengsong parents: diff changeset	397	had several causes, at the heart was a regular expression that
cc54ce075db5 restructured Chengsong parents: diff changeset	398	was used to monitor network
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	399	traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	400	These problems with regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	401	are not isolated events that happen
cc54ce075db5 restructured Chengsong parents: diff changeset	402	very occasionally, but actually widespread.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	403	They occur so often that they have a
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	404	name: Regular-Expression-Denial-Of-Service (ReDoS)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	405	attack.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	406	\citeauthor{Davis18} detected more
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	407	than 1000 evil regular expressions
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	408	in Node.js, Python core libraries, npm and in pypi.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	409	They therefore concluded that evil regular expressions
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	410	are real problems rather than "a parlour trick".
532 cc54ce075db5 restructured Chengsong parents: diff changeset	411
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	412	This work aims to address this issue
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	413	with the help of formal proofs.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	414	We describe a lexing algorithm based
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	415	on Brzozowski derivatives with verified correctness (in
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	416	Isabelle/HOL)
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	417	and a finiteness property.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	418	Such properties %guarantee the absence of
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	419	are an important step in preventing
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	420	catastrophic backtracking once and for all.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	421	We will give more details in the next sections
16d67f9c07d4 more Chengsong parents: 603 diff changeset	422	on (i) why the slow cases in graph \ref{fig:aStarStarb}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	423	can occur in traditional regular expression engines
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	424	and (ii) why we choose our
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	425	approach based on Brzozowski derivatives and formal proofs.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	426
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	427
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	428	\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	429	Regular expressions and regular expression matchers
ce4e5151a836 more Chengsong parents: 600 diff changeset	430	have of course been studied for many, many years.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	431	Theoretical results in automata theory state
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	432	that basic regular expression matching should be linear
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	433	w.r.t the input.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	434	This assumes that the regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	435	$r$ was pre-processed and turned into a
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	436	deterministic finite automaton (DFA) before matching\cite{Sakarovitch2009}.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	437	By basic we mean textbook definitions such as the one
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	438	below, involving only regular expressions for characters, alternatives,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	439	sequences, and Kleene stars:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	440	\[
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	441	r ::= c \| r_1 + r_2 \| r_1 \cdot r_2 \| r^*
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	442	\]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	443	Modern regular expression matchers used by programmers,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	444	however,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	445	support much richer constructs, such as bounded repetitions
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	446	and back-references.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	447	To differentiate, we use the word \emph{regex} to refer
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	448	to those expressions with richer constructs while reserving the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	449	term \emph{regular expression}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	450	for the more traditional meaning in formal languages theory.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	451	We follow this convention
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	452	in this thesis.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	453	In the future, we aim to support all the popular features of regexes,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	454	but for this work we mainly look at basic regular expressions
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	455	and bounded repetitions.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	456
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	457
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	458
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	459	%Most modern regex libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	460	%the so-called PCRE standard (Peral Compatible Regular Expressions)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	461	%has the back-references
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	462	Regexes come with a number of constructs
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	463	that make it more convenient for
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	464	programmers to write regular expressions.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	465	Depending on the types of constructs
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	466	the task of matching and lexing with them
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	467	will have different levels of complexity.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	468	Some of those constructs are just syntactic sugars that are
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	469	simply short hand notations
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	470	that save the programmers a few keystrokes.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	471	These will not cause problems for regex libraries.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	472	For example the
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	473	non-binary alternative involving three or more choices just means:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	474	\[
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	475	(a \| b \| c) \stackrel{means}{=} ((a + b)+ c)
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	476	\]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	477	Similarly, the range operator used to express the alternative
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	478	of all characters between its operands is just a concise way:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	479	\[
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	480	[0~-9]\stackrel{means}{=} (0 \| 1 \| \ldots \| 9 ) \; \text{(all number digits)}
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	481	\]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	482	for an alternative. The
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	483	wildcard character $.$ is used to refer to any single character,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	484	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	485	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	486	\]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	487	except the newline.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	488
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	489	\subsection{Bounded Repetitions}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	490	More interesting are bounded repetitions, which can
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	491	make the regular expressions much
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	492	more compact.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	493	There are
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	494	$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	495	(where $n$ and $m$ are constant natural numbers).
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	496	Like the star regular expressions, the set of strings or language
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	497	a bounded regular expression can match
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	498	is defined using the power operation on sets:
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	499	\begin{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	500	\begin{tabular}{lcl}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	501	$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	502	$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	503	$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	504	$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	505	\end{tabular}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	506	\end{center}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	507	The attraction of bounded repetitions is that they can be
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	508	used to avoid a blow up: for example $r^{\{n\}}$
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	509	is a shorthand for
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	510	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	511	\underbrace{r\ldots r}_\text{n copies of r}.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	512	\]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	513	%Therefore, a naive algorithm that simply unfolds
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	514	%them into their desugared forms
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	515	%will suffer from at least an exponential runtime increase.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	516
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	517
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	518	The problem with matching
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	519	is that tools based on the classic notion of
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	520	automata need to expand $r^{\{n\}}$ into $n$ connected
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	521	copies of the automaton for $r$. This leads to very inefficient matching
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	522	algorithms or algorithms that consume large amounts of memory.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	523	Implementations using $\DFA$s will
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	524	either become excruciatingly slow
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	525	(for example Verbatim++\cite{Verbatimpp}) or get
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	526	out of memory errors (for example $\mathit{LEX}$ and
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	527	$\mathit{JFLEX}$\footnote{which are lexer generators
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	528	in C and JAVA that generate $\mathit{DFA}$-based
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	529	lexers. The user provides a set of regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	530	and configurations to them, and then
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	531	gets an output program encoding a minimized $\mathit{DFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	532	that can be compiled and run.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	533	When given the above countdown regular expression,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	534	a small $n$ (a few dozen) would result in a
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	535	determinised automata
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	536	with millions of states.}) for large counters.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	537	A classic example for this phenomenon is the regular expression $(a+b)^* a (a+b)^{n}$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	538	where the minimal DFA requires at least $2^{n+1}$ states.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	539	For example, when $n$ is equal to 2,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	540	The corresponding $\mathit{NFA}$ looks like:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	541	\begin{center}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	542	\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	543	\node[state,initial] (q_0) {$q_0$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	544	\node[state, red] (q_1) [right=of q_0] {$q_1$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	545	\node[state, red] (q_2) [right=of q_1] {$q_2$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	546	\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	547	\path[->]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	548	(q_0) edge node {a} (q_1)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	549	edge [loop below] node {a,b} ()
16d67f9c07d4 more Chengsong parents: 603 diff changeset	550	(q_1) edge node {a,b} (q_2)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	551	(q_2) edge node {a,b} (q_3);
16d67f9c07d4 more Chengsong parents: 603 diff changeset	552	\end{tikzpicture}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	553	\end{center}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	554	when turned into a DFA by the subset construction
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	555	requires at least $2^3$ states.\footnote{The
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	556	red states are "countdown states" which counts down
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	557	the number of characters needed in addition to the current
16d67f9c07d4 more Chengsong parents: 603 diff changeset	558	string to make a successful match.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	559	For example, state $q_1$ indicates a match that has
16d67f9c07d4 more Chengsong parents: 603 diff changeset	560	gone past the $(a\|b)^$ part of $(a\|b)^a(a\|b)^{\{2\}}$,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	561	and just consumed the "delimiter" $a$ in the middle, and
16d67f9c07d4 more Chengsong parents: 603 diff changeset	562	need to match 2 more iterations of $(a\|b)$ to complete.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	563	State $q_2$ on the other hand, can be viewed as a state
16d67f9c07d4 more Chengsong parents: 603 diff changeset	564	after $q_1$ has consumed 1 character, and just waits
16d67f9c07d4 more Chengsong parents: 603 diff changeset	565	for 1 more character to complete.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	566	$q_3$ is the last state, requiring 0 more character and is accepting.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	567	Depending on the suffix of the
16d67f9c07d4 more Chengsong parents: 603 diff changeset	568	input string up to the current read location,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	569	the states $q_1$ and $q_2$, $q_3$
16d67f9c07d4 more Chengsong parents: 603 diff changeset	570	may or may
16d67f9c07d4 more Chengsong parents: 603 diff changeset	571	not be active, independent from each other.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	572	A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
16d67f9c07d4 more Chengsong parents: 603 diff changeset	573	contain at least $2^3$ non-equivalent states that cannot be merged,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	574	because the subset construction during determinisation will generate
16d67f9c07d4 more Chengsong parents: 603 diff changeset	575	all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	576	Generalizing this to regular expressions with larger
16d67f9c07d4 more Chengsong parents: 603 diff changeset	577	bounded repetitions number, we have that
16d67f9c07d4 more Chengsong parents: 603 diff changeset	578	regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	579	would require at least $2^{n+1}$ states, if $r$ itself contains
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	580	more than 1 string.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	581	This is to represent all different
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	582	scenarios which "countdown" states are active.}
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	583
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	584
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	585	Bounded repetitions are very important because they
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	586	tend to occur a lot in practical use,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	587	for example in the regex library RegExLib,
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	588	the rules library of Snort \cite{Snort1999}\footnote{
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	589	Snort is a network intrusion detection (NID) tool
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	590	for monitoring network traffic.
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	591	The network security community curates a list
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	592	of malicious patterns written as regexes,
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	593	which is used by Snort's detection engine
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	594	to match against network traffic for any hostile
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	595	activities such as buffer overflow attacks.},
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	596	as well as in XML Schema definitions (XSDs).
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	597	According to Bj\"{o}rklund et al \cite{xml2015},
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	598	more than half of the
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	599	XSDs they found on the Maven.org central repository
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	600	have bounded regular expressions in them.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	601	Often the counters are quite large, with the largest being
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	602	approximately up to ten million.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	603	An example XSD they gave
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	604	is:
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	605	\begin{verbatim}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	606	<sequence minOccurs="0" maxOccurs="65535">
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	607	<element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	608	<element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	609	</sequence>
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	610	\end{verbatim}
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	611	This can be seen as the expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	612	$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	613	regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	614	satisfying certain constraints (such as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	615	satisfying the floating point number format).
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	616	It is therefore quite unsatisfying that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	617	some regular expressions matching libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	618	impose adhoc limits
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	619	for bounded regular expressions:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	620	For example, in the regular expression matching library in the Go
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	621	language the regular expression $a^{1001}$ is not permitted, because no counter
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	622	can be above 1000, and in the built-in Rust regular expression library
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	623	expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
606 99b530103464 new Chengsong parents: 605 diff changeset	624	for being too big.
99b530103464 new Chengsong parents: 605 diff changeset	625	As Becchi and Crawley\cite{Becchi08} have pointed out,
99b530103464 new Chengsong parents: 605 diff changeset	626	the reason for these restrictions
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	627	is that they simulate a non-deterministic finite
606 99b530103464 new Chengsong parents: 605 diff changeset	628	automata (NFA) with a breadth-first search.
99b530103464 new Chengsong parents: 605 diff changeset	629	This way the number of active states could
99b530103464 new Chengsong parents: 605 diff changeset	630	be equal to the counter number.
99b530103464 new Chengsong parents: 605 diff changeset	631	When the counters are large,
99b530103464 new Chengsong parents: 605 diff changeset	632	the memory requirement could become
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	633	infeasible, and a regex engine
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	634	like Go will reject this pattern straight away.
606 99b530103464 new Chengsong parents: 605 diff changeset	635	\begin{figure}[H]
99b530103464 new Chengsong parents: 605 diff changeset	636	\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	637	\begin{tikzpicture} [node distance = 2cm, on grid, auto]
99b530103464 new Chengsong parents: 605 diff changeset	638
99b530103464 new Chengsong parents: 605 diff changeset	639	\node (q0) [state, initial] {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	640	\node (q1) [state, right = of q0] {$1$};
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	641	%\node (q2) [state, right = of q1] {$2$};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	642	\node (qdots) [right = of q1] {$\ldots$};
606 99b530103464 new Chengsong parents: 605 diff changeset	643	\node (qn) [state, right = of qdots] {$n$};
99b530103464 new Chengsong parents: 605 diff changeset	644	\node (qn1) [state, right = of qn] {$n+1$};
99b530103464 new Chengsong parents: 605 diff changeset	645	\node (qn2) [state, right = of qn1] {$n+2$};
99b530103464 new Chengsong parents: 605 diff changeset	646	\node (qn3) [state, accepting, right = of qn2] {$n+3$};
99b530103464 new Chengsong parents: 605 diff changeset	647
99b530103464 new Chengsong parents: 605 diff changeset	648	\path [-stealth, thick]
99b530103464 new Chengsong parents: 605 diff changeset	649	(q0) edge [loop above] node {a} ()
99b530103464 new Chengsong parents: 605 diff changeset	650	(q0) edge node {a} (q1)
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	651	%(q1) edge node {.} (q2)
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	652	(q1) edge node {.} (qdots)
606 99b530103464 new Chengsong parents: 605 diff changeset	653	(qdots) edge node {.} (qn)
99b530103464 new Chengsong parents: 605 diff changeset	654	(qn) edge node {.} (qn1)
99b530103464 new Chengsong parents: 605 diff changeset	655	(qn1) edge node {b} (qn2)
99b530103464 new Chengsong parents: 605 diff changeset	656	(qn2) edge node {$c$} (qn3);
99b530103464 new Chengsong parents: 605 diff changeset	657	\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	658	%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
99b530103464 new Chengsong parents: 605 diff changeset	659	% \node[state,initial] (q_0) {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	660	% \node[state, ] (q_1) [right=of q_0] {$1$};
99b530103464 new Chengsong parents: 605 diff changeset	661	% \node[state, ] (q_2) [right=of q_1] {$2$};
99b530103464 new Chengsong parents: 605 diff changeset	662	% \node[state,
99b530103464 new Chengsong parents: 605 diff changeset	663	% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
99b530103464 new Chengsong parents: 605 diff changeset	664	% \path[->]
99b530103464 new Chengsong parents: 605 diff changeset	665	% (q_0) edge node {a} (q_1)
99b530103464 new Chengsong parents: 605 diff changeset	666	% edge [loop below] node {a,b} ()
99b530103464 new Chengsong parents: 605 diff changeset	667	% (q_1) edge node {a,b} (q_2)
99b530103464 new Chengsong parents: 605 diff changeset	668	% (q_2) edge node {a,b} (q_3);
99b530103464 new Chengsong parents: 605 diff changeset	669	%\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	670	\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	671	\caption{The example given by Becchi and Crawley
99b530103464 new Chengsong parents: 605 diff changeset	672	that NFA simulation can consume large
99b530103464 new Chengsong parents: 605 diff changeset	673	amounts of memory: $.^*a.^{\{n\}}bc$ matching
99b530103464 new Chengsong parents: 605 diff changeset	674	strings of the form $aaa\ldots aaaabc$.
99b530103464 new Chengsong parents: 605 diff changeset	675	When traversing in a breadth-first manner,
99b530103464 new Chengsong parents: 605 diff changeset	676	all states from 0 till $n+1$ will become active.}
99b530103464 new Chengsong parents: 605 diff changeset	677	\end{figure}
99b530103464 new Chengsong parents: 605 diff changeset	678	%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
99b530103464 new Chengsong parents: 605 diff changeset	679	%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
99b530103464 new Chengsong parents: 605 diff changeset	680	%in terms of input string length.
99b530103464 new Chengsong parents: 605 diff changeset	681	%TODO:try out these lexers
99b530103464 new Chengsong parents: 605 diff changeset	682	These problems can of course be solved in matching algorithms where
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	683	automata go beyond the classic notion and for instance include explicit
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	684	counters \cite{Turo_ov__2020}.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	685	These solutions can be quite efficient,
606 99b530103464 new Chengsong parents: 605 diff changeset	686	with the ability to process
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	687	gigabytes of strings input per second
606 99b530103464 new Chengsong parents: 605 diff changeset	688	even with large counters \cite{Becchi08}.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	689	But formal reasoning about these automata especially in Isabelle
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	690	can be challenging
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	691	and un-intuitive.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	692	Therefore, we take correctness and runtime claims made about these solutions
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	693	with a grain of salt.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	694
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	695	In the work reported in \cite{CSL2022} and here,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	696	we add better support using derivatives
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	697	for bounded regular expressions $r^{\{n\}}$.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	698	The results
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	699	extend straightforwardly to
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	700	repetitions with an interval such as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	701	$r^{\{n\ldots m\}}$.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	702	The merit of Brzozowski derivatives (more on this later)
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	703	on this problem is that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	704	it can be naturally extended to support bounded repetitions.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	705	Moreover these extensions are still made up of only
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	706	inductive datatypes and recursive functions,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	707	making it handy to deal with using theorem provers.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	708	%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	709	%straightforwardly extended to deal with bounded regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	710	%and moreover the resulting code still consists of only simple
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	711	%recursive functions and inductive datatypes.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	712	Finally, bounded regular expressions do not destroy our finite
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	713	boundedness property, which we shall prove later on.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	714
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	715
606 99b530103464 new Chengsong parents: 605 diff changeset	716
99b530103464 new Chengsong parents: 605 diff changeset	717
99b530103464 new Chengsong parents: 605 diff changeset	718
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	719	\subsection{Back-References}
606 99b530103464 new Chengsong parents: 605 diff changeset	720	The other way to simulate an $\mathit{NFA}$ for matching is choosing
99b530103464 new Chengsong parents: 605 diff changeset	721	a single transition each time, keeping all the other options in
99b530103464 new Chengsong parents: 605 diff changeset	722	a queue or stack, and backtracking if that choice eventually
99b530103464 new Chengsong parents: 605 diff changeset	723	fails. This method, often called a "depth-first-search",
99b530103464 new Chengsong parents: 605 diff changeset	724	is efficient in a lot of cases, but could end up
99b530103464 new Chengsong parents: 605 diff changeset	725	with exponential run time.
99b530103464 new Chengsong parents: 605 diff changeset	726	The backtracking method is employed in regex libraries
99b530103464 new Chengsong parents: 605 diff changeset	727	that support \emph{back-references}, for example
99b530103464 new Chengsong parents: 605 diff changeset	728	in Java and Python.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	729	%\section{Back-references and The Terminology Regex}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	730
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	731	%When one constructs an $\NFA$ out of a regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	732	%there is often very little to be done in the first phase, one simply
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	733	%construct the $\NFA$ states based on the structure of the input regular expression.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	734
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	735	%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	736	%one by keeping track of all active states after consuming
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	737	%a character, and update that set of states iteratively.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	738	%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	739	%for a path terminating
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	740	%at an accepting state.
606 99b530103464 new Chengsong parents: 605 diff changeset	741
99b530103464 new Chengsong parents: 605 diff changeset	742
99b530103464 new Chengsong parents: 605 diff changeset	743
99b530103464 new Chengsong parents: 605 diff changeset	744	Given a regular expression like this (the sequence
532 cc54ce075db5 restructured Chengsong parents: diff changeset	745	operator is omitted for brevity):
cc54ce075db5 restructured Chengsong parents: diff changeset	746	\begin{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	747	$r_1r_2r_3r_4$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	748	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	749	one could label sub-expressions of interest
532 cc54ce075db5 restructured Chengsong parents: diff changeset	750	by parenthesizing them and giving
cc54ce075db5 restructured Chengsong parents: diff changeset	751	them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured Chengsong parents: diff changeset	752	One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured Chengsong parents: diff changeset	753	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	754	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured Chengsong parents: diff changeset	755	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	756	The sub-expressions
99b530103464 new Chengsong parents: 605 diff changeset	757	$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
99b530103464 new Chengsong parents: 605 diff changeset	758	by 1 to 4, and can be ``referred back'' by their respective numbers.
99b530103464 new Chengsong parents: 605 diff changeset	759	%These sub-expressions are called "capturing groups".
99b530103464 new Chengsong parents: 605 diff changeset	760	To do so, we use the syntax $\backslash i$
99b530103464 new Chengsong parents: 605 diff changeset	761	to denote that we want the sub-string
99b530103464 new Chengsong parents: 605 diff changeset	762	of the input just matched by the i-th
99b530103464 new Chengsong parents: 605 diff changeset	763	sub-expression to appear again,
99b530103464 new Chengsong parents: 605 diff changeset	764	exactly the same as it first appeared:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	765	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	766	$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
cc54ce075db5 restructured Chengsong parents: diff changeset	767	\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured Chengsong parents: diff changeset	768	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	769	%The backslash and number $i$ are the
99b530103464 new Chengsong parents: 605 diff changeset	770	%so-called "back-references".
99b530103464 new Chengsong parents: 605 diff changeset	771	%Let $e$ be an expression made of regular expressions
99b530103464 new Chengsong parents: 605 diff changeset	772	%and back-references. $e$ contains the expression $e_i$
99b530103464 new Chengsong parents: 605 diff changeset	773	%as its $i$-th capturing group.
99b530103464 new Chengsong parents: 605 diff changeset	774	%The semantics of back-reference can be recursively
99b530103464 new Chengsong parents: 605 diff changeset	775	%written as:
99b530103464 new Chengsong parents: 605 diff changeset	776	%\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	777	% \begin{tabular}{c}
99b530103464 new Chengsong parents: 605 diff changeset	778	% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
99b530103464 new Chengsong parents: 605 diff changeset	779	% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
99b530103464 new Chengsong parents: 605 diff changeset	780	% \end{tabular}
99b530103464 new Chengsong parents: 605 diff changeset	781	%\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	782	A concrete example
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	783	for back-references is
532 cc54ce075db5 restructured Chengsong parents: diff changeset	784	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	785	$(.^*)\backslash 1$,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	786	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	787	which matches
606 99b530103464 new Chengsong parents: 605 diff changeset	788	strings that can be split into two identical halves,
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	789	for example $\mathit{foofoo}$, $\mathit{ww}$ and so on.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	790	Note that this is different from
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	791	repeating the sub-expression verbatim like
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	792	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	793	$(.^)(.^)$,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	794	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	795	which does not impose any restrictions on what strings the second
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	796	sub-expression $.^*$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	797	might match.
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	798	Another example of back-references is
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	799	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	800	$(.)(.)\backslash 2\backslash 1$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	801	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	802	which matches four-character palindromes
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	803	like $abba$, $x??x$ and so on.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	804
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	805	Back-references is a regex construct
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	806	that programmers find quite useful.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	807	According to Becchi and Crawley\cite{Becchi08},
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	808	6\% of Snort rules (up until 2008) use them.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	809	The most common use of back-references
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	810	is to express well-formed html files,
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	811	where back-references are convenient for matching
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	812	opening and closing tags like
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	813	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	814	$\langle html \rangle \ldots \langle / html \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	815	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	816	A regex describing such a format
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	817	is
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	818	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	819	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	820	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	821	Despite being useful, the expressive power of regexes
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	822	go beyond the regular language hierarchy
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	823	once back-references are included.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	824	In fact, they allow the regex construct to express
532 cc54ce075db5 restructured Chengsong parents: diff changeset	825	languages that cannot be contained in context-free
cc54ce075db5 restructured Chengsong parents: diff changeset	826	languages either.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	827	For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	828	expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	829	which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured Chengsong parents: diff changeset	830	Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured Chengsong parents: diff changeset	831	of formal languages.
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	832	Also solving the matching problem involving back-references
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	833	is known to be NP-complete \parencite{alfred2014algorithms}.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	834	Regex libraries supporting back-references such as
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	835	PCRE \cite{pcre} therefore have to
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	836	revert to a depth-first search algorithm which backtracks.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	837	What is unexpected is that even in the cases
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	838	not involving back-references, there is still
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	839	a (non-negligible) chance they might backtrack super-linearly,
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	840	as shown in the graphs in figure\ref{fig:aStarStarb}.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	841
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	842	\subsection{Summary of the Catastrophic Backtracking Problem}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	843	Summing these up, we can categorise existing
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	844	practical regex libraries into two kinds:
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	845	(i)The ones with linear
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	846	time guarantees like Go and Rust. The cost with them is that
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	847	they impose restrictions
532 cc54ce075db5 restructured Chengsong parents: diff changeset	848	on the user input (not allowing back-references,
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	849	bounded repetitions cannot exceed a counter limit etc.).
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	850	(ii) Those
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	851	that allow large bounded regular expressions and back-references
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	852	at the expense of using a backtracking algorithm.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	853	They could grind to a halt
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	854	on some very simple cases, posing a vulnerability of
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	855	a ReDoS attack.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	856
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	857
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	858	We would like to have regex engines that can
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	859	deal with the regular part (e.g.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	860	bounded repetitions) of regexes more
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	861	efficiently.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	862	Also we want to make sure that they do it correctly.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	863	It turns out that such aim is not so easy to achieve.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	864	%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
cc54ce075db5 restructured Chengsong parents: diff changeset	865	% For example, the Rust regex engine claims to be linear,
cc54ce075db5 restructured Chengsong parents: diff changeset	866	% but does not support lookarounds and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	867	% The GoLang regex library does not support over 1000 repetitions.
cc54ce075db5 restructured Chengsong parents: diff changeset	868	% Java and Python both support back-references, but shows
cc54ce075db5 restructured Chengsong parents: diff changeset	869	%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured Chengsong parents: diff changeset	870	%when the language is still regular).
cc54ce075db5 restructured Chengsong parents: diff changeset	871	%TODO: test performance of Rust on (((((aa)b)b){20}))c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured Chengsong parents: diff changeset	872	%TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured Chengsong parents: diff changeset	873
cc54ce075db5 restructured Chengsong parents: diff changeset	874
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	875
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	876
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	877	%The time cost of regex matching algorithms in general
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	878	%involve two different phases, and different things can go differently wrong on
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	879	%these phases.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	880	%$\DFA$s usually have problems in the first (construction) phase
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	881	%, whereas $\NFA$s usually run into trouble
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	882	%on the second phase.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	883
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	884
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	885	\section{Error-prone POSIX Implementations}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	886	When there are multiple ways of matching a string
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	887	with a regular expression, a matcher needs to
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	888	disambiguate.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	889	The standard for which particular match to pick
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	890	is called the disambiguation strategy.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	891	The more intuitive strategy is called POSIX,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	892	which always chooses the longest initial match.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	893	An alternative strategy would be greedy matches,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	894	which always ends a sub-match as early as possible.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	895	The POSIX standard is widely adopted in many operating systems.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	896	However, many implementations (including the C libraries
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	897	used by Linux and OS X distributions) contain bugs
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	898	or do not meet the specification they claim to adhere to.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	899	In some cases, they either fail to generate a lexing
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	900	result when there exists a match,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	901	or give results that are inconsistent with the $\POSIX$ standard.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	902	A concrete example would be the regex given by \cite{fowler2003}
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	903	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	904	$(aba + ab + a)^* \text{and the string} ababa$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	905	\end{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	906	The correct $\POSIX$ match for the above would be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	907	with the entire string $ababa$,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	908	split into two Kleene star iterations, $[ab] [aba]$ at positions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	909	$[0, 2), [2, 5)$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	910	respectively.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	911	But trying this out in regex101\parencite{regex101}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	912	with different language engines would yield
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	913	the same two fragmented matches: $[aba]$ at $[0, 3)$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	914	and $a$ at $[4, 5)$.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	915	Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	916	commented that most regex libraries are not
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	917	correctly implementing the POSIX (maximum-munch)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	918	rule of regular expression matching.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	919	As Grathwohl\parencite{grathwohl2014crash} wrote,
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	920	\begin{quote}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	921	``The POSIX strategy is more complicated than the
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	922	greedy because of the dependence on information about
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	923	the length of matched strings in the various subexpressions.''
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	924	\end{quote}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	925	%\noindent
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	926	The implementation complexity of POSIX rules also come from
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	927	the specification being not very clear.
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	928	There are many informal summaries of this disambiguation
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	929	strategy, which are often quite long and delicate.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	930	For example Kuklewicz \cite{KuklewiczHaskell}
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	931	described the POSIX rule as
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	932	\begin{quote}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	933	``
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	934	\begin{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	935	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	936	regular expressions (REs) take the leftmost starting match, and the longest match starting there
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	937	earlier subpatterns have leftmost-longest priority over later subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	938	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	939	higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	940	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	941	REs have right associative concatenation which can be changed with parenthesis\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	942	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	943	parenthesized subexpressions return the match from their last usage\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	944	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	945	text of component subexpressions must be contained in the text of the
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	946	higher-level subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	947	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	948	if "p" and "q" can never match the same text then "p\|q" and "q\|p" are equivalent, up to trivial renumbering of captured subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	949	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	950	if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\''
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	951	\end{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	952	\end{quote}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	953	The text above
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	954	is trying to capture something very precise,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	955	and is crying out for formalising.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	956	Ausaf et al. \cite{AusafDyckhoffUrban2016}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	957	are the first to fill the gap
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	958	by not just describing such a formalised POSIX
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	959	specification in Isabelle/HOL, but also proving
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	960	that their specification coincides with the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	961	POSIX specification given by Okui and Suzuki \cite{Okui10}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	962	which is a completely
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	963	different characterisation.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	964	They then formally proved the correctness of
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	965	a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	966	based on that specification.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	967
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	968	In the next section we will very briefly
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	969	introduce Brzozowski derivatives and Sulzmann
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	970	and Lu's algorithm, which this thesis builds on.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	971	We give a taste of what they
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	972	are like and why they are suitable for regular expression
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	973	matching and lexing.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	974
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	975	\section{Our Solution--Formal Specification of POSIX Matching
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	976	and Brzozowski Derivatives}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	977	Now we start with the central topic of the thesis: Brzozowski derivatives.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	978	Brzozowski \cite{Brzozowski1964} first introduced the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	979	concept of the \emph{derivative} in the 1960s.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	980	The derivative of a regular expression $r$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	981	with respect to a character $c$, is written as $r \backslash c$.\footnote{
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	982	Despite having the same name, regular expression
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	983	derivatives bear little similarity with the mathematical definition
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	984	of derivatives on functions.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	985	}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	986	It tells us what $r$ would transform into
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	987	if we chop off the first character $c$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	988	from all strings in the language of $r$ ($L \; r$).
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	989	To give a flavour of Brzozowski derivatives, we present
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	990	two straightforward clauses from it:
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	991	\begin{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	992	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	993	$d \backslash c$ & $\dn$ &
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	994	$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	995	$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	996	\end{tabular}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	997	\end{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	998	\noindent
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	999	The first clause says that for the regular expression
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1000	denoting a singleton set consisting of a sinlge-character string $\{ d \}$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1001	we check the derivative character $c$ against $d$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1002	returning a set containing only the empty string $\{ [] \}$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1003	if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1004	The second clause states that to obtain the regular expression
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1005	representing all strings' head character $c$ being chopped off
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1006	from $r_1 + r_2$, one simply needs to recursively take derivative
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1007	of $r_1$ and $r_2$ and then put them together.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1008
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1009	Thanks to the definition, derivatives have the nice property
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1010	that $s \in L \; (r\backslash c)$ if and only if
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1011	$c::s \in L \; r$.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1012	%This property can be used on regular expressions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1013	%matching and lexing--to test whether a string $s$ is in $L \; r$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1014	%one simply takes derivatives of $r$ successively with
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1015	%respect to the characters (in the correct order) in $s$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1016	%and then test whether the empty string is in the last regular expression.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1017	Derivatives give a simple solution
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1018	to the problem of matching and lexing a string $s$ with a regular
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1019	expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1020	succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1021	then $r$ matches $s$ (and {\em vice versa}).
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1022
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1023	This makes formally reasoning about these properties such
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1024	as correctness and complexity smooth and intuitive.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1025	In fact, there has already been several mechanised proofs about them,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1026	for example the one by Owens and Slind \cite{Owens2008} in HOL4,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1027	another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, and
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1028	yet another in Coq by Coquand and Siles \cite{Coquand2012}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1029
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1030	In addition, one can extend the clauses to bounded repetitions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1031	``for free'':
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1032	\begin{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1033	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1034	$r^{\{n\}} \backslash c$ & $\dn$ & $r \backslash c \cdot
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1035	r^{\{n-1\}}$\\
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1036	\end{tabular}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1037	\end{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1038	\noindent
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1039	And experimental results suggest that unlike DFA-based solutions,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1040	this derivatives can support
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1041	bounded regular expressions with large counters
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1042	quite well.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1043
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1044	There has also been
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1045	extensions to other constructs.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1046	For example, Owens et al include the derivatives
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1047	for \emph{NOT} regular expressions, which is
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1048	able to concisely express C-style comments of the form
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1049	$/* \ldots */$.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1050	Another extension for derivatives would be
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1051	regular expressions with look-aheads, done by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1052	by Miyazaki and Minamide
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1053	\cite{Takayuki2019}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1054	%We therefore use Brzozowski derivatives on regular expressions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1055	%lexing
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1056
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1057
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1058
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1059	Given the above definitions and properties of
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1060	Brzozowski derivatives, one quickly realises their potential
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1061	in generating a formally verified algorithm for lexing--the clauses and property
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1062	can be easily expressed in a functional programming language
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1063	or converted to theorem prover
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1064	code, with great extensibility.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1065	Perhaps this is the reason why it has sparked quite a bit of interest
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1066	in the functional programming and theorem prover communities in the last
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1067	fifteen or so years (
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1068	\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1069	\cite{Chen12} and \cite{Coquand2012}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1070	to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1071	after they were first published.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1072
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1073
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1074	However, there are two difficulties with derivative-based matchers:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1075	First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured Chengsong parents: diff changeset	1076	for whether a regular expression matches a string or not. This is too
cc54ce075db5 restructured Chengsong parents: diff changeset	1077	little information in the context of lexing where separate tokens must
cc54ce075db5 restructured Chengsong parents: diff changeset	1078	be identified and also classified (for example as keywords
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1079	or identifiers).
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1080	Second, derivative-based matchers need to be more efficient.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1081	Elegant and beautiful
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1082	as many implementations are,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1083	they can be excruciatingly slow.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1084	For example, Sulzmann and Lu
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1085	claim a linear running time of their proposed algorithm,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1086	but that was falsified by our experiments. The running time
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1087	is actually $\Omega(2^n)$ in the worst case.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1088	A similar claim about a theoretical runtime of $O(n^2)$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1089	is made for the Verbatim \cite{Verbatim}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1090	%TODO: give references
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1091	lexer, which calculates POSIX matches and is based on derivatives.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1092	They formalized the correctness of the lexer, but not the complexity.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1093	In the performance evaluation section, they simply analyzed the run time
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1094	of matching $a$ with the string
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1095	\begin{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1096	$\underbrace{a \ldots a}_{\text{n a's}}$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1097	\end{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1098	and concluded that the algorithm is quadratic in terms of input length.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1099	When we tried out their extracted OCaml code with our example $(a+aa)^*$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1100	the time it took to lex only 40 $a$'s was 5 minutes.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1101
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1102
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1103	\subsection{Sulzmann and Lu's Algorithm}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1104	Sulzmann and Lu~\cite{Sulzmann2014} overcame the first
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1105	difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured Chengsong parents: diff changeset	1106	algorithm. Their extended version generates additional information on
cc54ce075db5 restructured Chengsong parents: diff changeset	1107	\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured Chengsong parents: diff changeset	1108	rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured Chengsong parents: diff changeset	1109	second ``phase'' to Brzozowski's algorithm involving an injection
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1110	function simplification of internal data structures
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1111	eliminating the exponential behaviours.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1112	In an earlier work, Ausaf et al provided the formal
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1113	specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured Chengsong parents: diff changeset	1114	the correctness
cc54ce075db5 restructured Chengsong parents: diff changeset	1115	of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured Chengsong parents: diff changeset	1116	\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured Chengsong parents: diff changeset	1117
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1118	The version of the algorithm proven correct
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1119	suffers from the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1120	second difficulty though, where the internal derivatives can
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1121	grow to arbitrarily big sizes.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1122	For example if we start with the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1123	regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured Chengsong parents: diff changeset	1124	successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured Chengsong parents: diff changeset	1125	a sequence of ever-growing derivatives like
cc54ce075db5 restructured Chengsong parents: diff changeset	1126
cc54ce075db5 restructured Chengsong parents: diff changeset	1127	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured Chengsong parents: diff changeset	1128	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1129	\begin{tabular}{rll}
cc54ce075db5 restructured Chengsong parents: diff changeset	1130	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1131	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1132	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1133	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1134	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured Chengsong parents: diff changeset	1135	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	1136	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1137
cc54ce075db5 restructured Chengsong parents: diff changeset	1138	\noindent where after around 35 steps we run out of memory on a
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1139	typical computer (we shall define in the next chapter
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1140	the precise details of our
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1141	regular expressions and the derivative operation). Clearly, the
cc54ce075db5 restructured Chengsong parents: diff changeset	1142	notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured Chengsong parents: diff changeset	1143	simplification rules that can be applied to regular regular
cc54ce075db5 restructured Chengsong parents: diff changeset	1144	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured Chengsong parents: diff changeset	1145	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured Chengsong parents: diff changeset	1146	r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured Chengsong parents: diff changeset	1147	earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured Chengsong parents: diff changeset	1148	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured Chengsong parents: diff changeset	1149	\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured Chengsong parents: diff changeset	1150	above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured Chengsong parents: diff changeset	1151	quickly beyond any finite bound.
cc54ce075db5 restructured Chengsong parents: diff changeset	1152
cc54ce075db5 restructured Chengsong parents: diff changeset	1153	Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1154	\cite{Sulzmann2014} where they introduce bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1155	regular expressions. In this version, POSIX values are
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1156	represented as bit sequences and such sequences are incrementally generated
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1157	when derivatives are calculated. The compact representation
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1158	of bit sequences and regular expressions allows them to define a more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1159	``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured Chengsong parents: diff changeset	1160	derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured Chengsong parents: diff changeset	1161	They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured Chengsong parents: diff changeset	1162	of this version, but do not provide any supporting proof arguments, not
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1163	even ``pencil-and-paper'' arguments. They write about their bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1164	\emph{incremental parsing method} (that is the algorithm to be formalised
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1165	in this dissertation)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1166
cc54ce075db5 restructured Chengsong parents: diff changeset	1167
cc54ce075db5 restructured Chengsong parents: diff changeset	1168
cc54ce075db5 restructured Chengsong parents: diff changeset	1169	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	1170	``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured Chengsong parents: diff changeset	1171	method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured Chengsong parents: diff changeset	1172	yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured Chengsong parents: diff changeset	1173	extensively [..] but yet
cc54ce075db5 restructured Chengsong parents: diff changeset	1174	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured Chengsong parents: diff changeset	1175	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	1176	Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured Chengsong parents: diff changeset	1177	a formal proof.
cc54ce075db5 restructured Chengsong parents: diff changeset	1178
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1179	However a faster formally verified
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1180	lexing program with the optimisations
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1181	mentioned by Sulzmann and Lu's second algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1182	is still missing.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1183	As they stated,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1184	\begin{quote}\it
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1185	``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1186	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	1187	This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured Chengsong parents: diff changeset	1188	by Ausaf and Urban,
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1189	together with a formal proof of the correctness with those simplifications.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1190
cc54ce075db5 restructured Chengsong parents: diff changeset	1191
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1192	One of the most recent work in the context of lexing
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1193	%with this issue
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1194	is the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1195	This is relevant work for us and we will compare it later with
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1196	our derivative-based matcher we are going to present.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1197	There is also some newer work called
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1198	Verbatim++\cite{Verbatimpp}, which does not use derivatives,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1199	but deterministic finite automaton instead.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1200	%An example that gives problem to automaton approaches would be
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1201	%the regular expression $(a\|b)^*a(a\|b)^{\{n\}}$.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1202	%It requires at least $2^{n+1}$ states to represent
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1203	%as a DFA.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1204
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1205
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1206	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1207	\section{Contribution}
cc54ce075db5 restructured Chengsong parents: diff changeset	1208
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1209	In this thesis,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1210	we propose a solution to catastrophic
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1211	backtracking and error-prone matchers: a formally verified
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1212	regular expression lexing algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1213	that is both fast
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1214	and correct by extending Ausaf et al.'s work.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1215	The end result is %a regular expression lexing algorithm that comes with
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1216	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1217	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1218	an improved version of Sulzmann and Lu's bit-coded algorithm using
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1219	derivatives with simplifications,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1220	accompanied by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1221	a proven correctness theorem according to POSIX specification
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1222	given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1223	\item
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1224	a complexity-related property for that algorithm saying that the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1225	internal data structure will
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1226	remain finite,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1227	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1228	and extension to
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1229	the bounded repetitions construct with the correctness and finiteness property
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1230	maintained.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1231	\end{itemize}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1232
cc54ce075db5 restructured Chengsong parents: diff changeset	1233
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1234	With a formal finiteness bound in place,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1235	we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1236	Further improvements to the algorithm with an even stronger version of
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1237	simplification is made.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1238	Thanks to our theorem-prover-friendly approach,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1239	we believe that
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1240	this finiteness bound can be improved to a bound
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1241	linear to input and
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1242	cubic to the regular expression size using a technique by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1243	Antimirov\cite{Antimirov95}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1244	Once formalised, this would be a guarantee for the absence of all super-linear behavious.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1245	We are working out the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1246	details.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1247
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1248
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1249	To our best knowledge, no lexing libraries using Brzozowski derivatives
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1250	have similar complexity-related bounds,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1251	and claims about running time are usually speculative and backed by empirical
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1252	evidence on a few test cases.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1253	If a matching or lexing algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1254	does not come with certain basic complexity related
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1255	guarantees (for examaple the internal data structure size
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1256	does not grow indefinitely),
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1257	then they cannot claim with confidence having solved the problem
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1258	of catastrophic backtracking.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1259
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1260
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1261
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1262
cc54ce075db5 restructured Chengsong parents: diff changeset	1263
cc54ce075db5 restructured Chengsong parents: diff changeset	1264	\section{Structure of the thesis}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1265	In chapter 2 \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1266	and notations we
cc54ce075db5 restructured Chengsong parents: diff changeset	1267	use for describing the lexing algorithm by Sulzmann and Lu,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1268	and then give the lexing algorithm.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1269	We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1270	Then we illustrate in \ref{Bitcoded2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1271	how the algorithm without bitcodes falls short for such aggressive
cc54ce075db5 restructured Chengsong parents: diff changeset	1272	simplifications and therefore introduce our version of the
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1273	bit-coded algorithm and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1274	its correctness proof .
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1275	In \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1276	of our bitcoded algorithm, that is a finite bound on the size of any
cc54ce075db5 restructured Chengsong parents: diff changeset	1277	regex's derivatives.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1278	In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1279	in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1280	algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured Chengsong parents: diff changeset	1281
cc54ce075db5 restructured Chengsong parents: diff changeset	1282
cc54ce075db5 restructured Chengsong parents: diff changeset	1283
cc54ce075db5 restructured Chengsong parents: diff changeset	1284
cc54ce075db5 restructured Chengsong parents: diff changeset	1285
cc54ce075db5 restructured Chengsong parents: diff changeset	1286	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1287
cc54ce075db5 restructured Chengsong parents: diff changeset	1288
cc54ce075db5 restructured Chengsong parents: diff changeset	1289	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1290
cc54ce075db5 restructured Chengsong parents: diff changeset	1291	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1292
cc54ce075db5 restructured Chengsong parents: diff changeset	1293	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1294
cc54ce075db5 restructured Chengsong parents: diff changeset	1295

author	Chengsong
	Sun, 06 Nov 2022 23:05:47 +0000
changeset 618	233cf2b97d1a
parent 612	8c234a1bc7e0
child 620	ae6010c14e49
permissions	-rwxr-xr-x