lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@7ce2389dff4b (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	28	\def\bsimps{\textit{bsimp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	29	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	35	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	36	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594 62f8fa03863e more Chengsong parents: 591 diff changeset	37	\def\rdistincts{\textit{rdistinct}}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	38	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	40	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
620 ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	42	\def\cbn{\textit{createdByNtimes}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	43	\def\hpa{\textit{highestPowerAux}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	44	\def\hpower{\textit{highestPower}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	45	\def\ntset{\textit{ntset}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	46	\def\optermsimp{\textit{optermsimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	47	\def\optermOsimp{\textit{optermOsimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	48	\def\optermosimp{\textit{optermosimp}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	49	\def\opterm{\textit{opterm}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	50	\def\nString{\textit{nonemptyString}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	51
cc54ce075db5 restructured Chengsong parents: diff changeset	52	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	53
600 fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	54	\def\SEQ{\textit{SEQ}}
fd068f39ac23 chap4 comments done Chengsong parents: 596 diff changeset	55	\def\SEQs{\textit{SEQs}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	56	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	57	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	58	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	59	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	60	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	61	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	62	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	63	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	64	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	65	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	66	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	67	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	68	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	69	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	70	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	71	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	72
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	73	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	74
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	75	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	76	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	77	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	78	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	79	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	80
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	81	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	82
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	83	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	84	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	85	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	86	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	87	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	88	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	89	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	90	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	91	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	92	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	93	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	94	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	95	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	96	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	97	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	98	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	99	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	100	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	103	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	104	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	105	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	106	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	107	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	108	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	109	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	110	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	111	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	112	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	113	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	114	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	115	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	116	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	121	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	122	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	123	\def\starupdates{\textit{starUpdates}}
620 ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	124	\def\nupdate{\textit{nupdate}}
ae6010c14e49 chap6 almost done Chengsong parents: 618 diff changeset	125	\def\nupdates{\textit{nupdates}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	126
532 cc54ce075db5 restructured Chengsong parents: diff changeset	127
cc54ce075db5 restructured Chengsong parents: diff changeset	128	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	129	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	130	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	131	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	132	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	133	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	134	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	135	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	136	\def\bsimpStrong{\textit{bsimpStrong}}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	137	\def\bdersStrongs{\textit{bdersStrong}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	138	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	139
532 cc54ce075db5 restructured Chengsong parents: diff changeset	140	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	141	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	142	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	143	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	144	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	145	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	146
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	147	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	148
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	149
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	150	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	151	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	152	\def\bsimpalts{\textit{bsimp}_{ALTS}}
596 b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	153	\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	154	\def\rsimlalts{\textit{rsimp}_{ALTs}}
b306628a0eab more chap 56 Chengsong parents: 594 diff changeset	155	\def\rsimpseq{\textit{rsimp}_{SEQ}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	156
532 cc54ce075db5 restructured Chengsong parents: diff changeset	157	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	158	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	159	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	160
cc54ce075db5 restructured Chengsong parents: diff changeset	161
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	162	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	163	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	164	\def\filter{\textit{filter}}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	165	%\def\not{\textit{not}}
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	166
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	167
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	168
532 cc54ce075db5 restructured Chengsong parents: diff changeset	169	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	170	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	171	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	172	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	173	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	174	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	175	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	176
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	177
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	178
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	179
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	180	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	181	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	182	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	183	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	184	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	185	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	186	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	187	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	188	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	189	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	190	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	191	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	192	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	193	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	194	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	195	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	196	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	197	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	198
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	199
532 cc54ce075db5 restructured Chengsong parents: diff changeset	200	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	201	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	202	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	203
cc54ce075db5 restructured Chengsong parents: diff changeset	204	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	205
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	206
ce4e5151a836 more Chengsong parents: 600 diff changeset	207
ce4e5151a836 more Chengsong parents: 600 diff changeset	208
ce4e5151a836 more Chengsong parents: 600 diff changeset	209
ce4e5151a836 more Chengsong parents: 600 diff changeset	210
ce4e5151a836 more Chengsong parents: 600 diff changeset	211	Regular expressions are widely used in computer science:
ce4e5151a836 more Chengsong parents: 600 diff changeset	212	be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
ce4e5151a836 more Chengsong parents: 600 diff changeset	213	command-line tools like $\mathit{grep}$ that facilitate easy
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	214	text-processing \cite{grep}; network intrusion
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	215	detection systems that inspect suspicious traffic; or compiler
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	216	front ends.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	217	Given their usefulness and ubiquity, one would assume that
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	218	modern regular expression matching implementations
ce4e5151a836 more Chengsong parents: 600 diff changeset	219	are mature and fully studied.
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	220	Indeed, in a popular programming language's regular expression engine,
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	221	supplying it with regular expressions and strings,
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	222	in most cases one can
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	223	get the matching information in a very short time.
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	224	Those engines can be blindingly fast--some
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	225	network intrusion detection systems
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	226	use regular expression engines that are able to process
b079aaee5e10 more Chengsong parents: 633 diff changeset	227	hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
b079aaee5e10 more Chengsong parents: 633 diff changeset	228	However, those engines can sometimes exhibit a surprising security vulnerability
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	229	under a certain class of inputs.
46db6ae66448 chap1 Chengsong parents: 601 diff changeset	230	%However, , this is not the case for $\mathbf{all}$ inputs.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	231	%TODO: get source for SNORT/BRO's regex matching engine/speed
ce4e5151a836 more Chengsong parents: 600 diff changeset	232
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	233
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	234	Consider for example $(a^)^\,b$ and
f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	235	strings of the form $aa..a$, these strings cannot be matched by this regular
f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	236	expression: Obviously the expected $b$ in the last
f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	237	position is missing. One would assume that modern regular expression
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	238	matching engines can find this out very quickly. Surprisingly, if one tries
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	239	this example in JavaScript, Python or Java 8, even with small strings,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	240	say of lenght of around 30 $a$'s,
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	241	the decision takes an absurd amount of time to finish (see graphs in figure \ref{fig:aStarStarb}).
b079aaee5e10 more Chengsong parents: 633 diff changeset	242	This is clearly exponential behaviour, and as can be seen
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	243	is triggered by some relatively simple regular expressions.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	244	Java 9 and newer
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	245	versions improve this behaviour somewhat, but are still slow compared
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	246	with the approach we are going to use in this thesis.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	247
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	248
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	249
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	250	This superlinear blowup in regular expression engines
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	251	has caused grief in ``real life'' in the past where it is
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	252	given the name ``catastrophic backtracking'' or ``evil'' regular expressions.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	253	For example, on 20 July 2016 one evil
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	254	regular expression brought the webpage
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	255	\href{http://stackexchange.com}{Stack Exchange} to its
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	256	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	257	In this instance, a regular expression intended to just trim white
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	258	spaces from the beginning and the end of a line actually consumed
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	259	massive amounts of CPU resources---causing the web servers to grind to a
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	260	halt. In this example, the time needed to process
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	261	the string was
b079aaee5e10 more Chengsong parents: 633 diff changeset	262	$O(n^2)$ with respect to the string length $n$. This
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	263	quadratic overhead was enough for the homepage of Stack Exchange to
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	264	respond so slowly that the load balancer assumed a $\mathit{DoS}$
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	265	attack and therefore stopped the servers from responding to any
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	266	requests. This made the whole site become unavailable.
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	267
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	268	\begin{figure}[p]
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	269	\begin{center}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	270	\begin{tabular}{@{}c@{\hspace{0mm}}c@{}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	271	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	272	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	273	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	274	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	275	ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	276	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	277	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	278	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	279	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	280	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	281	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	282	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	283	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	284	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	285	legend entries={JavaScript},
cc54ce075db5 restructured Chengsong parents: diff changeset	286	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	287	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	288	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	289	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	290	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	291	&
cc54ce075db5 restructured Chengsong parents: diff changeset	292	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	293	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	294	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	295	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	296	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	297	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	298	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	299	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	300	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	301	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	302	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	303	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	304	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	305	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	306	legend entries={Python},
cc54ce075db5 restructured Chengsong parents: diff changeset	307	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	308	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	309	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	310	\end{axis}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	311	\end{tikzpicture}\\
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	312	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	313	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	314	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	315	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	316	ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	317	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	318	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	319	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	320	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	321	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	322	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	323	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	324	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	325	height=4cm,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	326	legend entries={Java 8},
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	327	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	328	legend cell align=left]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	329	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	330	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	331	\end{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	332	&
ce4e5151a836 more Chengsong parents: 600 diff changeset	333	\begin{tikzpicture}
ce4e5151a836 more Chengsong parents: 600 diff changeset	334	\begin{axis}[
ce4e5151a836 more Chengsong parents: 600 diff changeset	335	xlabel={$n$},
ce4e5151a836 more Chengsong parents: 600 diff changeset	336	x label style={at={(1.05,-0.05)}},
ce4e5151a836 more Chengsong parents: 600 diff changeset	337	%ylabel={time in secs},
ce4e5151a836 more Chengsong parents: 600 diff changeset	338	enlargelimits=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	339	xtick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	340	xmax=33,
ce4e5151a836 more Chengsong parents: 600 diff changeset	341	ymax=35,
ce4e5151a836 more Chengsong parents: 600 diff changeset	342	ytick={0,5,...,30},
ce4e5151a836 more Chengsong parents: 600 diff changeset	343	scaled ticks=false,
ce4e5151a836 more Chengsong parents: 600 diff changeset	344	axis lines=left,
ce4e5151a836 more Chengsong parents: 600 diff changeset	345	width=5cm,
ce4e5151a836 more Chengsong parents: 600 diff changeset	346	height=4cm,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	347	legend entries={Dart},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	348	legend pos=north west,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	349	legend cell align=left]
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	350	\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	351	\end{axis}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	352	\end{tikzpicture}\\
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	353	\begin{tikzpicture}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	354	\begin{axis}[
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	355	xlabel={$n$},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	356	x label style={at={(1.05,-0.05)}},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	357	ylabel={time in secs},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	358	enlargelimits=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	359	xtick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	360	xmax=33,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	361	ymax=35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	362	ytick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	363	scaled ticks=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	364	axis lines=left,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	365	width=5cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	366	height=4cm,
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	367	legend entries={Swift},
ce4e5151a836 more Chengsong parents: 600 diff changeset	368	legend pos=north west,
ce4e5151a836 more Chengsong parents: 600 diff changeset	369	legend cell align=left]
ce4e5151a836 more Chengsong parents: 600 diff changeset	370	\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
ce4e5151a836 more Chengsong parents: 600 diff changeset	371	\end{axis}
ce4e5151a836 more Chengsong parents: 600 diff changeset	372	\end{tikzpicture}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	373	&
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	374	\begin{tikzpicture}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	375	\begin{axis}[
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	376	xlabel={$n$},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	377	x label style={at={(1.05,-0.05)}},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	378	%ylabel={time in secs},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	379	enlargelimits=true,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	380	%xtick={0,5000,...,40000},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	381	%xmax=40000,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	382	%ymax=35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	383	restrict x to domain*=0:40000,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	384	restrict y to domain*=0:35,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	385	%ytick={0,5,...,30},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	386	%scaled ticks=false,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	387	axis lines=left,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	388	width=5cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	389	height=4cm,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	390	legend entries={Java9+},
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	391	legend pos=north west,
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	392	legend cell align=left]
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	393	\addplot[orange,mark=*, mark options={fill=white}] table {re-java9.data};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	394	\end{axis}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	395	\end{tikzpicture}\\
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	396	\multicolumn{2}{c}{Graphs}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	397	\end{tabular}
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	398	\end{center}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	399	\caption{Graphs showing runtime for matching $(a^)^\,b$ with strings
ce4e5151a836 more Chengsong parents: 600 diff changeset	400	of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	401	The reason for their superlinear behaviour is that they do a depth-first-search
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	402	using NFAs.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	403	If the string does not match, the regular expression matching
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	404	engine starts to explore all possibilities.
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	405	}\label{fig:aStarStarb}
ce4e5151a836 more Chengsong parents: 600 diff changeset	406	\end{figure}\afterpage{\clearpage}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	407
532 cc54ce075db5 restructured Chengsong parents: diff changeset	408	A more recent example is a global outage of all Cloudflare servers on 2 July
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	409	2019. A poorly written regular expression exhibited catastrophic backtracking
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	410	and exhausted CPUs that serve HTTP traffic. Although the outage
532 cc54ce075db5 restructured Chengsong parents: diff changeset	411	had several causes, at the heart was a regular expression that
cc54ce075db5 restructured Chengsong parents: diff changeset	412	was used to monitor network
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	413	traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	414	These problems with regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	415	are not isolated events that happen
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	416	very rarely,
b079aaee5e10 more Chengsong parents: 633 diff changeset	417	%but actually widespread.
b079aaee5e10 more Chengsong parents: 633 diff changeset	418	%They occur so often that they have a
b079aaee5e10 more Chengsong parents: 633 diff changeset	419	but they occur actually often enough that they have a
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	420	name: Regular-Expression-Denial-Of-Service (ReDoS)
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	421	attacks.
b079aaee5e10 more Chengsong parents: 633 diff changeset	422	Davis et al. \cite{Davis18} detected more
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	423	than 1000 evil regular expressions
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	424	in Node.js, Python core libraries, npm and pypi.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	425	They therefore concluded that evil regular expressions
634 b079aaee5e10 more Chengsong parents: 633 diff changeset	426	are real problems rather than just "a parlour trick".
532 cc54ce075db5 restructured Chengsong parents: diff changeset	427
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	428	This work aims to address this issue
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	429	with the help of formal proofs.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	430	We describe a lexing algorithm based
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	431	on Brzozowski derivatives with verified correctness
7ce2389dff4b more Chengsong parents: 634 diff changeset	432	and a finiteness property for the size of derivatives
7ce2389dff4b more Chengsong parents: 634 diff changeset	433	(which are all done in Isabelle/HOL).
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	434	Such properties %guarantee the absence of
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	435	are an important step in preventing
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	436	catastrophic backtracking once and for all.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	437	We will give more details in the next sections
16d67f9c07d4 more Chengsong parents: 603 diff changeset	438	on (i) why the slow cases in graph \ref{fig:aStarStarb}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	439	can occur in traditional regular expression engines
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	440	and (ii) why we choose our
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	441	approach based on Brzozowski derivatives and formal proofs.
602 46db6ae66448 chap1 Chengsong parents: 601 diff changeset	442
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	443
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	444	\section{Preliminaries}%Regex, and the Problems with Regex Matchers}
601 ce4e5151a836 more Chengsong parents: 600 diff changeset	445	Regular expressions and regular expression matchers
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	446	have clearly been studied for many, many years.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	447	Theoretical results in automata theory state
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	448	that basic regular expression matching should be linear
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	449	w.r.t the input.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	450	This assumes that the regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	451	$r$ was pre-processed and turned into a
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	452	deterministic finite automaton (DFA) before matching \cite{Sakarovitch2009}.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	453	By basic we mean textbook definitions such as the one
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	454	below, involving only regular expressions for characters, alternatives,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	455	sequences, and Kleene stars:
16d67f9c07d4 more Chengsong parents: 603 diff changeset	456	\[
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	457	r ::= c \| r_1 + r_2 \| r_1 \cdot r_2 \| r^*
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	458	\]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	459	Modern regular expression matchers used by programmers,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	460	however,
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	461	support much richer constructs, such as bounded repetitions,
7ce2389dff4b more Chengsong parents: 634 diff changeset	462	negations,
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	463	and back-references.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	464	To differentiate, we use the word \emph{regex} to refer
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	465	to those expressions with richer constructs while reserving the
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	466	term \emph{regular expression}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	467	for the more traditional meaning in formal languages theory.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	468	We follow this convention
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	469	in this thesis.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	470	In the future, we aim to support all the popular features of regexes,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	471	but for this work we mainly look at basic regular expressions
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	472	and bounded repetitions.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	473
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	474
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	475
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	476	%Most modern regex libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	477	%the so-called PCRE standard (Peral Compatible Regular Expressions)
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	478	%has the back-references
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	479	Regexes come with a number of constructs
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	480	that make it more convenient for
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	481	programmers to write regular expressions.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	482	Depending on the types of constructs
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	483	the task of matching and lexing with them
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	484	will have different levels of complexity.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	485	Some of those constructs are syntactic sugars that are
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	486	simply short hand notations
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	487	that save the programmers a few keystrokes.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	488	These will not cause problems for regex libraries.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	489	For example the
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	490	non-binary alternative involving three or more choices just means:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	491	\[
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	492	(a \| b \| c) \stackrel{means}{=} ((a + b)+ c)
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	493	\]
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	494	Similarly, the range operator
7ce2389dff4b more Chengsong parents: 634 diff changeset	495	%used to express the alternative
7ce2389dff4b more Chengsong parents: 634 diff changeset	496	%of all characters between its operands,
7ce2389dff4b more Chengsong parents: 634 diff changeset	497	is just a concise way
7ce2389dff4b more Chengsong parents: 634 diff changeset	498	of expressing an alternative of consecutive characters:
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	499	\[
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	500	[0~-9]\stackrel{means}{=} (0 \| 1 \| \ldots \| 9 )
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	501	\]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	502	for an alternative. The
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	503	wildcard character '$.$' is used to refer to any single character,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	504	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	505	. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	506	\]
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	507	except the newline.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	508
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	509	\subsection{Bounded Repetitions}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	510	More interesting are bounded repetitions, which can
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	511	make the regular expressions much
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	512	more compact.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	513	Normally there are four kinds of bounded repetitions:
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	514	$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	515	(where $n$ and $m$ are constant natural numbers).
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	516	Like the star regular expressions, the set of strings or language
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	517	a bounded regular expression can match
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	518	is defined using the power operation on sets:
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	519	\begin{center}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	520	\begin{tabular}{lcl}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	521	$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	522	$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	523	$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	524	$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	525	\end{tabular}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	526	\end{center}
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	527	The attraction of bounded repetitions is that they can be
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	528	used to avoid a size blow up: for example $r^{\{n\}}$
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	529	is a shorthand for
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	530	the much longer regular expression:
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	531	\[
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	532	\underbrace{r\ldots r}_\text{n copies of r}.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	533	\]
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	534	%Therefore, a naive algorithm that simply unfolds
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	535	%them into their desugared forms
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	536	%will suffer from at least an exponential runtime increase.
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	537
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	538
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	539	The problem with matching
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	540	such bounded repetitions
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	541	is that tools based on the classic notion of
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	542	automata need to expand $r^{\{n\}}$ into $n$ connected
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	543	copies of the automaton for $r$. This leads to very inefficient matching
370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	544	algorithms or algorithms that consume large amounts of memory.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	545	Implementations using $\DFA$s will
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	546	in such situations
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	547	either become excruciatingly slow
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	548	(for example Verbatim++ \cite{Verbatimpp}) or run
7ce2389dff4b more Chengsong parents: 634 diff changeset	549	out of memory (for example $\mathit{LEX}$ and
7ce2389dff4b more Chengsong parents: 634 diff changeset	550	$\mathit{JFLEX}$\footnote{LEX and JFLEX are lexer generators
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	551	in C and JAVA that generate $\mathit{DFA}$-based
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	552	lexers. The user provides a set of regular expressions
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	553	and configurations, and then
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	554	gets an output program encoding a minimized $\mathit{DFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	555	that can be compiled and run.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	556	When given the above countdown regular expression,
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	557	a small $n$ (say 20) would result in a program representing a
7ce2389dff4b more Chengsong parents: 634 diff changeset	558	DFA
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	559	with millions of states.}) for large counters.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	560	A classic example for this phenomenon is the regular expression $(a+b)^* a (a+b)^{n}$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	561	where the minimal DFA requires at least $2^{n+1}$ states.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	562	For example, when $n$ is equal to 2,
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	563	the corresponding $\mathit{NFA}$ looks like:
7ce2389dff4b more Chengsong parents: 634 diff changeset	564	\vspace{6mm}
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	565	\begin{center}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	566	\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	567	\node[state,initial] (q_0) {$q_0$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	568	\node[state, red] (q_1) [right=of q_0] {$q_1$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	569	\node[state, red] (q_2) [right=of q_1] {$q_2$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	570	\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
16d67f9c07d4 more Chengsong parents: 603 diff changeset	571	\path[->]
16d67f9c07d4 more Chengsong parents: 603 diff changeset	572	(q_0) edge node {a} (q_1)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	573	edge [loop below] node {a,b} ()
16d67f9c07d4 more Chengsong parents: 603 diff changeset	574	(q_1) edge node {a,b} (q_2)
16d67f9c07d4 more Chengsong parents: 603 diff changeset	575	(q_2) edge node {a,b} (q_3);
16d67f9c07d4 more Chengsong parents: 603 diff changeset	576	\end{tikzpicture}
16d67f9c07d4 more Chengsong parents: 603 diff changeset	577	\end{center}
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	578	and when turned into a DFA by the subset construction
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	579	requires at least $2^3$ states.\footnote{The
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	580	red states are "countdown states" which count down
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	581	the number of characters needed in addition to the current
16d67f9c07d4 more Chengsong parents: 603 diff changeset	582	string to make a successful match.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	583	For example, state $q_1$ indicates a match that has
16d67f9c07d4 more Chengsong parents: 603 diff changeset	584	gone past the $(a\|b)^$ part of $(a\|b)^a(a\|b)^{\{2\}}$,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	585	and just consumed the "delimiter" $a$ in the middle, and
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	586	needs to match 2 more iterations of $(a\|b)$ to complete.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	587	State $q_2$ on the other hand, can be viewed as a state
16d67f9c07d4 more Chengsong parents: 603 diff changeset	588	after $q_1$ has consumed 1 character, and just waits
16d67f9c07d4 more Chengsong parents: 603 diff changeset	589	for 1 more character to complete.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	590	The state $q_3$ is the last (accepting) state, requiring 0
7ce2389dff4b more Chengsong parents: 634 diff changeset	591	more characters.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	592	Depending on the suffix of the
16d67f9c07d4 more Chengsong parents: 603 diff changeset	593	input string up to the current read location,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	594	the states $q_1$ and $q_2$, $q_3$
16d67f9c07d4 more Chengsong parents: 603 diff changeset	595	may or may
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	596	not be active.
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	597	A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
16d67f9c07d4 more Chengsong parents: 603 diff changeset	598	contain at least $2^3$ non-equivalent states that cannot be merged,
16d67f9c07d4 more Chengsong parents: 603 diff changeset	599	because the subset construction during determinisation will generate
16d67f9c07d4 more Chengsong parents: 603 diff changeset	600	all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	601	Generalizing this to regular expressions with larger
16d67f9c07d4 more Chengsong parents: 603 diff changeset	602	bounded repetitions number, we have that
16d67f9c07d4 more Chengsong parents: 603 diff changeset	603	regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	604	would require at least $2^{n+1}$ states, if $r$ itself contains
604 16d67f9c07d4 more Chengsong parents: 603 diff changeset	605	more than 1 string.
16d67f9c07d4 more Chengsong parents: 603 diff changeset	606	This is to represent all different
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	607	scenarios in which "countdown" states are active.}
603 370fe1dde7c7 more restructuring chap1 Chengsong parents: 602 diff changeset	608
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	609
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	610	Bounded repetitions are important because they
7ce2389dff4b more Chengsong parents: 634 diff changeset	611	tend to occur frequently in practical use,
7ce2389dff4b more Chengsong parents: 634 diff changeset	612	for example in the regex library RegExLib, in
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	613	the rules library of Snort \cite{Snort1999}\footnote{
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	614	Snort is a network intrusion detection (NID) tool
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	615	for monitoring network traffic.
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	616	The network security community curates a list
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	617	of malicious patterns written as regexes,
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	618	which is used by Snort's detection engine
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	619	to match against network traffic for any hostile
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	620	activities such as buffer overflow attacks.},
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	621	as well as in XML Schema definitions (XSDs).
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	622	According to Bj\"{o}rklund et al \cite{xml2015},
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	623	more than half of the
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	624	XSDs they found on the Maven.org central repository
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	625	have bounded regular expressions in them.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	626	Often the counters are quite large, with the largest being
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	627	close to ten million.
7ce2389dff4b more Chengsong parents: 634 diff changeset	628	A smaller sample XSD they gave
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	629	is:
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	630	\begin{verbatim}
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	631	<sequence minOccurs="0" maxOccurs="65535">
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	632	<element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	633	<element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	634	</sequence>
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	635	\end{verbatim}
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	636	This can be seen as the regex
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	637	$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	638	regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	639	satisfying certain constraints (such as
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	640	satisfying the floating point number format).
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	641	It is therefore quite unsatisfying that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	642	some regular expressions matching libraries
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	643	impose adhoc limits
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	644	for bounded regular expressions:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	645	For example, in the regular expression matching library in the Go
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	646	language the regular expression $a^{1001}$ is not permitted, because no counter
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	647	can be above 1000, and in the built-in Rust regular expression library
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	648	expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
606 99b530103464 new Chengsong parents: 605 diff changeset	649	for being too big.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	650	As Becchi and Crawley \cite{Becchi08} have pointed out,
606 99b530103464 new Chengsong parents: 605 diff changeset	651	the reason for these restrictions
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	652	is that they simulate a non-deterministic finite
606 99b530103464 new Chengsong parents: 605 diff changeset	653	automata (NFA) with a breadth-first search.
99b530103464 new Chengsong parents: 605 diff changeset	654	This way the number of active states could
99b530103464 new Chengsong parents: 605 diff changeset	655	be equal to the counter number.
99b530103464 new Chengsong parents: 605 diff changeset	656	When the counters are large,
99b530103464 new Chengsong parents: 605 diff changeset	657	the memory requirement could become
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	658	infeasible, and a regex engine
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	659	like in Go will reject this pattern straight away.
606 99b530103464 new Chengsong parents: 605 diff changeset	660	\begin{figure}[H]
99b530103464 new Chengsong parents: 605 diff changeset	661	\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	662	\begin{tikzpicture} [node distance = 2cm, on grid, auto]
99b530103464 new Chengsong parents: 605 diff changeset	663
99b530103464 new Chengsong parents: 605 diff changeset	664	\node (q0) [state, initial] {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	665	\node (q1) [state, right = of q0] {$1$};
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	666	%\node (q2) [state, right = of q1] {$2$};
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	667	\node (qdots) [right = of q1] {$\ldots$};
606 99b530103464 new Chengsong parents: 605 diff changeset	668	\node (qn) [state, right = of qdots] {$n$};
99b530103464 new Chengsong parents: 605 diff changeset	669	\node (qn1) [state, right = of qn] {$n+1$};
99b530103464 new Chengsong parents: 605 diff changeset	670	\node (qn2) [state, right = of qn1] {$n+2$};
99b530103464 new Chengsong parents: 605 diff changeset	671	\node (qn3) [state, accepting, right = of qn2] {$n+3$};
99b530103464 new Chengsong parents: 605 diff changeset	672
99b530103464 new Chengsong parents: 605 diff changeset	673	\path [-stealth, thick]
99b530103464 new Chengsong parents: 605 diff changeset	674	(q0) edge [loop above] node {a} ()
99b530103464 new Chengsong parents: 605 diff changeset	675	(q0) edge node {a} (q1)
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	676	%(q1) edge node {.} (q2)
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	677	(q1) edge node {.} (qdots)
606 99b530103464 new Chengsong parents: 605 diff changeset	678	(qdots) edge node {.} (qn)
99b530103464 new Chengsong parents: 605 diff changeset	679	(qn) edge node {.} (qn1)
99b530103464 new Chengsong parents: 605 diff changeset	680	(qn1) edge node {b} (qn2)
99b530103464 new Chengsong parents: 605 diff changeset	681	(qn2) edge node {$c$} (qn3);
99b530103464 new Chengsong parents: 605 diff changeset	682	\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	683	%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
99b530103464 new Chengsong parents: 605 diff changeset	684	% \node[state,initial] (q_0) {$0$};
99b530103464 new Chengsong parents: 605 diff changeset	685	% \node[state, ] (q_1) [right=of q_0] {$1$};
99b530103464 new Chengsong parents: 605 diff changeset	686	% \node[state, ] (q_2) [right=of q_1] {$2$};
99b530103464 new Chengsong parents: 605 diff changeset	687	% \node[state,
99b530103464 new Chengsong parents: 605 diff changeset	688	% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
99b530103464 new Chengsong parents: 605 diff changeset	689	% \path[->]
99b530103464 new Chengsong parents: 605 diff changeset	690	% (q_0) edge node {a} (q_1)
99b530103464 new Chengsong parents: 605 diff changeset	691	% edge [loop below] node {a,b} ()
99b530103464 new Chengsong parents: 605 diff changeset	692	% (q_1) edge node {a,b} (q_2)
99b530103464 new Chengsong parents: 605 diff changeset	693	% (q_2) edge node {a,b} (q_3);
99b530103464 new Chengsong parents: 605 diff changeset	694	%\end{tikzpicture}
99b530103464 new Chengsong parents: 605 diff changeset	695	\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	696	\caption{The example given by Becchi and Crawley
99b530103464 new Chengsong parents: 605 diff changeset	697	that NFA simulation can consume large
99b530103464 new Chengsong parents: 605 diff changeset	698	amounts of memory: $.^*a.^{\{n\}}bc$ matching
99b530103464 new Chengsong parents: 605 diff changeset	699	strings of the form $aaa\ldots aaaabc$.
99b530103464 new Chengsong parents: 605 diff changeset	700	When traversing in a breadth-first manner,
99b530103464 new Chengsong parents: 605 diff changeset	701	all states from 0 till $n+1$ will become active.}
99b530103464 new Chengsong parents: 605 diff changeset	702	\end{figure}
99b530103464 new Chengsong parents: 605 diff changeset	703	%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
99b530103464 new Chengsong parents: 605 diff changeset	704	%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
99b530103464 new Chengsong parents: 605 diff changeset	705	%in terms of input string length.
99b530103464 new Chengsong parents: 605 diff changeset	706	%TODO:try out these lexers
99b530103464 new Chengsong parents: 605 diff changeset	707	These problems can of course be solved in matching algorithms where
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	708	automata go beyond the classic notion and for instance include explicit
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	709	counters \cite{Turo_ov__2020}.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	710	These solutions can be quite efficient,
606 99b530103464 new Chengsong parents: 605 diff changeset	711	with the ability to process
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	712	gigabytes of strings input per second
606 99b530103464 new Chengsong parents: 605 diff changeset	713	even with large counters \cite{Becchi08}.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	714	These practical solutions do not come with
7ce2389dff4b more Chengsong parents: 634 diff changeset	715	formal guarantees, and as pointed out by
7ce2389dff4b more Chengsong parents: 634 diff changeset	716	Kuklewicz \cite{KuklewiczHaskell}, can be error-prone.
7ce2389dff4b more Chengsong parents: 634 diff changeset	717	%But formal reasoning about these automata especially in Isabelle
7ce2389dff4b more Chengsong parents: 634 diff changeset	718	%can be challenging
7ce2389dff4b more Chengsong parents: 634 diff changeset	719	%and un-intuitive.
7ce2389dff4b more Chengsong parents: 634 diff changeset	720	%Therefore, we take correctness and runtime claims made about these solutions
7ce2389dff4b more Chengsong parents: 634 diff changeset	721	%with a grain of salt.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	722
628 7af4e2420a8c ready to submit~~ Chengsong parents: 622 diff changeset	723	In the work reported in \cite{FoSSaCS2023} and here,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	724	we add better support using derivatives
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	725	for bounded regular expression $r^{\{n\}}$.
7ce2389dff4b more Chengsong parents: 634 diff changeset	726	Our results
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	727	extend straightforwardly to
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	728	repetitions with intervals such as
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	729	$r^{\{n\ldots m\}}$.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	730	The merit of Brzozowski derivatives (more on this later)
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	731	on this problem is that
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	732	it can be naturally extended to support bounded repetitions.
635 7ce2389dff4b more Chengsong parents: 634 diff changeset	733	Moreover these extensions are still made up of only small
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	734	inductive datatypes and recursive functions,
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	735	making it handy to deal with them in theorem provers.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	736	%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	737	%straightforwardly extended to deal with bounded regular expressions
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	738	%and moreover the resulting code still consists of only simple
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	739	%recursive functions and inductive datatypes.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	740	Finally, bounded regular expressions do not destroy our finite
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	741	boundedness property, which we shall prove later on.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	742
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	743
606 99b530103464 new Chengsong parents: 605 diff changeset	744
99b530103464 new Chengsong parents: 605 diff changeset	745
99b530103464 new Chengsong parents: 605 diff changeset	746
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	747	\subsection{Back-References}
606 99b530103464 new Chengsong parents: 605 diff changeset	748	The other way to simulate an $\mathit{NFA}$ for matching is choosing
99b530103464 new Chengsong parents: 605 diff changeset	749	a single transition each time, keeping all the other options in
99b530103464 new Chengsong parents: 605 diff changeset	750	a queue or stack, and backtracking if that choice eventually
99b530103464 new Chengsong parents: 605 diff changeset	751	fails. This method, often called a "depth-first-search",
99b530103464 new Chengsong parents: 605 diff changeset	752	is efficient in a lot of cases, but could end up
99b530103464 new Chengsong parents: 605 diff changeset	753	with exponential run time.
99b530103464 new Chengsong parents: 605 diff changeset	754	The backtracking method is employed in regex libraries
99b530103464 new Chengsong parents: 605 diff changeset	755	that support \emph{back-references}, for example
99b530103464 new Chengsong parents: 605 diff changeset	756	in Java and Python.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	757	%\section{Back-references and The Terminology Regex}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	758
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	759	%When one constructs an $\NFA$ out of a regular expression
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	760	%there is often very little to be done in the first phase, one simply
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	761	%construct the $\NFA$ states based on the structure of the input regular expression.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	762
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	763	%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	764	%one by keeping track of all active states after consuming
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	765	%a character, and update that set of states iteratively.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	766	%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	767	%for a path terminating
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	768	%at an accepting state.
606 99b530103464 new Chengsong parents: 605 diff changeset	769
99b530103464 new Chengsong parents: 605 diff changeset	770
99b530103464 new Chengsong parents: 605 diff changeset	771
99b530103464 new Chengsong parents: 605 diff changeset	772	Given a regular expression like this (the sequence
532 cc54ce075db5 restructured Chengsong parents: diff changeset	773	operator is omitted for brevity):
cc54ce075db5 restructured Chengsong parents: diff changeset	774	\begin{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	775	$r_1r_2r_3r_4$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	776	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	777	one could label sub-expressions of interest
532 cc54ce075db5 restructured Chengsong parents: diff changeset	778	by parenthesizing them and giving
cc54ce075db5 restructured Chengsong parents: diff changeset	779	them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured Chengsong parents: diff changeset	780	One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured Chengsong parents: diff changeset	781	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	782	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured Chengsong parents: diff changeset	783	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	784	The sub-expressions
99b530103464 new Chengsong parents: 605 diff changeset	785	$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
99b530103464 new Chengsong parents: 605 diff changeset	786	by 1 to 4, and can be ``referred back'' by their respective numbers.
99b530103464 new Chengsong parents: 605 diff changeset	787	%These sub-expressions are called "capturing groups".
99b530103464 new Chengsong parents: 605 diff changeset	788	To do so, we use the syntax $\backslash i$
99b530103464 new Chengsong parents: 605 diff changeset	789	to denote that we want the sub-string
99b530103464 new Chengsong parents: 605 diff changeset	790	of the input just matched by the i-th
99b530103464 new Chengsong parents: 605 diff changeset	791	sub-expression to appear again,
99b530103464 new Chengsong parents: 605 diff changeset	792	exactly the same as it first appeared:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	793	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	794	$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
cc54ce075db5 restructured Chengsong parents: diff changeset	795	\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured Chengsong parents: diff changeset	796	\end{center}
606 99b530103464 new Chengsong parents: 605 diff changeset	797	%The backslash and number $i$ are the
99b530103464 new Chengsong parents: 605 diff changeset	798	%so-called "back-references".
99b530103464 new Chengsong parents: 605 diff changeset	799	%Let $e$ be an expression made of regular expressions
99b530103464 new Chengsong parents: 605 diff changeset	800	%and back-references. $e$ contains the expression $e_i$
99b530103464 new Chengsong parents: 605 diff changeset	801	%as its $i$-th capturing group.
99b530103464 new Chengsong parents: 605 diff changeset	802	%The semantics of back-reference can be recursively
99b530103464 new Chengsong parents: 605 diff changeset	803	%written as:
99b530103464 new Chengsong parents: 605 diff changeset	804	%\begin{center}
99b530103464 new Chengsong parents: 605 diff changeset	805	% \begin{tabular}{c}
99b530103464 new Chengsong parents: 605 diff changeset	806	% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
99b530103464 new Chengsong parents: 605 diff changeset	807	% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
99b530103464 new Chengsong parents: 605 diff changeset	808	% \end{tabular}
99b530103464 new Chengsong parents: 605 diff changeset	809	%\end{center}
99b530103464 new Chengsong parents: 605 diff changeset	810	A concrete example
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	811	for back-references is
532 cc54ce075db5 restructured Chengsong parents: diff changeset	812	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	813	$(.^*)\backslash 1$,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	814	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	815	which matches
606 99b530103464 new Chengsong parents: 605 diff changeset	816	strings that can be split into two identical halves,
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	817	for example $\mathit{foofoo}$, $\mathit{ww}$ and so on.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	818	Note that this is different from
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	819	repeating the sub-expression verbatim like
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	820	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	821	$(.^)(.^)$,
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	822	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	823	which does not impose any restrictions on what strings the second
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	824	sub-expression $.^*$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	825	might match.
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	826	Another example of back-references is
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	827	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	828	$(.)(.)\backslash 2\backslash 1$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	829	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	830	which matches four-character palindromes
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	831	like $abba$, $x??x$ and so on.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	832
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	833	Back-references is a regex construct
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	834	that programmers find quite useful.
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	835	According to Becchi and Crawley~\cite{Becchi08},
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	836	6\% of Snort rules (up until 2008) use them.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	837	The most common use of back-references
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	838	is to express well-formed html files,
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	839	where back-references are convenient for matching
233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	840	opening and closing tags like
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	841	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	842	$\langle html \rangle \ldots \langle / html \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	843	\end{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	844	A regex describing such a format
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	845	is
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	846	\begin{center}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	847	$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	848	\end{center}
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	849	Despite being useful, the expressive power of regexes
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	850	go beyond regular languages
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	851	once back-references are included.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	852	In fact, they allow the regex construct to express
532 cc54ce075db5 restructured Chengsong parents: diff changeset	853	languages that cannot be contained in context-free
cc54ce075db5 restructured Chengsong parents: diff changeset	854	languages either.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	855	For example, the back-reference $(a^*)b\backslash1 b \backslash 1$
532 cc54ce075db5 restructured Chengsong parents: diff changeset	856	expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	857	which cannot be expressed by context-free grammars \parencite{campeanu2003formal}.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	858	Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured Chengsong parents: diff changeset	859	of formal languages.
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	860	Also solving the matching problem involving back-references
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	861	is known to be NP-complete \parencite{alfred2014algorithms}.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	862	Regex libraries supporting back-references such as
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	863	PCRE \cite{pcre} therefore have to
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	864	revert to a depth-first search algorithm which backtracks.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	865	What is unexpected is that even in the cases
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	866	not involving back-references, there is still
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	867	a (non-negligible) chance they might backtrack super-linearly,
618 233cf2b97d1a chapter 5 finished!! Chengsong parents: 612 diff changeset	868	as shown in the graphs in figure\ref{fig:aStarStarb}.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	869
532 cc54ce075db5 restructured Chengsong parents: diff changeset	870	Summing these up, we can categorise existing
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	871	practical regex libraries into two kinds:
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	872	(i) The ones with linear
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	873	time guarantees like Go and Rust. The downside with them is that
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	874	they impose restrictions
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	875	on the regular expressions (not allowing back-references,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	876	bounded repetitions cannot exceed an ad hoc limit etc.).
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	877	And (ii) those
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	878	that allow large bounded regular expressions and back-references
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	879	at the expense of using backtracking algorithms.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	880	They can potentially ``grind to a halt''
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	881	on some very simple cases, resulting
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	882	ReDoS attacks if exposed to the internet.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	883
633 f1edd5ee1a12 more, including link Chengsong parents: 631 diff changeset	884	The problems with both approaches are the motivation for us
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	885	to look again at the regular expression matching problem.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	886	Another motivation is that regular expression matching algorithms
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	887	that follow the POSIX standard often contain errors and bugs
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	888	as we shall explain next.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	889
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	890	%We would like to have regex engines that can
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	891	%deal with the regular part (e.g.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	892	%bounded repetitions) of regexes more
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	893	%efficiently.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	894	%Also we want to make sure that they do it correctly.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	895	%It turns out that such aim is not so easy to achieve.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	896	%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
cc54ce075db5 restructured Chengsong parents: diff changeset	897	% For example, the Rust regex engine claims to be linear,
cc54ce075db5 restructured Chengsong parents: diff changeset	898	% but does not support lookarounds and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	899	% The GoLang regex library does not support over 1000 repetitions.
cc54ce075db5 restructured Chengsong parents: diff changeset	900	% Java and Python both support back-references, but shows
cc54ce075db5 restructured Chengsong parents: diff changeset	901	%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured Chengsong parents: diff changeset	902	%when the language is still regular).
cc54ce075db5 restructured Chengsong parents: diff changeset	903	%TODO: test performance of Rust on (((((aa)b)b){20}))c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured Chengsong parents: diff changeset	904	%TODO: verify the fact Rust does not allow 1000+ reps
cc54ce075db5 restructured Chengsong parents: diff changeset	905
cc54ce075db5 restructured Chengsong parents: diff changeset	906
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	907
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	908
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	909	%The time cost of regex matching algorithms in general
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	910	%involve two different phases, and different things can go differently wrong on
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	911	%these phases.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	912	%$\DFA$s usually have problems in the first (construction) phase
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	913	%, whereas $\NFA$s usually run into trouble
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	914	%on the second phase.
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	915
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	916
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	917	\section{Error-prone POSIX Implementations}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	918	Very often there are multiple ways of matching a string
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	919	with a regular expression.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	920	In such cases the regular expressions matcher needs to
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	921	disambiguate.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	922	The more widely used strategy is called POSIX,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	923	which roughly speaking always chooses the longest initial match.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	924	The POSIX strategy is widely adopted in many regular expression matchers.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	925	However, many implementations (including the C libraries
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	926	used by Linux and OS X distributions) contain bugs
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	927	or do not meet the specification they claim to adhere to.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	928	Kuklewicz maintains a unit test repository which lists some
628 7af4e2420a8c ready to submit~~ Chengsong parents: 622 diff changeset	929	problems with existing regular expression engines \cite{KuklewiczHaskell}.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	930	In some cases, they either fail to generate a
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	931	result when there exists a match,
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	932	or give results that are inconsistent with the POSIX standard.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	933	A concrete example is the regex:
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	934	\begin{center}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	935	$(aba + ab + a)^* \text{and the string} ababa$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	936	\end{center}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	937	The correct POSIX match for the above
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	938	is the entire string $ababa$,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	939	split into two Kleene star iterations, namely $[ab], [aba]$ at positions
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	940	$[0, 2), [2, 5)$
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	941	respectively.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	942	But trying this out in regex101 \parencite{regex101} \footnote{
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	943	regex101 is an online regular expression matcher which
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	944	provides API for trying out regular expression
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	945	engines of multiple popular programming languages like
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	946	Java, Python, Go, etc.}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	947	with different engines yields
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	948	always matches: $[aba]$ at $[0, 3)$
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	949	and $a$ at $[4, 5)$.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	950	Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	951	commented that most regex libraries are not
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	952	correctly implementing the central POSIX
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	953	rule, called the maximum munch rule.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	954	Grathwohl \parencite{grathwohl2014crash} wrote,
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	955	\begin{quote}
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	956	``The POSIX strategy is more complicated than the
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	957	greedy because of the dependence on information about
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	958	the length of matched strings in the various subexpressions.''
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	959	\end{quote}
ed53ce26ecb6 more Chengsong parents: 604 diff changeset	960	%\noindent
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	961	We think the implementation complexity of POSIX rules also come from
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	962	the specification being not very precise.
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	963	There are many informal summaries of this disambiguation
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	964	strategy, which are often quite long and delicate.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	965	For example Kuklewicz \cite{KuklewiczHaskell}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	966	described the POSIX rule as (section 1, last paragraph):
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	967	\begin{quote}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	968	\begin{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	969	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	970	regular expressions (REs) take the leftmost starting match, and the longest match starting there
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	971	earlier subpatterns have leftmost-longest priority over later subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	972	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	973	higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	974	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	975	REs have right associative concatenation which can be changed with parenthesis\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	976	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	977	parenthesized subexpressions return the match from their last usage\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	978	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	979	text of component subexpressions must be contained in the text of the
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	980	higher-level subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	981	\item
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	982	if "p" and "q" can never match the same text then "p\|q" and "q\|p" are equivalent, up to trivial renumbering of captured subexpressions\\
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	983	\item
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	984	if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
607 e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	985	\end{itemize}
e6fc9b72c0e3 chap1 almost done Chengsong parents: 606 diff changeset	986	\end{quote}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	987	%The text above
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	988	%is trying to capture something very precise,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	989	%and is crying out for formalising.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	990	Ausaf et al. \cite{AusafDyckhoffUrban2016}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	991	are the first to
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	992	give a quite simple formalised POSIX
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	993	specification in Isabelle/HOL, and also prove
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	994	that their specification coincides with the
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	995	POSIX specification given by Okui and Suzuki \cite{Okui10}.
608 37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	996	They then formally proved the correctness of
37b6fd310a16 added related work chap Chengsong parents: 607 diff changeset	997	a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	998	with regards to that specification.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	999	They also found that the informal POSIX
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1000	specification by Sulzmann and Lu does not work for the correctness proof.
605 ed53ce26ecb6 more Chengsong parents: 604 diff changeset	1001
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1002	In the next section we will briefly
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1003	introduce Brzozowski derivatives and Sulzmann
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1004	and Lu's algorithm, which the main point of this thesis builds on.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1005	%We give a taste of what they
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1006	%are like and why they are suitable for regular expression
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1007	%matching and lexing.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1008	\section{Formal Specification of POSIX Matching
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1009	and Brzozowski Derivatives}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1010	%Now we start with the central topic of the thesis: Brzozowski derivatives.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1011	Brzozowski \cite{Brzozowski1964} first introduced the
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1012	concept of a \emph{derivative} of regular expression in 1964.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1013	The derivative of a regular expression $r$
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1014	with respect to a character $c$, is written as $r \backslash c$.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1015	This operation tells us what $r$ transforms into
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1016	if we ``chop'' off the first character $c$
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1017	from all strings in the language of $r$ (defined
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1018	later as $L \; r$).
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1019	%To give a flavour of Brzozowski derivatives, we present
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1020	%two straightforward clauses from it:
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1021	%\begin{center}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1022	% \begin{tabular}{lcl}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1023	% $d \backslash c$ & $\dn$ &
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1024	% $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1025	%$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1026	% \end{tabular}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1027	%\end{center}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1028	%\noindent
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1029	%The first clause says that for the regular expression
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1030	%denoting a singleton set consisting of a sinlge-character string $\{ d \}$,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1031	%we check the derivative character $c$ against $d$,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1032	%returning a set containing only the empty string $\{ [] \}$
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1033	%if $c$ and $d$ are equal, and the empty set $\varnothing$ otherwise.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1034	%The second clause states that to obtain the regular expression
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1035	%representing all strings' head character $c$ being chopped off
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1036	%from $r_1 + r_2$, one simply needs to recursively take derivative
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1037	%of $r_1$ and $r_2$ and then put them together.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1038	Derivatives have the property
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1039	that $s \in L \; (r\backslash c)$ if and only if
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1040	$c::s \in L \; r$ where $::$ stands for list prepending.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1041	%This property can be used on regular expressions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1042	%matching and lexing--to test whether a string $s$ is in $L \; r$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1043	%one simply takes derivatives of $r$ successively with
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1044	%respect to the characters (in the correct order) in $s$,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1045	%and then test whether the empty string is in the last regular expression.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1046	With this derivatives give a simple solution
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1047	to the problem of matching a string $s$ with a regular
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1048	expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1049	succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1050	then $r$ matches $s$ (and {\em vice versa}).
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1051	%This makes formally reasoning about these properties such
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1052	%as correctness and complexity smooth and intuitive.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1053	There had been several mechanised proofs about this property in various theorem
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1054	provers,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1055	for example one by Owens and Slind \cite{Owens2008} in HOL4,
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1056	another one by Krauss and Nipkow \cite{Nipkow98} in Isabelle/HOL, and
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1057	yet another in Coq by Coquand and Siles \cite{Coquand2012}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1058
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1059	In addition, one can extend derivatives to bounded repetitions
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1060	relatively straightforwardly. For example, the derivative for
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1061	this can be defined as:
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1062	\begin{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1063	\begin{tabular}{lcl}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1064	$r^{\{n\}} \backslash c$ & $\dn$ & $r \backslash c \cdot
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1065	r^{\{n-1\}}$\\
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1066	\end{tabular}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1067	\end{center}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1068	\noindent
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1069	Experimental results suggest that unlike DFA-based solutions
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1070	for bounded regular expressions,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1071	derivatives can cope
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1072	large counters
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1073	quite well.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1074
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1075	There has also been
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1076	extensions to other constructs.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1077	For example, Owens et al include the derivatives
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1078	for the \emph{NOT} regular expression, which is
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1079	able to concisely express C-style comments of the form
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1080	$/* \ldots */$ (see \cite{Owens2008}).
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1081	Another extension for derivatives is
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1082	regular expressions with look-aheads, done
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1083	by Miyazaki and Minamide
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1084	\cite{Takayuki2019}.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1085	%We therefore use Brzozowski derivatives on regular expressions
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1086	%lexing
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1087
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1088
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1089
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1090	Given the above definitions and properties of
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1091	Brzozowski derivatives, one quickly realises their potential
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1092	in generating a formally verified algorithm for lexing--the clauses and property
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1093	can be easily expressed in a functional programming language
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1094	or converted to theorem prover
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1095	code, with great extensibility.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1096	Perhaps this is the reason why it has sparked quite a bit of interest
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1097	in the functional programming and theorem prover communities in the last
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1098	fifteen or so years (
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1099	\cite{Almeidaetal10}, \cite{Berglund14}, \cite{Berglund18},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1100	\cite{Chen12} and \cite{Coquand2012}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1101	to name a few), despite being buried in the ``sands of time'' \cite{Owens2008}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1102	after they were first published by Brzozowski.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1103
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1104
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1105	However, there are two difficulties with derivative-based matchers:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1106	First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured Chengsong parents: diff changeset	1107	for whether a regular expression matches a string or not. This is too
cc54ce075db5 restructured Chengsong parents: diff changeset	1108	little information in the context of lexing where separate tokens must
cc54ce075db5 restructured Chengsong parents: diff changeset	1109	be identified and also classified (for example as keywords
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1110	or identifiers).
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1111	Second, derivative-based matchers need to be more efficient.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1112	Elegant and beautiful
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1113	as many implementations are,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1114	they can be excruciatingly slow.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1115	For example, Sulzmann and Lu
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1116	claim a linear running time of their proposed algorithm,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1117	but that was falsified by our experiments. The running time
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1118	is actually $\Omega(2^n)$ in the worst case.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1119	A similar claim about a theoretical runtime of $O(n^2)$
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1120	is made for the Verbatim \cite{Verbatim}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1121	%TODO: give references
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1122	lexer, which calculates POSIX matches and is based on derivatives.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1123	They formalized the correctness of the lexer, but not their complexity result.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1124	In the performance evaluation section, they analyzed the run time
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1125	of matching $a$ with the string
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1126	\begin{center}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1127	$\underbrace{a \ldots a}_{\text{n a's}}$.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1128	\end{center}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1129	\noindent
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1130	They concluded that the algorithm is quadratic in terms of
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1131	the length of the input string.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1132	When we tried out their extracted OCaml code with our example $(a+aa)^*$,
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1133	the time it took to match a string of 40 $a$'s was approximately 5 minutes.
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1134
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1135
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1136	\subsection{Sulzmann and Lu's Algorithm}
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1137	Sulzmann and Lu~\cite{Sulzmann2014} overcame the first
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1138	problem with the yes/no answer
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1139	by cleverly extending Brzozowski's matching
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1140	algorithm. Their extended version generates additional information on
cc54ce075db5 restructured Chengsong parents: diff changeset	1141	\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured Chengsong parents: diff changeset	1142	rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured Chengsong parents: diff changeset	1143	second ``phase'' to Brzozowski's algorithm involving an injection
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1144	function.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1145	In earlier work, Ausaf et al provided the formal
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1146	specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured Chengsong parents: diff changeset	1147	the correctness
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1148	of this extended algorithm accordingly
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1149	\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured Chengsong parents: diff changeset	1150
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1151	The version of the algorithm proven correct
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1152	suffers heavily from a
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1153	second difficulty, where the internal derivatives can
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1154	grow to arbitrarily big sizes.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1155	For example if we start with the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1156	regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured Chengsong parents: diff changeset	1157	successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured Chengsong parents: diff changeset	1158	a sequence of ever-growing derivatives like
cc54ce075db5 restructured Chengsong parents: diff changeset	1159
cc54ce075db5 restructured Chengsong parents: diff changeset	1160	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured Chengsong parents: diff changeset	1161	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1162	\begin{tabular}{rll}
cc54ce075db5 restructured Chengsong parents: diff changeset	1163	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1164	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1165	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1166	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	1167	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured Chengsong parents: diff changeset	1168	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	1169	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	1170
cc54ce075db5 restructured Chengsong parents: diff changeset	1171	\noindent where after around 35 steps we run out of memory on a
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1172	typical computer. Clearly, the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1173	notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured Chengsong parents: diff changeset	1174	simplification rules that can be applied to regular regular
cc54ce075db5 restructured Chengsong parents: diff changeset	1175	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured Chengsong parents: diff changeset	1176	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1177	r$. While such simple-minded simplifications have been proved in
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1178	the work by Ausaf et al. to preserve the correctness of Sulzmann and Lu's
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1179	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured Chengsong parents: diff changeset	1180	\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured Chengsong parents: diff changeset	1181	above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured Chengsong parents: diff changeset	1182	quickly beyond any finite bound.
cc54ce075db5 restructured Chengsong parents: diff changeset	1183
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1184	Therefore we want to look in this thesis at a second
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1185	algorithm by Sulzmann and Lu where they
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1186	overcame this ``growth problem'' \cite{Sulzmann2014}.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1187	In this version, POSIX values are
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1188	represented as bit sequences and such sequences are incrementally generated
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1189	when derivatives are calculated. The compact representation
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1190	of bit sequences and regular expressions allows them to define a more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1191	``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured Chengsong parents: diff changeset	1192	derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured Chengsong parents: diff changeset	1193	They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured Chengsong parents: diff changeset	1194	of this version, but do not provide any supporting proof arguments, not
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1195	even ``pencil-and-paper'' arguments. They write about their bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1196	\emph{incremental parsing method} (that is the algorithm to be formalised
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1197	in this dissertation)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1198
cc54ce075db5 restructured Chengsong parents: diff changeset	1199
cc54ce075db5 restructured Chengsong parents: diff changeset	1200
cc54ce075db5 restructured Chengsong parents: diff changeset	1201	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	1202	``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured Chengsong parents: diff changeset	1203	method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured Chengsong parents: diff changeset	1204	yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured Chengsong parents: diff changeset	1205	extensively [..] but yet
cc54ce075db5 restructured Chengsong parents: diff changeset	1206	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured Chengsong parents: diff changeset	1207	\end{quote}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1208	Ausaf and Urban made some initial progress towards the
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1209	full correctness proof but still had to leave out the optimisation
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1210	Sulzmann and Lu proposed.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1211	Ausaf wrote \cite{Ausaf},
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1212	\begin{quote}\it
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1213	``The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.''
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1214	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	1215	This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured Chengsong parents: diff changeset	1216	by Ausaf and Urban,
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1217	together with a formal proof of the correctness of those simplifications.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1218
cc54ce075db5 restructured Chengsong parents: diff changeset	1219
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1220	One of the most recent work in the context of lexing
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1221	%with this issue
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	1222	is the Verbatim lexer by Egolf, Lasser and Fisher~\cite{Verbatim}.
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1223	This is relevant work for us and we will compare it later with
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1224	our derivative-based matcher we are going to present.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1225	There is also some newer work called
630 d50a309a0645 with Christian Chengsong parents: 628 diff changeset	1226	Verbatim++~\cite{Verbatimpp}, which does not use derivatives,
612 8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1227	but deterministic finite automaton instead.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1228	%An example that gives problem to automaton approaches would be
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1229	%the regular expression $(a\|b)^*a(a\|b)^{\{n\}}$.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1230	%It requires at least $2^{n+1}$ states to represent
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1231	%as a DFA.
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1232
8c234a1bc7e0 chap6 Chengsong parents: 609 diff changeset	1233
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1234	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1235	\section{Contribution}
cc54ce075db5 restructured Chengsong parents: diff changeset	1236
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1237	In this thesis,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1238	we propose a solution to catastrophic
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1239	backtracking and error-prone matchers: a formally verified
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1240	regular expression lexing algorithm
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1241	that is both fast
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1242	and correct by extending Ausaf et al.'s work.
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1243	The end result is %a regular expression lexing algorithm that comes with
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1244	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1245	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1246	an improved version of Sulzmann and Lu's bit-coded algorithm using
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1247	derivatives with simplifications,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1248	accompanied by
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1249	a proven correctness theorem according to POSIX specification
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1250	given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1251	\item
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1252	a complexity-related property for that algorithm saying that the
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1253	internal data structure will
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1254	remain finite,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1255	\item
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1256	and extension to
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1257	the bounded repetitions construct with the correctness and finiteness property
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1258	maintained.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1259	\end{itemize}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1260	\noindent
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1261	With a formal finiteness bound in place,
61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1262	we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1263	The Isabelle/HOL code for our formalisation can be
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1264	found at
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1265	\begin{center}
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1266	\url{https://github.com/hellotommmy/posix}
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1267	\end{center}
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1268	Further improvements to the algorithm with an even stronger version of
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1269	simplification can be made. We conjecture that the resulting size of derivatives
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1270	can be bounded by a cubic bound w.r.t. the size of the regular expression.
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1271
609 61139fdddae0 chap1 totally done Chengsong parents: 608 diff changeset	1272
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1273
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	1274
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1275
cc54ce075db5 restructured Chengsong parents: diff changeset	1276
cc54ce075db5 restructured Chengsong parents: diff changeset	1277	\section{Structure of the thesis}
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1278	In chapter \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1279	and notations we
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1280	use for describing regular expressions and derivatives,
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1281	and the first version of their lexing algorithm without bitcodes (including
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1282	its correctness proof).
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1283	We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1284	together with the correctness proof by Ausaf and Urban.
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1285	Then we illustrate in chapter \ref{Bitcoded2}
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1286	how Sulzmann and Lu's
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1287	simplifications fail to simplify. We therefore introduce our version of the
4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1288	algorithm with simplification and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1289	its correctness proof .
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1290	In chapter \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1291	of our bitcoded algorithm, that is a finite bound on the size of any
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1292	regular expression's derivatives.
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1293	We also show how one can extend the
bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1294	algorithm to include bounded repetitions.
622 4b1149fb5aec incorporated more comments, bib Chengsong parents: 620 diff changeset	1295	In chapter \ref{Cubic} we discuss stronger simplification rules which
631 bdb3ffdd39f8 more Chengsong parents: 630 diff changeset	1296	improve the finite bound to a cubic bound.%and the NOT regular expression.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	1297
cc54ce075db5 restructured Chengsong parents: diff changeset	1298
cc54ce075db5 restructured Chengsong parents: diff changeset	1299
cc54ce075db5 restructured Chengsong parents: diff changeset	1300
cc54ce075db5 restructured Chengsong parents: diff changeset	1301
cc54ce075db5 restructured Chengsong parents: diff changeset	1302	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1303
cc54ce075db5 restructured Chengsong parents: diff changeset	1304
cc54ce075db5 restructured Chengsong parents: diff changeset	1305	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1306
cc54ce075db5 restructured Chengsong parents: diff changeset	1307	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1308
cc54ce075db5 restructured Chengsong parents: diff changeset	1309	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	1310
cc54ce075db5 restructured Chengsong parents: diff changeset	1311

author	Chengsong
	Tue, 20 Dec 2022 22:32:54 +0000 (2022-12-20)
changeset 635	7ce2389dff4b
parent 634	b079aaee5e10
child 636	0bcb4a7cb40c
permissions	-rwxr-xr-x