lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@988e92a70704 (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	28	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	29	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	35	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	36	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	37	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	38	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured Chengsong parents: diff changeset	40
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	42
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	43	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	44	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	45	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	46	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	47	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	48	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	49	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	50	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	51	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	52	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	53	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	54	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	55	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	56	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	57	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	58	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	59
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	60	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	61
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	62	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	63	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	64	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	65	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	66	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	67
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	68	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	69
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	70	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	71	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	72	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	73	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	74	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	75	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	76	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	77	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	78	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	79	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	80	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	81	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	82	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	83	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	84	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	85	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	86	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	87	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	88	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	89	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	90	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	91	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	92	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	93	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	94	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	95	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	96	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	97	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	98	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	99	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	100	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	103	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	104	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	105	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	106	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	107	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	108	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	109	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	110	\def\starupdates{\textit{starUpdates}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	111
532 cc54ce075db5 restructured Chengsong parents: diff changeset	112
cc54ce075db5 restructured Chengsong parents: diff changeset	113	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	114	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	115	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	116	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	120	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	121	\def\bsimpStrong{\textit{bsimpStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	122	%\def\bdersStrong{\textit{bdersStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	123	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	124
532 cc54ce075db5 restructured Chengsong parents: diff changeset	125	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	126	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	127	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	128	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	129	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	130	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	131
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	132	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	133
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	134
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	135	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	136	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	137	\def\bsimpalts{\textit{bsimp}_{ALTS}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	138
532 cc54ce075db5 restructured Chengsong parents: diff changeset	139	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	140	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	141	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	142
cc54ce075db5 restructured Chengsong parents: diff changeset	143
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	144	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	145	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	146	\def\filter{\textit{filter}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	147	\def\not{\textit{not}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	148
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	149
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	150
532 cc54ce075db5 restructured Chengsong parents: diff changeset	151	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	152	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	153	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	154	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	155	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	156	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	157	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	158
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	159
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	160
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	161
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	162	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	163	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	164	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	165	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	166	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	167	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	168	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	169	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	170	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	171	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	172	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	173	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	174	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	175	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	176	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	177	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	178	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	179	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	180
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	181
532 cc54ce075db5 restructured Chengsong parents: diff changeset	182	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	183	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	184	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	185
cc54ce075db5 restructured Chengsong parents: diff changeset	186	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	187
cc54ce075db5 restructured Chengsong parents: diff changeset	188	\begin{figure}
cc54ce075db5 restructured Chengsong parents: diff changeset	189	\centering
cc54ce075db5 restructured Chengsong parents: diff changeset	190	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
cc54ce075db5 restructured Chengsong parents: diff changeset	191	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	192	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	193	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	194	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	195	ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	196	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	197	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	198	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	199	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	200	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	201	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	202	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	203	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	204	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	205	legend entries={JavaScript},
cc54ce075db5 restructured Chengsong parents: diff changeset	206	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	207	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	208	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	209	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	210	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	211	&
cc54ce075db5 restructured Chengsong parents: diff changeset	212	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	213	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	214	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	215	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	216	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	217	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	218	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	219	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	220	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	221	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	222	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	223	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	224	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	225	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	226	legend entries={Python},
cc54ce075db5 restructured Chengsong parents: diff changeset	227	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	228	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	229	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	230	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	231	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	232	&
cc54ce075db5 restructured Chengsong parents: diff changeset	233	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	234	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	235	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	236	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	237	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	238	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	239	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	240	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	241	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	242	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	243	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	244	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	245	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	246	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	247	legend entries={Java 8},
cc54ce075db5 restructured Chengsong parents: diff changeset	248	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	249	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	250	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	251	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	252	\end{tikzpicture}\\
cc54ce075db5 restructured Chengsong parents: diff changeset	253	\multicolumn{3}{c}{Graphs: Runtime for matching $(a^)^\,b$ with strings
cc54ce075db5 restructured Chengsong parents: diff changeset	254	of the form $\underbrace{aa..a}_{n}$.}
cc54ce075db5 restructured Chengsong parents: diff changeset	255	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	256	\caption{aStarStarb} \label{fig:aStarStarb}
cc54ce075db5 restructured Chengsong parents: diff changeset	257	\end{figure}
cc54ce075db5 restructured Chengsong parents: diff changeset	258
cc54ce075db5 restructured Chengsong parents: diff changeset	259
cc54ce075db5 restructured Chengsong parents: diff changeset	260
cc54ce075db5 restructured Chengsong parents: diff changeset	261
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	262
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	263	Regular expressions are widely used in computer science:
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	264	be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	265	command-line tools like $\mathit{grep}$ that facilitate easy
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	266	text-processing; network intrusion
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	267	detection systems that reject suspicious traffic; or compiler
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	268	front ends--the majority of the solutions to these tasks
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	269	involve lexing with regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	270	expressions.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	271	Given its usefulness and ubiquity, one would imagine that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	272	modern regular expression matching implementations
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	273	are mature and fully studied.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	274	Indeed, in a popular programming language' regex engine,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	275	supplying it with regular expressions and strings, one can
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	276	get rich matching information in a very short time.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	277	Some network intrusion detection systems
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	278	use regex engines that are able to process
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	279	megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	280	Unfortunately, this is not the case for $\mathbf{all}$ inputs.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	281	%TODO: get source for SNORT/BRO's regex matching engine/speed
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	282
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	283
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	284	Take $(a^)^\,b$ and ask whether
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	285	strings of the form $aa..a$ match this regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	286	expression. Obviously this is not the case---the expected $b$ in the last
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	287	position is missing. One would expect that modern regular expression
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	288	matching engines can find this out very quickly. Alas, if one tries
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	289	this example in JavaScript, Python or Java 8, even with strings of a small
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	290	length, say around 30 $a$'s, one discovers that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	291	this decision takes crazy time to finish given the simplicity of the problem.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	292	This is clearly exponential behaviour, and
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	293	is triggered by some relatively simple regex patterns, as the graphs
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	294	in \ref{fig:aStarStarb} show.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	295
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	296
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	297
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	298
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	299	\ChristianComment{Superlinear I just leave out the explanation
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	300	which I find once used would distract the flow. Plus if i just say exponential
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	301	here the 2016 event in StackExchange was not exponential, but just quardratic so would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	302	in accurate}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	303
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	304	This superlinear blowup in regular expression engines
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	305	had repeatedly caused grief in real life.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	306	For example, on 20 July 2016 one evil
532 cc54ce075db5 restructured Chengsong parents: diff changeset	307	regular expression brought the webpage
cc54ce075db5 restructured Chengsong parents: diff changeset	308	\href{http://stackexchange.com}{Stack Exchange} to its
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	309	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	310	In this instance, a regular expression intended to just trim white
cc54ce075db5 restructured Chengsong parents: diff changeset	311	spaces from the beginning and the end of a line actually consumed
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	312	massive amounts of CPU resources---causing web servers to grind to a
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	313	halt. In this example, the time needed to process
532 cc54ce075db5 restructured Chengsong parents: diff changeset	314	the string was $O(n^2)$ with respect to the string length. This
cc54ce075db5 restructured Chengsong parents: diff changeset	315	quadratic overhead was enough for the homepage of Stack Exchange to
cc54ce075db5 restructured Chengsong parents: diff changeset	316	respond so slowly that the load balancer assumed a $\mathit{DoS}$
cc54ce075db5 restructured Chengsong parents: diff changeset	317	attack and therefore stopped the servers from responding to any
cc54ce075db5 restructured Chengsong parents: diff changeset	318	requests. This made the whole site become unavailable.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	319
532 cc54ce075db5 restructured Chengsong parents: diff changeset	320	A more recent example is a global outage of all Cloudflare servers on 2 July
cc54ce075db5 restructured Chengsong parents: diff changeset	321	2019. A poorly written regular expression exhibited exponential
cc54ce075db5 restructured Chengsong parents: diff changeset	322	behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
cc54ce075db5 restructured Chengsong parents: diff changeset	323	had several causes, at the heart was a regular expression that
cc54ce075db5 restructured Chengsong parents: diff changeset	324	was used to monitor network
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	325	traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	326	%TODO: data points for some new versions of languages
cc54ce075db5 restructured Chengsong parents: diff changeset	327	These problems with regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	328	are not isolated events that happen
cc54ce075db5 restructured Chengsong parents: diff changeset	329	very occasionally, but actually widespread.
cc54ce075db5 restructured Chengsong parents: diff changeset	330	They occur so often that they get a
cc54ce075db5 restructured Chengsong parents: diff changeset	331	name--Regular-Expression-Denial-Of-Service (ReDoS)
cc54ce075db5 restructured Chengsong parents: diff changeset	332	attack.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	333	\citeauthor{Davis18} detected more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	334	than 1000 super-linear (SL) regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	335	in Node.js, Python core libraries, and npm and pypi.
cc54ce075db5 restructured Chengsong parents: diff changeset	336	They therefore concluded that evil regular expressions
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	337	are problems "more than a parlour trick", but one that
532 cc54ce075db5 restructured Chengsong parents: diff changeset	338	requires
cc54ce075db5 restructured Chengsong parents: diff changeset	339	more research attention.
cc54ce075db5 restructured Chengsong parents: diff changeset	340
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	341
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	342	But the problems are not limited to slowness on certain
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	343	cases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	344	Another thing about these libraries is that there
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	345	is no correctness guarantee.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	346	In some cases, they either fail to generate a lexing result when there exists a match,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	347	or give results that are inconsistent with the $\POSIX$ standard.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	348	A concrete example would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	349	the regex
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	350	\begin{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	351	(aba\|ab\|a)*
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	352	\end{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	353	and the string
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	354	\begin{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	355	ababa
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	356	\end{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	357	The correct $\POSIX$ match for the above would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	358	with the entire string $ababa$,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	359	split into two Kleene star iterations, $[ab] [aba]$ at positions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	360	$[0, 2), [2, 5)$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	361	respectively.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	362	But trying this out in regex101\parencite{regex101}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	363	with different language engines would yield
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	364	the same two fragmented matches: $[aba]$ at $[0, 3)$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	365	and $a$ at $[4, 5)$.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	366
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	367	Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	368	correctly implementing the POSIX (maximum-munch)
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	369	rule of regular expression matching.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	370
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	371	As Grathwohl\parencite{grathwohl2014crash} commented,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	372	\begin{center}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	373	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	374	\end{center}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	375
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	376
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	377	To summarise the above, regular expressions are important.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	378	They are popular and programming languages' library functions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	379	for them are very fast on non-catastrophic cases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	380	But there are problems with current practical implementations.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	381	First thing is that the running time might blow up.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	382	The second problem is that they might be error-prone on certain
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	383	very simple cases.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	384	In the next part of the chapter, we will look into reasons why
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	385	certain regex engines are running horribly slow on the "catastrophic"
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	386	cases and propose a solution that addresses both of these problems
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	387	based on Brzozowski and Sulzmann and Lu's work.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	388
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	389
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	390	\section{Why are current regex engines slow?}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	391
cc54ce075db5 restructured Chengsong parents: diff changeset	392	%find literature/find out for yourself that REGEX->DFA on basic regexes
cc54ce075db5 restructured Chengsong parents: diff changeset	393	%does not blow up the size
cc54ce075db5 restructured Chengsong parents: diff changeset	394	Shouldn't regular expression matching be linear?
cc54ce075db5 restructured Chengsong parents: diff changeset	395	How can one explain the super-linear behaviour of the
cc54ce075db5 restructured Chengsong parents: diff changeset	396	regex matching engines we have?
cc54ce075db5 restructured Chengsong parents: diff changeset	397	The time cost of regex matching algorithms in general
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	398	involve two different phases, and different things can go differently wrong on
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	399	these phases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	400	$\DFA$s usually have problems in the first (construction) phase
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	401	, whereas $\NFA$s usually run into trouble
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	402	on the second phase.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	403
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	404	\subsection{Different Phases of a Matching/Lexing Algorithm}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	405
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	406
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	407	Most lexing algorithms can be roughly divided into
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	408	two phases during its run.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	409	The first phase is the "construction" phase,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	410	in which the algorithm builds some
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	411	suitable data structure from the input regex $r$, so that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	412	it can be easily operated on later.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	413	We denote
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	414	the time cost for such a phase by $P_1(r)$.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	415	The second phase is the lexing phase, when the input string
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	416	$s$ is read and the data structure
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	417	representing that regex $r$ is being operated on.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	418	We represent the time
532 cc54ce075db5 restructured Chengsong parents: diff changeset	419	it takes by $P_2(r, s)$.\\
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	420
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	421	For $\mathit{DFA}$,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	422	we have $P_2(r, s) = O( \|s\| )$,
cc54ce075db5 restructured Chengsong parents: diff changeset	423	because we take at most $\|s\|$ steps,
cc54ce075db5 restructured Chengsong parents: diff changeset	424	and each step takes
cc54ce075db5 restructured Chengsong parents: diff changeset	425	at most one transition--
cc54ce075db5 restructured Chengsong parents: diff changeset	426	a deterministic-finite-automata
cc54ce075db5 restructured Chengsong parents: diff changeset	427	by definition has at most one state active and at most one
cc54ce075db5 restructured Chengsong parents: diff changeset	428	transition upon receiving an input symbol.
cc54ce075db5 restructured Chengsong parents: diff changeset	429	But unfortunately in the worst case
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	430	$P_1(r) = O(exp^{\|r\|})$. An example will be given later.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	431
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	432
532 cc54ce075db5 restructured Chengsong parents: diff changeset	433	For $\mathit{NFA}$s, we have $P_1(r) = O(\|r\|)$ if we do not unfold
cc54ce075db5 restructured Chengsong parents: diff changeset	434	expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
cc54ce075db5 restructured Chengsong parents: diff changeset	435	The $P_2(r, s)$ is bounded by $\|r\|\cdot\|s\|$, if we do not backtrack.
cc54ce075db5 restructured Chengsong parents: diff changeset	436	On the other hand, if backtracking is used, the worst-case time bound bloats
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	437	to $\|r\| * 2^\|s\|$.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	438	%on the input
cc54ce075db5 restructured Chengsong parents: diff changeset	439	%And when calculating the time complexity of the matching algorithm,
cc54ce075db5 restructured Chengsong parents: diff changeset	440	%we are assuming that each input reading step requires constant time.
cc54ce075db5 restructured Chengsong parents: diff changeset	441	%which translates to that the number of
cc54ce075db5 restructured Chengsong parents: diff changeset	442	%states active and transitions taken each time is bounded by a
cc54ce075db5 restructured Chengsong parents: diff changeset	443	%constant $C$.
cc54ce075db5 restructured Chengsong parents: diff changeset	444	%But modern regex libraries in popular language engines
cc54ce075db5 restructured Chengsong parents: diff changeset	445	% often want to support much richer constructs than just
cc54ce075db5 restructured Chengsong parents: diff changeset	446	% sequences and Kleene stars,
cc54ce075db5 restructured Chengsong parents: diff changeset	447	%such as negation, intersection,
cc54ce075db5 restructured Chengsong parents: diff changeset	448	%bounded repetitions and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	449	%And de-sugaring these "extended" regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	450	%into basic ones might bloat the size exponentially.
cc54ce075db5 restructured Chengsong parents: diff changeset	451	%TODO: more reference for exponential size blowup on desugaring.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	452
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	453	\subsection{Why $\mathit{DFA}s$ can be slow in the first phase}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	454
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	455
532 cc54ce075db5 restructured Chengsong parents: diff changeset	456	The good things about $\mathit{DFA}$s is that once
cc54ce075db5 restructured Chengsong parents: diff changeset	457	generated, they are fast and stable, unlike
cc54ce075db5 restructured Chengsong parents: diff changeset	458	backtracking algorithms.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	459	However, they do not scale well with bounded repetitions.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	460
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	461	\subsubsection{Problems with Bounded Repetitions}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	462	Bounded repetitions, usually written in the form
cc54ce075db5 restructured Chengsong parents: diff changeset	463	$r^{\{c\}}$ (where $c$ is a constant natural number),
cc54ce075db5 restructured Chengsong parents: diff changeset	464	denotes a regular expression accepting strings
cc54ce075db5 restructured Chengsong parents: diff changeset	465	that can be divided into $c$ substrings, where each
cc54ce075db5 restructured Chengsong parents: diff changeset	466	substring is in $r$.
cc54ce075db5 restructured Chengsong parents: diff changeset	467	For the regular expression $(a\|b)^*a(a\|b)^{\{2\}}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	468	an $\mathit{NFA}$ describing it would look like:
cc54ce075db5 restructured Chengsong parents: diff changeset	469	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	470	\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
cc54ce075db5 restructured Chengsong parents: diff changeset	471	\node[state,initial] (q_0) {$q_0$};
cc54ce075db5 restructured Chengsong parents: diff changeset	472	\node[state, red] (q_1) [right=of q_0] {$q_1$};
cc54ce075db5 restructured Chengsong parents: diff changeset	473	\node[state, red] (q_2) [right=of q_1] {$q_2$};
cc54ce075db5 restructured Chengsong parents: diff changeset	474	\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
cc54ce075db5 restructured Chengsong parents: diff changeset	475	\path[->]
cc54ce075db5 restructured Chengsong parents: diff changeset	476	(q_0) edge node {a} (q_1)
cc54ce075db5 restructured Chengsong parents: diff changeset	477	edge [loop below] node {a,b} ()
cc54ce075db5 restructured Chengsong parents: diff changeset	478	(q_1) edge node {a,b} (q_2)
cc54ce075db5 restructured Chengsong parents: diff changeset	479	(q_2) edge node {a,b} (q_3);
cc54ce075db5 restructured Chengsong parents: diff changeset	480	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	481	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	482	The red states are "countdown states" which counts down
cc54ce075db5 restructured Chengsong parents: diff changeset	483	the number of characters needed in addition to the current
cc54ce075db5 restructured Chengsong parents: diff changeset	484	string to make a successful match.
cc54ce075db5 restructured Chengsong parents: diff changeset	485	For example, state $q_1$ indicates a match that has
cc54ce075db5 restructured Chengsong parents: diff changeset	486	gone past the $(a\|b)^$ part of $(a\|b)^a(a\|b)^{\{2\}}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	487	and just consumed the "delimiter" $a$ in the middle, and
cc54ce075db5 restructured Chengsong parents: diff changeset	488	need to match 2 more iterations of $(a\|b)$ to complete.
cc54ce075db5 restructured Chengsong parents: diff changeset	489	State $q_2$ on the other hand, can be viewed as a state
cc54ce075db5 restructured Chengsong parents: diff changeset	490	after $q_1$ has consumed 1 character, and just waits
cc54ce075db5 restructured Chengsong parents: diff changeset	491	for 1 more character to complete.
cc54ce075db5 restructured Chengsong parents: diff changeset	492	$q_3$ is the last state, requiring 0 more character and is accepting.
cc54ce075db5 restructured Chengsong parents: diff changeset	493	Depending on the suffix of the
cc54ce075db5 restructured Chengsong parents: diff changeset	494	input string up to the current read location,
cc54ce075db5 restructured Chengsong parents: diff changeset	495	the states $q_1$ and $q_2$, $q_3$
cc54ce075db5 restructured Chengsong parents: diff changeset	496	may or may
cc54ce075db5 restructured Chengsong parents: diff changeset	497	not be active, independent from each other.
cc54ce075db5 restructured Chengsong parents: diff changeset	498	A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
cc54ce075db5 restructured Chengsong parents: diff changeset	499	contain at least $2^3$ non-equivalent states that cannot be merged,
cc54ce075db5 restructured Chengsong parents: diff changeset	500	because the subset construction during determinisation will generate
cc54ce075db5 restructured Chengsong parents: diff changeset	501	all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
cc54ce075db5 restructured Chengsong parents: diff changeset	502	Generalizing this to regular expressions with larger
cc54ce075db5 restructured Chengsong parents: diff changeset	503	bounded repetitions number, we have that
cc54ce075db5 restructured Chengsong parents: diff changeset	504	regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
cc54ce075db5 restructured Chengsong parents: diff changeset	505	would require at least $2^{n+1}$ states, if $r$ contains
cc54ce075db5 restructured Chengsong parents: diff changeset	506	more than 1 string.
cc54ce075db5 restructured Chengsong parents: diff changeset	507	This is to represent all different
cc54ce075db5 restructured Chengsong parents: diff changeset	508	scenarios which "countdown" states are active.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	509	For those regexes, tools that uses $\DFA$s will get
532 cc54ce075db5 restructured Chengsong parents: diff changeset	510	out of memory errors.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	511
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	512	\subsubsection{Tools that uses $\mathit{DFA}$s}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	513	%TODO:more tools that use DFAs?
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	514	$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	515	in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	516	lexers. The user provides a set of regular expressions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	517	and configurations to such lexer generators, and then
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	518	gets an output program encoding a minimized $\mathit{DFA}$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	519	that can be compiled and run.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	520	When given the above countdown regular expression,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	521	a small number $n$ would result in a determinised automata
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	522	with millions of states.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	523
532 cc54ce075db5 restructured Chengsong parents: diff changeset	524	For this reason, regex libraries that support
cc54ce075db5 restructured Chengsong parents: diff changeset	525	bounded repetitions often choose to use the $\mathit{NFA}$
cc54ce075db5 restructured Chengsong parents: diff changeset	526	approach.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	527
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	528
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	529
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	530
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	531
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	532
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	533
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	534
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	535	\subsection{Why $\mathit{NFA}$s can be slow in the second phase}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	536	When one constructs an $\NFA$ out of a regular expression
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	537	there is often very little to be done in the first phase, one simply
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	538	construct the $\NFA$ states based on the structure of the input regular expression.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	539
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	540	In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	541	one by keeping track of all active states after consuming
cc54ce075db5 restructured Chengsong parents: diff changeset	542	a character, and update that set of states iteratively.
cc54ce075db5 restructured Chengsong parents: diff changeset	543	This can be viewed as a breadth-first-search of the $\mathit{NFA}$
cc54ce075db5 restructured Chengsong parents: diff changeset	544	for a path terminating
cc54ce075db5 restructured Chengsong parents: diff changeset	545	at an accepting state.
cc54ce075db5 restructured Chengsong parents: diff changeset	546	Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	547	type of $\mathit{NFA}$ simulation and guarantees a linear runtime
532 cc54ce075db5 restructured Chengsong parents: diff changeset	548	in terms of input string length.
cc54ce075db5 restructured Chengsong parents: diff changeset	549	%TODO:try out these lexers
cc54ce075db5 restructured Chengsong parents: diff changeset	550	The other way to use $\mathit{NFA}$ for matching is choosing
cc54ce075db5 restructured Chengsong parents: diff changeset	551	a single transition each time, keeping all the other options in
cc54ce075db5 restructured Chengsong parents: diff changeset	552	a queue or stack, and backtracking if that choice eventually
cc54ce075db5 restructured Chengsong parents: diff changeset	553	fails. This method, often called a "depth-first-search",
cc54ce075db5 restructured Chengsong parents: diff changeset	554	is efficient in a lot of cases, but could end up
cc54ce075db5 restructured Chengsong parents: diff changeset	555	with exponential run time.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	556	%TODO:COMPARE java python lexer speed with Rust and Go
cc54ce075db5 restructured Chengsong parents: diff changeset	557	The reason behind backtracking algorithms in languages like
cc54ce075db5 restructured Chengsong parents: diff changeset	558	Java and Python is that they support back-references.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	559	\subsubsection{Back References}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	560	If we have a regular expression like this (the sequence
cc54ce075db5 restructured Chengsong parents: diff changeset	561	operator is omitted for brevity):
cc54ce075db5 restructured Chengsong parents: diff changeset	562	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	563	$r_1(r_2(r_3r_4))$
cc54ce075db5 restructured Chengsong parents: diff changeset	564	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	565	We could label sub-expressions of interest
cc54ce075db5 restructured Chengsong parents: diff changeset	566	by parenthesizing them and giving
cc54ce075db5 restructured Chengsong parents: diff changeset	567	them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured Chengsong parents: diff changeset	568	One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured Chengsong parents: diff changeset	569	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	570	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured Chengsong parents: diff changeset	571	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	572	$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
cc54ce075db5 restructured Chengsong parents: diff changeset	573	by 1 to 4. $1$ would refer to the entire expression
cc54ce075db5 restructured Chengsong parents: diff changeset	574	$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
cc54ce075db5 restructured Chengsong parents: diff changeset	575	These sub-expressions are called "capturing groups".
cc54ce075db5 restructured Chengsong parents: diff changeset	576	We can use the following syntax to denote that we want a string just matched by a
cc54ce075db5 restructured Chengsong parents: diff changeset	577	sub-expression (capturing group) to appear at a certain location again,
cc54ce075db5 restructured Chengsong parents: diff changeset	578	exactly as it was:
cc54ce075db5 restructured Chengsong parents: diff changeset	579	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	580	$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
cc54ce075db5 restructured Chengsong parents: diff changeset	581	\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured Chengsong parents: diff changeset	582	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	583	The backslash and number $i$ are used to denote such
cc54ce075db5 restructured Chengsong parents: diff changeset	584	so-called "back-references".
cc54ce075db5 restructured Chengsong parents: diff changeset	585	Let $e$ be an expression made of regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	586	and back-references. $e$ contains the expression $e_i$
cc54ce075db5 restructured Chengsong parents: diff changeset	587	as its $i$-th capturing group.
cc54ce075db5 restructured Chengsong parents: diff changeset	588	The semantics of back-reference can be recursively
cc54ce075db5 restructured Chengsong parents: diff changeset	589	written as:
cc54ce075db5 restructured Chengsong parents: diff changeset	590	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	591	\begin{tabular}{c}
cc54ce075db5 restructured Chengsong parents: diff changeset	592	$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	593	$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
cc54ce075db5 restructured Chengsong parents: diff changeset	594	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	595	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	596	The concrete example
cc54ce075db5 restructured Chengsong parents: diff changeset	597	$((a\|b\|c\|\ldots\|z)^*)\backslash 1$
cc54ce075db5 restructured Chengsong parents: diff changeset	598	would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	599	Back-reference is a construct in the "regex" standard
cc54ce075db5 restructured Chengsong parents: diff changeset	600	that programmers found useful, but not exactly
cc54ce075db5 restructured Chengsong parents: diff changeset	601	regular any more.
cc54ce075db5 restructured Chengsong parents: diff changeset	602	In fact, that allows the regex construct to express
cc54ce075db5 restructured Chengsong parents: diff changeset	603	languages that cannot be contained in context-free
cc54ce075db5 restructured Chengsong parents: diff changeset	604	languages either.
cc54ce075db5 restructured Chengsong parents: diff changeset	605	For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
cc54ce075db5 restructured Chengsong parents: diff changeset	606	expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	607	which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured Chengsong parents: diff changeset	608	Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured Chengsong parents: diff changeset	609	of formal languages.
cc54ce075db5 restructured Chengsong parents: diff changeset	610	Solving the back-reference expressions matching problem
cc54ce075db5 restructured Chengsong parents: diff changeset	611	is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
cc54ce075db5 restructured Chengsong parents: diff changeset	612	efficient solution is not known to exist.
cc54ce075db5 restructured Chengsong parents: diff changeset	613	%TODO:read a bit more about back reference algorithms
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	614
532 cc54ce075db5 restructured Chengsong parents: diff changeset	615	It seems that languages like Java and Python made the trade-off
cc54ce075db5 restructured Chengsong parents: diff changeset	616	to support back-references at the expense of having to backtrack,
cc54ce075db5 restructured Chengsong parents: diff changeset	617	even in the case of regexes not involving back-references.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	618	Summing these up, we can categorise existing
cc54ce075db5 restructured Chengsong parents: diff changeset	619	practical regex libraries into the ones with linear
cc54ce075db5 restructured Chengsong parents: diff changeset	620	time guarantees like Go and Rust, which impose restrictions
cc54ce075db5 restructured Chengsong parents: diff changeset	621	on the user input (not allowing back-references,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	622	bounded repetitions cannot exceed 1000 etc.), and ones
532 cc54ce075db5 restructured Chengsong parents: diff changeset	623	that allows the programmer much freedom, but grinds to a halt
cc54ce075db5 restructured Chengsong parents: diff changeset	624	in some non-negligible portion of cases.
cc54ce075db5 restructured Chengsong parents: diff changeset	625	%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
cc54ce075db5 restructured Chengsong parents: diff changeset	626	% For example, the Rust regex engine claims to be linear,
cc54ce075db5 restructured Chengsong parents: diff changeset	627	% but does not support lookarounds and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	628	% The GoLang regex library does not support over 1000 repetitions.
cc54ce075db5 restructured Chengsong parents: diff changeset	629	% Java and Python both support back-references, but shows
cc54ce075db5 restructured Chengsong parents: diff changeset	630	%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured Chengsong parents: diff changeset	631	%when the language is still regular).
cc54ce075db5 restructured Chengsong parents: diff changeset	632	%TODO: test performance of Rust on (((((aa)b)b){20}))c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured Chengsong parents: diff changeset	633	%TODO: verify the fact Rust does not allow 1000+ reps
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	634	\ChristianComment{Comment required: Java 17 updated graphs? Is it ok to still use Java 8 graphs?}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	635
cc54ce075db5 restructured Chengsong parents: diff changeset	636
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	637	So we have practical implementations
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	638	on regular expression matching/lexing which are fast
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	639	but do not come with any guarantees that it will not grind to a halt
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	640	or give wrong answers.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	641	Our goal is to have a regex lexing algorithm that comes with
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	642	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	643	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	644	proven correctness
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	645	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	646	proven non-catastrophic properties
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	647	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	648	easy extensions to
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	649	constructs like
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	650	bounded repetitions, negation, lookarounds, and even back-references.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	651	\end{itemize}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	652
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	653	\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	654	We propose Brzozowski derivatives on regular expressions as
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	655	a solution to this.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	656	In the last fifteen or so years, Brzozowski's derivatives of regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	657	expressions have sparked quite a bit of interest in the functional
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	658	programming and theorem prover communities.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	659
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	660	\subsection{Motivation}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	661
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	662	Derivatives give a simple solution
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	663	to the problem of matching a string $s$ with a regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	664	expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	665	succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	666	then $r$ matches $s$ (and {\em vice versa}).
532 cc54ce075db5 restructured Chengsong parents: diff changeset	667
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	668	The beauty of
532 cc54ce075db5 restructured Chengsong parents: diff changeset	669	Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
cc54ce075db5 restructured Chengsong parents: diff changeset	670	expressible in any functional language, and easily definable and
cc54ce075db5 restructured Chengsong parents: diff changeset	671	reasoned about in theorem provers---the definitions just consist of
cc54ce075db5 restructured Chengsong parents: diff changeset	672	inductive datatypes and simple recursive functions.
cc54ce075db5 restructured Chengsong parents: diff changeset	673	And an algorithms based on it by
cc54ce075db5 restructured Chengsong parents: diff changeset	674	Suzmann and Lu \parencite{Sulzmann2014} allows easy extension
cc54ce075db5 restructured Chengsong parents: diff changeset	675	to include extended regular expressions and
cc54ce075db5 restructured Chengsong parents: diff changeset	676	simplification of internal data structures
cc54ce075db5 restructured Chengsong parents: diff changeset	677	eliminating the exponential behaviours.
cc54ce075db5 restructured Chengsong parents: diff changeset	678
cc54ce075db5 restructured Chengsong parents: diff changeset	679	However, two difficulties with derivative-based matchers exist:
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	680	\subsubsection{Problems with Current Brzozowski Matchers}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	681	First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured Chengsong parents: diff changeset	682	for whether a regular expression matches a string or not. This is too
cc54ce075db5 restructured Chengsong parents: diff changeset	683	little information in the context of lexing where separate tokens must
cc54ce075db5 restructured Chengsong parents: diff changeset	684	be identified and also classified (for example as keywords
cc54ce075db5 restructured Chengsong parents: diff changeset	685	or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
cc54ce075db5 restructured Chengsong parents: diff changeset	686	difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured Chengsong parents: diff changeset	687	algorithm. Their extended version generates additional information on
cc54ce075db5 restructured Chengsong parents: diff changeset	688	\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured Chengsong parents: diff changeset	689	rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured Chengsong parents: diff changeset	690	second ``phase'' to Brzozowski's algorithm involving an injection
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	691	function. In our own earlier work, we provided the formal
532 cc54ce075db5 restructured Chengsong parents: diff changeset	692	specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured Chengsong parents: diff changeset	693	the correctness
cc54ce075db5 restructured Chengsong parents: diff changeset	694	of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured Chengsong parents: diff changeset	695	\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured Chengsong parents: diff changeset	696
cc54ce075db5 restructured Chengsong parents: diff changeset	697	The second difficulty is that Brzozowski's derivatives can
cc54ce075db5 restructured Chengsong parents: diff changeset	698	grow to arbitrarily big sizes. For example if we start with the
cc54ce075db5 restructured Chengsong parents: diff changeset	699	regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured Chengsong parents: diff changeset	700	successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured Chengsong parents: diff changeset	701	a sequence of ever-growing derivatives like
cc54ce075db5 restructured Chengsong parents: diff changeset	702
cc54ce075db5 restructured Chengsong parents: diff changeset	703	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured Chengsong parents: diff changeset	704	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	705	\begin{tabular}{rll}
cc54ce075db5 restructured Chengsong parents: diff changeset	706	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	707	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	708	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured Chengsong parents: diff changeset	709	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	710	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured Chengsong parents: diff changeset	711	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	712	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	713
cc54ce075db5 restructured Chengsong parents: diff changeset	714	\noindent where after around 35 steps we run out of memory on a
cc54ce075db5 restructured Chengsong parents: diff changeset	715	typical computer (we shall define shortly the precise details of our
cc54ce075db5 restructured Chengsong parents: diff changeset	716	regular expressions and the derivative operation). Clearly, the
cc54ce075db5 restructured Chengsong parents: diff changeset	717	notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured Chengsong parents: diff changeset	718	simplification rules that can be applied to regular regular
cc54ce075db5 restructured Chengsong parents: diff changeset	719	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured Chengsong parents: diff changeset	720	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured Chengsong parents: diff changeset	721	r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured Chengsong parents: diff changeset	722	earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured Chengsong parents: diff changeset	723	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured Chengsong parents: diff changeset	724	\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured Chengsong parents: diff changeset	725	above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured Chengsong parents: diff changeset	726	quickly beyond any finite bound.
cc54ce075db5 restructured Chengsong parents: diff changeset	727
cc54ce075db5 restructured Chengsong parents: diff changeset	728
cc54ce075db5 restructured Chengsong parents: diff changeset	729	Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	730	\cite{Sulzmann2014} where they introduce bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	731	regular expressions. In this version, POSIX values are
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	732	represented as bit sequences and such sequences are incrementally generated
532 cc54ce075db5 restructured Chengsong parents: diff changeset	733	when derivatives are calculated. The compact representation
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	734	of bit sequences and regular expressions allows them to define a more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	735	``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured Chengsong parents: diff changeset	736	derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured Chengsong parents: diff changeset	737	They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured Chengsong parents: diff changeset	738	of this version, but do not provide any supporting proof arguments, not
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	739	even ``pencil-and-paper'' arguments. They write about their bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	740	\emph{incremental parsing method} (that is the algorithm to be formalised
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	741	in this dissertation)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	742
cc54ce075db5 restructured Chengsong parents: diff changeset	743
cc54ce075db5 restructured Chengsong parents: diff changeset	744
cc54ce075db5 restructured Chengsong parents: diff changeset	745	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	746	``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured Chengsong parents: diff changeset	747	method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured Chengsong parents: diff changeset	748	yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured Chengsong parents: diff changeset	749	extensively [..] but yet
cc54ce075db5 restructured Chengsong parents: diff changeset	750	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured Chengsong parents: diff changeset	751	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	752
cc54ce075db5 restructured Chengsong parents: diff changeset	753	Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured Chengsong parents: diff changeset	754	a formal proof.
cc54ce075db5 restructured Chengsong parents: diff changeset	755
cc54ce075db5 restructured Chengsong parents: diff changeset	756	But as they stated,
cc54ce075db5 restructured Chengsong parents: diff changeset	757	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	758	The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
cc54ce075db5 restructured Chengsong parents: diff changeset	759	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	760
cc54ce075db5 restructured Chengsong parents: diff changeset	761	This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured Chengsong parents: diff changeset	762	by Ausaf and Urban,
cc54ce075db5 restructured Chengsong parents: diff changeset	763	and gives a formal proof of the correctness with those simplifications.
cc54ce075db5 restructured Chengsong parents: diff changeset	764
cc54ce075db5 restructured Chengsong parents: diff changeset	765
cc54ce075db5 restructured Chengsong parents: diff changeset	766	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	767	\section{Contribution}
cc54ce075db5 restructured Chengsong parents: diff changeset	768
cc54ce075db5 restructured Chengsong parents: diff changeset	769
cc54ce075db5 restructured Chengsong parents: diff changeset	770
cc54ce075db5 restructured Chengsong parents: diff changeset	771	This work addresses the vulnerability of super-linear and
cc54ce075db5 restructured Chengsong parents: diff changeset	772	buggy regex implementations by the combination
cc54ce075db5 restructured Chengsong parents: diff changeset	773	of Brzozowski's derivatives and interactive theorem proving.
cc54ce075db5 restructured Chengsong parents: diff changeset	774	We give an
cc54ce075db5 restructured Chengsong parents: diff changeset	775	improved version of Sulzmann and Lu's bit-coded algorithm using
cc54ce075db5 restructured Chengsong parents: diff changeset	776	derivatives, which come with a formal guarantee in terms of correctness and
cc54ce075db5 restructured Chengsong parents: diff changeset	777	running time as an Isabelle/HOL proof.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	778	Further improvements to the algorithm with an even stronger version of
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	779	simplification is made.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	780	We have not yet come up with one, but believe that it leads to a
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	781	formalised proof with a time bound linear to input and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	782	cubic to regular expression size using a technique by
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	783	Antimirov\cite{Antimirov}.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	784
cc54ce075db5 restructured Chengsong parents: diff changeset	785
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	786	The main contribution of this thesis is
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	787	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	788	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	789	a proven correct lexing algorithm
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	790	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	791	with formalized finite bounds on internal data structures' sizes.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	792	\end{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	793
532 cc54ce075db5 restructured Chengsong parents: diff changeset	794	To our best knowledge, no lexing libraries using Brzozowski derivatives
cc54ce075db5 restructured Chengsong parents: diff changeset	795	have a provable time guarantee,
cc54ce075db5 restructured Chengsong parents: diff changeset	796	and claims about running time are usually speculative and backed by thin empirical
cc54ce075db5 restructured Chengsong parents: diff changeset	797	evidence.
cc54ce075db5 restructured Chengsong parents: diff changeset	798	%TODO: give references
cc54ce075db5 restructured Chengsong parents: diff changeset	799	For example, Sulzmann and Lu had proposed an algorithm in which they
cc54ce075db5 restructured Chengsong parents: diff changeset	800	claim a linear running time.
cc54ce075db5 restructured Chengsong parents: diff changeset	801	But that was falsified by our experiments and the running time
cc54ce075db5 restructured Chengsong parents: diff changeset	802	is actually $\Omega(2^n)$ in the worst case.
cc54ce075db5 restructured Chengsong parents: diff changeset	803	A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
cc54ce075db5 restructured Chengsong parents: diff changeset	804	%TODO: give references
cc54ce075db5 restructured Chengsong parents: diff changeset	805	lexer, which calculates POSIX matches and is based on derivatives.
cc54ce075db5 restructured Chengsong parents: diff changeset	806	They formalized the correctness of the lexer, but not the complexity.
cc54ce075db5 restructured Chengsong parents: diff changeset	807	In the performance evaluation section, they simply analyzed the run time
cc54ce075db5 restructured Chengsong parents: diff changeset	808	of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
cc54ce075db5 restructured Chengsong parents: diff changeset	809	and concluded that the algorithm is quadratic in terms of input length.
cc54ce075db5 restructured Chengsong parents: diff changeset	810	When we tried out their extracted OCaml code with our example $(a+aa)^*$,
cc54ce075db5 restructured Chengsong parents: diff changeset	811	the time it took to lex only 40 $a$'s was 5 minutes.
cc54ce075db5 restructured Chengsong parents: diff changeset	812
cc54ce075db5 restructured Chengsong parents: diff changeset	813
cc54ce075db5 restructured Chengsong parents: diff changeset	814
cc54ce075db5 restructured Chengsong parents: diff changeset	815	\subsection{Related Work}
cc54ce075db5 restructured Chengsong parents: diff changeset	816	We are aware
cc54ce075db5 restructured Chengsong parents: diff changeset	817	of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
cc54ce075db5 restructured Chengsong parents: diff changeset	818	Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
cc54ce075db5 restructured Chengsong parents: diff changeset	819	of the work by Krauss and Nipkow \parencite{Krauss2011}. And another one
cc54ce075db5 restructured Chengsong parents: diff changeset	820	in Coq is given by Coquand and Siles \parencite{Coquand2012}.
cc54ce075db5 restructured Chengsong parents: diff changeset	821	Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
cc54ce075db5 restructured Chengsong parents: diff changeset	822
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	823
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	824	When a regular expression does not behave as intended,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	825	people usually try to rewrite the regex to some equivalent form
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	826	or they try to avoid the possibly problematic patterns completely,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	827	for which many false positives exist\parencite{Davis18}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	828	Animated tools to "debug" regular expressions such as
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	829	\parencite{regexploit2021} \parencite{regex101} are also popular.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	830	We are also aware of static analysis work on regular expressions that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	831	aims to detect potentially expoential regex patterns. Rathnayake and Thielecke
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	832	\parencite{Rathnayake2014StaticAF} proposed an algorithm
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	833	that detects regular expressions triggering exponential
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	834	behavious on backtracking matchers.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	835	Weideman \parencite{Weideman2017Static} came up with
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	836	non-linear polynomial worst-time estimates
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	837	for regexes, attack string that exploit the worst-time
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	838	scenario, and "attack automata" that generates
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	839	attack strings.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	840
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	841
532 cc54ce075db5 restructured Chengsong parents: diff changeset	842
cc54ce075db5 restructured Chengsong parents: diff changeset	843
cc54ce075db5 restructured Chengsong parents: diff changeset	844	\section{Structure of the thesis}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	845	In chapter 2 \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	846	and notations we
cc54ce075db5 restructured Chengsong parents: diff changeset	847	use for describing the lexing algorithm by Sulzmann and Lu,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	848	and then give the lexing algorithm.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	849	We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	850	Then we illustrate in \ref{Bitcoded2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	851	how the algorithm without bitcodes falls short for such aggressive
cc54ce075db5 restructured Chengsong parents: diff changeset	852	simplifications and therefore introduce our version of the
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	853	bit-coded algorithm and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	854	its correctness proof .
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	855	In \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	856	of our bitcoded algorithm, that is a finite bound on the size of any
cc54ce075db5 restructured Chengsong parents: diff changeset	857	regex's derivatives.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	858	In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	859	in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	860	algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured Chengsong parents: diff changeset	861
cc54ce075db5 restructured Chengsong parents: diff changeset	862
cc54ce075db5 restructured Chengsong parents: diff changeset	863
cc54ce075db5 restructured Chengsong parents: diff changeset	864
cc54ce075db5 restructured Chengsong parents: diff changeset	865
cc54ce075db5 restructured Chengsong parents: diff changeset	866	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	867
cc54ce075db5 restructured Chengsong parents: diff changeset	868
cc54ce075db5 restructured Chengsong parents: diff changeset	869	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	870
cc54ce075db5 restructured Chengsong parents: diff changeset	871	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	872
cc54ce075db5 restructured Chengsong parents: diff changeset	873	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	874
cc54ce075db5 restructured Chengsong parents: diff changeset	875

author	Chengsong
	Wed, 31 Aug 2022 23:57:42 +0100
changeset 590	988e92a70704
parent 585	4969ef817d92
child 591	b2d0de6aee18
permissions	-rwxr-xr-x