lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@35df9cdd36ca (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	28	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	29	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	35	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	36	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	37	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	38	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured Chengsong parents: diff changeset	40
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	42
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	43	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	44	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	45	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	46	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	47	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	48	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	49	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	50	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	51	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	52	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	53	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	54	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	55	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	56	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	57	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	58	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	59
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	60	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	61	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	62	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	63	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	64	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	65
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	66	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	67	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	68	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	69	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	70	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	71	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	72	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	73	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	74	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	75	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	76	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	77	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	78	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	79	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	80	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	81	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	82	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	83	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	84	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	85	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	86	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	87	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	88	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	89	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	90	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	91	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	92	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	93	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	94	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	95	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	96	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	97	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	98	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	99	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	100	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	103	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	104	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	105	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	106	\def\starupdates{\textit{starUpdates}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	107
532 cc54ce075db5 restructured Chengsong parents: diff changeset	108
cc54ce075db5 restructured Chengsong parents: diff changeset	109	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	110	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	111	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	112	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	113	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	114	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	115	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	116	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	118	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	119	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	121	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	122
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	123	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	124
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	125
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	126	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	127	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	128	\def\bsimpalts{\textit{bsimp}_{ALTS}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	129
532 cc54ce075db5 restructured Chengsong parents: diff changeset	130	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	131	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	132	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	133
cc54ce075db5 restructured Chengsong parents: diff changeset	134
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	135	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	136	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	137	\def\filter{\textit{filter}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	138	\def\not{\textit{not}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	139
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	140
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	141
532 cc54ce075db5 restructured Chengsong parents: diff changeset	142	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	143	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	144	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	145	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	146	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	147	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	148	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	149
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	150
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	151
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	152	\pgfplotsset{
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	153	myplotstyle/.style={
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	154	legend style={draw=none, font=\small},
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	155	legend cell align=left,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	156	legend pos=north east,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	157	ylabel style={align=center, font=\bfseries\boldmath},
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	158	xlabel style={align=center, font=\bfseries\boldmath},
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	159	x tick label style={font=\bfseries\boldmath},
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	160	y tick label style={font=\bfseries\boldmath},
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	161	scaled ticks=true,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	162	every axis plot/.append style={thick},
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	163	},
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	164	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	165
532 cc54ce075db5 restructured Chengsong parents: diff changeset	166	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	167	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	168	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	169
cc54ce075db5 restructured Chengsong parents: diff changeset	170	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	171
cc54ce075db5 restructured Chengsong parents: diff changeset	172	\begin{figure}
cc54ce075db5 restructured Chengsong parents: diff changeset	173	\centering
cc54ce075db5 restructured Chengsong parents: diff changeset	174	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
cc54ce075db5 restructured Chengsong parents: diff changeset	175	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	176	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	177	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	178	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	179	ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	180	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	181	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	182	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	183	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	184	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	185	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	186	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	187	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	188	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	189	legend entries={JavaScript},
cc54ce075db5 restructured Chengsong parents: diff changeset	190	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	191	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	192	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	193	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	194	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	195	&
cc54ce075db5 restructured Chengsong parents: diff changeset	196	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	197	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	198	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	199	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	200	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	201	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	202	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	203	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	204	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	205	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	206	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	207	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	208	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	209	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	210	legend entries={Python},
cc54ce075db5 restructured Chengsong parents: diff changeset	211	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	212	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	213	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	214	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	215	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	216	&
cc54ce075db5 restructured Chengsong parents: diff changeset	217	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	218	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	219	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	220	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	221	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	222	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	223	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	224	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	225	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	226	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	227	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	228	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	229	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	230	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	231	legend entries={Java 8},
cc54ce075db5 restructured Chengsong parents: diff changeset	232	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	233	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	234	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	235	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	236	\end{tikzpicture}\\
cc54ce075db5 restructured Chengsong parents: diff changeset	237	\multicolumn{3}{c}{Graphs: Runtime for matching $(a^)^\,b$ with strings
cc54ce075db5 restructured Chengsong parents: diff changeset	238	of the form $\underbrace{aa..a}_{n}$.}
cc54ce075db5 restructured Chengsong parents: diff changeset	239	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	240	\caption{aStarStarb} \label{fig:aStarStarb}
cc54ce075db5 restructured Chengsong parents: diff changeset	241	\end{figure}
cc54ce075db5 restructured Chengsong parents: diff changeset	242
cc54ce075db5 restructured Chengsong parents: diff changeset	243
cc54ce075db5 restructured Chengsong parents: diff changeset	244
cc54ce075db5 restructured Chengsong parents: diff changeset	245
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	246
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	247	Regular expressions are widely used in computer science:
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	248	be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	249	command-line tools like $\mathit{grep}$ that facilitate easy
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	250	text-processing; network intrusion
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	251	detection systems that reject suspicious traffic; or compiler
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	252	front ends--the majority of the solutions to these tasks
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	253	involve lexing with regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	254	expressions.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	255	Given its usefulness and ubiquity, one would imagine that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	256	modern regular expression matching implementations
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	257	are mature and fully studied.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	258	Indeed, in a popular programming language' regex engine,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	259	supplying it with regular expressions and strings, one can
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	260	get rich matching information in a very short time.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	261	Some network intrusion detection systems
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	262	use regex engines that are able to process
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	263	megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	264	Unfortunately, this is not the case for $\mathbf{all}$ inputs.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	265	%TODO: get source for SNORT/BRO's regex matching engine/speed
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	266
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	267
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	268	Take $(a^)^\,b$ and ask whether
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	269	strings of the form $aa..a$ match this regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	270	expression. Obviously this is not the case---the expected $b$ in the last
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	271	position is missing. One would expect that modern regular expression
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	272	matching engines can find this out very quickly. Alas, if one tries
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	273	this example in JavaScript, Python or Java 8, even with strings of a small
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	274	length, say around 30 $a$'s, one discovers that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	275	this decision takes crazy time to finish given the simplicity of the problem.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	276	This is clearly exponential behaviour, and
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	277	is triggered by some relatively simple regex patterns, as the graphs
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	278	in \ref{fig:aStarStarb} show.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	279
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	280
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	281
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	282
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	283	\ChristianComment{Superlinear I just leave out the explanation
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	284	which I find once used would distract the flow. Plus if i just say exponential
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	285	here the 2016 event in StackExchange was not exponential, but just quardratic so would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	286	in accurate}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	287
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	288	This superlinear blowup in regular expression engines
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	289	had repeatedly caused grief in real life.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	290	For example, on 20 July 2016 one evil
532 cc54ce075db5 restructured Chengsong parents: diff changeset	291	regular expression brought the webpage
cc54ce075db5 restructured Chengsong parents: diff changeset	292	\href{http://stackexchange.com}{Stack Exchange} to its
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	293	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	294	In this instance, a regular expression intended to just trim white
cc54ce075db5 restructured Chengsong parents: diff changeset	295	spaces from the beginning and the end of a line actually consumed
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	296	massive amounts of CPU resources---causing web servers to grind to a
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	297	halt. In this example, the time needed to process
532 cc54ce075db5 restructured Chengsong parents: diff changeset	298	the string was $O(n^2)$ with respect to the string length. This
cc54ce075db5 restructured Chengsong parents: diff changeset	299	quadratic overhead was enough for the homepage of Stack Exchange to
cc54ce075db5 restructured Chengsong parents: diff changeset	300	respond so slowly that the load balancer assumed a $\mathit{DoS}$
cc54ce075db5 restructured Chengsong parents: diff changeset	301	attack and therefore stopped the servers from responding to any
cc54ce075db5 restructured Chengsong parents: diff changeset	302	requests. This made the whole site become unavailable.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	303
532 cc54ce075db5 restructured Chengsong parents: diff changeset	304	A more recent example is a global outage of all Cloudflare servers on 2 July
cc54ce075db5 restructured Chengsong parents: diff changeset	305	2019. A poorly written regular expression exhibited exponential
cc54ce075db5 restructured Chengsong parents: diff changeset	306	behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
cc54ce075db5 restructured Chengsong parents: diff changeset	307	had several causes, at the heart was a regular expression that
cc54ce075db5 restructured Chengsong parents: diff changeset	308	was used to monitor network
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	309	traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	310	%TODO: data points for some new versions of languages
cc54ce075db5 restructured Chengsong parents: diff changeset	311	These problems with regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	312	are not isolated events that happen
cc54ce075db5 restructured Chengsong parents: diff changeset	313	very occasionally, but actually widespread.
cc54ce075db5 restructured Chengsong parents: diff changeset	314	They occur so often that they get a
cc54ce075db5 restructured Chengsong parents: diff changeset	315	name--Regular-Expression-Denial-Of-Service (ReDoS)
cc54ce075db5 restructured Chengsong parents: diff changeset	316	attack.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	317	\citeauthor{Davis18} detected more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	318	than 1000 super-linear (SL) regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	319	in Node.js, Python core libraries, and npm and pypi.
cc54ce075db5 restructured Chengsong parents: diff changeset	320	They therefore concluded that evil regular expressions
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	321	are problems "more than a parlour trick", but one that
532 cc54ce075db5 restructured Chengsong parents: diff changeset	322	requires
cc54ce075db5 restructured Chengsong parents: diff changeset	323	more research attention.
cc54ce075db5 restructured Chengsong parents: diff changeset	324
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	325
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	326	But the problems are not limited to slowness on certain
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	327	cases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	328	Another thing about these libraries is that there
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	329	is no correctness guarantee.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	330	In some cases, they either fail to generate a lexing result when there exists a match,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	331	or give results that are inconsistent with the $\POSIX$ standard.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	332	A concrete example would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	333	the regex
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	334	\begin{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	335	(aba\|ab\|a)*
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	336	\end{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	337	and the string
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	338	\begin{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	339	ababa
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	340	\end{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	341	The correct $\POSIX$ match for the above would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	342	with the entire string $ababa$,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	343	split into two Kleene star iterations, $[ab] [aba]$ at positions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	344	$[0, 2), [2, 5)$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	345	respectively.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	346	But trying this out in regex101\parencite{regex101}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	347	with different language engines would yield
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	348	the same two fragmented matches: $[aba]$ at $[0, 3)$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	349	and $a$ at $[4, 5)$.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	350
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	351	Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	352	correctly implementing the POSIX (maximum-munch)
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	353	rule of regular expression matching.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	354
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	355	As Grathwohl\parencite{grathwohl2014crash} commented,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	356	\begin{center}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	357	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	358	\end{center}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	359
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	360
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	361	To summarise the above, regular expressions are important.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	362	They are popular and programming languages' library functions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	363	for them are very fast on non-catastrophic cases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	364	But there are problems with current practical implementations.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	365	First thing is that the running time might blow up.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	366	The second problem is that they might be error-prone on certain
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	367	very simple cases.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	368	In the next part of the chapter, we will look into reasons why
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	369	certain regex engines are running horribly slow on the "catastrophic"
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	370	cases and propose a solution that addresses both of these problems
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	371	based on Brzozowski and Sulzmann and Lu's work.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	372
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	373
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	374	\section{Why are current regex engines slow?}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	375
cc54ce075db5 restructured Chengsong parents: diff changeset	376	%find literature/find out for yourself that REGEX->DFA on basic regexes
cc54ce075db5 restructured Chengsong parents: diff changeset	377	%does not blow up the size
cc54ce075db5 restructured Chengsong parents: diff changeset	378	Shouldn't regular expression matching be linear?
cc54ce075db5 restructured Chengsong parents: diff changeset	379	How can one explain the super-linear behaviour of the
cc54ce075db5 restructured Chengsong parents: diff changeset	380	regex matching engines we have?
cc54ce075db5 restructured Chengsong parents: diff changeset	381	The time cost of regex matching algorithms in general
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	382	involve two different phases, and different things can go differently wrong on
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	383	these phases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	384	$\DFA$s usually have problems in the first (construction) phase
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	385	, whereas $\NFA$s usually run into trouble
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	386	on the second phase.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	387
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	388	\subsection{Different Phases of a Matching/Lexing Algorithm}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	389
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	390
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	391	Most lexing algorithms can be roughly divided into
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	392	two phases during its run.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	393	The first phase is the "construction" phase,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	394	in which the algorithm builds some
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	395	suitable data structure from the input regex $r$, so that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	396	it can be easily operated on later.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	397	We denote
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	398	the time cost for such a phase by $P_1(r)$.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	399	The second phase is the lexing phase, when the input string
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	400	$s$ is read and the data structure
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	401	representing that regex $r$ is being operated on.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	402	We represent the time
532 cc54ce075db5 restructured Chengsong parents: diff changeset	403	it takes by $P_2(r, s)$.\\
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	404
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	405	For $\mathit{DFA}$,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	406	we have $P_2(r, s) = O( \|s\| )$,
cc54ce075db5 restructured Chengsong parents: diff changeset	407	because we take at most $\|s\|$ steps,
cc54ce075db5 restructured Chengsong parents: diff changeset	408	and each step takes
cc54ce075db5 restructured Chengsong parents: diff changeset	409	at most one transition--
cc54ce075db5 restructured Chengsong parents: diff changeset	410	a deterministic-finite-automata
cc54ce075db5 restructured Chengsong parents: diff changeset	411	by definition has at most one state active and at most one
cc54ce075db5 restructured Chengsong parents: diff changeset	412	transition upon receiving an input symbol.
cc54ce075db5 restructured Chengsong parents: diff changeset	413	But unfortunately in the worst case
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	414	$P_1(r) = O(exp^{\|r\|})$. An example will be given later.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	415
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	416
532 cc54ce075db5 restructured Chengsong parents: diff changeset	417	For $\mathit{NFA}$s, we have $P_1(r) = O(\|r\|)$ if we do not unfold
cc54ce075db5 restructured Chengsong parents: diff changeset	418	expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
cc54ce075db5 restructured Chengsong parents: diff changeset	419	The $P_2(r, s)$ is bounded by $\|r\|\cdot\|s\|$, if we do not backtrack.
cc54ce075db5 restructured Chengsong parents: diff changeset	420	On the other hand, if backtracking is used, the worst-case time bound bloats
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	421	to $\|r\| * 2^\|s\|$.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	422	%on the input
cc54ce075db5 restructured Chengsong parents: diff changeset	423	%And when calculating the time complexity of the matching algorithm,
cc54ce075db5 restructured Chengsong parents: diff changeset	424	%we are assuming that each input reading step requires constant time.
cc54ce075db5 restructured Chengsong parents: diff changeset	425	%which translates to that the number of
cc54ce075db5 restructured Chengsong parents: diff changeset	426	%states active and transitions taken each time is bounded by a
cc54ce075db5 restructured Chengsong parents: diff changeset	427	%constant $C$.
cc54ce075db5 restructured Chengsong parents: diff changeset	428	%But modern regex libraries in popular language engines
cc54ce075db5 restructured Chengsong parents: diff changeset	429	% often want to support much richer constructs than just
cc54ce075db5 restructured Chengsong parents: diff changeset	430	% sequences and Kleene stars,
cc54ce075db5 restructured Chengsong parents: diff changeset	431	%such as negation, intersection,
cc54ce075db5 restructured Chengsong parents: diff changeset	432	%bounded repetitions and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	433	%And de-sugaring these "extended" regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	434	%into basic ones might bloat the size exponentially.
cc54ce075db5 restructured Chengsong parents: diff changeset	435	%TODO: more reference for exponential size blowup on desugaring.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	436
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	437	\subsection{Why $\mathit{DFA}s$ can be slow in the first phase}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	438
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	439
532 cc54ce075db5 restructured Chengsong parents: diff changeset	440	The good things about $\mathit{DFA}$s is that once
cc54ce075db5 restructured Chengsong parents: diff changeset	441	generated, they are fast and stable, unlike
cc54ce075db5 restructured Chengsong parents: diff changeset	442	backtracking algorithms.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	443	However, they do not scale well with bounded repetitions.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	444
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	445	\subsubsection{Problems with Bounded Repetitions}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	446	Bounded repetitions, usually written in the form
cc54ce075db5 restructured Chengsong parents: diff changeset	447	$r^{\{c\}}$ (where $c$ is a constant natural number),
cc54ce075db5 restructured Chengsong parents: diff changeset	448	denotes a regular expression accepting strings
cc54ce075db5 restructured Chengsong parents: diff changeset	449	that can be divided into $c$ substrings, where each
cc54ce075db5 restructured Chengsong parents: diff changeset	450	substring is in $r$.
cc54ce075db5 restructured Chengsong parents: diff changeset	451	For the regular expression $(a\|b)^*a(a\|b)^{\{2\}}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	452	an $\mathit{NFA}$ describing it would look like:
cc54ce075db5 restructured Chengsong parents: diff changeset	453	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	454	\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
cc54ce075db5 restructured Chengsong parents: diff changeset	455	\node[state,initial] (q_0) {$q_0$};
cc54ce075db5 restructured Chengsong parents: diff changeset	456	\node[state, red] (q_1) [right=of q_0] {$q_1$};
cc54ce075db5 restructured Chengsong parents: diff changeset	457	\node[state, red] (q_2) [right=of q_1] {$q_2$};
cc54ce075db5 restructured Chengsong parents: diff changeset	458	\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
cc54ce075db5 restructured Chengsong parents: diff changeset	459	\path[->]
cc54ce075db5 restructured Chengsong parents: diff changeset	460	(q_0) edge node {a} (q_1)
cc54ce075db5 restructured Chengsong parents: diff changeset	461	edge [loop below] node {a,b} ()
cc54ce075db5 restructured Chengsong parents: diff changeset	462	(q_1) edge node {a,b} (q_2)
cc54ce075db5 restructured Chengsong parents: diff changeset	463	(q_2) edge node {a,b} (q_3);
cc54ce075db5 restructured Chengsong parents: diff changeset	464	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	465	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	466	The red states are "countdown states" which counts down
cc54ce075db5 restructured Chengsong parents: diff changeset	467	the number of characters needed in addition to the current
cc54ce075db5 restructured Chengsong parents: diff changeset	468	string to make a successful match.
cc54ce075db5 restructured Chengsong parents: diff changeset	469	For example, state $q_1$ indicates a match that has
cc54ce075db5 restructured Chengsong parents: diff changeset	470	gone past the $(a\|b)^$ part of $(a\|b)^a(a\|b)^{\{2\}}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	471	and just consumed the "delimiter" $a$ in the middle, and
cc54ce075db5 restructured Chengsong parents: diff changeset	472	need to match 2 more iterations of $(a\|b)$ to complete.
cc54ce075db5 restructured Chengsong parents: diff changeset	473	State $q_2$ on the other hand, can be viewed as a state
cc54ce075db5 restructured Chengsong parents: diff changeset	474	after $q_1$ has consumed 1 character, and just waits
cc54ce075db5 restructured Chengsong parents: diff changeset	475	for 1 more character to complete.
cc54ce075db5 restructured Chengsong parents: diff changeset	476	$q_3$ is the last state, requiring 0 more character and is accepting.
cc54ce075db5 restructured Chengsong parents: diff changeset	477	Depending on the suffix of the
cc54ce075db5 restructured Chengsong parents: diff changeset	478	input string up to the current read location,
cc54ce075db5 restructured Chengsong parents: diff changeset	479	the states $q_1$ and $q_2$, $q_3$
cc54ce075db5 restructured Chengsong parents: diff changeset	480	may or may
cc54ce075db5 restructured Chengsong parents: diff changeset	481	not be active, independent from each other.
cc54ce075db5 restructured Chengsong parents: diff changeset	482	A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
cc54ce075db5 restructured Chengsong parents: diff changeset	483	contain at least $2^3$ non-equivalent states that cannot be merged,
cc54ce075db5 restructured Chengsong parents: diff changeset	484	because the subset construction during determinisation will generate
cc54ce075db5 restructured Chengsong parents: diff changeset	485	all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
cc54ce075db5 restructured Chengsong parents: diff changeset	486	Generalizing this to regular expressions with larger
cc54ce075db5 restructured Chengsong parents: diff changeset	487	bounded repetitions number, we have that
cc54ce075db5 restructured Chengsong parents: diff changeset	488	regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
cc54ce075db5 restructured Chengsong parents: diff changeset	489	would require at least $2^{n+1}$ states, if $r$ contains
cc54ce075db5 restructured Chengsong parents: diff changeset	490	more than 1 string.
cc54ce075db5 restructured Chengsong parents: diff changeset	491	This is to represent all different
cc54ce075db5 restructured Chengsong parents: diff changeset	492	scenarios which "countdown" states are active.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	493	For those regexes, tools that uses $\DFA$s will get
532 cc54ce075db5 restructured Chengsong parents: diff changeset	494	out of memory errors.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	495
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	496	\subsubsection{Tools that uses $\mathit{DFA}$s}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	497	%TODO:more tools that use DFAs?
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	498	$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	499	in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	500	lexers. The user provides a set of regular expressions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	501	and configurations to such lexer generators, and then
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	502	gets an output program encoding a minimized $\mathit{DFA}$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	503	that can be compiled and run.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	504	When given the above countdown regular expression,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	505	a small number $n$ would result in a determinised automata
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	506	with millions of states.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	507
532 cc54ce075db5 restructured Chengsong parents: diff changeset	508	For this reason, regex libraries that support
cc54ce075db5 restructured Chengsong parents: diff changeset	509	bounded repetitions often choose to use the $\mathit{NFA}$
cc54ce075db5 restructured Chengsong parents: diff changeset	510	approach.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	511
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	512
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	513
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	514
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	515
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	516
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	517
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	518
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	519	\subsection{Why $\mathit{NFA}$s can be slow in the second phase}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	520	When one constructs an $\NFA$ out of a regular expression
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	521	there is often very little to be done in the first phase, one simply
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	522	construct the $\NFA$ states based on the structure of the input regular expression.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	523
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	524	In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	525	one by keeping track of all active states after consuming
cc54ce075db5 restructured Chengsong parents: diff changeset	526	a character, and update that set of states iteratively.
cc54ce075db5 restructured Chengsong parents: diff changeset	527	This can be viewed as a breadth-first-search of the $\mathit{NFA}$
cc54ce075db5 restructured Chengsong parents: diff changeset	528	for a path terminating
cc54ce075db5 restructured Chengsong parents: diff changeset	529	at an accepting state.
cc54ce075db5 restructured Chengsong parents: diff changeset	530	Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	531	type of $\mathit{NFA}$ simulation and guarantees a linear runtime
532 cc54ce075db5 restructured Chengsong parents: diff changeset	532	in terms of input string length.
cc54ce075db5 restructured Chengsong parents: diff changeset	533	%TODO:try out these lexers
cc54ce075db5 restructured Chengsong parents: diff changeset	534	The other way to use $\mathit{NFA}$ for matching is choosing
cc54ce075db5 restructured Chengsong parents: diff changeset	535	a single transition each time, keeping all the other options in
cc54ce075db5 restructured Chengsong parents: diff changeset	536	a queue or stack, and backtracking if that choice eventually
cc54ce075db5 restructured Chengsong parents: diff changeset	537	fails. This method, often called a "depth-first-search",
cc54ce075db5 restructured Chengsong parents: diff changeset	538	is efficient in a lot of cases, but could end up
cc54ce075db5 restructured Chengsong parents: diff changeset	539	with exponential run time.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	540	%TODO:COMPARE java python lexer speed with Rust and Go
cc54ce075db5 restructured Chengsong parents: diff changeset	541	The reason behind backtracking algorithms in languages like
cc54ce075db5 restructured Chengsong parents: diff changeset	542	Java and Python is that they support back-references.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	543	\subsubsection{Back References}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	544	If we have a regular expression like this (the sequence
cc54ce075db5 restructured Chengsong parents: diff changeset	545	operator is omitted for brevity):
cc54ce075db5 restructured Chengsong parents: diff changeset	546	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	547	$r_1(r_2(r_3r_4))$
cc54ce075db5 restructured Chengsong parents: diff changeset	548	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	549	We could label sub-expressions of interest
cc54ce075db5 restructured Chengsong parents: diff changeset	550	by parenthesizing them and giving
cc54ce075db5 restructured Chengsong parents: diff changeset	551	them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured Chengsong parents: diff changeset	552	One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured Chengsong parents: diff changeset	553	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	554	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured Chengsong parents: diff changeset	555	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	556	$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
cc54ce075db5 restructured Chengsong parents: diff changeset	557	by 1 to 4. $1$ would refer to the entire expression
cc54ce075db5 restructured Chengsong parents: diff changeset	558	$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
cc54ce075db5 restructured Chengsong parents: diff changeset	559	These sub-expressions are called "capturing groups".
cc54ce075db5 restructured Chengsong parents: diff changeset	560	We can use the following syntax to denote that we want a string just matched by a
cc54ce075db5 restructured Chengsong parents: diff changeset	561	sub-expression (capturing group) to appear at a certain location again,
cc54ce075db5 restructured Chengsong parents: diff changeset	562	exactly as it was:
cc54ce075db5 restructured Chengsong parents: diff changeset	563	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	564	$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
cc54ce075db5 restructured Chengsong parents: diff changeset	565	\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured Chengsong parents: diff changeset	566	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	567	The backslash and number $i$ are used to denote such
cc54ce075db5 restructured Chengsong parents: diff changeset	568	so-called "back-references".
cc54ce075db5 restructured Chengsong parents: diff changeset	569	Let $e$ be an expression made of regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	570	and back-references. $e$ contains the expression $e_i$
cc54ce075db5 restructured Chengsong parents: diff changeset	571	as its $i$-th capturing group.
cc54ce075db5 restructured Chengsong parents: diff changeset	572	The semantics of back-reference can be recursively
cc54ce075db5 restructured Chengsong parents: diff changeset	573	written as:
cc54ce075db5 restructured Chengsong parents: diff changeset	574	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	575	\begin{tabular}{c}
cc54ce075db5 restructured Chengsong parents: diff changeset	576	$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	577	$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
cc54ce075db5 restructured Chengsong parents: diff changeset	578	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	579	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	580	The concrete example
cc54ce075db5 restructured Chengsong parents: diff changeset	581	$((a\|b\|c\|\ldots\|z)^*)\backslash 1$
cc54ce075db5 restructured Chengsong parents: diff changeset	582	would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	583	Back-reference is a construct in the "regex" standard
cc54ce075db5 restructured Chengsong parents: diff changeset	584	that programmers found useful, but not exactly
cc54ce075db5 restructured Chengsong parents: diff changeset	585	regular any more.
cc54ce075db5 restructured Chengsong parents: diff changeset	586	In fact, that allows the regex construct to express
cc54ce075db5 restructured Chengsong parents: diff changeset	587	languages that cannot be contained in context-free
cc54ce075db5 restructured Chengsong parents: diff changeset	588	languages either.
cc54ce075db5 restructured Chengsong parents: diff changeset	589	For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
cc54ce075db5 restructured Chengsong parents: diff changeset	590	expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	591	which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured Chengsong parents: diff changeset	592	Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured Chengsong parents: diff changeset	593	of formal languages.
cc54ce075db5 restructured Chengsong parents: diff changeset	594	Solving the back-reference expressions matching problem
cc54ce075db5 restructured Chengsong parents: diff changeset	595	is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
cc54ce075db5 restructured Chengsong parents: diff changeset	596	efficient solution is not known to exist.
cc54ce075db5 restructured Chengsong parents: diff changeset	597	%TODO:read a bit more about back reference algorithms
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	598
532 cc54ce075db5 restructured Chengsong parents: diff changeset	599	It seems that languages like Java and Python made the trade-off
cc54ce075db5 restructured Chengsong parents: diff changeset	600	to support back-references at the expense of having to backtrack,
cc54ce075db5 restructured Chengsong parents: diff changeset	601	even in the case of regexes not involving back-references.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	602	Summing these up, we can categorise existing
cc54ce075db5 restructured Chengsong parents: diff changeset	603	practical regex libraries into the ones with linear
cc54ce075db5 restructured Chengsong parents: diff changeset	604	time guarantees like Go and Rust, which impose restrictions
cc54ce075db5 restructured Chengsong parents: diff changeset	605	on the user input (not allowing back-references,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	606	bounded repetitions cannot exceed 1000 etc.), and ones
532 cc54ce075db5 restructured Chengsong parents: diff changeset	607	that allows the programmer much freedom, but grinds to a halt
cc54ce075db5 restructured Chengsong parents: diff changeset	608	in some non-negligible portion of cases.
cc54ce075db5 restructured Chengsong parents: diff changeset	609	%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
cc54ce075db5 restructured Chengsong parents: diff changeset	610	% For example, the Rust regex engine claims to be linear,
cc54ce075db5 restructured Chengsong parents: diff changeset	611	% but does not support lookarounds and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	612	% The GoLang regex library does not support over 1000 repetitions.
cc54ce075db5 restructured Chengsong parents: diff changeset	613	% Java and Python both support back-references, but shows
cc54ce075db5 restructured Chengsong parents: diff changeset	614	%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured Chengsong parents: diff changeset	615	%when the language is still regular).
cc54ce075db5 restructured Chengsong parents: diff changeset	616	%TODO: test performance of Rust on (((((aa)b)b){20}))c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured Chengsong parents: diff changeset	617	%TODO: verify the fact Rust does not allow 1000+ reps
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	618	\ChristianComment{Comment required: Java 17 updated graphs? Is it ok to still use Java 8 graphs?}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	619
cc54ce075db5 restructured Chengsong parents: diff changeset	620
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	621	So we have practical implementations
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	622	on regular expression matching/lexing which are fast
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	623	but do not come with any guarantees that it will not grind to a halt
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	624	or give wrong answers.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	625	Our goal is to have a regex lexing algorithm that comes with
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	626	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	627	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	628	proven correctness
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	629	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	630	proven non-catastrophic properties
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	631	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	632	easy extensions to
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	633	constructs like
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	634	bounded repetitions, negation, lookarounds, and even back-references.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	635	\end{itemize}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	636
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	637	\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	638	We propose Brzozowski derivatives on regular expressions as
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	639	a solution to this.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	640	In the last fifteen or so years, Brzozowski's derivatives of regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	641	expressions have sparked quite a bit of interest in the functional
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	642	programming and theorem prover communities.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	643
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	644	\subsection{Motivation}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	645
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	646	Derivatives give a simple solution
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	647	to the problem of matching a string $s$ with a regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	648	expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	649	succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	650	then $r$ matches $s$ (and {\em vice versa}).
532 cc54ce075db5 restructured Chengsong parents: diff changeset	651
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	652	The beauty of
532 cc54ce075db5 restructured Chengsong parents: diff changeset	653	Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
cc54ce075db5 restructured Chengsong parents: diff changeset	654	expressible in any functional language, and easily definable and
cc54ce075db5 restructured Chengsong parents: diff changeset	655	reasoned about in theorem provers---the definitions just consist of
cc54ce075db5 restructured Chengsong parents: diff changeset	656	inductive datatypes and simple recursive functions.
cc54ce075db5 restructured Chengsong parents: diff changeset	657	And an algorithms based on it by
cc54ce075db5 restructured Chengsong parents: diff changeset	658	Suzmann and Lu \parencite{Sulzmann2014} allows easy extension
cc54ce075db5 restructured Chengsong parents: diff changeset	659	to include extended regular expressions and
cc54ce075db5 restructured Chengsong parents: diff changeset	660	simplification of internal data structures
cc54ce075db5 restructured Chengsong parents: diff changeset	661	eliminating the exponential behaviours.
cc54ce075db5 restructured Chengsong parents: diff changeset	662
cc54ce075db5 restructured Chengsong parents: diff changeset	663	However, two difficulties with derivative-based matchers exist:
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	664	\subsubsection{Problems with Current Brzozowski Matchers}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	665	First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured Chengsong parents: diff changeset	666	for whether a regular expression matches a string or not. This is too
cc54ce075db5 restructured Chengsong parents: diff changeset	667	little information in the context of lexing where separate tokens must
cc54ce075db5 restructured Chengsong parents: diff changeset	668	be identified and also classified (for example as keywords
cc54ce075db5 restructured Chengsong parents: diff changeset	669	or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
cc54ce075db5 restructured Chengsong parents: diff changeset	670	difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured Chengsong parents: diff changeset	671	algorithm. Their extended version generates additional information on
cc54ce075db5 restructured Chengsong parents: diff changeset	672	\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured Chengsong parents: diff changeset	673	rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured Chengsong parents: diff changeset	674	second ``phase'' to Brzozowski's algorithm involving an injection
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	675	function. In our own earlier work, we provided the formal
532 cc54ce075db5 restructured Chengsong parents: diff changeset	676	specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured Chengsong parents: diff changeset	677	the correctness
cc54ce075db5 restructured Chengsong parents: diff changeset	678	of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured Chengsong parents: diff changeset	679	\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured Chengsong parents: diff changeset	680
cc54ce075db5 restructured Chengsong parents: diff changeset	681	The second difficulty is that Brzozowski's derivatives can
cc54ce075db5 restructured Chengsong parents: diff changeset	682	grow to arbitrarily big sizes. For example if we start with the
cc54ce075db5 restructured Chengsong parents: diff changeset	683	regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured Chengsong parents: diff changeset	684	successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured Chengsong parents: diff changeset	685	a sequence of ever-growing derivatives like
cc54ce075db5 restructured Chengsong parents: diff changeset	686
cc54ce075db5 restructured Chengsong parents: diff changeset	687	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured Chengsong parents: diff changeset	688	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	689	\begin{tabular}{rll}
cc54ce075db5 restructured Chengsong parents: diff changeset	690	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	691	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	692	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured Chengsong parents: diff changeset	693	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	694	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured Chengsong parents: diff changeset	695	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	696	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	697
cc54ce075db5 restructured Chengsong parents: diff changeset	698	\noindent where after around 35 steps we run out of memory on a
cc54ce075db5 restructured Chengsong parents: diff changeset	699	typical computer (we shall define shortly the precise details of our
cc54ce075db5 restructured Chengsong parents: diff changeset	700	regular expressions and the derivative operation). Clearly, the
cc54ce075db5 restructured Chengsong parents: diff changeset	701	notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured Chengsong parents: diff changeset	702	simplification rules that can be applied to regular regular
cc54ce075db5 restructured Chengsong parents: diff changeset	703	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured Chengsong parents: diff changeset	704	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured Chengsong parents: diff changeset	705	r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured Chengsong parents: diff changeset	706	earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured Chengsong parents: diff changeset	707	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured Chengsong parents: diff changeset	708	\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured Chengsong parents: diff changeset	709	above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured Chengsong parents: diff changeset	710	quickly beyond any finite bound.
cc54ce075db5 restructured Chengsong parents: diff changeset	711
cc54ce075db5 restructured Chengsong parents: diff changeset	712
cc54ce075db5 restructured Chengsong parents: diff changeset	713	Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	714	\cite{Sulzmann2014} where they introduce bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	715	regular expressions. In this version, POSIX values are
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	716	represented as bit sequences and such sequences are incrementally generated
532 cc54ce075db5 restructured Chengsong parents: diff changeset	717	when derivatives are calculated. The compact representation
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	718	of bit sequences and regular expressions allows them to define a more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	719	``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured Chengsong parents: diff changeset	720	derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured Chengsong parents: diff changeset	721	They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured Chengsong parents: diff changeset	722	of this version, but do not provide any supporting proof arguments, not
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	723	even ``pencil-and-paper'' arguments. They write about their bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	724	\emph{incremental parsing method} (that is the algorithm to be formalised
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	725	in this dissertation)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	726
cc54ce075db5 restructured Chengsong parents: diff changeset	727
cc54ce075db5 restructured Chengsong parents: diff changeset	728
cc54ce075db5 restructured Chengsong parents: diff changeset	729	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	730	``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured Chengsong parents: diff changeset	731	method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured Chengsong parents: diff changeset	732	yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured Chengsong parents: diff changeset	733	extensively [..] but yet
cc54ce075db5 restructured Chengsong parents: diff changeset	734	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured Chengsong parents: diff changeset	735	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	736
cc54ce075db5 restructured Chengsong parents: diff changeset	737	Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured Chengsong parents: diff changeset	738	a formal proof.
cc54ce075db5 restructured Chengsong parents: diff changeset	739
cc54ce075db5 restructured Chengsong parents: diff changeset	740	But as they stated,
cc54ce075db5 restructured Chengsong parents: diff changeset	741	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	742	The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
cc54ce075db5 restructured Chengsong parents: diff changeset	743	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	744
cc54ce075db5 restructured Chengsong parents: diff changeset	745	This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured Chengsong parents: diff changeset	746	by Ausaf and Urban,
cc54ce075db5 restructured Chengsong parents: diff changeset	747	and gives a formal proof of the correctness with those simplifications.
cc54ce075db5 restructured Chengsong parents: diff changeset	748
cc54ce075db5 restructured Chengsong parents: diff changeset	749
cc54ce075db5 restructured Chengsong parents: diff changeset	750	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	751	\section{Contribution}
cc54ce075db5 restructured Chengsong parents: diff changeset	752
cc54ce075db5 restructured Chengsong parents: diff changeset	753
cc54ce075db5 restructured Chengsong parents: diff changeset	754
cc54ce075db5 restructured Chengsong parents: diff changeset	755	This work addresses the vulnerability of super-linear and
cc54ce075db5 restructured Chengsong parents: diff changeset	756	buggy regex implementations by the combination
cc54ce075db5 restructured Chengsong parents: diff changeset	757	of Brzozowski's derivatives and interactive theorem proving.
cc54ce075db5 restructured Chengsong parents: diff changeset	758	We give an
cc54ce075db5 restructured Chengsong parents: diff changeset	759	improved version of Sulzmann and Lu's bit-coded algorithm using
cc54ce075db5 restructured Chengsong parents: diff changeset	760	derivatives, which come with a formal guarantee in terms of correctness and
cc54ce075db5 restructured Chengsong parents: diff changeset	761	running time as an Isabelle/HOL proof.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	762	Further improvements to the algorithm with an even stronger version of
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	763	simplification is made.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	764	We have not yet come up with one, but believe that it leads to a
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	765	formalised proof with a time bound linear to input and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	766	cubic to regular expression size using a technique by
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	767	Antimirov\cite{Antimirov}.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	768
cc54ce075db5 restructured Chengsong parents: diff changeset	769
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	770	The main contribution of this thesis is
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	771	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	772	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	773	a proven correct lexing algorithm
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	774	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	775	with formalized finite bounds on internal data structures' sizes.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	776	\end{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	777
532 cc54ce075db5 restructured Chengsong parents: diff changeset	778	To our best knowledge, no lexing libraries using Brzozowski derivatives
cc54ce075db5 restructured Chengsong parents: diff changeset	779	have a provable time guarantee,
cc54ce075db5 restructured Chengsong parents: diff changeset	780	and claims about running time are usually speculative and backed by thin empirical
cc54ce075db5 restructured Chengsong parents: diff changeset	781	evidence.
cc54ce075db5 restructured Chengsong parents: diff changeset	782	%TODO: give references
cc54ce075db5 restructured Chengsong parents: diff changeset	783	For example, Sulzmann and Lu had proposed an algorithm in which they
cc54ce075db5 restructured Chengsong parents: diff changeset	784	claim a linear running time.
cc54ce075db5 restructured Chengsong parents: diff changeset	785	But that was falsified by our experiments and the running time
cc54ce075db5 restructured Chengsong parents: diff changeset	786	is actually $\Omega(2^n)$ in the worst case.
cc54ce075db5 restructured Chengsong parents: diff changeset	787	A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
cc54ce075db5 restructured Chengsong parents: diff changeset	788	%TODO: give references
cc54ce075db5 restructured Chengsong parents: diff changeset	789	lexer, which calculates POSIX matches and is based on derivatives.
cc54ce075db5 restructured Chengsong parents: diff changeset	790	They formalized the correctness of the lexer, but not the complexity.
cc54ce075db5 restructured Chengsong parents: diff changeset	791	In the performance evaluation section, they simply analyzed the run time
cc54ce075db5 restructured Chengsong parents: diff changeset	792	of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
cc54ce075db5 restructured Chengsong parents: diff changeset	793	and concluded that the algorithm is quadratic in terms of input length.
cc54ce075db5 restructured Chengsong parents: diff changeset	794	When we tried out their extracted OCaml code with our example $(a+aa)^*$,
cc54ce075db5 restructured Chengsong parents: diff changeset	795	the time it took to lex only 40 $a$'s was 5 minutes.
cc54ce075db5 restructured Chengsong parents: diff changeset	796
cc54ce075db5 restructured Chengsong parents: diff changeset	797
cc54ce075db5 restructured Chengsong parents: diff changeset	798
cc54ce075db5 restructured Chengsong parents: diff changeset	799	\subsection{Related Work}
cc54ce075db5 restructured Chengsong parents: diff changeset	800	We are aware
cc54ce075db5 restructured Chengsong parents: diff changeset	801	of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
cc54ce075db5 restructured Chengsong parents: diff changeset	802	Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
cc54ce075db5 restructured Chengsong parents: diff changeset	803	of the work by Krauss and Nipkow \parencite{Krauss2011}. And another one
cc54ce075db5 restructured Chengsong parents: diff changeset	804	in Coq is given by Coquand and Siles \parencite{Coquand2012}.
cc54ce075db5 restructured Chengsong parents: diff changeset	805	Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
cc54ce075db5 restructured Chengsong parents: diff changeset	806
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	807
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	808	When a regular expression does not behave as intended,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	809	people usually try to rewrite the regex to some equivalent form
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	810	or they try to avoid the possibly problematic patterns completely,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	811	for which many false positives exist\parencite{Davis18}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	812	Animated tools to "debug" regular expressions such as
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	813	\parencite{regexploit2021} \parencite{regex101} are also popular.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	814	We are also aware of static analysis work on regular expressions that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	815	aims to detect potentially expoential regex patterns. Rathnayake and Thielecke
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	816	\parencite{Rathnayake2014StaticAF} proposed an algorithm
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	817	that detects regular expressions triggering exponential
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	818	behavious on backtracking matchers.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	819	Weideman \parencite{Weideman2017Static} came up with
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	820	non-linear polynomial worst-time estimates
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	821	for regexes, attack string that exploit the worst-time
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	822	scenario, and "attack automata" that generates
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	823	attack strings.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	824
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	825
532 cc54ce075db5 restructured Chengsong parents: diff changeset	826
cc54ce075db5 restructured Chengsong parents: diff changeset	827
cc54ce075db5 restructured Chengsong parents: diff changeset	828	\section{Structure of the thesis}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	829	In chapter 2 \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	830	and notations we
cc54ce075db5 restructured Chengsong parents: diff changeset	831	use for describing the lexing algorithm by Sulzmann and Lu,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	832	and then give the lexing algorithm.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	833	We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	834	Then we illustrate in \ref{Bitcoded2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	835	how the algorithm without bitcodes falls short for such aggressive
cc54ce075db5 restructured Chengsong parents: diff changeset	836	simplifications and therefore introduce our version of the
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	837	bit-coded algorithm and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	838	its correctness proof .
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	839	In \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	840	of our bitcoded algorithm, that is a finite bound on the size of any
cc54ce075db5 restructured Chengsong parents: diff changeset	841	regex's derivatives.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	842	In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	843	in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	844	algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured Chengsong parents: diff changeset	845
cc54ce075db5 restructured Chengsong parents: diff changeset	846
cc54ce075db5 restructured Chengsong parents: diff changeset	847
cc54ce075db5 restructured Chengsong parents: diff changeset	848
cc54ce075db5 restructured Chengsong parents: diff changeset	849
cc54ce075db5 restructured Chengsong parents: diff changeset	850	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	851
cc54ce075db5 restructured Chengsong parents: diff changeset	852
cc54ce075db5 restructured Chengsong parents: diff changeset	853	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	854
cc54ce075db5 restructured Chengsong parents: diff changeset	855	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	856
cc54ce075db5 restructured Chengsong parents: diff changeset	857	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	858
cc54ce075db5 restructured Chengsong parents: diff changeset	859

author	Chengsong
	Wed, 17 Aug 2022 01:09:13 +0100 (2022-08-17)
changeset 579	35df9cdd36ca
parent 573	454ced557605
child 585	4969ef817d92
permissions	-rwxr-xr-x