lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@62f8fa03863e (annotated)

532 cc54ce075db5 restructured Chengsong parents: diff changeset	1	% Chapter 1
cc54ce075db5 restructured Chengsong parents: diff changeset	2
cc54ce075db5 restructured Chengsong parents: diff changeset	3	\chapter{Introduction} % Main chapter title
cc54ce075db5 restructured Chengsong parents: diff changeset	4
cc54ce075db5 restructured Chengsong parents: diff changeset	5	\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
cc54ce075db5 restructured Chengsong parents: diff changeset	6
cc54ce075db5 restructured Chengsong parents: diff changeset	7	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	8
cc54ce075db5 restructured Chengsong parents: diff changeset	9	% Define some commands to keep the formatting separated from the content
cc54ce075db5 restructured Chengsong parents: diff changeset	10	\newcommand{\keyword}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	11	\newcommand{\tabhead}[1]{\textbf{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	12	\newcommand{\code}[1]{\texttt{#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	13	\newcommand{\file}[1]{\texttt{\bfseries#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	14	\newcommand{\option}[1]{\texttt{\itshape#1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	15
cc54ce075db5 restructured Chengsong parents: diff changeset	16	%boxes
cc54ce075db5 restructured Chengsong parents: diff changeset	17	\newcommand*{\mybox}[1]{\framebox{\strut #1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	18
cc54ce075db5 restructured Chengsong parents: diff changeset	19	%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
cc54ce075db5 restructured Chengsong parents: diff changeset	20	\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
cc54ce075db5 restructured Chengsong parents: diff changeset	21	\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	22	\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	23	\newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	24	\def\derssimp{\textit{ders}\_\textit{simp}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	25	\def\rders{\textit{rders}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	26	\newcommand{\bders}[2]{#1 \backslash #2}
cc54ce075db5 restructured Chengsong parents: diff changeset	27	\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	28	\def\bsimps{\textit{bsimp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	29	\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	30	\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
cc54ce075db5 restructured Chengsong parents: diff changeset	31	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	32	\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
cc54ce075db5 restructured Chengsong parents: diff changeset	33	\newcommand{\ZERO}{\mbox{\bf 0}}
cc54ce075db5 restructured Chengsong parents: diff changeset	34	\newcommand{\ONE}{\mbox{\bf 1}}
cc54ce075db5 restructured Chengsong parents: diff changeset	35	\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	36	\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
594 62f8fa03863e more Chengsong parents: 591 diff changeset	37	\def\rdistincts{\textit{rdistinct}}
556 c27f04bb2262 hello Chengsong parents: 555 diff changeset	38	\def\rDistinct{\textit{rdistinct}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	39	\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
cc54ce075db5 restructured Chengsong parents: diff changeset	40	\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
cc54ce075db5 restructured Chengsong parents: diff changeset	41	\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
cc54ce075db5 restructured Chengsong parents: diff changeset	42
cc54ce075db5 restructured Chengsong parents: diff changeset	43	\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
cc54ce075db5 restructured Chengsong parents: diff changeset	44
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	45	\def\case{\textit{case}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	46	\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	47	\def\rsimpalts{\textit{rsimp}_{ALTS}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	48	\def\good{\textit{good}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	49	\def\btrue{\textit{true}}
15d182ffbc76 more Chengsong parents: 543 diff changeset	50	\def\bfalse{\textit{false}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	51	\def\bnullable{\textit{bnullable}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	52	\def\bnullables{\textit{bnullables}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	53	\def\Some{\textit{Some}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	54	\def\None{\textit{None}}
537 50e590823220 more Chengsong parents: 532 diff changeset	55	\def\code{\textit{code}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	56	\def\decode{\textit{decode}}
cc54ce075db5 restructured Chengsong parents: diff changeset	57	\def\internalise{\textit{internalise}}
cc54ce075db5 restructured Chengsong parents: diff changeset	58	\def\lexer{\mathit{lexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	59	\def\mkeps{\textit{mkeps}}
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	60	\newcommand{\rder}[2]{#2 \backslash_r #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	61
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	62	\def\rerases{\textit{rerase}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	63
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	64	\def\nonnested{\textit{nonnested}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	65	\def\AZERO{\textit{AZERO}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	66	\def\sizeNregex{\textit{sizeNregex}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	67	\def\AONE{\textit{AONE}}
cc54ce075db5 restructured Chengsong parents: diff changeset	68	\def\ACHAR{\textit{ACHAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	69
585 4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	70	\def\simpsulz{\textit{simp}_{Sulz}}
4969ef817d92 chap4 more Chengsong parents: 579 diff changeset	71
557 812e5d112f49 more changes Chengsong parents: 556 diff changeset	72	\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
555 aecf1ddf3541 more Chengsong parents: 554 diff changeset	73	\def\frewrite{\rightsquigarrow_f}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	74	\def\hrewrite{\rightsquigarrow_h}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	75	\def\grewrite{\rightsquigarrow_g}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	76	\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	77	\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
aecf1ddf3541 more Chengsong parents: 554 diff changeset	78	\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	79	\def\fuse{\textit{fuse}}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	80	\def\bder{\textit{bder}}
542 a7344c9afbaf chapter3 finished Chengsong parents: 538 diff changeset	81	\def\der{\textit{der}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	82	\def\POSIX{\textit{POSIX}}
cc54ce075db5 restructured Chengsong parents: diff changeset	83	\def\ALTS{\textit{ALTS}}
cc54ce075db5 restructured Chengsong parents: diff changeset	84	\def\ASTAR{\textit{ASTAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	85	\def\DFA{\textit{DFA}}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	86	\def\NFA{\textit{NFA}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	87	\def\bmkeps{\textit{bmkeps}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	88	\def\bmkepss{\textit{bmkepss}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	89	\def\retrieve{\textit{retrieve}}
cc54ce075db5 restructured Chengsong parents: diff changeset	90	\def\blexer{\textit{blexer}}
cc54ce075db5 restructured Chengsong parents: diff changeset	91	\def\flex{\textit{flex}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	92	\def\inj{\textit{inj}}
564 3cbcd7cda0a9 more Chengsong parents: 558 diff changeset	93	\def\Empty{\textit{Empty}}
567 28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	94	\def\Left{\textit{Left}}
28cb8089ec36 more updaates Chengsong parents: 564 diff changeset	95	\def\Right{\textit{Right}}
573 454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	96	\def\Stars{\textit{Stars}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	97	\def\Char{\textit{Char}}
454ced557605 chapter2 finished polishing Chengsong parents: 567 diff changeset	98	\def\Seq{\textit{Seq}}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	99	\def\Der{\textit{Der}}
cc54ce075db5 restructured Chengsong parents: diff changeset	100	\def\Ders{\textit{Ders}}
cc54ce075db5 restructured Chengsong parents: diff changeset	101	\def\nullable{\mathit{nullable}}
cc54ce075db5 restructured Chengsong parents: diff changeset	102	\def\Z{\mathit{Z}}
cc54ce075db5 restructured Chengsong parents: diff changeset	103	\def\S{\mathit{S}}
cc54ce075db5 restructured Chengsong parents: diff changeset	104	\def\rup{r^\uparrow}
cc54ce075db5 restructured Chengsong parents: diff changeset	105	%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	106	\def\distinctWith{\textit{distinctWith}}
cc54ce075db5 restructured Chengsong parents: diff changeset	107	\def\lf{\textit{lf}}
cc54ce075db5 restructured Chengsong parents: diff changeset	108	\def\PD{\textit{PD}}
cc54ce075db5 restructured Chengsong parents: diff changeset	109	\def\suffix{\textit{Suffix}}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	110	\def\distinctBy{\textit{distinctBy}}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	111	\def\starupdate{\textit{starUpdate}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	112	\def\starupdates{\textit{starUpdates}}
671a83abccf3 haha Chengsong parents: 557 diff changeset	113
532 cc54ce075db5 restructured Chengsong parents: diff changeset	114
cc54ce075db5 restructured Chengsong parents: diff changeset	115	\def\size{\mathit{size}}
cc54ce075db5 restructured Chengsong parents: diff changeset	116	\def\rexp{\mathbf{rexp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	117	\def\simp{\mathit{simp}}
cc54ce075db5 restructured Chengsong parents: diff changeset	118	\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
cc54ce075db5 restructured Chengsong parents: diff changeset	119	\def\map{\mathit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	120	\def\distinct{\mathit{distinct}}
cc54ce075db5 restructured Chengsong parents: diff changeset	121	\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	122	\def\blexerStrong{\textit{blexerStrong}}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	123	\def\bsimpStrong{\textit{bsimpStrong}}
591 b2d0de6aee18 more polishing integrated comments chap2 Chengsong parents: 590 diff changeset	124	\def\bdersStrongs{\textit{bdersStrong}}
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	125	\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	126
532 cc54ce075db5 restructured Chengsong parents: diff changeset	127	\def\map{\textit{map}}
cc54ce075db5 restructured Chengsong parents: diff changeset	128	\def\rrexp{\textit{rrexp}}
554 15d182ffbc76 more Chengsong parents: 543 diff changeset	129	\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
532 cc54ce075db5 restructured Chengsong parents: diff changeset	130	\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	131	\newcommand\asize[1]{\llbracket #1 \rrbracket}
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	132	\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	133
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	134	\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	135
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	136
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	137	\def\rflts{\textit{rflts}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	138	\def\rrewrite{\textit{rrewrite}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	139	\def\bsimpalts{\textit{bsimp}_{ALTS}}
b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	140
532 cc54ce075db5 restructured Chengsong parents: diff changeset	141	\def\erase{\textit{erase}}
cc54ce075db5 restructured Chengsong parents: diff changeset	142	\def\STAR{\textit{STAR}}
cc54ce075db5 restructured Chengsong parents: diff changeset	143	\def\flts{\textit{flts}}
cc54ce075db5 restructured Chengsong parents: diff changeset	144
cc54ce075db5 restructured Chengsong parents: diff changeset	145
579 35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	146	\def\zeroable{\textit{zeroable}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	147	\def\nub{\textit{nub}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	148	\def\filter{\textit{filter}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	149	\def\not{\textit{not}}
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	150
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	151
35df9cdd36ca more chap3 Chengsong parents: 573 diff changeset	152
532 cc54ce075db5 restructured Chengsong parents: diff changeset	153	\def\RZERO{\mathbf{0}_r }
cc54ce075db5 restructured Chengsong parents: diff changeset	154	\def\RONE{\mathbf{1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	155	\newcommand\RCHAR[1]{\mathbf{#1}_r}
cc54ce075db5 restructured Chengsong parents: diff changeset	156	\newcommand\RSEQ[2]{#1 \cdot #2}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	157	\newcommand\RALTS[1]{\sum #1}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	158	\newcommand\RSTAR[1]{#1^*}
558 671a83abccf3 haha Chengsong parents: 557 diff changeset	159	\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	160
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	161
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	162
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	163
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	164	\lstdefinestyle{myScalastyle}{
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	165	frame=tb,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	166	language=scala,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	167	aboveskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	168	belowskip=3mm,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	169	showstringspaces=false,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	170	columns=flexible,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	171	basicstyle={\small\ttfamily},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	172	numbers=none,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	173	numberstyle=\tiny\color{gray},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	174	keywordstyle=\color{blue},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	175	commentstyle=\color{dkgreen},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	176	stringstyle=\color{mauve},
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	177	frame=single,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	178	breaklines=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	179	breakatwhitespace=true,
988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	180	tabsize=3,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	181	}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	182
590 988e92a70704 more chap5 and chap6 bsimp_idem Chengsong parents: 585 diff changeset	183
532 cc54ce075db5 restructured Chengsong parents: diff changeset	184	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	185	%This part is about regular expressions, Brzozowski derivatives,
cc54ce075db5 restructured Chengsong parents: diff changeset	186	%and a bit-coded lexing algorithm with proven correctness and time bounds.
cc54ce075db5 restructured Chengsong parents: diff changeset	187
cc54ce075db5 restructured Chengsong parents: diff changeset	188	%TODO: look up snort rules to use here--give readers idea of what regexes look like
cc54ce075db5 restructured Chengsong parents: diff changeset	189
cc54ce075db5 restructured Chengsong parents: diff changeset	190	\begin{figure}
cc54ce075db5 restructured Chengsong parents: diff changeset	191	\centering
cc54ce075db5 restructured Chengsong parents: diff changeset	192	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
cc54ce075db5 restructured Chengsong parents: diff changeset	193	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	194	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	195	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	196	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	197	ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	198	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	199	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	200	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	201	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	202	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	203	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	204	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	205	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	206	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	207	legend entries={JavaScript},
cc54ce075db5 restructured Chengsong parents: diff changeset	208	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	209	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	210	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	211	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	212	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	213	&
cc54ce075db5 restructured Chengsong parents: diff changeset	214	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	215	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	216	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	217	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	218	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	219	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	220	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	221	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	222	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	223	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	224	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	225	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	226	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	227	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	228	legend entries={Python},
cc54ce075db5 restructured Chengsong parents: diff changeset	229	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	230	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	231	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	232	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	233	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	234	&
cc54ce075db5 restructured Chengsong parents: diff changeset	235	\begin{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	236	\begin{axis}[
cc54ce075db5 restructured Chengsong parents: diff changeset	237	xlabel={$n$},
cc54ce075db5 restructured Chengsong parents: diff changeset	238	x label style={at={(1.05,-0.05)}},
cc54ce075db5 restructured Chengsong parents: diff changeset	239	%ylabel={time in secs},
cc54ce075db5 restructured Chengsong parents: diff changeset	240	enlargelimits=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	241	xtick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	242	xmax=33,
cc54ce075db5 restructured Chengsong parents: diff changeset	243	ymax=35,
cc54ce075db5 restructured Chengsong parents: diff changeset	244	ytick={0,5,...,30},
cc54ce075db5 restructured Chengsong parents: diff changeset	245	scaled ticks=false,
cc54ce075db5 restructured Chengsong parents: diff changeset	246	axis lines=left,
cc54ce075db5 restructured Chengsong parents: diff changeset	247	width=5cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	248	height=4cm,
cc54ce075db5 restructured Chengsong parents: diff changeset	249	legend entries={Java 8},
cc54ce075db5 restructured Chengsong parents: diff changeset	250	legend pos=north west,
cc54ce075db5 restructured Chengsong parents: diff changeset	251	legend cell align=left]
cc54ce075db5 restructured Chengsong parents: diff changeset	252	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
cc54ce075db5 restructured Chengsong parents: diff changeset	253	\end{axis}
cc54ce075db5 restructured Chengsong parents: diff changeset	254	\end{tikzpicture}\\
cc54ce075db5 restructured Chengsong parents: diff changeset	255	\multicolumn{3}{c}{Graphs: Runtime for matching $(a^)^\,b$ with strings
cc54ce075db5 restructured Chengsong parents: diff changeset	256	of the form $\underbrace{aa..a}_{n}$.}
cc54ce075db5 restructured Chengsong parents: diff changeset	257	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	258	\caption{aStarStarb} \label{fig:aStarStarb}
cc54ce075db5 restructured Chengsong parents: diff changeset	259	\end{figure}
cc54ce075db5 restructured Chengsong parents: diff changeset	260
cc54ce075db5 restructured Chengsong parents: diff changeset	261
cc54ce075db5 restructured Chengsong parents: diff changeset	262
cc54ce075db5 restructured Chengsong parents: diff changeset	263
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	264
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	265	Regular expressions are widely used in computer science:
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	266	be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	267	command-line tools like $\mathit{grep}$ that facilitate easy
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	268	text-processing; network intrusion
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	269	detection systems that reject suspicious traffic; or compiler
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	270	front ends--the majority of the solutions to these tasks
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	271	involve lexing with regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	272	expressions.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	273	Given its usefulness and ubiquity, one would imagine that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	274	modern regular expression matching implementations
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	275	are mature and fully studied.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	276	Indeed, in a popular programming language' regex engine,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	277	supplying it with regular expressions and strings, one can
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	278	get rich matching information in a very short time.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	279	Some network intrusion detection systems
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	280	use regex engines that are able to process
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	281	megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	282	Unfortunately, this is not the case for $\mathbf{all}$ inputs.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	283	%TODO: get source for SNORT/BRO's regex matching engine/speed
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	284
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	285
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	286	Take $(a^)^\,b$ and ask whether
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	287	strings of the form $aa..a$ match this regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	288	expression. Obviously this is not the case---the expected $b$ in the last
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	289	position is missing. One would expect that modern regular expression
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	290	matching engines can find this out very quickly. Alas, if one tries
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	291	this example in JavaScript, Python or Java 8, even with strings of a small
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	292	length, say around 30 $a$'s, one discovers that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	293	this decision takes crazy time to finish given the simplicity of the problem.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	294	This is clearly exponential behaviour, and
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	295	is triggered by some relatively simple regex patterns, as the graphs
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	296	in \ref{fig:aStarStarb} show.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	297
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	298
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	299
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	300
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	301	\ChristianComment{Superlinear I just leave out the explanation
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	302	which I find once used would distract the flow. Plus if i just say exponential
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	303	here the 2016 event in StackExchange was not exponential, but just quardratic so would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	304	in accurate}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	305
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	306	This superlinear blowup in regular expression engines
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	307	had repeatedly caused grief in real life.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	308	For example, on 20 July 2016 one evil
532 cc54ce075db5 restructured Chengsong parents: diff changeset	309	regular expression brought the webpage
cc54ce075db5 restructured Chengsong parents: diff changeset	310	\href{http://stackexchange.com}{Stack Exchange} to its
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	311	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	312	In this instance, a regular expression intended to just trim white
cc54ce075db5 restructured Chengsong parents: diff changeset	313	spaces from the beginning and the end of a line actually consumed
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	314	massive amounts of CPU resources---causing web servers to grind to a
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	315	halt. In this example, the time needed to process
532 cc54ce075db5 restructured Chengsong parents: diff changeset	316	the string was $O(n^2)$ with respect to the string length. This
cc54ce075db5 restructured Chengsong parents: diff changeset	317	quadratic overhead was enough for the homepage of Stack Exchange to
cc54ce075db5 restructured Chengsong parents: diff changeset	318	respond so slowly that the load balancer assumed a $\mathit{DoS}$
cc54ce075db5 restructured Chengsong parents: diff changeset	319	attack and therefore stopped the servers from responding to any
cc54ce075db5 restructured Chengsong parents: diff changeset	320	requests. This made the whole site become unavailable.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	321
532 cc54ce075db5 restructured Chengsong parents: diff changeset	322	A more recent example is a global outage of all Cloudflare servers on 2 July
cc54ce075db5 restructured Chengsong parents: diff changeset	323	2019. A poorly written regular expression exhibited exponential
cc54ce075db5 restructured Chengsong parents: diff changeset	324	behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
cc54ce075db5 restructured Chengsong parents: diff changeset	325	had several causes, at the heart was a regular expression that
cc54ce075db5 restructured Chengsong parents: diff changeset	326	was used to monitor network
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	327	traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	328	%TODO: data points for some new versions of languages
cc54ce075db5 restructured Chengsong parents: diff changeset	329	These problems with regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	330	are not isolated events that happen
cc54ce075db5 restructured Chengsong parents: diff changeset	331	very occasionally, but actually widespread.
cc54ce075db5 restructured Chengsong parents: diff changeset	332	They occur so often that they get a
cc54ce075db5 restructured Chengsong parents: diff changeset	333	name--Regular-Expression-Denial-Of-Service (ReDoS)
cc54ce075db5 restructured Chengsong parents: diff changeset	334	attack.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	335	\citeauthor{Davis18} detected more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	336	than 1000 super-linear (SL) regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	337	in Node.js, Python core libraries, and npm and pypi.
cc54ce075db5 restructured Chengsong parents: diff changeset	338	They therefore concluded that evil regular expressions
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	339	are problems "more than a parlour trick", but one that
532 cc54ce075db5 restructured Chengsong parents: diff changeset	340	requires
cc54ce075db5 restructured Chengsong parents: diff changeset	341	more research attention.
cc54ce075db5 restructured Chengsong parents: diff changeset	342
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	343
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	344	But the problems are not limited to slowness on certain
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	345	cases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	346	Another thing about these libraries is that there
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	347	is no correctness guarantee.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	348	In some cases, they either fail to generate a lexing result when there exists a match,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	349	or give results that are inconsistent with the $\POSIX$ standard.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	350	A concrete example would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	351	the regex
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	352	\begin{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	353	(aba\|ab\|a)*
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	354	\end{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	355	and the string
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	356	\begin{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	357	ababa
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	358	\end{verbatim}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	359	The correct $\POSIX$ match for the above would be
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	360	with the entire string $ababa$,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	361	split into two Kleene star iterations, $[ab] [aba]$ at positions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	362	$[0, 2), [2, 5)$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	363	respectively.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	364	But trying this out in regex101\parencite{regex101}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	365	with different language engines would yield
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	366	the same two fragmented matches: $[aba]$ at $[0, 3)$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	367	and $a$ at $[4, 5)$.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	368
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	369	Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	370	correctly implementing the POSIX (maximum-munch)
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	371	rule of regular expression matching.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	372
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	373	As Grathwohl\parencite{grathwohl2014crash} commented,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	374	\begin{center}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	375	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	376	\end{center}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	377
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	378
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	379	To summarise the above, regular expressions are important.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	380	They are popular and programming languages' library functions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	381	for them are very fast on non-catastrophic cases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	382	But there are problems with current practical implementations.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	383	First thing is that the running time might blow up.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	384	The second problem is that they might be error-prone on certain
543 b2bea5968b89 thesis_thys Chengsong parents: 542 diff changeset	385	very simple cases.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	386	In the next part of the chapter, we will look into reasons why
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	387	certain regex engines are running horribly slow on the "catastrophic"
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	388	cases and propose a solution that addresses both of these problems
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	389	based on Brzozowski and Sulzmann and Lu's work.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	390
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	391
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	392	\section{Why are current regex engines slow?}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	393
cc54ce075db5 restructured Chengsong parents: diff changeset	394	%find literature/find out for yourself that REGEX->DFA on basic regexes
cc54ce075db5 restructured Chengsong parents: diff changeset	395	%does not blow up the size
cc54ce075db5 restructured Chengsong parents: diff changeset	396	Shouldn't regular expression matching be linear?
cc54ce075db5 restructured Chengsong parents: diff changeset	397	How can one explain the super-linear behaviour of the
cc54ce075db5 restructured Chengsong parents: diff changeset	398	regex matching engines we have?
cc54ce075db5 restructured Chengsong parents: diff changeset	399	The time cost of regex matching algorithms in general
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	400	involve two different phases, and different things can go differently wrong on
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	401	these phases.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	402	$\DFA$s usually have problems in the first (construction) phase
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	403	, whereas $\NFA$s usually run into trouble
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	404	on the second phase.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	405
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	406	\subsection{Different Phases of a Matching/Lexing Algorithm}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	407
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	408
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	409	Most lexing algorithms can be roughly divided into
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	410	two phases during its run.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	411	The first phase is the "construction" phase,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	412	in which the algorithm builds some
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	413	suitable data structure from the input regex $r$, so that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	414	it can be easily operated on later.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	415	We denote
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	416	the time cost for such a phase by $P_1(r)$.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	417	The second phase is the lexing phase, when the input string
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	418	$s$ is read and the data structure
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	419	representing that regex $r$ is being operated on.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	420	We represent the time
532 cc54ce075db5 restructured Chengsong parents: diff changeset	421	it takes by $P_2(r, s)$.\\
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	422
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	423	For $\mathit{DFA}$,
532 cc54ce075db5 restructured Chengsong parents: diff changeset	424	we have $P_2(r, s) = O( \|s\| )$,
cc54ce075db5 restructured Chengsong parents: diff changeset	425	because we take at most $\|s\|$ steps,
cc54ce075db5 restructured Chengsong parents: diff changeset	426	and each step takes
cc54ce075db5 restructured Chengsong parents: diff changeset	427	at most one transition--
cc54ce075db5 restructured Chengsong parents: diff changeset	428	a deterministic-finite-automata
cc54ce075db5 restructured Chengsong parents: diff changeset	429	by definition has at most one state active and at most one
cc54ce075db5 restructured Chengsong parents: diff changeset	430	transition upon receiving an input symbol.
cc54ce075db5 restructured Chengsong parents: diff changeset	431	But unfortunately in the worst case
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	432	$P_1(r) = O(exp^{\|r\|})$. An example will be given later.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	433
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	434
532 cc54ce075db5 restructured Chengsong parents: diff changeset	435	For $\mathit{NFA}$s, we have $P_1(r) = O(\|r\|)$ if we do not unfold
cc54ce075db5 restructured Chengsong parents: diff changeset	436	expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
cc54ce075db5 restructured Chengsong parents: diff changeset	437	The $P_2(r, s)$ is bounded by $\|r\|\cdot\|s\|$, if we do not backtrack.
cc54ce075db5 restructured Chengsong parents: diff changeset	438	On the other hand, if backtracking is used, the worst-case time bound bloats
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	439	to $\|r\| * 2^\|s\|$.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	440	%on the input
cc54ce075db5 restructured Chengsong parents: diff changeset	441	%And when calculating the time complexity of the matching algorithm,
cc54ce075db5 restructured Chengsong parents: diff changeset	442	%we are assuming that each input reading step requires constant time.
cc54ce075db5 restructured Chengsong parents: diff changeset	443	%which translates to that the number of
cc54ce075db5 restructured Chengsong parents: diff changeset	444	%states active and transitions taken each time is bounded by a
cc54ce075db5 restructured Chengsong parents: diff changeset	445	%constant $C$.
cc54ce075db5 restructured Chengsong parents: diff changeset	446	%But modern regex libraries in popular language engines
cc54ce075db5 restructured Chengsong parents: diff changeset	447	% often want to support much richer constructs than just
cc54ce075db5 restructured Chengsong parents: diff changeset	448	% sequences and Kleene stars,
cc54ce075db5 restructured Chengsong parents: diff changeset	449	%such as negation, intersection,
cc54ce075db5 restructured Chengsong parents: diff changeset	450	%bounded repetitions and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	451	%And de-sugaring these "extended" regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	452	%into basic ones might bloat the size exponentially.
cc54ce075db5 restructured Chengsong parents: diff changeset	453	%TODO: more reference for exponential size blowup on desugaring.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	454
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	455	\subsection{Why $\mathit{DFA}s$ can be slow in the first phase}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	456
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	457
532 cc54ce075db5 restructured Chengsong parents: diff changeset	458	The good things about $\mathit{DFA}$s is that once
cc54ce075db5 restructured Chengsong parents: diff changeset	459	generated, they are fast and stable, unlike
cc54ce075db5 restructured Chengsong parents: diff changeset	460	backtracking algorithms.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	461	However, they do not scale well with bounded repetitions.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	462
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	463	\subsubsection{Problems with Bounded Repetitions}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	464	Bounded repetitions, usually written in the form
cc54ce075db5 restructured Chengsong parents: diff changeset	465	$r^{\{c\}}$ (where $c$ is a constant natural number),
cc54ce075db5 restructured Chengsong parents: diff changeset	466	denotes a regular expression accepting strings
cc54ce075db5 restructured Chengsong parents: diff changeset	467	that can be divided into $c$ substrings, where each
cc54ce075db5 restructured Chengsong parents: diff changeset	468	substring is in $r$.
cc54ce075db5 restructured Chengsong parents: diff changeset	469	For the regular expression $(a\|b)^*a(a\|b)^{\{2\}}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	470	an $\mathit{NFA}$ describing it would look like:
cc54ce075db5 restructured Chengsong parents: diff changeset	471	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	472	\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
cc54ce075db5 restructured Chengsong parents: diff changeset	473	\node[state,initial] (q_0) {$q_0$};
cc54ce075db5 restructured Chengsong parents: diff changeset	474	\node[state, red] (q_1) [right=of q_0] {$q_1$};
cc54ce075db5 restructured Chengsong parents: diff changeset	475	\node[state, red] (q_2) [right=of q_1] {$q_2$};
cc54ce075db5 restructured Chengsong parents: diff changeset	476	\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
cc54ce075db5 restructured Chengsong parents: diff changeset	477	\path[->]
cc54ce075db5 restructured Chengsong parents: diff changeset	478	(q_0) edge node {a} (q_1)
cc54ce075db5 restructured Chengsong parents: diff changeset	479	edge [loop below] node {a,b} ()
cc54ce075db5 restructured Chengsong parents: diff changeset	480	(q_1) edge node {a,b} (q_2)
cc54ce075db5 restructured Chengsong parents: diff changeset	481	(q_2) edge node {a,b} (q_3);
cc54ce075db5 restructured Chengsong parents: diff changeset	482	\end{tikzpicture}
cc54ce075db5 restructured Chengsong parents: diff changeset	483	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	484	The red states are "countdown states" which counts down
cc54ce075db5 restructured Chengsong parents: diff changeset	485	the number of characters needed in addition to the current
cc54ce075db5 restructured Chengsong parents: diff changeset	486	string to make a successful match.
cc54ce075db5 restructured Chengsong parents: diff changeset	487	For example, state $q_1$ indicates a match that has
cc54ce075db5 restructured Chengsong parents: diff changeset	488	gone past the $(a\|b)^$ part of $(a\|b)^a(a\|b)^{\{2\}}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	489	and just consumed the "delimiter" $a$ in the middle, and
cc54ce075db5 restructured Chengsong parents: diff changeset	490	need to match 2 more iterations of $(a\|b)$ to complete.
cc54ce075db5 restructured Chengsong parents: diff changeset	491	State $q_2$ on the other hand, can be viewed as a state
cc54ce075db5 restructured Chengsong parents: diff changeset	492	after $q_1$ has consumed 1 character, and just waits
cc54ce075db5 restructured Chengsong parents: diff changeset	493	for 1 more character to complete.
cc54ce075db5 restructured Chengsong parents: diff changeset	494	$q_3$ is the last state, requiring 0 more character and is accepting.
cc54ce075db5 restructured Chengsong parents: diff changeset	495	Depending on the suffix of the
cc54ce075db5 restructured Chengsong parents: diff changeset	496	input string up to the current read location,
cc54ce075db5 restructured Chengsong parents: diff changeset	497	the states $q_1$ and $q_2$, $q_3$
cc54ce075db5 restructured Chengsong parents: diff changeset	498	may or may
cc54ce075db5 restructured Chengsong parents: diff changeset	499	not be active, independent from each other.
cc54ce075db5 restructured Chengsong parents: diff changeset	500	A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
cc54ce075db5 restructured Chengsong parents: diff changeset	501	contain at least $2^3$ non-equivalent states that cannot be merged,
cc54ce075db5 restructured Chengsong parents: diff changeset	502	because the subset construction during determinisation will generate
cc54ce075db5 restructured Chengsong parents: diff changeset	503	all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
cc54ce075db5 restructured Chengsong parents: diff changeset	504	Generalizing this to regular expressions with larger
cc54ce075db5 restructured Chengsong parents: diff changeset	505	bounded repetitions number, we have that
cc54ce075db5 restructured Chengsong parents: diff changeset	506	regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
cc54ce075db5 restructured Chengsong parents: diff changeset	507	would require at least $2^{n+1}$ states, if $r$ contains
cc54ce075db5 restructured Chengsong parents: diff changeset	508	more than 1 string.
cc54ce075db5 restructured Chengsong parents: diff changeset	509	This is to represent all different
cc54ce075db5 restructured Chengsong parents: diff changeset	510	scenarios which "countdown" states are active.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	511	For those regexes, tools that uses $\DFA$s will get
532 cc54ce075db5 restructured Chengsong parents: diff changeset	512	out of memory errors.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	513
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	514	\subsubsection{Tools that uses $\mathit{DFA}$s}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	515	%TODO:more tools that use DFAs?
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	516	$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	517	in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	518	lexers. The user provides a set of regular expressions
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	519	and configurations to such lexer generators, and then
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	520	gets an output program encoding a minimized $\mathit{DFA}$
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	521	that can be compiled and run.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	522	When given the above countdown regular expression,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	523	a small number $n$ would result in a determinised automata
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	524	with millions of states.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	525
532 cc54ce075db5 restructured Chengsong parents: diff changeset	526	For this reason, regex libraries that support
cc54ce075db5 restructured Chengsong parents: diff changeset	527	bounded repetitions often choose to use the $\mathit{NFA}$
cc54ce075db5 restructured Chengsong parents: diff changeset	528	approach.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	529
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	530
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	531
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	532
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	533
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	534
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	535
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	536
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	537	\subsection{Why $\mathit{NFA}$s can be slow in the second phase}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	538	When one constructs an $\NFA$ out of a regular expression
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	539	there is often very little to be done in the first phase, one simply
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	540	construct the $\NFA$ states based on the structure of the input regular expression.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	541
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	542	In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
532 cc54ce075db5 restructured Chengsong parents: diff changeset	543	one by keeping track of all active states after consuming
cc54ce075db5 restructured Chengsong parents: diff changeset	544	a character, and update that set of states iteratively.
cc54ce075db5 restructured Chengsong parents: diff changeset	545	This can be viewed as a breadth-first-search of the $\mathit{NFA}$
cc54ce075db5 restructured Chengsong parents: diff changeset	546	for a path terminating
cc54ce075db5 restructured Chengsong parents: diff changeset	547	at an accepting state.
cc54ce075db5 restructured Chengsong parents: diff changeset	548	Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	549	type of $\mathit{NFA}$ simulation and guarantees a linear runtime
532 cc54ce075db5 restructured Chengsong parents: diff changeset	550	in terms of input string length.
cc54ce075db5 restructured Chengsong parents: diff changeset	551	%TODO:try out these lexers
cc54ce075db5 restructured Chengsong parents: diff changeset	552	The other way to use $\mathit{NFA}$ for matching is choosing
cc54ce075db5 restructured Chengsong parents: diff changeset	553	a single transition each time, keeping all the other options in
cc54ce075db5 restructured Chengsong parents: diff changeset	554	a queue or stack, and backtracking if that choice eventually
cc54ce075db5 restructured Chengsong parents: diff changeset	555	fails. This method, often called a "depth-first-search",
cc54ce075db5 restructured Chengsong parents: diff changeset	556	is efficient in a lot of cases, but could end up
cc54ce075db5 restructured Chengsong parents: diff changeset	557	with exponential run time.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	558	%TODO:COMPARE java python lexer speed with Rust and Go
cc54ce075db5 restructured Chengsong parents: diff changeset	559	The reason behind backtracking algorithms in languages like
cc54ce075db5 restructured Chengsong parents: diff changeset	560	Java and Python is that they support back-references.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	561	\subsubsection{Back References}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	562	If we have a regular expression like this (the sequence
cc54ce075db5 restructured Chengsong parents: diff changeset	563	operator is omitted for brevity):
cc54ce075db5 restructured Chengsong parents: diff changeset	564	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	565	$r_1(r_2(r_3r_4))$
cc54ce075db5 restructured Chengsong parents: diff changeset	566	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	567	We could label sub-expressions of interest
cc54ce075db5 restructured Chengsong parents: diff changeset	568	by parenthesizing them and giving
cc54ce075db5 restructured Chengsong parents: diff changeset	569	them a number by the order in which their opening parentheses appear.
cc54ce075db5 restructured Chengsong parents: diff changeset	570	One possible way of parenthesizing and labelling is given below:
cc54ce075db5 restructured Chengsong parents: diff changeset	571	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	572	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
cc54ce075db5 restructured Chengsong parents: diff changeset	573	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	574	$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
cc54ce075db5 restructured Chengsong parents: diff changeset	575	by 1 to 4. $1$ would refer to the entire expression
cc54ce075db5 restructured Chengsong parents: diff changeset	576	$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
cc54ce075db5 restructured Chengsong parents: diff changeset	577	These sub-expressions are called "capturing groups".
cc54ce075db5 restructured Chengsong parents: diff changeset	578	We can use the following syntax to denote that we want a string just matched by a
cc54ce075db5 restructured Chengsong parents: diff changeset	579	sub-expression (capturing group) to appear at a certain location again,
cc54ce075db5 restructured Chengsong parents: diff changeset	580	exactly as it was:
cc54ce075db5 restructured Chengsong parents: diff changeset	581	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	582	$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
cc54ce075db5 restructured Chengsong parents: diff changeset	583	\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
cc54ce075db5 restructured Chengsong parents: diff changeset	584	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	585	The backslash and number $i$ are used to denote such
cc54ce075db5 restructured Chengsong parents: diff changeset	586	so-called "back-references".
cc54ce075db5 restructured Chengsong parents: diff changeset	587	Let $e$ be an expression made of regular expressions
cc54ce075db5 restructured Chengsong parents: diff changeset	588	and back-references. $e$ contains the expression $e_i$
cc54ce075db5 restructured Chengsong parents: diff changeset	589	as its $i$-th capturing group.
cc54ce075db5 restructured Chengsong parents: diff changeset	590	The semantics of back-reference can be recursively
cc54ce075db5 restructured Chengsong parents: diff changeset	591	written as:
cc54ce075db5 restructured Chengsong parents: diff changeset	592	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	593	\begin{tabular}{c}
cc54ce075db5 restructured Chengsong parents: diff changeset	594	$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	595	$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
cc54ce075db5 restructured Chengsong parents: diff changeset	596	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	597	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	598	The concrete example
cc54ce075db5 restructured Chengsong parents: diff changeset	599	$((a\|b\|c\|\ldots\|z)^*)\backslash 1$
cc54ce075db5 restructured Chengsong parents: diff changeset	600	would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	601	Back-reference is a construct in the "regex" standard
cc54ce075db5 restructured Chengsong parents: diff changeset	602	that programmers found useful, but not exactly
cc54ce075db5 restructured Chengsong parents: diff changeset	603	regular any more.
cc54ce075db5 restructured Chengsong parents: diff changeset	604	In fact, that allows the regex construct to express
cc54ce075db5 restructured Chengsong parents: diff changeset	605	languages that cannot be contained in context-free
cc54ce075db5 restructured Chengsong parents: diff changeset	606	languages either.
cc54ce075db5 restructured Chengsong parents: diff changeset	607	For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
cc54ce075db5 restructured Chengsong parents: diff changeset	608	expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
cc54ce075db5 restructured Chengsong parents: diff changeset	609	which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
cc54ce075db5 restructured Chengsong parents: diff changeset	610	Such a language is contained in the context-sensitive hierarchy
cc54ce075db5 restructured Chengsong parents: diff changeset	611	of formal languages.
cc54ce075db5 restructured Chengsong parents: diff changeset	612	Solving the back-reference expressions matching problem
cc54ce075db5 restructured Chengsong parents: diff changeset	613	is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
cc54ce075db5 restructured Chengsong parents: diff changeset	614	efficient solution is not known to exist.
cc54ce075db5 restructured Chengsong parents: diff changeset	615	%TODO:read a bit more about back reference algorithms
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	616
532 cc54ce075db5 restructured Chengsong parents: diff changeset	617	It seems that languages like Java and Python made the trade-off
cc54ce075db5 restructured Chengsong parents: diff changeset	618	to support back-references at the expense of having to backtrack,
cc54ce075db5 restructured Chengsong parents: diff changeset	619	even in the case of regexes not involving back-references.\\
cc54ce075db5 restructured Chengsong parents: diff changeset	620	Summing these up, we can categorise existing
cc54ce075db5 restructured Chengsong parents: diff changeset	621	practical regex libraries into the ones with linear
cc54ce075db5 restructured Chengsong parents: diff changeset	622	time guarantees like Go and Rust, which impose restrictions
cc54ce075db5 restructured Chengsong parents: diff changeset	623	on the user input (not allowing back-references,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	624	bounded repetitions cannot exceed 1000 etc.), and ones
532 cc54ce075db5 restructured Chengsong parents: diff changeset	625	that allows the programmer much freedom, but grinds to a halt
cc54ce075db5 restructured Chengsong parents: diff changeset	626	in some non-negligible portion of cases.
cc54ce075db5 restructured Chengsong parents: diff changeset	627	%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
cc54ce075db5 restructured Chengsong parents: diff changeset	628	% For example, the Rust regex engine claims to be linear,
cc54ce075db5 restructured Chengsong parents: diff changeset	629	% but does not support lookarounds and back-references.
cc54ce075db5 restructured Chengsong parents: diff changeset	630	% The GoLang regex library does not support over 1000 repetitions.
cc54ce075db5 restructured Chengsong parents: diff changeset	631	% Java and Python both support back-references, but shows
cc54ce075db5 restructured Chengsong parents: diff changeset	632	%catastrophic backtracking behaviours on inputs without back-references(
cc54ce075db5 restructured Chengsong parents: diff changeset	633	%when the language is still regular).
cc54ce075db5 restructured Chengsong parents: diff changeset	634	%TODO: test performance of Rust on (((((aa)b)b){20}))c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
cc54ce075db5 restructured Chengsong parents: diff changeset	635	%TODO: verify the fact Rust does not allow 1000+ reps
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	636	\ChristianComment{Comment required: Java 17 updated graphs? Is it ok to still use Java 8 graphs?}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	637
cc54ce075db5 restructured Chengsong parents: diff changeset	638
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	639	So we have practical implementations
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	640	on regular expression matching/lexing which are fast
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	641	but do not come with any guarantees that it will not grind to a halt
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	642	or give wrong answers.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	643	Our goal is to have a regex lexing algorithm that comes with
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	644	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	645	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	646	proven correctness
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	647	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	648	proven non-catastrophic properties
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	649	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	650	easy extensions to
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	651	constructs like
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	652	bounded repetitions, negation, lookarounds, and even back-references.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	653	\end{itemize}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	654
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	655	\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	656	We propose Brzozowski derivatives on regular expressions as
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	657	a solution to this.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	658	In the last fifteen or so years, Brzozowski's derivatives of regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	659	expressions have sparked quite a bit of interest in the functional
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	660	programming and theorem prover communities.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	661
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	662	\subsection{Motivation}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	663
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	664	Derivatives give a simple solution
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	665	to the problem of matching a string $s$ with a regular
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	666	expression $r$: if the derivative of $r$ w.r.t.\ (in
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	667	succession) all the characters of the string matches the empty string,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	668	then $r$ matches $s$ (and {\em vice versa}).
532 cc54ce075db5 restructured Chengsong parents: diff changeset	669
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	670	The beauty of
532 cc54ce075db5 restructured Chengsong parents: diff changeset	671	Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
cc54ce075db5 restructured Chengsong parents: diff changeset	672	expressible in any functional language, and easily definable and
cc54ce075db5 restructured Chengsong parents: diff changeset	673	reasoned about in theorem provers---the definitions just consist of
cc54ce075db5 restructured Chengsong parents: diff changeset	674	inductive datatypes and simple recursive functions.
cc54ce075db5 restructured Chengsong parents: diff changeset	675	And an algorithms based on it by
cc54ce075db5 restructured Chengsong parents: diff changeset	676	Suzmann and Lu \parencite{Sulzmann2014} allows easy extension
cc54ce075db5 restructured Chengsong parents: diff changeset	677	to include extended regular expressions and
cc54ce075db5 restructured Chengsong parents: diff changeset	678	simplification of internal data structures
cc54ce075db5 restructured Chengsong parents: diff changeset	679	eliminating the exponential behaviours.
cc54ce075db5 restructured Chengsong parents: diff changeset	680
cc54ce075db5 restructured Chengsong parents: diff changeset	681	However, two difficulties with derivative-based matchers exist:
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	682	\subsubsection{Problems with Current Brzozowski Matchers}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	683	First, Brzozowski's original matcher only generates a yes/no answer
cc54ce075db5 restructured Chengsong parents: diff changeset	684	for whether a regular expression matches a string or not. This is too
cc54ce075db5 restructured Chengsong parents: diff changeset	685	little information in the context of lexing where separate tokens must
cc54ce075db5 restructured Chengsong parents: diff changeset	686	be identified and also classified (for example as keywords
cc54ce075db5 restructured Chengsong parents: diff changeset	687	or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
cc54ce075db5 restructured Chengsong parents: diff changeset	688	difficulty by cleverly extending Brzozowski's matching
cc54ce075db5 restructured Chengsong parents: diff changeset	689	algorithm. Their extended version generates additional information on
cc54ce075db5 restructured Chengsong parents: diff changeset	690	\emph{how} a regular expression matches a string following the POSIX
cc54ce075db5 restructured Chengsong parents: diff changeset	691	rules for regular expression matching. They achieve this by adding a
cc54ce075db5 restructured Chengsong parents: diff changeset	692	second ``phase'' to Brzozowski's algorithm involving an injection
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	693	function. In our own earlier work, we provided the formal
532 cc54ce075db5 restructured Chengsong parents: diff changeset	694	specification of what POSIX matching means and proved in Isabelle/HOL
cc54ce075db5 restructured Chengsong parents: diff changeset	695	the correctness
cc54ce075db5 restructured Chengsong parents: diff changeset	696	of Sulzmann and Lu's extended algorithm accordingly
cc54ce075db5 restructured Chengsong parents: diff changeset	697	\cite{AusafDyckhoffUrban2016}.
cc54ce075db5 restructured Chengsong parents: diff changeset	698
cc54ce075db5 restructured Chengsong parents: diff changeset	699	The second difficulty is that Brzozowski's derivatives can
cc54ce075db5 restructured Chengsong parents: diff changeset	700	grow to arbitrarily big sizes. For example if we start with the
cc54ce075db5 restructured Chengsong parents: diff changeset	701	regular expression $(a+aa)^*$ and take
cc54ce075db5 restructured Chengsong parents: diff changeset	702	successive derivatives according to the character $a$, we end up with
cc54ce075db5 restructured Chengsong parents: diff changeset	703	a sequence of ever-growing derivatives like
cc54ce075db5 restructured Chengsong parents: diff changeset	704
cc54ce075db5 restructured Chengsong parents: diff changeset	705	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
cc54ce075db5 restructured Chengsong parents: diff changeset	706	\begin{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	707	\begin{tabular}{rll}
cc54ce075db5 restructured Chengsong parents: diff changeset	708	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	709	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	710	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
cc54ce075db5 restructured Chengsong parents: diff changeset	711	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
cc54ce075db5 restructured Chengsong parents: diff changeset	712	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
cc54ce075db5 restructured Chengsong parents: diff changeset	713	\end{tabular}
cc54ce075db5 restructured Chengsong parents: diff changeset	714	\end{center}
cc54ce075db5 restructured Chengsong parents: diff changeset	715
cc54ce075db5 restructured Chengsong parents: diff changeset	716	\noindent where after around 35 steps we run out of memory on a
cc54ce075db5 restructured Chengsong parents: diff changeset	717	typical computer (we shall define shortly the precise details of our
cc54ce075db5 restructured Chengsong parents: diff changeset	718	regular expressions and the derivative operation). Clearly, the
cc54ce075db5 restructured Chengsong parents: diff changeset	719	notation involving $\ZERO$s and $\ONE$s already suggests
cc54ce075db5 restructured Chengsong parents: diff changeset	720	simplification rules that can be applied to regular regular
cc54ce075db5 restructured Chengsong parents: diff changeset	721	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
cc54ce075db5 restructured Chengsong parents: diff changeset	722	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
cc54ce075db5 restructured Chengsong parents: diff changeset	723	r$. While such simple-minded simplifications have been proved in our
cc54ce075db5 restructured Chengsong parents: diff changeset	724	earlier work to preserve the correctness of Sulzmann and Lu's
cc54ce075db5 restructured Chengsong parents: diff changeset	725	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
cc54ce075db5 restructured Chengsong parents: diff changeset	726	\emph{not} help with limiting the growth of the derivatives shown
cc54ce075db5 restructured Chengsong parents: diff changeset	727	above: the growth is slowed, but the derivatives can still grow rather
cc54ce075db5 restructured Chengsong parents: diff changeset	728	quickly beyond any finite bound.
cc54ce075db5 restructured Chengsong parents: diff changeset	729
cc54ce075db5 restructured Chengsong parents: diff changeset	730
cc54ce075db5 restructured Chengsong parents: diff changeset	731	Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	732	\cite{Sulzmann2014} where they introduce bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	733	regular expressions. In this version, POSIX values are
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	734	represented as bit sequences and such sequences are incrementally generated
532 cc54ce075db5 restructured Chengsong parents: diff changeset	735	when derivatives are calculated. The compact representation
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	736	of bit sequences and regular expressions allows them to define a more
532 cc54ce075db5 restructured Chengsong parents: diff changeset	737	``aggressive'' simplification method that keeps the size of the
cc54ce075db5 restructured Chengsong parents: diff changeset	738	derivatives finite no matter what the length of the string is.
cc54ce075db5 restructured Chengsong parents: diff changeset	739	They make some informal claims about the correctness and linear behaviour
cc54ce075db5 restructured Chengsong parents: diff changeset	740	of this version, but do not provide any supporting proof arguments, not
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	741	even ``pencil-and-paper'' arguments. They write about their bit-coded
532 cc54ce075db5 restructured Chengsong parents: diff changeset	742	\emph{incremental parsing method} (that is the algorithm to be formalised
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	743	in this dissertation)
532 cc54ce075db5 restructured Chengsong parents: diff changeset	744
cc54ce075db5 restructured Chengsong parents: diff changeset	745
cc54ce075db5 restructured Chengsong parents: diff changeset	746
cc54ce075db5 restructured Chengsong parents: diff changeset	747	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	748	``Correctness Claim: We further claim that the incremental parsing
cc54ce075db5 restructured Chengsong parents: diff changeset	749	method [..] in combination with the simplification steps [..]
cc54ce075db5 restructured Chengsong parents: diff changeset	750	yields POSIX parse trees. We have tested this claim
cc54ce075db5 restructured Chengsong parents: diff changeset	751	extensively [..] but yet
cc54ce075db5 restructured Chengsong parents: diff changeset	752	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
cc54ce075db5 restructured Chengsong parents: diff changeset	753	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	754
cc54ce075db5 restructured Chengsong parents: diff changeset	755	Ausaf and Urban were able to back this correctness claim with
cc54ce075db5 restructured Chengsong parents: diff changeset	756	a formal proof.
cc54ce075db5 restructured Chengsong parents: diff changeset	757
cc54ce075db5 restructured Chengsong parents: diff changeset	758	But as they stated,
cc54ce075db5 restructured Chengsong parents: diff changeset	759	\begin{quote}\it
cc54ce075db5 restructured Chengsong parents: diff changeset	760	The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
cc54ce075db5 restructured Chengsong parents: diff changeset	761	\end{quote}
cc54ce075db5 restructured Chengsong parents: diff changeset	762
cc54ce075db5 restructured Chengsong parents: diff changeset	763	This thesis implements the aggressive simplifications envisioned
cc54ce075db5 restructured Chengsong parents: diff changeset	764	by Ausaf and Urban,
cc54ce075db5 restructured Chengsong parents: diff changeset	765	and gives a formal proof of the correctness with those simplifications.
cc54ce075db5 restructured Chengsong parents: diff changeset	766
cc54ce075db5 restructured Chengsong parents: diff changeset	767
cc54ce075db5 restructured Chengsong parents: diff changeset	768	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	769	\section{Contribution}
cc54ce075db5 restructured Chengsong parents: diff changeset	770
cc54ce075db5 restructured Chengsong parents: diff changeset	771
cc54ce075db5 restructured Chengsong parents: diff changeset	772
cc54ce075db5 restructured Chengsong parents: diff changeset	773	This work addresses the vulnerability of super-linear and
cc54ce075db5 restructured Chengsong parents: diff changeset	774	buggy regex implementations by the combination
cc54ce075db5 restructured Chengsong parents: diff changeset	775	of Brzozowski's derivatives and interactive theorem proving.
cc54ce075db5 restructured Chengsong parents: diff changeset	776	We give an
cc54ce075db5 restructured Chengsong parents: diff changeset	777	improved version of Sulzmann and Lu's bit-coded algorithm using
cc54ce075db5 restructured Chengsong parents: diff changeset	778	derivatives, which come with a formal guarantee in terms of correctness and
cc54ce075db5 restructured Chengsong parents: diff changeset	779	running time as an Isabelle/HOL proof.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	780	Further improvements to the algorithm with an even stronger version of
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	781	simplification is made.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	782	We have not yet come up with one, but believe that it leads to a
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	783	formalised proof with a time bound linear to input and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	784	cubic to regular expression size using a technique by
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	785	Antimirov\cite{Antimirov}.
532 cc54ce075db5 restructured Chengsong parents: diff changeset	786
cc54ce075db5 restructured Chengsong parents: diff changeset	787
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	788	The main contribution of this thesis is
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	789	\begin{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	790	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	791	a proven correct lexing algorithm
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	792	\item
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	793	with formalized finite bounds on internal data structures' sizes.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	794	\end{itemize}
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	795
532 cc54ce075db5 restructured Chengsong parents: diff changeset	796	To our best knowledge, no lexing libraries using Brzozowski derivatives
cc54ce075db5 restructured Chengsong parents: diff changeset	797	have a provable time guarantee,
cc54ce075db5 restructured Chengsong parents: diff changeset	798	and claims about running time are usually speculative and backed by thin empirical
cc54ce075db5 restructured Chengsong parents: diff changeset	799	evidence.
cc54ce075db5 restructured Chengsong parents: diff changeset	800	%TODO: give references
cc54ce075db5 restructured Chengsong parents: diff changeset	801	For example, Sulzmann and Lu had proposed an algorithm in which they
cc54ce075db5 restructured Chengsong parents: diff changeset	802	claim a linear running time.
cc54ce075db5 restructured Chengsong parents: diff changeset	803	But that was falsified by our experiments and the running time
cc54ce075db5 restructured Chengsong parents: diff changeset	804	is actually $\Omega(2^n)$ in the worst case.
cc54ce075db5 restructured Chengsong parents: diff changeset	805	A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
cc54ce075db5 restructured Chengsong parents: diff changeset	806	%TODO: give references
cc54ce075db5 restructured Chengsong parents: diff changeset	807	lexer, which calculates POSIX matches and is based on derivatives.
cc54ce075db5 restructured Chengsong parents: diff changeset	808	They formalized the correctness of the lexer, but not the complexity.
cc54ce075db5 restructured Chengsong parents: diff changeset	809	In the performance evaluation section, they simply analyzed the run time
cc54ce075db5 restructured Chengsong parents: diff changeset	810	of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
cc54ce075db5 restructured Chengsong parents: diff changeset	811	and concluded that the algorithm is quadratic in terms of input length.
cc54ce075db5 restructured Chengsong parents: diff changeset	812	When we tried out their extracted OCaml code with our example $(a+aa)^*$,
cc54ce075db5 restructured Chengsong parents: diff changeset	813	the time it took to lex only 40 $a$'s was 5 minutes.
cc54ce075db5 restructured Chengsong parents: diff changeset	814
cc54ce075db5 restructured Chengsong parents: diff changeset	815
cc54ce075db5 restructured Chengsong parents: diff changeset	816
cc54ce075db5 restructured Chengsong parents: diff changeset	817	\subsection{Related Work}
cc54ce075db5 restructured Chengsong parents: diff changeset	818	We are aware
cc54ce075db5 restructured Chengsong parents: diff changeset	819	of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
cc54ce075db5 restructured Chengsong parents: diff changeset	820	Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
cc54ce075db5 restructured Chengsong parents: diff changeset	821	of the work by Krauss and Nipkow \parencite{Krauss2011}. And another one
cc54ce075db5 restructured Chengsong parents: diff changeset	822	in Coq is given by Coquand and Siles \parencite{Coquand2012}.
cc54ce075db5 restructured Chengsong parents: diff changeset	823	Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
cc54ce075db5 restructured Chengsong parents: diff changeset	824
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	825
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	826	When a regular expression does not behave as intended,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	827	people usually try to rewrite the regex to some equivalent form
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	828	or they try to avoid the possibly problematic patterns completely,
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	829	for which many false positives exist\parencite{Davis18}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	830	Animated tools to "debug" regular expressions such as
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	831	\parencite{regexploit2021} \parencite{regex101} are also popular.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	832	We are also aware of static analysis work on regular expressions that
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	833	aims to detect potentially expoential regex patterns. Rathnayake and Thielecke
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	834	\parencite{Rathnayake2014StaticAF} proposed an algorithm
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	835	that detects regular expressions triggering exponential
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	836	behavious on backtracking matchers.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	837	Weideman \parencite{Weideman2017Static} came up with
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	838	non-linear polynomial worst-time estimates
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	839	for regexes, attack string that exploit the worst-time
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	840	scenario, and "attack automata" that generates
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	841	attack strings.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	842
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	843
532 cc54ce075db5 restructured Chengsong parents: diff changeset	844
cc54ce075db5 restructured Chengsong parents: diff changeset	845
cc54ce075db5 restructured Chengsong parents: diff changeset	846	\section{Structure of the thesis}
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	847	In chapter 2 \ref{Inj} we will introduce the concepts
532 cc54ce075db5 restructured Chengsong parents: diff changeset	848	and notations we
cc54ce075db5 restructured Chengsong parents: diff changeset	849	use for describing the lexing algorithm by Sulzmann and Lu,
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	850	and then give the lexing algorithm.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	851	We will give its variant in \ref{Bitcoded1}.
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	852	Then we illustrate in \ref{Bitcoded2}
532 cc54ce075db5 restructured Chengsong parents: diff changeset	853	how the algorithm without bitcodes falls short for such aggressive
cc54ce075db5 restructured Chengsong parents: diff changeset	854	simplifications and therefore introduce our version of the
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	855	bit-coded algorithm and
532 cc54ce075db5 restructured Chengsong parents: diff changeset	856	its correctness proof .
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	857	In \ref{Finite} we give the second guarantee
532 cc54ce075db5 restructured Chengsong parents: diff changeset	858	of our bitcoded algorithm, that is a finite bound on the size of any
cc54ce075db5 restructured Chengsong parents: diff changeset	859	regex's derivatives.
538 8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	860	In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
8016a2480704 intro and chap2 Chengsong parents: 537 diff changeset	861	in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
532 cc54ce075db5 restructured Chengsong parents: diff changeset	862	algorithm to include constructs such as bounded repetitions and negations.
cc54ce075db5 restructured Chengsong parents: diff changeset	863
cc54ce075db5 restructured Chengsong parents: diff changeset	864
cc54ce075db5 restructured Chengsong parents: diff changeset	865
cc54ce075db5 restructured Chengsong parents: diff changeset	866
cc54ce075db5 restructured Chengsong parents: diff changeset	867
cc54ce075db5 restructured Chengsong parents: diff changeset	868	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	869
cc54ce075db5 restructured Chengsong parents: diff changeset	870
cc54ce075db5 restructured Chengsong parents: diff changeset	871	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	872
cc54ce075db5 restructured Chengsong parents: diff changeset	873	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	874
cc54ce075db5 restructured Chengsong parents: diff changeset	875	%----------------------------------------------------------------------------------------
cc54ce075db5 restructured Chengsong parents: diff changeset	876
cc54ce075db5 restructured Chengsong parents: diff changeset	877

author	Chengsong
	Fri, 02 Sep 2022 19:18:50 +0100
changeset 594	62f8fa03863e
parent 591	b2d0de6aee18
child 596	b306628a0eab
permissions	-rwxr-xr-x