cst_tests: ninems/ninems.tex@cab5eab1f6f1 (annotated)

45 60cb82639691 spell check Christian Urban <urbanc@in.tum.de> parents: 44 diff changeset	1	\documentclass[a4paper,UKenglish]{lipics}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	2	\usepackage{graphic}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	3	\usepackage{data}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	4	\usepackage{tikz-cd}
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	5	%\usepackage{algorithm}
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	6	\usepackage{amsmath}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	7	\usepackage[noend]{algpseudocode}
42 2a469388c989 s Chengsong parents: 41 diff changeset	8	\usepackage{enumitem}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	9	\usepackage{nccmath}
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	10
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	11	\definecolor{darkblue}{rgb}{0,0,0.6}
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	12	\hypersetup{colorlinks=true,allcolors=darkblue}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	13	\newcommand{\comment}[1]%
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	14	{{\color{red}$\Rightarrow$}\marginpar{\raggedright\small{\bf\color{red}#1}}}
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	15
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	16	% \documentclass{article}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	17	%\usepackage[utf8]{inputenc}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	18	%\usepackage[english]{babel}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	19	%\usepackage{listings}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	20	% \usepackage{amsthm}
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	21	%\usepackage{hyperref}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	22	% \usepackage[margin=0.5in]{geometry}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	23	%\usepackage{pmboxdraw}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	24
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	25	\title{POSIX Regular Expression Matching and Lexing}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	26	\author{Chengsong Tan}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	27	\affil{King's College London\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	28	London, UK\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	29	\texttt{chengsong.tan@kcl.ac.uk}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	30	\authorrunning{Chengsong Tan}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	31	\Copyright{Chengsong Tan}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	32
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	33	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	34	\newcommand{\ZERO}{\mbox{\bf 0}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	35	\newcommand{\ONE}{\mbox{\bf 1}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	36	\def\lexer{\mathit{lexer}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	37	\def\mkeps{\mathit{mkeps}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	38	\def\inj{\mathit{inj}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	39	\def\Empty{\mathit{Empty}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	40	\def\Left{\mathit{Left}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	41	\def\Right{\mathit{Right}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	42	\def\Stars{\mathit{Stars}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	43	\def\Char{\mathit{Char}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	44	\def\Seq{\mathit{Seq}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	45	\def\Der{\mathit{Der}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	46	\def\nullable{\mathit{nullable}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	47	\def\Z{\mathit{Z}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	48	\def\S{\mathit{S}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	49
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	50	%\theoremstyle{theorem}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	51	%\newtheorem{theorem}{Theorem}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	52	%\theoremstyle{lemma}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	53	%\newtheorem{lemma}{Lemma}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	54	%\newcommand{\lemmaautorefname}{Lemma}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	55	%\theoremstyle{definition}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	56	%\newtheorem{definition}{Definition}
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	57	\algnewcommand\algorithmicswitch{\textbf{switch}}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	58	\algnewcommand\algorithmiccase{\textbf{case}}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	59	\algnewcommand\algorithmicassert{\texttt{assert}}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	60	\algnewcommand\Assert[1]{\State \algorithmicassert(#1)}%
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	61	% New "environments"
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	62	\algdef{SE}[SWITCH]{Switch}{EndSwitch}[1]{\algorithmicswitch\ #1\ \algorithmicdo}{\algorithmicend\ \algorithmicswitch}%
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	63	\algdef{SE}[CASE]{Case}{EndCase}[1]{\algorithmiccase\ #1}{\algorithmicend\ \algorithmiccase}%
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	64	\algtext*{EndSwitch}%
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	65	\algtext*{EndCase}%
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	66
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	67
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	68	\begin{document}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	69
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	70	\maketitle
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	71
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	72	\begin{abstract}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	73	Brzozowski introduced in 1964 a beautifully simple algorithm for
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	74	regular expression matching based on the notion of derivatives of
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	75	regular expressions. In 2014, Sulzmann and Lu extended this
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	76	algorithm to not just give a YES/NO answer for whether or not a
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	77	regular expression matches a string, but in case it does also
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	78	answers with \emph{how} it matches the string. This is important for
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	79	applications such as lexing (tokenising a string). The problem is to
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	80	make the algorithm by Sulzmann and Lu fast on all inputs without
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	81	breaking its correctness. We have already developed some
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	82	simplification rules for this, but have not yet proved that they
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	83	preserve the correctness of the algorithm. We also have not yet
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	84	looked at extended regular expressions, such as bounded repetitions,
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	85	negation and back-references.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	86	\end{abstract}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	87
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	88	\section{Introduction}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	89
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	90
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	91	This PhD-project is about regular expression matching and
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	92	lexing. Given the maturity of this topic, the reader might wonder:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	93	Surely, regular expressions must have already been studied to death?
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	94	What could possibly be \emph{not} known in this area? And surely all
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	95	implemented algorithms for regular expression matching are blindingly
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	96	fast?
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	97
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	98	Unfortunately these preconceptions are not supported by evidence: Take
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	99	for example the regular expression $(a^)^\,b$ and ask whether
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	100	strings of the form $aa..a$ match this regular
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	101	expression. Obviously not---the expected $b$ in the last
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	102	position is missing. One would expect that modern regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	103	matching engines can find this out very quickly. Alas, if one tries
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	104	this example in JavaScript, Python or Java 8 with strings like 28
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	105	$a$'s, one discovers that this decision takes around 30 seconds and
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	106	takes considerably longer when adding a few more $a$'s, as the graphs
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	107	below show:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	108
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	109	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	110	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	111	\begin{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	112	\begin{axis}[
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	113	xlabel={$n$},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	114	x label style={at={(1.05,-0.05)}},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	115	ylabel={time in secs},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	116	enlargelimits=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	117	xtick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	118	xmax=33,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	119	ymax=35,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	120	ytick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	121	scaled ticks=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	122	axis lines=left,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	123	width=5cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	124	height=4cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	125	legend entries={JavaScript},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	126	legend pos=north west,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	127	legend cell align=left]
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	128	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	129	\end{axis}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	130	\end{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	131	&
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	132	\begin{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	133	\begin{axis}[
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	134	xlabel={$n$},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	135	x label style={at={(1.05,-0.05)}},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	136	%ylabel={time in secs},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	137	enlargelimits=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	138	xtick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	139	xmax=33,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	140	ymax=35,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	141	ytick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	142	scaled ticks=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	143	axis lines=left,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	144	width=5cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	145	height=4cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	146	legend entries={Python},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	147	legend pos=north west,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	148	legend cell align=left]
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	149	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	150	\end{axis}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	151	\end{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	152	&
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	153	\begin{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	154	\begin{axis}[
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	155	xlabel={$n$},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	156	x label style={at={(1.05,-0.05)}},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	157	%ylabel={time in secs},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	158	enlargelimits=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	159	xtick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	160	xmax=33,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	161	ymax=35,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	162	ytick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	163	scaled ticks=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	164	axis lines=left,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	165	width=5cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	166	height=4cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	167	legend entries={Java 8},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	168	legend pos=north west,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	169	legend cell align=left]
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	170	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	171	\end{axis}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	172	\end{tikzpicture}\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	173	\multicolumn{3}{c}{Graphs: Runtime for matching $(a^)^\,b$ with strings
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	174	of the form $\underbrace{aa..a}_{n}$.}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	175	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	176	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	177
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	178	\noindent These are clearly abysmal and possibly surprising results. One
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	179	would expect these systems to do much better than that---after all,
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	180	given a DFA and a string, deciding whether a string is matched by this
f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	181	DFA should be linear.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	182
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	183	Admittedly, the regular expression $(a^)^\,b$ is carefully chosen to
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	184	exhibit this exponential behaviour. Unfortunately, such regular
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	185	expressions are not just a few outliers. They are actually
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	186	frequent enough to have a separate name created for
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	187	them---\emph{evil regular expressions}. In empiric work, Davis et al
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	188	report that they have found thousands of such evil regular expressions
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	189	in the JavaScript and Python ecosystems \cite{Davis18}.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	190
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	191	This exponential blowup in matching algorithms sometimes causes
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	192	considerable grief in real life: for example on 20 July 2016 one evil
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	193	regular expression brought the webpage
f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	194	\href{http://stackexchange.com}{Stack Exchange} to its
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	195	knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	196	In this instance, a regular expression intended to just trim white
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	197	spaces from the beginning and the end of a line actually consumed
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	198	massive amounts of CPU-resources---causing web servers to
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	199	grind to a halt. This happened when a post with 20,000 white spaces was
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	200	submitted, but importantly the white spaces were neither at the
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	201	beginning nor at the end. As a result, the regular expression matching
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	202	engine needed to backtrack over many choices. The underlying problem is
f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	203	that many ``real life'' regular expression matching engines do not use
f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	204	DFAs for matching. This is because they support regular expressions that
f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	205	are not covered by the classical automata theory, and in this more
f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	206	general setting there are quite a few research questions still
f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	207	unanswered and fast algorithms still need to be developed (for example
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	208	how to treat bounded repetitions, negation and back-references
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	209	efficiently).
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	210	%question: dfa can have exponential states. isn't this the actual reason why they do not use dfas?
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	211	%how do they avoid dfas exponential states if they use them for fast matching?
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	212	There is also another under-researched problem to do with regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	213	expressions and lexing, i.e.~the process of breaking up strings into
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	214	sequences of tokens according to some regular expressions. In this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	215	setting one is not just interested in whether or not a regular
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	216	expression matches a string, but also in \emph{how}. Consider for example a regular expression
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	217	$r_{key}$ for recognising keywords such as \textit{if}, \textit{then}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	218	and so on; and a regular expression $r_{id}$ for recognising
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	219	identifiers (say, a single character followed by characters or
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	220	numbers). One can then form the compound regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	221	$(r_{key} + r_{id})^*$ and use it to tokenise strings. But then how
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	222	should the string \textit{iffoo} be tokenised? It could be tokenised
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	223	as a keyword followed by an identifier, or the entire string as a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	224	single identifier. Similarly, how should the string \textit{if} be
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	225	tokenised? Both regular expressions, $r_{key}$ and $r_{id}$, would
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	226	``fire''---so is it an identifier or a keyword? While in applications
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	227	there is a well-known strategy to decide these questions, called POSIX
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	228	matching, only relatively recently precise definitions of what POSIX
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	229	matching actually means has been formalised
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	230	\cite{AusafDyckhoffUrban2016,OkuiSuzuki2010,Vansummeren2006}.
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	231	Such a definition has also been given by Sulzmann and Lu \cite{Sulzmann2014}, but the
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	232	corresponding correctness proof turned out to be faulty \cite{AusafDyckhoffUrban2016}.
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	233	Roughly, POSIX matching means matching the longest initial substring.
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	234	In the case of a tie, the initial sub-match is chosen according to some priorities attached to the
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	235	regular expressions (e.g.~keywords have a higher priority than
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	236	identifiers). This sounds rather simple, but according to Grathwohl et
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	237	al \cite[Page 36]{CrashCourse2014} this is not the case. They wrote:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	238
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	239	\begin{quote}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	240	\it{}``The POSIX strategy is more complicated than the greedy because of
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	241	the dependence on information about the length of matched strings in the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	242	various subexpressions.''
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	243	\end{quote}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	244
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	245	\noindent
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	246	This is also supported by evidence collected by Kuklewicz
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	247	\cite{Kuklewicz} who noticed that a number of POSIX regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	248	matchers calculate incorrect results.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	249
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	250	Our focus is on an algorithm introduced by Sulzmann and Lu in 2014 for
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	251	regular expression matching according to the POSIX strategy
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	252	\cite{Sulzmann2014}. Their algorithm is based on an older algorithm by
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	253	Brzozowski from 1964 where he introduced the notion of derivatives of
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	254	regular expressions \cite{Brzozowski1964}. We shall briefly explain
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	255	this algorithm next.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	256
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	257	\section{The Algorithm by Brzozowski based on Derivatives of Regular
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	258	Expressions}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	259
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	260	Suppose (basic) regular expressions are given by the following grammar:
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	261	\[ r ::= \ZERO \mid \ONE
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	262	\mid c
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	263	\mid r_1 \cdot r_2
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	264	\mid r_1 + r_2
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	265	\mid r^*
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	266	\]
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	267
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	268	\noindent
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	269	The intended meaning of the constructors is as follows: $\ZERO$
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	270	cannot match any string, $\ONE$ can match the empty string, the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	271	character regular expression $c$ can match the character $c$, and so
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	272	on.
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	273
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	274	The ingenious contribution by Brzozowski is the notion of
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	275	\emph{derivatives} of regular expressions. The idea behind this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	276	notion is as follows: suppose a regular expression $r$ can match a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	277	string of the form $c\!::\! s$ (that is a list of characters starting
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	278	with $c$), what does the regular expression look like that can match
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	279	just $s$? Brzozowski gave a neat answer to this question. He started
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	280	with the definition of $nullable$:
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	281	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	282	\begin{tabular}{lcl}
97c9ac95194d chages Chengsong parents: 35 diff changeset	283	$\nullable(\ZERO)$ & $\dn$ & $\mathit{false}$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	284	$\nullable(\ONE)$ & $\dn$ & $\mathit{true}$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	285	$\nullable(c)$ & $\dn$ & $\mathit{false}$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	286	$\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	287	$\nullable(r_1\cdot r_2)$ & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	288	$\nullable(r^*)$ & $\dn$ & $\mathit{true}$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	289	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	290	\end{center}
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	291	This function simply tests whether the empty string is in $L(r)$.
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	292	He then defined
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	293	the following operation on regular expressions, written
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	294	$r\backslash c$ (the derivative of $r$ w.r.t.~the character $c$):
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	295
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	296	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	297	\begin{tabular}{lcl}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	298	$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	299	$\ONE \backslash c$ & $\dn$ & $\ZERO$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	300	$d \backslash c$ & $\dn$ &
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	301	$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	302	$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	303	$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, nullable(r_1)$\\
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	304	& & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	305	& & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	306	$(r^)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	307	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	308	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	309
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	310	%Assuming the classic notion of a
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	311	%\emph{language} of a regular expression, written $L(\_)$, t
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	312
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	313	\noindent
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	314	The main property of the derivative operation is that
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	315
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	316	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	317	$c\!::\!s \in L(r)$ holds
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	318	if and only if $s \in L(r\backslash c)$.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	319	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	320
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	321	\noindent
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	322	For us the main advantage is that derivatives can be
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	323	straightforwardly implemented in any functional programming language,
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	324	and are easily definable and reasoned about in theorem provers---the
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	325	definitions just consist of inductive datatypes and simple recursive
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	326	functions. Moreover, the notion of derivatives can be easily
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	327	generalised to cover extended regular expression constructors such as
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	328	the not-regular expression, written $\neg\,r$, or bounded repetitions
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	329	(for example $r^{\{n\}}$ and $r^{\{n..m\}}$), which cannot be so
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	330	straightforwardly realised within the classic automata approach.
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	331	For the moment however, we focus only on the usual basic regular expressions.
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	332
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	333
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	334	Now if we want to find out whether a string $s$ matches with a regular
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	335	expression $r$, build the derivatives of $r$ w.r.t.\ (in succession)
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	336	all the characters of the string $s$. Finally, test whether the
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	337	resulting regular expression can match the empty string. If yes, then
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	338	$r$ matches $s$, and no in the negative case. To implement this idea
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	339	we can generalise the derivative operation to strings like this:
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	340
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	341	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	342	\begin{tabular}{lcl}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	343	$r \backslash (c\!::\!s) $ & $\dn$ & $(r \backslash c) \backslash s$ \\
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	344	$r \backslash [\,] $ & $\dn$ & $r$
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	345	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	346	\end{center}
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	347
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	348	\noindent
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	349	and then define as regular-expression matching algorithm:
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	350	\[
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	351	match\;s\;r \;\dn\; nullable(r\backslash s)
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	352	\]
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	353
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	354	\noindent
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	355	This algorithm looks graphically as follows:
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	356	\begin{equation}\label{graph:*}
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	357	\begin{tikzcd}
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	358	r_0 \arrow[r, "\backslash c_0"] & r_1 \arrow[r, "\backslash c_1"] & r_2 \arrow[r, dashed] & r_n \arrow[r,"\textit{nullable}?"] & \;\textrm{YES}/\textrm{NO}
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	359	\end{tikzcd}
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	360	\end{equation}
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	361
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	362	\noindent
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	363	where we start with a regular expression $r_0$, build successive
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	364	derivatives until we exhaust the string and then use \textit{nullable}
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	365	to test whether the result can match the empty string. It can be
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	366	relatively easily shown that this matcher is correct (that is given
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	367	an $s = c_0...c_{n-1}$ and an $r_0$, it generates YES if and only if $s \in L(r_0)$).
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	368
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	369
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	370	\section{Values and the Algorithm by Sulzmann and Lu}
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	371
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	372	One limitation, however, of Brzozowski's algorithm is that it only
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	373	produces a YES/NO answer for whether a string is being matched by a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	374	regular expression. Sulzmann and Lu~\cite{Sulzmann2014} extended this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	375	algorithm to allow generation of an actual matching, called a
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	376	\emph{value}. Values and regular expressions correspond to each
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	377	other as illustrated in the following table:
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	378
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	379
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	380	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	381	\begin{tabular}{c@{\hspace{20mm}}c}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	382	\begin{tabular}{@{}rrl@{}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	383	\multicolumn{3}{@{}l}{\textbf{Regular Expressions}}\medskip\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	384	$r$ & $::=$ & $\ZERO$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	385	& $\mid$ & $\ONE$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	386	& $\mid$ & $c$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	387	& $\mid$ & $r_1 \cdot r_2$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	388	& $\mid$ & $r_1 + r_2$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	389	\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	390	& $\mid$ & $r^*$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	391	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	392	&
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	393	\begin{tabular}{@{\hspace{0mm}}rrl@{}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	394	\multicolumn{3}{@{}l}{\textbf{Values}}\medskip\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	395	$v$ & $::=$ & \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	396	& & $\Empty$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	397	& $\mid$ & $\Char(c)$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	398	& $\mid$ & $\Seq\,v_1\, v_2$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	399	& $\mid$ & $\Left(v)$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	400	& $\mid$ & $\Right(v)$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	401	& $\mid$ & $\Stars\,[v_1,\ldots\,v_n]$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	402	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	403	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	404	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	405
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	406	\noindent
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	407	No value corresponds to $\ZERO$; $\Empty$ corresponds to
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	408	$\ONE$; $\Char$ to the character regular expression; $\Seq$ to the
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	409	sequence regular expression and so on. The idea of values is to encode
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	410	parse trees for how the sub-parts of a regular expression matches
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	411	the sub-parts of a string. To see this, suppose a \emph{flatten} operation, written
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	412	$\|v\|$. We use this function to extract the underlying string of a value
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	413	$v$. For example, $\|\mathit{Seq} \, (\textit{Char x}) \, (\textit{Char
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	414	y})\|$ is the string $xy$. Using flatten, we can describe how values
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	415	encode parse trees: $\Seq\,v_1\, v_2$ encodes a tree with 2 children nodes
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	416	that tells how the string $\|v_1\| @
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	417	\|v_2\|$ matches the regex $r_1 \cdot r_2$ whereby $r_1$ matches the
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	418	substring $\|v_1\|$ and, respectively, $r_2$ matches the substring
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	419	$\|v_2\|$. Exactly how these two are matched is contained in the
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	420	children nodes $v_1$ and $v_2$ of parent $\textit{Seq}$ .
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	421
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	422	To give a concrete example of how value works, consider the string $xy$
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	423	and the regular expression $(x + (y + xy))^*$. We can view this regular
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	424	expression as a tree and if the string $xy$ is matched by two Star
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	425	``iterations'', then the $x$ is matched by the left-most alternative in
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	426	this tree and the $y$ by the right-left alternative. This suggests to
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	427	record this matching as
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	428
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	429	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	430	$\Stars\,[\Left\,(\Char\,x), \Right(\Left(\Char\,y))]$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	431	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	432
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	433	\noindent
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	434	where $\Stars [\ldots]$ records all the
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	435	iterations; and $\Left$, respectively $\Right$, which
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	436	alternative is used. The value for
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	437	matching $xy$ in a single ``iteration'', i.e.~the POSIX value,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	438	would look as follows
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	439
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	440	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	441	$\Stars\,[\Seq\,(\Char\,x)\,(\Char\,y)]$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	442	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	443
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	444	\noindent
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	445	where $\Stars$ has only a single-element list for the single iteration
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	446	and $\Seq$ indicates that $xy$ is matched by a sequence regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	447	expression.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	448
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	449	The contribution of Sulzmann and Lu is an extension of Brzozowski's
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	450	algorithm by a second phase (the first phase being building successive
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	451	derivatives---see \eqref{graph:*}). In this second phase, a POSIX value
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	452	is generated assuming the regular expression matches the string.
54 4bfd28722884 minor changes Chengsong parents: 53 diff changeset	453	Pictorially, the algorithm is as follows:
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	454
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	455	\begin{ceqn}
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	456	\begin{equation}\label{graph:2}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	457	\begin{tikzcd}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	458	r_0 \arrow[r, "\backslash c_0"] \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	459	v_0 & v_1 \arrow[l,"inj_{r_0} c_0"] & v_2 \arrow[l, "inj_{r_1} c_1"] & v_n \arrow[l, dashed]
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	460	\end{tikzcd}
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	461	\end{equation}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	462	\end{ceqn}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	463
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	464	\noindent
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	465	For convenience, we shall employ the following notations: the regular expression we
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	466	start with is $r_0$, and the given string $s$ is composed of characters $c_0 c_1
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	467	\ldots c_{n-1}$. In the first phase, we build the derivatives $r_1$, $r_2$, \ldots according to
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	468	the characters $c_0$, $c_1$,\ldots until we exhaust the string and
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	469	obtain at the derivative $r_n$. We test whether this derivative is
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	470	$\textit{nullable}$ or not. If not, we know the string does not match
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	471	$r$ and no value needs to be generated. If yes, we start building the
9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	472	parse tree incrementally by \emph{injecting} back the characters into
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	473	the values $v_n, \ldots, v_0$. For this we first call the function
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	474	$\textit{mkeps}$, which builds the parse tree for how the empty string
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	475	has matched the (nullable) regular expression $r_n$. This function is defined
46 9b48724ec609 proofread Christian Urban <urbanc@in.tum.de> parents: 45 diff changeset	476	as
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	477
51 5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	478	\begin{center}
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	479	\begin{tabular}{lcl}
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	480	$\mkeps(\ONE)$ & $\dn$ & $\Empty$ \\
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	481	$\mkeps(r_{1}+r_{2})$ & $\dn$
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	482	& \textit{if} $\nullable(r_{1})$\\
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	483	& & \textit{then} $\Left(\mkeps(r_{1}))$\\
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	484	& & \textit{else} $\Right(\mkeps(r_{2}))$\\
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	485	$\mkeps(r_1\cdot r_2)$ & $\dn$ & $\Seq\,(\mkeps\,r_1)\,(\mkeps\,r_2)$\\
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	486	$mkeps(r^*)$ & $\dn$ & $\Stars\,[]$
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	487	\end{tabular}
5df7faf69238 added mkeps and pder, still have not proof read it Chengsong parents: 50 diff changeset	488	\end{center}
41 a1f90febbc7f example Chengsong parents: 40 diff changeset	489
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	490
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	491	\noindent There are no cases for $\ZERO$ and $c$, since
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	492	these regular expression cannot match the empty string. Note
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	493	also that in case of alternatives we give preference to the
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	494	regular expression on the left-hand side. This will become
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	495	important later on.
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	496
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	497	After this, we inject back the characters one by one in order to build
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	498	the parse tree $v_i$ for how the regex $r_i$ matches the string $s_i$
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	499	($s_i = c_i \ldots c_{n-1}$ ) from the previous parse tree $v_{i+1}$. After
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	500	injecting back $n$ characters, we get the parse tree for how $r_0$
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	501	matches $s$. For this Sulzmann and Lu defined a function that reverses
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	502	the ``chopping off'' of characters during the derivative phase. The
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	503	corresponding function is called $\textit{inj}$; it takes three
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	504	arguments: the first one is a regular expression ${r_{i-1}}$, before the
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	505	character is chopped off, the second is a character ${c_{i-1}}$, the
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	506	character we want to inject and the third argument is the value
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	507	${v_i}$, into which one wants to inject the character (it
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	508	corresponds to the regular expression after the character has been
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	509	chopped off). The result of this function is a new value. The definition
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	510	of $\textit{inj}$ is as follows:
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	511
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	512	\begin{center}
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	513	\begin{tabular}{l@{\hspace{1mm}}c@{\hspace{1mm}}l}
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	514	$\textit{inj}\,(c)\,c\,Empty$ & $\dn$ & $Char\,c$\\
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	515	$\textit{inj}\,(r_1 + r_2)\,c\,\Left(v)$ & $\dn$ & $\Left(\textit{inj}\,r_1\,c\,v)$\\
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	516	$\textit{inj}\,(r_1 + r_2)\,c\,Right(v)$ & $\dn$ & $Right(\textit{inj}\,r_2\,c\,v)$\\
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	517	$\textit{inj}\,(r_1 \cdot r_2)\,c\,Seq(v_1,v_2)$ & $\dn$ & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	518	$\textit{inj}\,(r_1 \cdot r_2)\,c\,\Left(Seq(v_1,v_2))$ & $\dn$ & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	519	$\textit{inj}\,(r_1 \cdot r_2)\,c\,Right(v)$ & $\dn$ & $Seq(\textit{mkeps}(r_1),\textit{inj}\,r_2\,c\,v)$\\
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	520	$\textit{inj}\,(r^*)\,c\,Seq(v,Stars\,vs)$ & $\dn$ & $Stars((\textit{inj}\,r\,c\,v)\,::\,vs)$\\
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	521	\end{tabular}
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	522	\end{center}
8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	523
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	524	\noindent This definition is by recursion on the ``shape'' of regular
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	525	expressions and values. To understands this definition better consider
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	526	the situation when we build the derivative on regular expression $r_{i-1}$.
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	527	For this we chop off a character from $r_{i-1}$ to form $r_i$. This leaves a
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	528	``hole'' in $r_i$ and its corresponding value $v_i$
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	529	. To calculate $v_{i-1}$, we need to
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	530	locate where that hole is and fill it.
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	531	We can find this location by
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	532	comparing $r_{i-1}$ and $v_i$. For instance, if $r_{i-1}$ is of shape
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	533	$r_a \cdot r_b$, and $v_i$ is of shape $\Left(Seq(v_1,v_2))$, we know immediately that
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	534	%
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	535	\[ (r_a \cdot r_b)\backslash c = (r_a\backslash c) \cdot r_b \,+\, r_b\backslash c,\]
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	536
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	537	\noindent
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	538	otherwise if $r_a$ is not nullable,
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	539	\[ (r_a \cdot r_b)\backslash c = (r_a\backslash c) \cdot r_b,\]
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	540
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	541	\noindent
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	542	the value $v_i$ should be $\Seq(\ldots)$, contradicting the fact that
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	543	$v_i$ is actually of shape $\Left(\ldots)$. Furthermore, since $v_i$ is of shape
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	544	$\Left(\ldots)$ instead of $\Right(\ldots)$, we know that the left
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	545	branch of \[ (r_a \cdot r_b)\backslash c =
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	546	\bold{\underline{ (r_a\backslash c) \cdot r_b} }\,+\, r_b\backslash c,\](underlined)
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	547	is taken instead of the right one. This means $c$ is chopped off
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	548	from $r_a$ rather than $r_b$.
afd0d702a4fe some changes Chengsong parents: 63 diff changeset	549	We have therefore found out
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	550	that the hole will be on $r_a$. So we recursively call $\inj\,
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	551	r_a\,c\,v_a$ to fill that hole in $v_a$. After injection, the value
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	552	$v_i$ for $r_i = r_a \cdot r_b$ should be $\Seq\,(\inj\,r_a\,c\,v_a)\,v_b$.
60 c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	553	Other clauses can be understood in a similar way.
59 8ff7b7508824 changes1 Chengsong parents: 58 diff changeset	554
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	555	The following example gives a \comment{Other word: insight?}taste of $\textit{inj}$'s effect
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	556	and how Sulzmann and Lu's algorithm works as a whole.
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	557	Suppose we have a
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	558	regular expression $((((a+b)+ab)+c)+abc)^*$, and want to match it against
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	559	the string $abc$ (when $abc$ is written as a regular expression, the most
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	560	standard way of expressing it should be $a \cdot (b \cdot c)$. We omit
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	561	the parentheses and dots here for readability).
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	562	This algorithm returns a POSIX value, which means it
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	563	will go for the longest matching, i.e.~it should match the string
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	564	$abc$ in one star iteration, using the longest alternative $abc$ in the
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	565	sub-expression $((((a+b)+ab)+c)+abc)$ (we use $r$ to denote this sub-expression
64 afd0d702a4fe some changes Chengsong parents: 63 diff changeset	566	for conciseness).
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	567	Before $\textit{inj}$ comes into play,
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	568	our lexer first builds derivative using string $abc$ (we simplified some regular expressions like
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	569	$0 \cdot b$ to $0$ for conciseness; we also omit parentheses if
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	570	they are clear from the context):
60 c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	571	%Similarly, we allow
c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	572	%$\textit{ALT}$ to take a list of regular expressions as an argument
c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	573	%instead of just 2 operands to reduce the nested depth of
c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	574	%$\textit{ALT}$
c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	575	\begin{center}
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	576	\begin{tabular}{lcl}
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	577	$r^$ & $\xrightarrow{\backslash a}$ & $r_1 = (1+0+1 \cdot b + 0 + 1 \cdot b \cdot c) \cdot r^$\\
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	578	& $\xrightarrow{\backslash b}$ & $r_2 = (0+0+1 \cdot 1 + 0 + 1 \cdot 1 \cdot c) \cdot r^* +(0+1+0 + 0 + 0) \cdot r^*$\\
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	579	& $\xrightarrow{\backslash c}$ & $r_3 = ((0+0+0 + 0 + 1 \cdot 1 \cdot 1) \cdot r^* + (0+0+0 + 1 + 0) \cdot r^*) + $\\
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	580	& & $\phantom{r_3 = (} ((0+1+0 + 0 + 0) \cdot r^* + (0+0+0 + 1 + 0) \cdot r^* )$
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	581	\end{tabular}
60 c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	582	\end{center}
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	583
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	584	\noindent
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	585	In case $r_3$ is nullable, we can call $mkeps$
60 c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	586	to construct a parse tree for how $r_3$ matched the string $abc$.
c737a0259194 sorry not all done, need a few more mins for last few changes Chengsong parents: 59 diff changeset	587	$mkeps$ gives the following value $v_3$:
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	588	\begin{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	589	$\Left(\Left(\Seq(\Right(\Seq(\Empty, \Seq(\Empty,\Empty))), \Stars [])))$
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	590	\end{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	591	The outer $\Left(\Left(\ldots))$ tells us the leftmost nullable part of $r_3$(underlined):
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	592
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	593	\begin{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	594	$( \underline{(0+0+0 + 0 + 1 \cdot 1 \cdot 1) \cdot r^*} + (0+0+0 + 1 + 0)
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	595	\cdot r^) +((0+1+0 + 0 + 0) \cdot r^+(0+0+0 + 1 + 0) \cdot r^* ).$
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	596	\end{center}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	597
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	598	\noindent
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	599	Note that the leftmost location of term $((0+0+0 + 0 + 1 \cdot 1 \cdot
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	600	1) \cdot r^*$ (which corresponds to the initial sub-match $abc$) allows
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	601	$mkeps$ to pick it up because $mkeps$ is defined to always choose the
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	602	left one when it is nullable. In the case of this example, $abc$ is
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	603	preferred over $a$ or $ab$. This $\Left(\Left(\ldots))$ location is
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	604	naturally generated by two applications of the splitting clause
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	605
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	606	\begin{center}
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	607	$(r_1 \cdot r_2)\backslash c (when \; r_1 \; nullable) \, = (r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c.$
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	608	\end{center}
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	609
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	610	\noindent
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	611	By this clause, we put $r_1 \backslash c \cdot r_2 $ at the
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	612	$\textit{front}$ and $r_2 \backslash c$ at the $\textit{back}$. This
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	613	allows $mkeps$ to always pick up among two matches the one with a longer
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	614	initial sub-match. Removing the outside $\Left(\Left(...))$, the inside
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	615	sub-value
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	616
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	617	\begin{center}
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	618	$\Seq(\Right(\Seq(\Empty, \Seq(\Empty, \Empty))), \Stars [])$
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	619	\end{center}
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	620
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	621	\noindent
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	622	tells us how the empty string $[]$ is matched with $(0+0+0 + 0 + 1 \cdot
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	623	1 \cdot 1) \cdot r^*$. We match $[]$ by a sequence of 2 nullable regular
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	624	expressions. The first one is an alternative, we take the rightmost
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	625	alternative---whose language contains the empty string. The second
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	626	nullable regular expression is a Kleene star. $\Stars$ tells us how it
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	627	generates the nullable regular expression: by 0 iterations to form
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	628	$\epsilon$. Now $\textit{inj}$ injects characters back and incrementally
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	629	builds a parse tree based on $v_3$. Using the value $v_3$, the character
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	630	c, and the regular expression $r_2$, we can recover how $r_2$ matched
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	631	the string $[c]$ : $\textit{inj} \; r_2 \; c \; v_3$ gives us
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	632	\begin{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	633	$v_2 = \Left(\Seq(\Right(\Seq(\Empty, \Seq(\Empty, c))), \Stars [])),$
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	634	\end{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	635	which tells us how $r_2$ matched $[c]$. After this we inject back the character $b$, and get
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	636	\begin{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	637	$v_1 = \Seq(\Right(\Seq(\Empty, \Seq(b, c))), \Stars [])$
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	638	\end{center}
61 580c7b84f900 hi finished Chengsong parents: 60 diff changeset	639	for how
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	640	\begin{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	641	$r_1= (1+0+1 \cdot b + 0 + 1 \cdot b \cdot c) \cdot r*$
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	642	\end{center}
61 580c7b84f900 hi finished Chengsong parents: 60 diff changeset	643	matched the string $bc$ before it split into 2 pieces.
580c7b84f900 hi finished Chengsong parents: 60 diff changeset	644	Finally, after injecting character $a$ back to $v_1$,
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	645	we get the parse tree
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	646	\begin{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	647	$v_0= \Stars [\Right(\Seq(a, \Seq(b, c)))]$
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	648	\end{center}
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	649	for how $r$ matched $abc$. This completes the algorithm.
df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	650
61 580c7b84f900 hi finished Chengsong parents: 60 diff changeset	651	%We omit the details of injection function, which is provided by Sulzmann and Lu's paper \cite{Sulzmann2014}.
580c7b84f900 hi finished Chengsong parents: 60 diff changeset	652	Readers might have noticed that the parse tree information
580c7b84f900 hi finished Chengsong parents: 60 diff changeset	653	is actually already available when doing derivatives.
580c7b84f900 hi finished Chengsong parents: 60 diff changeset	654	For example, immediately after the operation $\backslash a$ we know that if we want to match a string that starts with $a$,
580c7b84f900 hi finished Chengsong parents: 60 diff changeset	655	we can either take the initial match to be
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	656	\begin{center}
42 2a469388c989 s Chengsong parents: 41 diff changeset	657	\begin{enumerate}
2a469388c989 s Chengsong parents: 41 diff changeset	658	\item[1)] just $a$ or
2a469388c989 s Chengsong parents: 41 diff changeset	659	\item[2)] string $ab$ or
2a469388c989 s Chengsong parents: 41 diff changeset	660	\item[3)] string $abc$.
2a469388c989 s Chengsong parents: 41 diff changeset	661	\end{enumerate}
65 df2e0faccb23 moew changes Chengsong parents: 64 diff changeset	662	\end{center}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	663
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	664	\noindent
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	665	In order to differentiate between these choices, we just need to
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	666	remember their positions--$a$ is on the left, $ab$ is in the middle ,
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	667	and $abc$ is on the right. Which one of these alternatives is chosen
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	668	later does not affect their relative position because our algorithm does
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	669	not change this order. If this parsing information can be determined and
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	670	does not change because of later derivatives, there is no point in
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	671	traversing this information twice. This leads to an optimisation---if we
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	672	store the information for parse trees inside the regular expression,
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	673	update it when we do derivative on them, and collect the information
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	674	when finished with derivatives and call $mkeps$ for deciding which
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	675	branch is POSIX, we can generate the parse tree in one pass, instead of
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	676	doing the rest $n$ injections. This leads to Sulzmann and Lu's novel
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	677	idea of using bit-codes in derivatives.
42 2a469388c989 s Chengsong parents: 41 diff changeset	678
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	679	In the next section, we shall focus on the bit-coded algorithm and the
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	680	process of simplification of regular expressions. This is needed in
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	681	order to obtain \emph{fast} versions of the Brzozowski's, and Sulzmann
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	682	and Lu's algorithms. This is where the PhD-project aims to advance the
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	683	state-of-the-art.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	684
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	685
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	686	\section{Simplification of Regular Expressions}
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	687
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	688	Using bitcodes to guide parsing is not a novel idea. It was applied to
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	689	context free grammars and then adapted by Henglein and Nielson for
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	690	efficient regular expression parsing using DFAs~\cite{nielson11bcre}.
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	691	Sulzmann and Lu took this idea of bitcodes a step further by integrating
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	692	bitcodes into derivatives. The reason why we want to use bitcodes in
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	693	this project is that we want to introduce more aggressive
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	694	simplifications in order to keep the size of derivatives small
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	695	throughout. This is because the main drawback of building successive
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	696	derivatives according to Brzozowski's definition is that they can grow
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	697	very quickly in size. This is mainly due to the fact that the derivative
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	698	operation generates often ``useless'' $\ZERO$s and $\ONE$s in
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	699	derivatives. As a result, if implemented naively both algorithms by
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	700	Brzozowski and by Sulzmann and Lu are excruciatingly slow. For example
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	701	when starting with the regular expression $(a + aa)^*$ and building 12
63 d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	702	successive derivatives w.r.t.~the character $a$, one obtains a
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	703	derivative regular expression with more than 8000 nodes (when viewed as
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	704	a tree). Operations like derivative and $\nullable$ need to traverse
d3c22f809dde more proof-reading Christian Urban <urbanc@in.tum.de> parents: 62 diff changeset	705	such trees and consequently the bigger the size of the derivative the
66 8c8c82c0515f good to proofread Chengsong parents: 65 diff changeset	706	slower the algorithm.
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	707
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	708	Fortunately, one can simplify regular expressions after each derivative
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	709	step. Various simplifications of regular expressions are possible, such
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	710	as the simplifications of $\ZERO + r$, $r + \ZERO$, $\ONE\cdot r$, $r
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	711	\cdot \ONE$, and $r + r$ to just $r$. These simplifications do not
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	712	affect the answer for whether a regular expression matches a string or
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	713	not, but fortunately also do not affect the POSIX strategy of how
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	714	regular expressions match strings---although the latter is much harder
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	715	to establish. \comment{Does not make sense.} The argument for
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	716	complicating the data structures from basic regular expressions to those
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	717	with bitcodes is that we can introduce simplification without making the
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	718	algorithm crash or overly complex to reason about. The latter is crucial
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	719	for a correctness proof. Some initial results in this regard have been
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	720	obtained in \cite{AusafDyckhoffUrban2016}.
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	721
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	722	Unfortunately, the simplification rules outlined above are not
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	723	sufficient to prevent an explosion for all regular expression. We
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	724	believe a tighter bound can be achieved that prevents an explosion in
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	725	all cases. Such a tighter bound is suggested by work of Antimirov who
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	726	proved that (partial) derivatives can be bound by the number of
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	727	characters contained in the initial regular expression
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	728	\cite{Antimirov95}. He defined the \emph{partial derivatives} of regular
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	729	expressions as follows:
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	730
52 25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	731	\begin{center}
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	732	\begin{tabular}{lcl}
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	733	$\textit{pder} \; c \; 0$ & $\dn$ & $\emptyset$\\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	734	$\textit{pder} \; c \; 1$ & $\dn$ & $\emptyset$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	735	$\textit{pder} \; c \; d$ & $\dn$ & $\textit{if} \; c \,=\, d \; \{ 1 \} \; \textit{else} \; \emptyset$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	736	$\textit{pder} \; c \; r_1+r_2$ & $\dn$ & $pder \; c \; r_1 \cup pder \; c \; r_2$ \\
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	737	$\textit{pder} \; c \; r_1 \cdot r_2$ & $\dn$ & $\textit{if} \; nullable \; r_1 $\\
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	738	& & $\textit{then} \; \{ r \cdot r_2 \mid r \in pder \; c \; r_1 \} \cup pder \; c \; r_2 \;$\\
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	739	& & $\textit{else} \; \{ r \cdot r_2 \mid r \in pder \; c \; r_1 \} $ \\
52 25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	740	$\textit{pder} \; c \; r^$ & $\dn$ & $ \{ r' \cdot r^ \mid r' \in pder \; c \; r \} $ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	741	\end{tabular}
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	742	\end{center}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	743
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	744	\noindent
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	745	A partial derivative of a regular expression $r$ is essentially a set of
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	746	regular expressions that are either $r$'s children expressions or a
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	747	concatenation of them. Antimirov has proved a tight bound of the size of
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	748	partial derivatives. \comment{That looks too preliminary to me.} Roughly
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	749	speaking the size will not exceed the fourth power of the number of
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	750	nodes in that regular expression. \comment{Improve: which
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	751	simplifications?}Interestingly, we observed from experiment that after
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	752	the simplification step, our regular expression has the same size or is
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	753	smaller than the partial derivatives. This allows us to prove a tight
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	754	bound on the size of regular expression during the running time of the
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	755	algorithm if we can establish the connection between our simplification
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	756	rules and partial derivatives.
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	757
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	758	%We believe, and have generated test
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	759	%data, that a similar bound can be obtained for the derivatives in
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	760	%Sulzmann and Lu's algorithm. Let us give some details about this next.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	761
43 52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	762	Bit-codes look like this:
67 e974c5477a8c anotehr update Chengsong parents: 66 diff changeset	763	\begin{center}
e974c5477a8c anotehr update Chengsong parents: 66 diff changeset	764	$b ::= S \mid Z \; \;\;
43 52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	765	bs ::= [] \mid b:bs
67 e974c5477a8c anotehr update Chengsong parents: 66 diff changeset	766	$
e974c5477a8c anotehr update Chengsong parents: 66 diff changeset	767	\end{center}
68 3c73d95cbfef more upd Chengsong parents: 67 diff changeset	768	They are just a string of bits, the names $S$ and $Z$ here are quite arbitrary, we can use 0 and 1 or any other set of binary symbols to substitute them. Bit-codes(or bit-sequences) are a compact form of parse trees.
43 52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	769	Here is how values and bit-codes are related:
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	770	Bitcodes are essentially incomplete values.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	771	This can be straightforwardly seen in the following transformation:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	772	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	773	\begin{tabular}{lcl}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	774	$\textit{code}(\Empty)$ & $\dn$ & $[]$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	775	$\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	776	$\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	777	$\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	778	$\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
68 3c73d95cbfef more upd Chengsong parents: 67 diff changeset	779	$\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\
3c73d95cbfef more upd Chengsong parents: 67 diff changeset	780	$\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\;
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	781	code(\Stars\,vs)$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	782	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	783	\end{center}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	784
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	785	Here code encodes a value into a bit-sequence by converting Left into
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	786	$\Z$, Right into $\S$, the start point of a non-empty star iteration
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	787	into $\S$, and the border where a local star terminates into $\Z$. This
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	788	conversion is apparently lossy, as it throws away the character
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	789	information, and does not decode the boundary between the two operands
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	790	of the sequence constructor. Moreover, with only the bitcode we cannot
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	791	even tell whether the $\S$s and $\Z$s are for $\Left/\Right$ or
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	792	$\Stars$. The reason for choosing this compact way of storing
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	793	information is that the relatively small size of bits can be easily
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	794	moved around. In order to recover the bitcode back into values, we will
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	795	need the regular expression as the extra information and decode it back
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	796	into value:\\
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	797
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	798
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	799	%\begin{definition}[Bitdecoding of Values]\mbox{}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	800	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	801	\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
97c9ac95194d chages Chengsong parents: 35 diff changeset	802	$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	803	$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	804	$\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	805	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
97c9ac95194d chages Chengsong parents: 35 diff changeset	806	(\Left\,v, bs_1)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	807	$\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	808	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
97c9ac95194d chages Chengsong parents: 35 diff changeset	809	(\Right\,v, bs_1)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	810	$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	811	$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	812	& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	813	& & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	814	$\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	815	$\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	816	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	817	& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	818	& & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	819
97c9ac95194d chages Chengsong parents: 35 diff changeset	820	$\textit{decode}\,bs\,r$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	821	$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	822	& & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
97c9ac95194d chages Chengsong parents: 35 diff changeset	823	\textit{else}\;\textit{None}$
97c9ac95194d chages Chengsong parents: 35 diff changeset	824	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	825	\end{center}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	826	%\end{definition}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	827
43 52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	828
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	829	Sulzmann and Lu's integrated the bitcodes into annotated regular
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	830	expressions by attaching them to the head of every substructure of a
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	831	regular expression\cite{Sulzmann2014}. They are defined by the following
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	832	grammar:
43 52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	833
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	834	\begin{center}
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	835	\begin{tabular}{lcl}
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	836	$\textit{a}$ & $::=$ & $\textit{ZERO}$\\
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	837	& $\mid$ & $\textit{ONE}\;\;bs$\\
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	838	& $\mid$ & $\textit{CHAR}\;\;bs\,c$\\
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	839	& $\mid$ & $\textit{ALTS}\;\;bs\,as$\\
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	840	& $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	841	& $\mid$ & $\textit{STAR}\;\;bs\,a$
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	842	\end{tabular}
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	843	\end{center}
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	844
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	845	\noindent
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	846	where $bs$ stands for bitsequences, and $as$ (in \textit{ALTS}) for a
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	847	list of annotated regular expressions. These bitsequences encode
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	848	information about the (POSIX) value that should be generated by the
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	849	Sulzmann and Lu algorithm.
52a3cec0a5c7 s? Chengsong parents: 42 diff changeset	850
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	851	To do lexing using annotated regular expressions, we shall first
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	852	transform the usual (un-annotated) regular expressions into annotated
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	853	regular expressions. This operation is called \emph{internalisation} and
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	854	defined as follows:
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	855
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	856	%\begin{definition}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	857	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	858	\begin{tabular}{lcl}
97c9ac95194d chages Chengsong parents: 35 diff changeset	859	$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	860	$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	861	$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	862	$(r_1 + r_2)^\uparrow$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	863	$\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
97c9ac95194d chages Chengsong parents: 35 diff changeset	864	(\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	865	$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	866	$\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	867	$(r^*)^\uparrow$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	868	$\textit{STAR}\;[]\,r^\uparrow$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	869	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	870	\end{center}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	871	%\end{definition}
44 4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	872
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	873	\noindent
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	874	In the fourth clause, $fuse$ is an auxiliary function that helps to attach bits to the
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	875	front of an annotated regular expression. Its definition is as follows:
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	876
44 4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	877	\begin{center}
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	878	\begin{tabular}{lcl}
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	879	$\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	880	$\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	881	$\textit{ONE}\,(bs\,@\,bs')$\\
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	882	$\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	883	$\textit{CHAR}\,(bs\,@\,bs')\,c$\\
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	884	$\textit{fuse}\,bs\,(\textit{ALT}\,bs'\,a_1\,a_2)$ & $\dn$ &
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	885	$\textit{ALT}\,(bs\,@\,bs')\,a_1\,a_2$\\
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	886	$\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,a_1\,a_2)$ & $\dn$ &
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	887	$\textit{SEQ}\,(bs\,@\,bs')\,a_1\,a_2$\\
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	888	$\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,a)$ & $\dn$ &
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	889	$\textit{STAR}\,(bs\,@\,bs')\,a$
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	890	\end{tabular}
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	891	\end{center}
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	892
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	893	\noindent
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	894	After internalise we do successive derivative operations on the
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	895	annotated regular expression. This derivative operation is the same as
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	896	what we previously have for the simple regular expressions, except that
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	897	we take special care of the bits :\\
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	898
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	899	%\begin{definition}{bder}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	900	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	901	\begin{tabular}{@{}lcl@{}}
97c9ac95194d chages Chengsong parents: 35 diff changeset	902	$(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	903	$(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	904	$(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	905	$\textit{if}\;c=d\; \;\textit{then}\;
97c9ac95194d chages Chengsong parents: 35 diff changeset	906	\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	907	$(\textit{ALT}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	908	$\textit{ALT}\,bs\,(a_1\backslash c)\,(a_2\backslash c)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	909	$(\textit{SEQ}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	910	$\textit{if}\;\textit{bnullable}\,a_1$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	911	& &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(a_1\backslash c)\,a_2)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	912	& &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\backslash c))$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	913	& &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\backslash c)\,a_2$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	914	$(\textit{STAR}\,bs\,a)\backslash c$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	915	$\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
97c9ac95194d chages Chengsong parents: 35 diff changeset	916	(\textit{STAR}\,[]\,r)$
97c9ac95194d chages Chengsong parents: 35 diff changeset	917	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	918	\end{center}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	919	%\end{definition}
44 4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	920	For instance, when we unfold $STAR \; bs \; a$ into a sequence, we attach an additional bit Z to the front of $r \backslash c$ to indicate that there is one more star iteration.
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	921	The other example, the $SEQ$ clause is more subtle-- when $a_1$ is $bnullable$(here bnullable is exactly the same as nullable, except that it is for annotated regular expressions, therefore we omit the definition).
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	922	Assume that $bmkeps$ correctly extracts the bitcode for how $a_1$ matches the string prior to character c(more on this later), then the right branch of $ALTS$, which is $fuse \; bmkeps \; a_1 (a_2 \backslash c)$ will collapse the regular expression $a_1$(as it has already been fully matched) and store the parsing information at the head of the regular expression $a_2 \backslash c$ by fusing to it. The bitsequence $bs$, which was initially attached to the head of $SEQ$, has now been elevated to the top-level of ALT,
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	923	as this information will be needed whichever way the $SEQ$ is matched--no matter whether c belongs to $a_1$ or $ a_2$.
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	924	After carefully doing these derivatives and maintaining all the parsing information, we complete the parsing by collecting the bits using a special $mkeps$ function for annotated regular expressions--$bmkeps$:
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	925
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	926
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	927	%\begin{definition}[\textit{bmkeps}]\mbox{}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	928	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	929	\begin{tabular}{lcl}
97c9ac95194d chages Chengsong parents: 35 diff changeset	930	$\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	931	$\textit{bmkeps}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	932	$\textit{if}\;\textit{bnullable}\,a_1$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	933	& &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a_1$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	934	& &$\textit{else}\;bs\,@\,\textit{bmkeps}\,a_2$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	935	$\textit{bmkeps}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	936	$bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	937	$\textit{bmkeps}\,(\textit{STAR}\,bs\,a)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	938	$bs \,@\, [\S]$
97c9ac95194d chages Chengsong parents: 35 diff changeset	939	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	940	\end{center}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	941	%\end{definition}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	942
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	943	\noindent
44 4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	944	This function completes the parse tree information by
45 60cb82639691 spell check Christian Urban <urbanc@in.tum.de> parents: 44 diff changeset	945	travelling along the path on the regular expression that corresponds to a POSIX value snd collect all the bits, and
44 4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	946	using S to indicate the end of star iterations. If we take the bitsproduced by $bmkeps$ and decode it,
4d674a971852 another changes. have written more. but havent typed them. tomorrow will continue. Chengsong parents: 43 diff changeset	947	we get the parse tree we need, the working flow looks like this:\\
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	948	\begin{center}
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	949	\begin{tabular}{lcl}
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	950	$\textit{blexer}\;r\,s$ & $\dn$ &
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	951	$\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	952	& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	953	& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	954	& & $\;\;\textit{else}\;\textit{None}$
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	955	\end{tabular}
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	956	\end{center}
53 3ec403f650a8 readable version Chengsong parents: 52 diff changeset	957	Here $(r^\uparrow)\backslash s$ is similar to what we have previously defined for
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	958	$r\backslash s$.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	959
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	960	The main point of the bitsequences and annotated regular expressions
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	961	is that we can apply rather aggressive (in terms of size)
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	962	simplification rules in order to keep derivatives small.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	963
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	964	We have
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	965	developed such ``aggressive'' simplification rules and generated test
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	966	data that show that the expected bound can be achieved. Obviously we
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	967	could only partially cover the search space as there are infinitely
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	968	many regular expressions and strings. One modification we introduced
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	969	is to allow a list of annotated regular expressions in the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	970	\textit{ALTS} constructor. This allows us to not just delete
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	971	unnecessary $\ZERO$s and $\ONE$s from regular expressions, but also
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	972	unnecessary ``copies'' of regular expressions (very similar to
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	973	simplifying $r + r$ to just $r$, but in a more general
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	974	setting).
49 d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	975	Another modification is that we use simplification rules
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	976	inspired by Antimirov's work on partial derivatives. They maintain the
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	977	idea that only the first ``copy'' of a regular expression in an
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	978	alternative contributes to the calculation of a POSIX value. All
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	979	subsequent copies can be pruned from the regular expression.
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	980
52 25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	981	A recursive definition of simplification function that looks similar to scala code is given below:\\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	982	\begin{center}
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	983	\begin{tabular}{@{}lcl@{}}
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	984	$\textit{simp} \; a$ & $\dn$ & $\textit{a} \; \textit{if} \; a = (\textit{ONE} \; bs) \; or\; (\textit{CHAR} \, bs \; c) \; or\; (\textit{STAR}\; bs\; a_1)$\\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	985
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	986	$\textit{simp} \; \textit{SEQ}\;bs\,a_1\,a_2$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp} \; a_2) \; \textit{match} $ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	987	&&$\textit{case} \; (0, \_) \Rightarrow 0$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	988	&&$ \textit{case} \; (\_, 0) \Rightarrow 0$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	989	&&$ \textit{case} \; (1, a_2') \Rightarrow \textit{fuse} \; bs \; a_2'$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	990	&&$ \textit{case} \; (a_1', 1) \Rightarrow \textit{fuse} \; bs \; a_1'$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	991	&&$ \textit{case} \; (a_1', a_2') \Rightarrow \textit{SEQ} \; bs \; a_1' \; a_2'$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	992
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	993	$\textit{simp} \; \textit{ALT}\;bs\,as$ & $\dn$ & $\textit{ distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	994	&&$\textit{case} \; [] \Rightarrow 0$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	995	&&$ \textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$ \\
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	996	&&$ \textit{case} \; as' \Rightarrow \textit{ALT bs as'}$
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	997	\end{tabular}
25bbbb8b0e90 just in case of some accidents from erasing my work Chengsong parents: 51 diff changeset	998	\end{center}
47 d2a7e87ea6e1 will not compile, just text Chengsong parents: 46 diff changeset	999
d2a7e87ea6e1 will not compile, just text Chengsong parents: 46 diff changeset	1000	The simplification does a pattern matching on the regular expression. When it detected that
d2a7e87ea6e1 will not compile, just text Chengsong parents: 46 diff changeset	1001	the regular expression is an alternative or sequence, it will try to simplify its children regular expressions
d2a7e87ea6e1 will not compile, just text Chengsong parents: 46 diff changeset	1002	recursively and then see if one of the children turn into 0 or 1, which might trigger further simplification
d2a7e87ea6e1 will not compile, just text Chengsong parents: 46 diff changeset	1003	at the current level. The most involved part is the ALTS clause, where we use two auxiliary functions
d2a7e87ea6e1 will not compile, just text Chengsong parents: 46 diff changeset	1004	flatten and distinct to open up nested ALT and reduce as many duplicates as possible.
48 bbefcf7351f2 still will not compile Chengsong parents: 47 diff changeset	1005	Function distinct keeps the first occurring copy only and remove all later ones when detected duplicates.
bbefcf7351f2 still will not compile Chengsong parents: 47 diff changeset	1006	Function flatten opens up nested ALT. Its recursive definition is given below:
53 3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1007	\begin{center}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1008	\begin{tabular}{@{}lcl@{}}
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1009	$\textit{flatten} \; (\textit{ALT}\;bs\,as) :: as'$ & $\dn$ & $(\textit{map} \;
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1010	(\textit{fuse}\;bs)\; \textit{as}) \; @ \; \textit{flatten} \; as' $ \\
53 3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1011	$\textit{flatten} \; \textit{ZERO} :: as'$ & $\dn$ & $ \textit{flatten} \; as' $ \\
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1012	$\textit{flatten} \; a :: as'$ & $\dn$ & $a :: \textit{flatten} \; as'$ \quad(otherwise)
53 3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1013	\end{tabular}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1014	\end{center}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1015
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1016	\noindent
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1017	\comment{No: functional flatten does not remove ZEROs}Here flatten behaves like the traditional functional programming flatten function,
48 bbefcf7351f2 still will not compile Chengsong parents: 47 diff changeset	1018	what it does is basically removing parentheses like changing $a+(b+c)$ into $a+b+c$.
47 d2a7e87ea6e1 will not compile, just text Chengsong parents: 46 diff changeset	1019
53 3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1020	Suppose we apply simplification after each derivative step,
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1021	and view these two operations as an atomic one: $a \backslash_{simp} c \dn \textit{simp}(a \backslash c)$.
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1022	Then we can use the previous natural extension from derivative w.r.t character to derivative w.r.t string:
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1023
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1024	\begin{center}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1025	\begin{tabular}{lcl}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1026	$r \backslash_{simp} (c\!::\!s) $ & $\dn$ & $(r \backslash_{simp} c) \backslash_{simp} s$ \\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1027	$r \backslash [\,] $ & $\dn$ & $r$
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1028	\end{tabular}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1029	\end{center}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1030
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1031	we get an optimized version of the algorithm:
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1032	\begin{center}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1033	\begin{tabular}{lcl}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1034	$\textit{blexer\_simp}\;r\,s$ & $\dn$ &
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1035	$\textit{let}\;a = (r^\uparrow)\backslash_{simp} s\;\textit{in}$\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1036	& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1037	& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1038	& & $\;\;\textit{else}\;\textit{None}$
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1039	\end{tabular}
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1040	\end{center}
48 bbefcf7351f2 still will not compile Chengsong parents: 47 diff changeset	1041
bbefcf7351f2 still will not compile Chengsong parents: 47 diff changeset	1042	This algorithm effectively keeps the regular expression size small, for example,
bbefcf7351f2 still will not compile Chengsong parents: 47 diff changeset	1043	with this simplification our previous $(a + aa)^*$ example's 8000 nodes will be reduced to only 6 and stay constant, however long the input string is.
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	1044
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1045
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	1046
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1047	\section{Current Work}
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1048
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1049	We are currently engaged in two tasks related to this algorithm.
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1050
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1051	\begin{itemize}
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1052	\item
49 d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1053	The first one is proving that our simplification rules
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1054	actually do not affect the POSIX value that should be generated by the
49 d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1055	algorithm according to the specification of a POSIX value
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1056	and furthermore obtain a much
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1057	tighter bound on the sizes of derivatives. The result is that our
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1058	algorithm should be correct and faster on all inputs. The original
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1059	blow-up, as observed in JavaScript, Python and Java, would be excluded
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1060	from happening in our algorithm.For
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1061	this proof we use the theorem prover Isabelle. Once completed, this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1062	result will advance the state-of-the-art: Sulzmann and Lu wrote in
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1063	their paper \cite{Sulzmann2014} about the bitcoded ``incremental
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1064	parsing method'' (that is the matching algorithm outlined in this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1065	section):
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1066
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1067	\begin{quote}\it
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1068	``Correctness Claim: We further claim that the incremental parsing
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1069	method in Figure~5 in combination with the simplification steps in
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1070	Figure 6 yields POSIX parse trees. We have tested this claim
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1071	extensively by using the method in Figure~3 as a reference but yet
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1072	have to work out all proof details.''
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1073	\end{quote}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1074
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1075	\noindent
49 d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1076	We would settle the correctness claim.
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1077	It is relatively straightforward to establish that after one simplification step, the part of derivative that corresponds to a POSIX value remains intact and can still be collected, in other words,
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1078	\comment{that only holds when r is nullable}bmkeps r = bmkeps simp r
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	1079	as this basically comes down to proving actions like removing the additional $r$ in $r+r$ does not delete important POSIX information in a regular expression.
49 d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1080	The hardcore of this problem is to prove that
d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1081	bmkeps bders r = bmkeps bders simp r
62 5b10d83b0834 spellcheck Christian Urban <urbanc@in.tum.de> parents: 61 diff changeset	1082	That is, if we do derivative on regular expression r and the simplified version for, they can still prove the same POSIX value if there is one . This is not as straightforward as the previous proposition, as the two regular expression r and simp r might become very different regular expressions after repeated application ofd simp and derivative.
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1083	The crucial point is to find the \comment{What?}"gene" of a regular expression and how it is kept intact during simplification.
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1084	To aid this, we are use the helping function retrieve described by Sulzmann and Lu:
49 d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1085	\\definition of retrieve\\
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	1086	This function assembled the bitcode that corresponds to a parse tree for how the current derivative matches the suffix of the string(the characters that have not yet appeared, but is stored in the value).
49 d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1087	Sulzmann and Lu used this to connect the bit-coded algorithm to the older algorithm by the following equation:\\
53 3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1088	$inj \;a\; c \; v = \textit{decode} \; (\textit{retrieve}\; ((\textit{internalise}\; r)\backslash_{simp} c) v)$\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1089	A little fact that needs to be stated to help comprehension:\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1090	$r^\uparrow = a$($a$ stands for $annotated$).\\
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	1091	Fahad and Christian also used this fact to prove the correctness of bit-coded algorithm without simplification.
50 866eda9ba66a now will compile Chengsong parents: 49 diff changeset	1092	Our purpose of using this, however, is try to establish \\
53 3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1093	$ \textit{retrieve} \; a \; v \;=\; \textit{retrieve} \; \textit{simp}(a) \; v'.$\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1094	The idea is that using $v'$,
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	1095	a simplified version of $v$ that possibly had gone through the same simplification step as $\textit{simp}(a)$ we are still able to extract the bitsequence that gives the same parsing information as the unsimplified one.
53 3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1096	After establishing this, we might be able to finally bridge the gap of proving\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1097	$\textit{retrieve} \; r \backslash s \; v = \;\textit{retrieve} \; \textit{simp}(r) \backslash s \; v'$\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1098	and subsequently\\
3ec403f650a8 readable version Chengsong parents: 52 diff changeset	1099	$\textit{retrieve} \; r \backslash s \; v\; = \; \textit{retrieve} \; r \backslash_{simp} s \; v'$.\\
58 f0360e17080e proofread Christian Urban <urbanc@in.tum.de> parents: 57 diff changeset	1100	This proves that our simplified version of regular expression still contains all the bitcodes needed.
49 d256aabe88f3 still wont comiple hhh Chengsong parents: 48 diff changeset	1101
70 cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1102	\item
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1103	The second task is to speed up the more aggressive simplification.
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1104	Currently it is slower than a naive simplification(the naive version as
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1105	implemented in ADU of course can explode in some cases). So it needs to
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1106	be explored how to make it faster. Our possibility would be to explore
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1107	again the connection to DFAs. This is very much work in progress.
cab5eab1f6f1 more proofreading Christian Urban <urbanc@in.tum.de> parents: 69 diff changeset	1108	\end{itemize}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1109
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1110	\section{Conclusion}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1111
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1112	In this PhD-project we are interested in fast algorithms for regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1113	expression matching. While this seems to be a ``settled'' area, in
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1114	fact interesting research questions are popping up as soon as one steps
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1115	outside the classic automata theory (for example in terms of what kind
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1116	of regular expressions are supported). The reason why it is
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1117	interesting for us to look at the derivative approach introduced by
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1118	Brzozowski for regular expression matching, and then much further
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1119	developed by Sulzmann and Lu, is that derivatives can elegantly deal
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1120	with some of the regular expressions that are of interest in ``real
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1121	life''. This includes the not-regular expression, written $\neg\,r$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1122	(that is all strings that are not recognised by $r$), but also bounded
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1123	regular expressions such as $r^{\{n\}}$ and $r^{\{n..m\}}$). There is
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1124	also hope that the derivatives can provide another angle for how to
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1125	deal more efficiently with back-references, which are one of the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1126	reasons why regular expression engines in JavaScript, Python and Java
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1127	choose to not implement the classic automata approach of transforming
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1128	regular expressions into NFAs and then DFAs---because we simply do not
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1129	know how such back-references can be represented by DFAs.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1130
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1131
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1132	\bibliographystyle{plain}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1133	\bibliography{root}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1134
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1135
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	1136	\end{document}

author	Christian Urban <urbanc@in.tum.de>
	Mon, 08 Jul 2019 11:10:32 +0100
changeset 70	cab5eab1f6f1
parent 69	4c7173b7ddca
child 71	0573615e41a3
permissions	-rw-r--r--