cst_tests: ninems/ninems.tex@2a469388c989 (annotated)

34 96bba9b493e9 hope works Chengsong parents: 30 diff changeset	1	\documentclass[a4paper,UKenglish]{lipics}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	2	\usepackage{graphic}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	3	\usepackage{data}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	4	\usepackage{tikz-cd}
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	5	\usepackage{algorithm}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	6	\usepackage{amsmath}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	7	\usepackage[noend]{algpseudocode}
42 2a469388c989 s Chengsong parents: 41 diff changeset	8	\usepackage{enumitem}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	9	% \documentclass{article}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	10	%\usepackage[utf8]{inputenc}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	11	%\usepackage[english]{babel}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	12	%\usepackage{listings}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	13	% \usepackage{amsthm}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	14	% \usepackage{hyperref}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	15	% \usepackage[margin=0.5in]{geometry}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	16	%\usepackage{pmboxdraw}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	17
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	18	\title{POSIX Regular Expression Matching and Lexing}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	19	\author{Chengsong Tan}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	20	\affil{King's College London\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	21	London, UK\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	22	\texttt{chengsong.tan@kcl.ac.uk}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	23	\authorrunning{Chengsong Tan}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	24	\Copyright{Chengsong Tan}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	25
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	26	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	27	\newcommand{\ZERO}{\mbox{\bf 0}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	28	\newcommand{\ONE}{\mbox{\bf 1}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	29	\def\lexer{\mathit{lexer}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	30	\def\mkeps{\mathit{mkeps}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	31	\def\inj{\mathit{inj}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	32	\def\Empty{\mathit{Empty}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	33	\def\Left{\mathit{Left}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	34	\def\Right{\mathit{Right}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	35	\def\Stars{\mathit{Stars}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	36	\def\Char{\mathit{Char}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	37	\def\Seq{\mathit{Seq}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	38	\def\Der{\mathit{Der}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	39	\def\nullable{\mathit{nullable}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	40	\def\Z{\mathit{Z}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	41	\def\S{\mathit{S}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	42
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	43	%\theoremstyle{theorem}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	44	%\newtheorem{theorem}{Theorem}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	45	%\theoremstyle{lemma}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	46	%\newtheorem{lemma}{Lemma}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	47	%\newcommand{\lemmaautorefname}{Lemma}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	48	%\theoremstyle{definition}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	49	%\newtheorem{definition}{Definition}
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	50	\algnewcommand\algorithmicswitch{\textbf{switch}}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	51	\algnewcommand\algorithmiccase{\textbf{case}}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	52	\algnewcommand\algorithmicassert{\texttt{assert}}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	53	\algnewcommand\Assert[1]{\State \algorithmicassert(#1)}%
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	54	% New "environments"
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	55	\algdef{SE}[SWITCH]{Switch}{EndSwitch}[1]{\algorithmicswitch\ #1\ \algorithmicdo}{\algorithmicend\ \algorithmicswitch}%
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	56	\algdef{SE}[CASE]{Case}{EndCase}[1]{\algorithmiccase\ #1}{\algorithmicend\ \algorithmiccase}%
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	57	\algtext*{EndSwitch}%
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	58	\algtext*{EndCase}%
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	59
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	60
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	61	\begin{document}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	62
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	63	\maketitle
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	64
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	65	\begin{abstract}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	66	Brzozowski introduced in 1964 a beautifully simple algorithm for
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	67	regular expression matching based on the notion of derivatives of
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	68	regular expressions. In 2014, Sulzmann and Lu extended this
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	69	algorithm to not just give a YES/NO answer for whether or not a
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	70	regular expression matches a string, but in case it matches also
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	71	\emph{how} it matches the string. This is important for
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	72	applications such as lexing (tokenising a string). The problem is to
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	73	make the algorithm by Sulzmann and Lu fast on all inputs without
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	74	breaking its correctness. We have already developed some
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	75	simplification rules for this, but have not proved that they
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	76	preserve the correctness of the algorithm. We also have not yet
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	77	looked at extended regular expressions, such as bounded repetitions,
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	78	negation and back-references.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	79	\end{abstract}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	80
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	81	\section{Introduction}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	82
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	83	This PhD-project is about regular expression matching and
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	84	lexing. Given the maturity of this topic, the reader might wonder:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	85	Surely, regular expressions must have already been studied to death?
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	86	What could possibly be \emph{not} known in this area? And surely all
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	87	implemented algorithms for regular expression matching are blindingly
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	88	fast?
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	89
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	90
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	91
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	92	Unfortunately these preconceptions are not supported by evidence: Take
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	93	for example the regular expression $(a^)^\,b$ and ask whether
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	94	strings of the form $aa..a$ match this regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	95	expression. Obviously they do not match---the expected $b$ in the last
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	96	position is missing. One would expect that modern regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	97	matching engines can find this out very quickly. Alas, if one tries
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	98	this example in JavaScript, Python or Java 8 with strings like 28
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	99	$a$'s, one discovers that this decision takes around 30 seconds and
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	100	takes considerably longer when adding a few more $a$'s, as the graphs
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	101	below show:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	102
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	103	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	104	\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	105	\begin{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	106	\begin{axis}[
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	107	xlabel={$n$},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	108	x label style={at={(1.05,-0.05)}},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	109	ylabel={time in secs},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	110	enlargelimits=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	111	xtick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	112	xmax=33,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	113	ymax=35,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	114	ytick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	115	scaled ticks=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	116	axis lines=left,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	117	width=5cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	118	height=4cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	119	legend entries={JavaScript},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	120	legend pos=north west,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	121	legend cell align=left]
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	122	\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	123	\end{axis}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	124	\end{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	125	&
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	126	\begin{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	127	\begin{axis}[
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	128	xlabel={$n$},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	129	x label style={at={(1.05,-0.05)}},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	130	%ylabel={time in secs},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	131	enlargelimits=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	132	xtick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	133	xmax=33,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	134	ymax=35,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	135	ytick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	136	scaled ticks=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	137	axis lines=left,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	138	width=5cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	139	height=4cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	140	legend entries={Python},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	141	legend pos=north west,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	142	legend cell align=left]
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	143	\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	144	\end{axis}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	145	\end{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	146	&
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	147	\begin{tikzpicture}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	148	\begin{axis}[
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	149	xlabel={$n$},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	150	x label style={at={(1.05,-0.05)}},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	151	%ylabel={time in secs},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	152	enlargelimits=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	153	xtick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	154	xmax=33,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	155	ymax=35,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	156	ytick={0,5,...,30},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	157	scaled ticks=false,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	158	axis lines=left,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	159	width=5cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	160	height=4cm,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	161	legend entries={Java 8},
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	162	legend pos=north west,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	163	legend cell align=left]
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	164	\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	165	\end{axis}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	166	\end{tikzpicture}\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	167	\multicolumn{3}{c}{Graphs: Runtime for matching $(a^)^\,b$ with strings
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	168	of the form $\underbrace{aa..a}_{n}$.}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	169	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	170	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	171
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	172	\noindent These are clearly abysmal and possibly surprising results.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	173	One would expect these systems doing much better than that---after
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	174	all, given a DFA and a string, whether a string is matched by this DFA
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	175	should be linear.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	176
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	177	Admittedly, the regular expression $(a^)^\,b$ is carefully chosen to
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	178	exhibit this ``exponential behaviour''. Unfortunately, such regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	179	expressions are not just a few ``outliers'', but actually they are
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	180	frequent enough that a separate name has been created for
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	181	them---\emph{evil regular expressions}. In empiric work, Davis et al
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	182	report that they have found thousands of such evil regular expressions
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	183	in the JavaScript and Python ecosystems \cite{Davis18}.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	184
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	185	This exponential blowup sometimes causes real pain in real life:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	186	for example on 20 July 2016 one evil regular expression brought the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	187	webpage \href{http://stackexchange.com}{Stack Exchange} to its knees \footnote{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	188	In this instance, a regular expression intended to just trim white
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	189	spaces from the beginning and the end of a line actually consumed
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	190	massive amounts of CPU-resources and because of this the web servers
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	191	ground to a halt. This happened when a post with 20,000 white spaces
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	192	was submitted, but importantly the white spaces were neither at the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	193	beginning nor at the end. As a result, the regular expression matching
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	194	engine needed to backtrack over many choices.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	195
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	196	The underlying problem is that many ``real life'' regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	197	matching engines do not use DFAs for matching. This is because they
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	198	support regular expressions that are not covered by the classical
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	199	automata theory, and in this more general setting there are quite a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	200	few research questions still unanswered and fast algorithms still need
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	201	to be developed.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	202
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	203	There is also another under-researched problem to do with regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	204	expressions and lexing, i.e.~the process of breaking up strings into
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	205	sequences of tokens according to some regular expressions. In this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	206	setting one is not just interested in whether or not a regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	207	expression matches a string, but if it matches also in \emph{how} it
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	208	matches the string. Consider for example a regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	209	$r_{key}$ for recognising keywords such as \textit{if}, \textit{then}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	210	and so on; and a regular expression $r_{id}$ for recognising
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	211	identifiers (say, a single character followed by characters or
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	212	numbers). One can then form the compound regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	213	$(r_{key} + r_{id})^*$ and use it to tokenise strings. But then how
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	214	should the string \textit{iffoo} be tokenised? It could be tokenised
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	215	as a keyword followed by an identifier, or the entire string as a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	216	single identifier. Similarly, how should the string \textit{if} be
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	217	tokenised? Both regular expressions, $r_{key}$ and $r_{id}$, would
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	218	``fire''---so is it an identifier or a keyword? While in applications
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	219	there is a well-known strategy to decide these questions, called POSIX
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	220	matching, only relatively recently precise definitions of what POSIX
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	221	matching actually means have been formalised
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	222	\cite{AusafDyckhoffUrban2016,OkuiSuzuki2010,Vansummeren2006}. Roughly,
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	223	POSIX matching means matching the longest initial substring.
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	224	In the case of a tie, the initial submatch is chosen according to some priorities attached to the
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	225	regular expressions (e.g.~keywords have a higher priority than
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	226	identifiers). This sounds rather simple, but according to Grathwohl et
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	227	al \cite[Page 36]{CrashCourse2014} this is not the case. They wrote:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	228
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	229	\begin{quote}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	230	\it{}``The POSIX strategy is more complicated than the greedy because of
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	231	the dependence on information about the length of matched strings in the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	232	various subexpressions.''
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	233	\end{quote}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	234
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	235	\noindent
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	236	This is also supported by evidence collected by Kuklewicz
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	237	\cite{Kuklewicz} who noticed that a number of POSIX regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	238	matchers calculate incorrect results.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	239
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	240	Our focus is on an algorithm introduced by Sulzmann and Lu in 2014 for
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	241	regular expression matching according to the POSIX strategy
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	242	\cite{Sulzmann2014}. Their algorithm is based on an older algorithm by
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	243	Brzozowski from 1964 where he introduced the notion of derivatives of
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	244	regular expressions \cite{Brzozowski1964}. We shall briefly explain
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	245	the algorithms next.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	246
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	247	\section{The Algorithms by Brzozowski, and Sulzmann and Lu}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	248
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	249	Suppose (basic) regular expressions are given by the following grammar:
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	250	\[ r ::= \ZERO \mid \ONE
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	251	\mid c
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	252	\mid r_1 \cdot r_2
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	253	\mid r_1 + r_2
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	254	\mid r^*
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	255	\]
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	256
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	257	\noindent
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	258	The intended meaning of the constructors is as usual: $\ZERO$
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	259	cannot match any string, $\ONE$ can match the empty string, the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	260	character regular expression $c$ can match the character $c$, and so
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	261	on.
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	262
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	263	The brilliant contribution by Brzozowski is the notion of
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	264	\emph{derivatives} of regular expressions. The idea behind this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	265	notion is as follows: suppose a regular expression $r$ can match a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	266	string of the form $c\!::\! s$ (that is a list of characters starting
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	267	with $c$), what does the regular expression look like that can match
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	268	just $s$? Brzozowski gave a neat answer to this question. He started
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	269	with the definition of $nullable$:
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	270	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	271	\begin{tabular}{lcl}
97c9ac95194d chages Chengsong parents: 35 diff changeset	272	$\nullable(\ZERO)$ & $\dn$ & $\mathit{false}$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	273	$\nullable(\ONE)$ & $\dn$ & $\mathit{true}$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	274	$\nullable(c)$ & $\dn$ & $\mathit{false}$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	275	$\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	276	$\nullable(r_1\cdot r_2)$ & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	277	$\nullable(r^*)$ & $\dn$ & $\mathit{true}$ \\
97c9ac95194d chages Chengsong parents: 35 diff changeset	278	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	279	\end{center}
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	280	This function simply tests whether the empty string is in $L(r)$.
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	281	He then defined
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	282	the following operation on regular expressions, written
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	283	$r\backslash c$ (the derivative of $r$ w.r.t.~the character $c$):
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	284
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	285	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	286	\begin{tabular}{lcl}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	287	$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	288	$\ONE \backslash c$ & $\dn$ & $\ZERO$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	289	$d \backslash c$ & $\dn$ &
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	290	$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	291	$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	292	$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, nullable(r_1)$\\
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	293	& & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	294	& & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	295	$(r^)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	296	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	297	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	298
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	299	\noindent
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	300
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	301
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	302
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	303	%Assuming the classic notion of a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	304	%\emph{language} of a regular expression, written $L(\_)$, t
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	305	\noindent
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	306	The main property of the derivative operation is that
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	307
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	308	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	309	$c\!::\!s \in L(r)$ holds
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	310	if and only if $s \in L(r\backslash c)$.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	311	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	312
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	313	\noindent
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	314	For us the main advantage is that derivatives can be
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	315	straightforwardly implemented in any functional programming language,
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	316	and are easily definable and reasoned about in theorem provers---the
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	317	definitions just consist of inductive datatypes and simple recursive
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	318	functions. Moreover, the notion of derivatives can be easily
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	319	generalised to cover extended regular expression constructors such as
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	320	the not-regular expression, written $\neg\,r$, or bounded repetitions
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	321	(for example $r^{\{n\}}$ and $r^{\{n..m\}}$), which cannot be so
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	322	straightforwardly realised within the classic automata approach.
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	323	For the moment however, we focus only on the usual basic regular expressions.
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	324
b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	325
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	326	Now if we want to find out whether a string $s$ matches with a regular
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	327	expression $r$, build the derivatives of $r$ w.r.t.\ (in succession)
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	328	all the characters of the string $s$. Finally, test whether the
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	329	resulting regular expression can match the empty string. If yes, then
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	330	$r$ matches $s$, and no in the negative case. To implement this idea
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	331	we can generalise the derivative operation to strings like this:
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	332	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	333	\begin{tabular}{lcl}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	334	$r \backslash (c\!::\!s) $ & $\dn$ & $(r \backslash c) \backslash s$ \\
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	335	$r \backslash [\,] $ & $\dn$ & $r$
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	336	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	337	\end{center}
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	338
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	339	\noindent
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	340	Using this definition we obtain a simple and elegant regular
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	341	expression matching algorithm:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	342	\[
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	343	match\;s\;r \;\dn\; nullable(r\backslash s)
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	344	\]
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	345
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	346	\noindent
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	347	Pictorially this algorithm can be illustrated as follows:
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	348
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	349	\begin{center}
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	350	\begin{tikzcd}\label{graph:*}
39 7d18745dd7c9 changes Chengsong parents: 38 diff changeset	351	r_0 \arrow[r, "\backslash c_0"] & r_1 \arrow[r, "\backslash c_1"] & r_2 \arrow[r, dashed] & r_n \arrow[r,"nullable?"] & Yes/No
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	352	\end{tikzcd}
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	353	\end{center}
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	354
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	355	\noindent
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	356	where we start with a regular expression $r_0$, build successive derivatives
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	357	until we exhaust the string and then use \textit{nullable} to test whether the
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	358	result can match the empty string. It can be relatively easily shown that this
8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	359	matcher is correct.
38 b5363c0dcd99 half easy changes Chengsong parents: 37 diff changeset	360
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	361	One limitation, however, of Brzozowski's algorithm is that it only
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	362	produces a YES/NO answer for whether a string is being matched by a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	363	regular expression. Sulzmann and Lu~\cite{Sulzmann2014} extended this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	364	algorithm to allow generation of an actual matching, called a
40 8063792920ef proof reading Christian Urban <urbanc@in.tum.de> parents: 39 diff changeset	365	\emph{value}.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	366
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	367	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	368	\begin{tabular}{c@{\hspace{20mm}}c}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	369	\begin{tabular}{@{}rrl@{}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	370	\multicolumn{3}{@{}l}{\textbf{Regular Expressions}}\medskip\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	371	$r$ & $::=$ & $\ZERO$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	372	& $\mid$ & $\ONE$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	373	& $\mid$ & $c$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	374	& $\mid$ & $r_1 \cdot r_2$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	375	& $\mid$ & $r_1 + r_2$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	376	\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	377	& $\mid$ & $r^*$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	378	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	379	&
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	380	\begin{tabular}{@{\hspace{0mm}}rrl@{}}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	381	\multicolumn{3}{@{}l}{\textbf{Values}}\medskip\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	382	$v$ & $::=$ & \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	383	& & $\Empty$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	384	& $\mid$ & $\Char(c)$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	385	& $\mid$ & $\Seq\,v_1\, v_2$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	386	& $\mid$ & $\Left(v)$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	387	& $\mid$ & $\Right(v)$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	388	& $\mid$ & $\Stars\,[v_1,\ldots\,v_n]$ \\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	389	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	390	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	391	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	392
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	393	\noindent
39 7d18745dd7c9 changes Chengsong parents: 38 diff changeset	394	Values and regular expressions correspond to each other as illustrated by placing corresponding values next to the regular expressions.
7d18745dd7c9 changes Chengsong parents: 38 diff changeset	395	The idea of values is to express parse trees.
7d18745dd7c9 changes Chengsong parents: 38 diff changeset	396	Suppose by using a flatten operation, written $\|v\|$, we can extract the underlying string of v.
7d18745dd7c9 changes Chengsong parents: 38 diff changeset	397	For example, $\|\mathit{Seq} \, \mathit{Char(c)} \, \mathit{Char(d)}\|$ = $cd$. We omit this straightforward definition.
7d18745dd7c9 changes Chengsong parents: 38 diff changeset	398	Using this flatten notation, we now elaborate how values can express parse trees. $\Seq\,v_1\, v_2$ tells us how the string $\|v_1\| \cdot \|v_2\|$ matches the regex $r_1 \cdot r_2$: $r_1$ matches $\|v_1\|$ and respectively $r_2$ matches $\|v_2\|$. Exactly how these two are matched are contained in the sub-structure of $v_1$ and $v_2$.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	399
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	400	To give a concrete example of how value works, consider the string $xy$ and the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	401	regular expression $(x + (y + xy))^*$. We can view this regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	402	expression as a tree and if the string $xy$ is matched by two Star
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	403	``iterations'', then the $x$ is matched by the left-most alternative
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	404	in this tree and the $y$ by the right-left alternative. This suggests
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	405	to record this matching as
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	406
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	407	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	408	$\Stars\,[\Left\,(\Char\,x), \Right(\Left(\Char\,y))]$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	409	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	410
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	411	\noindent
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	412	where $\Stars$ records how many
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	413	iterations were used; and $\Left$, respectively $\Right$, which
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	414	alternative is used. The value for
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	415	matching $xy$ in a single ``iteration'', i.e.~the POSIX value,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	416	would look as follows
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	417
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	418	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	419	$\Stars\,[\Seq\,(\Char\,x)\,(\Char\,y)]$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	420	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	421
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	422	\noindent
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	423	where $\Stars$ has only a single-element list for the single iteration
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	424	and $\Seq$ indicates that $xy$ is matched by a sequence regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	425	expression.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	426
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	427	The contribution of Sulzmann and Lu is an extension of Brzozowski's
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	428	algorithm by a second phase (the first phase being building successive
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	429	derivatives). In this second phase, for every successful match the
39 7d18745dd7c9 changes Chengsong parents: 38 diff changeset	430	corresponding POSIX value is computed. Pictorially, the Sulzmann and Lu algorithm looks like the following diagram(the working flow of the simple matching algorithm that just gives a $YES/NO$ answer is given before \ref{graph:*}):\\
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	431	\begin{tikzcd}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	432	r_0 \arrow[r, "\backslash c_0"] \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	433	v_0 & v_1 \arrow[l,"inj_{r_0} c_0"] & v_2 \arrow[l, "inj_{r_1} c_1"] & v_n \arrow[l, dashed]
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	434	\end{tikzcd}
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	435
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	436
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	437	We shall briefly explain this interesting process.\\ For the convenience of explanation, we have the following notations: the regular expression $r$ used for matching is also called $r_0$ and the string $s$ is composed of $n$ characters $c_0 c_1 ... c_{n-1}$.
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	438	First, we do the derivative operation on $r_0$, $r_1$, ..., using characters $c_0$, $c_1$, ... until we get the final derivative $r_n$.We test whether it is $nullable$ or not. If no we know immediately the string does not match the regex. If yes, we start building the parse tree incrementally. We first call $mkeps$(which stands for make epsilon--make the parse tree for how the empty word matched the empty regular expression epsilon) to construct the parse tree $v_n$ for how the final derivative $r_n$ matches the empty string:
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	439
41 a1f90febbc7f example Chengsong parents: 40 diff changeset	440	$mkeps $ $1 \,[] $ $= Empty$
a1f90febbc7f example Chengsong parents: 40 diff changeset	441	......
a1f90febbc7f example Chengsong parents: 40 diff changeset	442
a1f90febbc7f example Chengsong parents: 40 diff changeset	443
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	444	After this, we inject back the characters one by one, in reverse order as they were chopped off, to build the parse tree $v_i$ for how the regex $r_i$ matches the string $s_i$($s_i$ means the string s with the first $i$ characters being chopped off) from the previous parse tree. After $n$ transformations, we get the parse tree for how $r_0$ matches $s$, exactly as we wanted.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	445	An inductive proof can be routinely established.
41 a1f90febbc7f example Chengsong parents: 40 diff changeset	446
a1f90febbc7f example Chengsong parents: 40 diff changeset	447	It is instructive to see how it works by a little example. Suppose we have a regular expression $(a+b+ab+c+abc)*$ and we want to match it against the string $abc$. By POSIX rules the lexer should go for the longest matching, i.e. it should match the string $abc$ in one star iteration, using the longest string $abc$ in the sub-expression $a+b+ab+c+abc$(we use $r$ to denote this sub-expression for conciseness). Here is how the lexer achieves a parse tree for this matching.
a1f90febbc7f example Chengsong parents: 40 diff changeset	448	First, we build successive derivatives until we exhaust the string, as illustrated here( we omitted some parenthesis for better readability):
a1f90febbc7f example Chengsong parents: 40 diff changeset	449
a1f90febbc7f example Chengsong parents: 40 diff changeset	450	\[ r^* \xrightarrow{\backslash a} r_1 = (1+0+1 \cdot b + 0 + 1 \cdot b \cdot c) \cdot r* \xrightarrow{\backslash b}\]
a1f90febbc7f example Chengsong parents: 40 diff changeset	451	\[r_2 = (0+0+1 \cdot 1 + 0 + 1 \cdot 1 \cdot c) \cdot r^* +(0+1+0 + 0 + 0) \cdot r* \xrightarrow{\backslash c}\]
a1f90febbc7f example Chengsong parents: 40 diff changeset	452	\[r_3 = ((0+0+0 + 0 + 1 \cdot 1 \cdot 1) \cdot r^* + (0+0+0 + 1 + 0) \cdot r) +((0+1+0 + 0 + 0) \cdot r+(0+0+0 + 1 + 0) \cdot r* )
a1f90febbc7f example Chengsong parents: 40 diff changeset	453	\]
a1f90febbc7f example Chengsong parents: 40 diff changeset	454
a1f90febbc7f example Chengsong parents: 40 diff changeset	455	Now instead of using $nullable$ to give a $yes$, we call $mkeps$ to construct a parse tree for how $r_3$ matched the string $abc$. $mkeps$ gives the following value $v_3$: \\$Left(Left(Seq(Right(Right(Right(Seq(Empty, Seq(Empty, Empty)))))), Stars []))$\\
a1f90febbc7f example Chengsong parents: 40 diff changeset	456	This corresponds to the leftmost term $((0+0+0 + 0 + 1 \cdot 1 \cdot 1) \cdot r^* $ in $r_3$. Note that its leftmost location allows $mkeps$ to choose it as the first candidate that meets the requirement of being $nullable$. This location is naturally generated by the splitting clause\\ $(r_1 \cdot r_2)\backslash c (when \, r_1 \, nullable)) \, = (r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c. \\$. By this clause, we put
a1f90febbc7f example Chengsong parents: 40 diff changeset	457	$r_1 \backslash c \cdot r_2 $ at the front and $r_2 \backslash c$ at the back. This allows $mkeps$ to always pick up among two matches the one with a longer prefix. The value \\
a1f90febbc7f example Chengsong parents: 40 diff changeset	458	$Left(Left(Seq(Right(Right(Right(Seq(Empty, Seq(Empty, Empty)))))), Stars []))$\\
a1f90febbc7f example Chengsong parents: 40 diff changeset	459	tells us how about the empty string matches the final regular expression after doing all the derivatives: among the regular expressions $(0+0+0 + 0 + 1 \cdot 1 \cdot 1) \cdot r^* + (0+0+0 + 1 + 0) \cdot r) +((0+1+0 + 0 + 0) \cdot r+(0+0+0 + 1 + 0) \cdot r* )$ we choose the left most nullable one, which is composed of a sequence of a nested alternative and a folded star that iterates 0 times. In that nested alternative we take the rightmost alternative.
a1f90febbc7f example Chengsong parents: 40 diff changeset	460
42 2a469388c989 s Chengsong parents: 41 diff changeset	461	Using the value $v_3$, the character c, and the regular expression $r_2$, we can recover how $r_2$ matched the string $[c]$ : we inject $c$ back to $v_3$, and get \\ $v_2 = Left(Seq(Right(Right(Right(Seq(Empty, Seq(Empty, c)))))), Stars [])$, which tells us how $r_2$ matched $c$. After this we inject back the character $b$, and get\\ $v_1 = Seq(Right(Right(Right(Seq(Empty, Seq(b, c)))))), Stars [])$ for how $r_1= (1+0+1 \cdot b + 0 + 1 \cdot b \cdot c) \cdot r*$ matched the string $bc$ before it split into 2 pieces. Finally, after injecting character a back to $v_1$, we get the parse tree $v_0= Stars [Right(Right(Right(Seq(a, Seq(b, c)))))]$ for how r matched $abc$.
2a469388c989 s Chengsong parents: 41 diff changeset	462	We omit the details of injection function, which is provided by Sulzmann and Lu's paper \cite{Sulzmann2014}.
2a469388c989 s Chengsong parents: 41 diff changeset	463	Readers might have noticed that the parse tree information as actually already available when doing derivatives. For example, immediately after the operation $\backslash a$ we know that if we want to match a string that starts with a, we can either take the initial match to be
2a469388c989 s Chengsong parents: 41 diff changeset	464	\begin{enumerate}
2a469388c989 s Chengsong parents: 41 diff changeset	465	\item[1)] just $a$ or
2a469388c989 s Chengsong parents: 41 diff changeset	466	\item[2)] string $ab$ or
2a469388c989 s Chengsong parents: 41 diff changeset	467	\item[3)] string $abc$.
2a469388c989 s Chengsong parents: 41 diff changeset	468	\end{enumerate}
2a469388c989 s Chengsong parents: 41 diff changeset	469	In order to differentiate between these choices, we just need to remember their positions--$a$ is on the left, $ab$ is in the middle , and $abc$ is on the right. Which one of these alternatives is chosen later does not affect their relative position because our algorithm does not change this order. There is no need to traverse this information twice. This leads to a new approach of lexing-- if we store the information for parse trees in the corresponding regular expression pieces, update this information when we do derivative operation on them, and collect the information when fininshed with derivatives and calling $mkeps$ for deiciding which branch is POSIX, we can generate the parse tree in one pass, instead of doing an n-step backward transformation.This leads to Sulzmann and Lu's novel idea of using bit-codes on derivatives.
2a469388c989 s Chengsong parents: 41 diff changeset	470
2a469388c989 s Chengsong parents: 41 diff changeset	471	In the next section, we shall focus on the bit-coded algorithm and the natural
2a469388c989 s Chengsong parents: 41 diff changeset	472	process of simplification of regular expressions using bit-codes, which is needed in
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	473	order to obtain \emph{fast} versions of the Brzozowski's, and Sulzmann
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	474	and Lu's algorithms. This is where the PhD-project hopes to advance
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	475	the state-of-the-art.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	476
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	477
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	478	\section{Simplification of Regular Expressions}
42 2a469388c989 s Chengsong parents: 41 diff changeset	479	Using bit-codes to guide parsing is not a new idea.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	480
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	481	The main drawback of building successive derivatives according to
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	482	Brzozowski's definition is that they can grow very quickly in size.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	483	This is mainly due to the fact that the derivative operation generates
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	484	often ``useless'' $\ZERO$s and $\ONE$s in derivatives. As a result,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	485	if implemented naively both algorithms by Brzozowski and by Sulzmann
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	486	and Lu are excruciatingly slow. For example when starting with the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	487	regular expression $(a + aa)^*$ and building 12 successive derivatives
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	488	w.r.t.~the character $a$, one obtains a derivative regular expression
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	489	with more than 8000 nodes (when viewed as a tree). Operations like
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	490	derivative and $\nullable$ need to traverse such trees and
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	491	consequently the bigger the size of the derivative the slower the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	492	algorithm. Fortunately, one can simplify regular expressions after
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	493	each derivative step. Various simplifications of regular expressions
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	494	are possible, such as the simplifications of $\ZERO + r$,
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	495	$r + \ZERO$, $\ONE\cdot r$, $r \cdot \ONE$, and $r + r$ to just
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	496	$r$. These simplifications do not affect the answer for whether a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	497	regular expression matches a string or not, but fortunately also do
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	498	not affect the POSIX strategy of how regular expressions match
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	499	strings---although the latter is much harder to establish. Some
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	500	initial results in this regard have been obtained in
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	501	\cite{AusafDyckhoffUrban2016}. However, what has not been achieved yet
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	502	is a very tight bound for the size. Such a tight bound is suggested by
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	503	work of Antimirov who proved that (partial) derivatives can be bound
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	504	by the number of characters contained in the initial regular
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	505	expression \cite{Antimirov95}.
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	506
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	507	Antimirov defined the "partial derivatives" of regular expressions to be this:
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	508	%TODO definition of partial derivatives
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	509
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	510	it is essentially a set of regular expressions that come from the sub-structure of the original regular expression.
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	511	Antimirov has proved a nice size bound of the size of partial derivatives. Roughly speaking the size will not exceed the fourth power of the number of nodes in that regular expression. Interestingly, we observed from experiment that after the simplification step, our regular expression has the same size or is smaller than the partial derivatives. This allows us to prove a tight bound on the size of regular expression during the running time of the algorithm if we can establish the connection between our simplification rules and partial derivatives.
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	512
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	513	%We believe, and have generated test
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	514	%data, that a similar bound can be obtained for the derivatives in
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	515	%Sulzmann and Lu's algorithm. Let us give some details about this next.
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	516
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	517	We first followed Sulzmann and Lu's idea of introducing
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	518	\emph{annotated regular expressions}~\cite{Sulzmann2014}. They are
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	519	defined by the following grammar:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	520
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	521	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	522	\begin{tabular}{lcl}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	523	$\textit{a}$ & $::=$ & $\textit{ZERO}$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	524	& $\mid$ & $\textit{ONE}\;\;bs$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	525	& $\mid$ & $\textit{CHAR}\;\;bs\,c$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	526	& $\mid$ & $\textit{ALTS}\;\;bs\,as$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	527	& $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	528	& $\mid$ & $\textit{STAR}\;\;bs\,a$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	529	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	530	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	531
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	532	\noindent
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	533	where $bs$ stands for bitsequences, and $as$ (in \textit{ALTS}) for a
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	534	list of annotated regular expressions. These bitsequences encode
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	535	information about the (POSIX) value that should be generated by the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	536	Sulzmann and Lu algorithm. Bitcodes are essentially incomplete values.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	537	This can be straightforwardly seen in the following transformation:
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	538	\begin{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	539	\begin{tabular}{lcl}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	540	$\textit{code}(\Empty)$ & $\dn$ & $[]$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	541	$\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	542	$\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	543	$\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	544	$\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	545	$\textit{code}(\Stars\,[])$ & $\dn$ & $[\S]$\\
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	546	$\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\Z :: code(v) \;@\;
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	547	code(\Stars\,vs)$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	548	\end{tabular}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	549	\end{center}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	550	where $\Z$ and $\S$ are arbitrary names for the bits in the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	551	bitsequences.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	552	Here code encodes a value into a bitsequence by converting Left into $\Z$, Right into $\S$, the start point of a non-empty star iteration into $\S$, and the border where a local star terminates into $\Z$. This conversion is apparently lossy, as it throws away the character information, and does not decode the boundary between the two operands of the sequence constructor. Moreover, with only the bitcode we cannot even tell whether the $\S$s and $\Z$s are for $Left/Right$ or $Stars$. The reason for choosing this compact way of storing information is that the relatively small size of bits can be easily moved around during the lexing process. In order to recover the bitcode back into values, we will need the regular expression as the extra information and decode them back into value:\\
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	553	%\begin{definition}[Bitdecoding of Values]\mbox{}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	554	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	555	\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
97c9ac95194d chages Chengsong parents: 35 diff changeset	556	$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	557	$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	558	$\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	559	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
97c9ac95194d chages Chengsong parents: 35 diff changeset	560	(\Left\,v, bs_1)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	561	$\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	562	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
97c9ac95194d chages Chengsong parents: 35 diff changeset	563	(\Right\,v, bs_1)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	564	$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	565	$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	566	& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	567	& & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	568	$\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	569	$\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	570	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	571	& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	572	& & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	573
97c9ac95194d chages Chengsong parents: 35 diff changeset	574	$\textit{decode}\,bs\,r$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	575	$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	576	& & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
97c9ac95194d chages Chengsong parents: 35 diff changeset	577	\textit{else}\;\textit{None}$
97c9ac95194d chages Chengsong parents: 35 diff changeset	578	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	579	\end{center}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	580	%\end{definition}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	581
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	582	To do lexing using annotated regular expressions, we shall first transform the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	583	usual (un-annotated) regular expressions into annotated regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	584	expressions:\\
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	585	%\begin{definition}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	586	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	587	\begin{tabular}{lcl}
97c9ac95194d chages Chengsong parents: 35 diff changeset	588	$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	589	$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	590	$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	591	$(r_1 + r_2)^\uparrow$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	592	$\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
97c9ac95194d chages Chengsong parents: 35 diff changeset	593	(\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	594	$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	595	$\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	596	$(r^*)^\uparrow$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	597	$\textit{STAR}\;[]\,r^\uparrow$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	598	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	599	\end{center}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	600	%\end{definition}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	601	Then we do successive derivative operations on the annotated regular expression. This derivative operation is the same as what we previously have for the simple regular expressions, except that we take special care of the bits to store the parse tree information:\\
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	602	%\begin{definition}{bder}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	603	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	604	\begin{tabular}{@{}lcl@{}}
97c9ac95194d chages Chengsong parents: 35 diff changeset	605	$(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	606	$(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	607	$(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	608	$\textit{if}\;c=d\; \;\textit{then}\;
97c9ac95194d chages Chengsong parents: 35 diff changeset	609	\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	610	$(\textit{ALT}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	611	$\textit{ALT}\,bs\,(a_1\backslash c)\,(a_2\backslash c)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	612	$(\textit{SEQ}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	613	$\textit{if}\;\textit{bnullable}\,a_1$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	614	& &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(a_1\backslash c)\,a_2)$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	615	& &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\backslash c))$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	616	& &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\backslash c)\,a_2$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	617	$(\textit{STAR}\,bs\,a)\backslash c$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	618	$\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
97c9ac95194d chages Chengsong parents: 35 diff changeset	619	(\textit{STAR}\,[]\,r)$
97c9ac95194d chages Chengsong parents: 35 diff changeset	620	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	621	\end{center}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	622	%\end{definition}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	623	This way, we do not have to use an injection function and a second phase, but instead only need to collect the bits while running $mkeps$:
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	624	%\begin{definition}[\textit{bmkeps}]\mbox{}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	625	\begin{center}
97c9ac95194d chages Chengsong parents: 35 diff changeset	626	\begin{tabular}{lcl}
97c9ac95194d chages Chengsong parents: 35 diff changeset	627	$\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	628	$\textit{bmkeps}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	629	$\textit{if}\;\textit{bnullable}\,a_1$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	630	& &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a_1$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	631	& &$\textit{else}\;bs\,@\,\textit{bmkeps}\,a_2$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	632	$\textit{bmkeps}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	633	$bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
97c9ac95194d chages Chengsong parents: 35 diff changeset	634	$\textit{bmkeps}\,(\textit{STAR}\,bs\,a)$ & $\dn$ &
97c9ac95194d chages Chengsong parents: 35 diff changeset	635	$bs \,@\, [\S]$
97c9ac95194d chages Chengsong parents: 35 diff changeset	636	\end{tabular}
97c9ac95194d chages Chengsong parents: 35 diff changeset	637	\end{center}
37 17d8e7599a01 new changes Chengsong parents: 36 diff changeset	638	%\end{definition}
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	639	and then decode the bits using the regular expression. After putting these pieces together, the whole process looks like this:\\
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	640	\begin{center}
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	641	\begin{tabular}{lcl}
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	642	$\textit{blexer}\;r\,s$ & $\dn$ &
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	643	$\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	644	& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	645	& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	646	& & $\;\;\textit{else}\;\textit{None}$
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	647	\end{tabular}
17d8e7599a01 new changes Chengsong parents: 36 diff changeset	648	\end{center}
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	649
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	650	The main point of the bitsequences and annotated regular expressions
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	651	is that we can apply rather aggressive (in terms of size)
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	652	simplification rules in order to keep derivatives small.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	653
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	654	We have
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	655	developed such ``aggressive'' simplification rules and generated test
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	656	data that show that the expected bound can be achieved. Obviously we
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	657	could only partially cover the search space as there are infinitely
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	658	many regular expressions and strings. One modification we introduced
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	659	is to allow a list of annotated regular expressions in the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	660	\textit{ALTS} constructor. This allows us to not just delete
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	661	unnecessary $\ZERO$s and $\ONE$s from regular expressions, but also
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	662	unnecessary ``copies'' of regular expressions (very similar to
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	663	simplifying $r + r$ to just $r$, but in a more general
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	664	setting).
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	665	A psuedocode version of our algorithm is given below:\\
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	666
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	667	\begin{algorithm}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	668	\caption{simplification of annotated regular expression}\label{euclid}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	669	\begin{algorithmic}[1]
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	670	\Procedure{$Simp$}{$areg$}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	671	\Switch{$areg$}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	672	\Case{$ALTS(bs, rs)$}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	673	\For{\textit{rs[i] in array rs}}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	674	\State $\textit{rs[i]} \gets$ \textit{Simp(rs[i])}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	675	\EndFor
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	676	\For{\textit{rs[i] in array rs}}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	677	\If{$rs[i] == ALTS(bs', rs')$}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	678	\State $rs'' \gets$ attach bits $bs'$ to all elements in $rs'$
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	679	\State Insert $rs''$ into $rs$ at position $i$ ($rs[i]$ is destroyed, replaced by its list of children regular expressions)
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	680	\EndIf
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	681	\EndFor
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	682	\State Remove all duplicates in $rs$, only keeping the first copy for multiple occurrences of the same regular expression
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	683	\State Remove all $0$s in $rs$
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	684	\If{$ rs.length == 0$} \Return $ZERO$ \EndIf
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	685	\If {$ rs.length == 1$} \Return$ rs[0] $\EndIf
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	686	\EndCase
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	687	\Case{$SEQ(bs, r_1, r_2)$}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	688	\If{$ r_1$ or $r_2$ is $ZERO$} \Return ZERO \EndIf
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	689	\State update $r_1$ and $r_2$ by attaching $bs$ to their front
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	690	\If {$r_1$ or $r_2$ is $ONE(bs')$} \Return $r_2$ or $r_1$ \EndIf
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	691	\EndCase
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	692	\Case{$Others$}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	693	\Return $areg$ as it is
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	694	\EndCase
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	695	\EndSwitch
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	696	\EndProcedure
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	697	\end{algorithmic}
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	698	\end{algorithm}
36 97c9ac95194d chages Chengsong parents: 35 diff changeset	699	With this simplification our previous $(a + aa)^*$ example's 8000 nodes will be reduced to only 6.
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	700
f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	701	Another modification is that we use simplification rules
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	702	inspired by Antimirov's work on partial derivatives. They maintain the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	703	idea that only the first ``copy'' of a regular expression in an
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	704	alternative contributes to the calculation of a POSIX value. All
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	705	subsequent copies can be pruned from the regular expression.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	706
35 f70e9ab4e680 psuedocode added Chengsong parents: 34 diff changeset	707
30 bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	708	We are currently engaged with proving that our simplification rules
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	709	actually do not affect the POSIX value that should be generated by the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	710	algorithm according to the specification of a POSIX value and
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	711	furthermore that our derivatives stay small for all derivatives. For
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	712	this proof we use the theorem prover Isabelle. Once completed, this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	713	result will advance the state-of-the-art: Sulzmann and Lu wrote in
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	714	their paper \cite{Sulzmann2014} about the bitcoded ``incremental
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	715	parsing method'' (that is the matching algorithm outlined in this
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	716	section):
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	717
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	718	\begin{quote}\it
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	719	``Correctness Claim: We further claim that the incremental parsing
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	720	method in Figure~5 in combination with the simplification steps in
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	721	Figure 6 yields POSIX parse trees. We have tested this claim
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	722	extensively by using the method in Figure~3 as a reference but yet
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	723	have to work out all proof details.''
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	724	\end{quote}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	725
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	726	\noindent
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	727	We would settle the correctness claim and furthermore obtain a much
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	728	tighter bound on the sizes of derivatives. The result is that our
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	729	algorithm should be correct and faster on all inputs. The original
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	730	blow-up, as observed in JavaScript, Python and Java, would be excluded
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	731	from happening in our algorithm.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	732
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	733	\section{Conclusion}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	734
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	735	In this PhD-project we are interested in fast algorithms for regular
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	736	expression matching. While this seems to be a ``settled'' area, in
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	737	fact interesting research questions are popping up as soon as one steps
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	738	outside the classic automata theory (for example in terms of what kind
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	739	of regular expressions are supported). The reason why it is
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	740	interesting for us to look at the derivative approach introduced by
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	741	Brzozowski for regular expression matching, and then much further
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	742	developed by Sulzmann and Lu, is that derivatives can elegantly deal
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	743	with some of the regular expressions that are of interest in ``real
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	744	life''. This includes the not-regular expression, written $\neg\,r$
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	745	(that is all strings that are not recognised by $r$), but also bounded
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	746	regular expressions such as $r^{\{n\}}$ and $r^{\{n..m\}}$). There is
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	747	also hope that the derivatives can provide another angle for how to
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	748	deal more efficiently with back-references, which are one of the
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	749	reasons why regular expression engines in JavaScript, Python and Java
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	750	choose to not implement the classic automata approach of transforming
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	751	regular expressions into NFAs and then DFAs---because we simply do not
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	752	know how such back-references can be represented by DFAs.
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	753
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	754
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	755	\bibliographystyle{plain}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	756	\bibliography{root}
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	757
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	758
bd9eb959dbce changed file name to ninems Chengsong parents: diff changeset	759	\end{document}

author	Chengsong
	Wed, 03 Jul 2019 22:06:47 +0100 (2019-07-03)
changeset 42	2a469388c989
parent 41	a1f90febbc7f
child 43	52a3cec0a5c7
permissions	-rw-r--r--