cst_tests: etnms/etnms.tex@4d6f54c478b5 (annotated)

94 2e2dca212fff add Chengsong parents: diff changeset	1	\documentclass[a4paper,UKenglish]{lipics}
2e2dca212fff add Chengsong parents: diff changeset	2	\usepackage{graphic}
2e2dca212fff add Chengsong parents: diff changeset	3	\usepackage{data}
2e2dca212fff add Chengsong parents: diff changeset	4	\usepackage{tikz-cd}
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	5	\usepackage{tikz}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	6	\usetikzlibrary{graphs}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	7	\usetikzlibrary{graphdrawing}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	8	\usegdlibrary{trees}
94 2e2dca212fff add Chengsong parents: diff changeset	9	%\usepackage{algorithm}
2e2dca212fff add Chengsong parents: diff changeset	10	\usepackage{amsmath}
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	11	\usepackage{xcolor}
94 2e2dca212fff add Chengsong parents: diff changeset	12	\usepackage[noend]{algpseudocode}
2e2dca212fff add Chengsong parents: diff changeset	13	\usepackage{enumitem}
2e2dca212fff add Chengsong parents: diff changeset	14	\usepackage{nccmath}
113 8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	15	\usepackage{soul}
94 2e2dca212fff add Chengsong parents: diff changeset	16
2e2dca212fff add Chengsong parents: diff changeset	17	\definecolor{darkblue}{rgb}{0,0,0.6}
2e2dca212fff add Chengsong parents: diff changeset	18	\hypersetup{colorlinks=true,allcolors=darkblue}
2e2dca212fff add Chengsong parents: diff changeset	19	\newcommand{\comment}[1]%
2e2dca212fff add Chengsong parents: diff changeset	20	{{\color{red}$\Rightarrow$}\marginpar{\raggedright\small{\bf\color{red}#1}}}
2e2dca212fff add Chengsong parents: diff changeset	21
2e2dca212fff add Chengsong parents: diff changeset	22	% \documentclass{article}
2e2dca212fff add Chengsong parents: diff changeset	23	%\usepackage[utf8]{inputenc}
2e2dca212fff add Chengsong parents: diff changeset	24	%\usepackage[english]{babel}
2e2dca212fff add Chengsong parents: diff changeset	25	%\usepackage{listings}
2e2dca212fff add Chengsong parents: diff changeset	26	% \usepackage{amsthm}
2e2dca212fff add Chengsong parents: diff changeset	27	%\usepackage{hyperref}
2e2dca212fff add Chengsong parents: diff changeset	28	% \usepackage[margin=0.5in]{geometry}
2e2dca212fff add Chengsong parents: diff changeset	29	%\usepackage{pmboxdraw}
2e2dca212fff add Chengsong parents: diff changeset	30
2e2dca212fff add Chengsong parents: diff changeset	31	\title{POSIX Regular Expression Matching and Lexing}
2e2dca212fff add Chengsong parents: diff changeset	32	\author{Chengsong Tan}
2e2dca212fff add Chengsong parents: diff changeset	33	\affil{King's College London\\
2e2dca212fff add Chengsong parents: diff changeset	34	London, UK\\
2e2dca212fff add Chengsong parents: diff changeset	35	\texttt{chengsong.tan@kcl.ac.uk}}
2e2dca212fff add Chengsong parents: diff changeset	36	\authorrunning{Chengsong Tan}
2e2dca212fff add Chengsong parents: diff changeset	37	\Copyright{Chengsong Tan}
2e2dca212fff add Chengsong parents: diff changeset	38
2e2dca212fff add Chengsong parents: diff changeset	39	\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
2e2dca212fff add Chengsong parents: diff changeset	40	\newcommand{\ZERO}{\mbox{\bf 0}}
2e2dca212fff add Chengsong parents: diff changeset	41	\newcommand{\ONE}{\mbox{\bf 1}}
101 4a327e70d538 b Chengsong parents: 100 diff changeset	42	\def\erase{\textit{erase}}
94 2e2dca212fff add Chengsong parents: diff changeset	43	\def\bders{\textit{bders}}
2e2dca212fff add Chengsong parents: diff changeset	44	\def\lexer{\mathit{lexer}}
2e2dca212fff add Chengsong parents: diff changeset	45	\def\blexer{\textit{blexer}}
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	46	\def\fuse{\textit{fuse}}
b1e365afa29c changes Chengsong parents: 106 diff changeset	47	\def\flatten{\textit{flatten}}
b1e365afa29c changes Chengsong parents: 106 diff changeset	48	\def\map{\textit{map}}
94 2e2dca212fff add Chengsong parents: diff changeset	49	\def\blexers{\mathit{blexer\_simp}}
95 c969a973fcae updte1 Chengsong parents: 94 diff changeset	50	\def\simp{\mathit{simp}}
94 2e2dca212fff add Chengsong parents: diff changeset	51	\def\mkeps{\mathit{mkeps}}
2e2dca212fff add Chengsong parents: diff changeset	52	\def\bmkeps{\textit{bmkeps}}
2e2dca212fff add Chengsong parents: diff changeset	53	\def\inj{\mathit{inj}}
2e2dca212fff add Chengsong parents: diff changeset	54	\def\Empty{\mathit{Empty}}
2e2dca212fff add Chengsong parents: diff changeset	55	\def\Left{\mathit{Left}}
2e2dca212fff add Chengsong parents: diff changeset	56	\def\Right{\mathit{Right}}
2e2dca212fff add Chengsong parents: diff changeset	57	\def\Stars{\mathit{Stars}}
2e2dca212fff add Chengsong parents: diff changeset	58	\def\Char{\mathit{Char}}
2e2dca212fff add Chengsong parents: diff changeset	59	\def\Seq{\mathit{Seq}}
2e2dca212fff add Chengsong parents: diff changeset	60	\def\Der{\mathit{Der}}
2e2dca212fff add Chengsong parents: diff changeset	61	\def\nullable{\mathit{nullable}}
2e2dca212fff add Chengsong parents: diff changeset	62	\def\Z{\mathit{Z}}
2e2dca212fff add Chengsong parents: diff changeset	63	\def\S{\mathit{S}}
2e2dca212fff add Chengsong parents: diff changeset	64	\def\flex{\textit{flex}}
2e2dca212fff add Chengsong parents: diff changeset	65	\def\rup{r^\uparrow}
2e2dca212fff add Chengsong parents: diff changeset	66	\def\retrieve{\textit{retrieve}}
2e2dca212fff add Chengsong parents: diff changeset	67	\def\AALTS{\textit{AALTS}}
2e2dca212fff add Chengsong parents: diff changeset	68	\def\AONE{\textit{AONE}}
2e2dca212fff add Chengsong parents: diff changeset	69	%\theoremstyle{theorem}
2e2dca212fff add Chengsong parents: diff changeset	70	%\newtheorem{theorem}{Theorem}
2e2dca212fff add Chengsong parents: diff changeset	71	%\theoremstyle{lemma}
2e2dca212fff add Chengsong parents: diff changeset	72	%\newtheorem{lemma}{Lemma}
2e2dca212fff add Chengsong parents: diff changeset	73	%\newcommand{\lemmaautorefname}{Lemma}
2e2dca212fff add Chengsong parents: diff changeset	74	%\theoremstyle{definition}
2e2dca212fff add Chengsong parents: diff changeset	75	%\newtheorem{definition}{Definition}
2e2dca212fff add Chengsong parents: diff changeset	76	\algnewcommand\algorithmicswitch{\textbf{switch}}
2e2dca212fff add Chengsong parents: diff changeset	77	\algnewcommand\algorithmiccase{\textbf{case}}
2e2dca212fff add Chengsong parents: diff changeset	78	\algnewcommand\algorithmicassert{\texttt{assert}}
2e2dca212fff add Chengsong parents: diff changeset	79	\algnewcommand\Assert[1]{\State \algorithmicassert(#1)}%
2e2dca212fff add Chengsong parents: diff changeset	80	% New "environments"
2e2dca212fff add Chengsong parents: diff changeset	81	\algdef{SE}[SWITCH]{Switch}{EndSwitch}[1]{\algorithmicswitch\ #1\ \algorithmicdo}{\algorithmicend\ \algorithmicswitch}%
2e2dca212fff add Chengsong parents: diff changeset	82	\algdef{SE}[CASE]{Case}{EndCase}[1]{\algorithmiccase\ #1}{\algorithmicend\ \algorithmiccase}%
2e2dca212fff add Chengsong parents: diff changeset	83	\algtext*{EndSwitch}%
2e2dca212fff add Chengsong parents: diff changeset	84	\algtext*{EndCase}%
2e2dca212fff add Chengsong parents: diff changeset	85
2e2dca212fff add Chengsong parents: diff changeset	86
2e2dca212fff add Chengsong parents: diff changeset	87	\begin{document}
2e2dca212fff add Chengsong parents: diff changeset	88
2e2dca212fff add Chengsong parents: diff changeset	89	\maketitle
2e2dca212fff add Chengsong parents: diff changeset	90
2e2dca212fff add Chengsong parents: diff changeset	91	\begin{abstract}
2e2dca212fff add Chengsong parents: diff changeset	92	Brzozowski introduced in 1964 a beautifully simple algorithm for
2e2dca212fff add Chengsong parents: diff changeset	93	regular expression matching based on the notion of derivatives of
2e2dca212fff add Chengsong parents: diff changeset	94	regular expressions. In 2014, Sulzmann and Lu extended this
2e2dca212fff add Chengsong parents: diff changeset	95	algorithm to not just give a YES/NO answer for whether or not a
2e2dca212fff add Chengsong parents: diff changeset	96	regular expression matches a string, but in case it does also
2e2dca212fff add Chengsong parents: diff changeset	97	answers with \emph{how} it matches the string. This is important for
2e2dca212fff add Chengsong parents: diff changeset	98	applications such as lexing (tokenising a string). The problem is to
2e2dca212fff add Chengsong parents: diff changeset	99	make the algorithm by Sulzmann and Lu fast on all inputs without
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	100	breaking its correctness. Being fast depends on a complete set of
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	101	simplification rules, some of which
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	102	have been put forward by Sulzmann and Lu. We have extended their
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	103	rules in order to obtain a tight bound on the size of regular expressions.
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	104	We have tested these extended rules, but have not
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	105	formally established their correctness. We have also not yet looked
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	106	at extended regular expressions, such as bounded repetitions,
94 2e2dca212fff add Chengsong parents: diff changeset	107	negation and back-references.
2e2dca212fff add Chengsong parents: diff changeset	108	\end{abstract}
2e2dca212fff add Chengsong parents: diff changeset	109
126 1260b383ae2c forfear Chengsong parents: 125 diff changeset	110
1260b383ae2c forfear Chengsong parents: 125 diff changeset	111	\section{Introduction}
129 576ddb23f596 madly Chengsong parents: 128 diff changeset	112	%Regular expressions' derivatives, which have received
576ddb23f596 madly Chengsong parents: 128 diff changeset	113	%renewed interest in the new millenium, is a beautiful....
126 1260b383ae2c forfear Chengsong parents: 125 diff changeset	114	While we believe derivatives of regular expressions, written
1260b383ae2c forfear Chengsong parents: 125 diff changeset	115	$r\backslash s$, are a beautiful concept (in terms of ease of
1260b383ae2c forfear Chengsong parents: 125 diff changeset	116	implementing them in functional programming languages and in terms of
1260b383ae2c forfear Chengsong parents: 125 diff changeset	117	reasoning about them formally), they have one major drawback: every
1260b383ae2c forfear Chengsong parents: 125 diff changeset	118	derivative step can make regular expressions grow drastically in
1260b383ae2c forfear Chengsong parents: 125 diff changeset	119	size. This in turn has negative effect on the runtime of the
1260b383ae2c forfear Chengsong parents: 125 diff changeset	120	corresponding lexing algorithms. Consider for example the regular
1260b383ae2c forfear Chengsong parents: 125 diff changeset	121	expression $(a+aa)^*$ and the short string $aaaaaaaaaaaa$. The
1260b383ae2c forfear Chengsong parents: 125 diff changeset	122	corresponding derivative contains already 8668 nodes where we assume
1260b383ae2c forfear Chengsong parents: 125 diff changeset	123	the derivative is given as a tree. The reason for the poor runtime of
1260b383ae2c forfear Chengsong parents: 125 diff changeset	124	the derivative-based lexing algorithms is that they need to traverse
1260b383ae2c forfear Chengsong parents: 125 diff changeset	125	such trees over and over again. The solution is to find a complete set
1260b383ae2c forfear Chengsong parents: 125 diff changeset	126	of simplification rules that keep the sizes of derivatives uniformly
1260b383ae2c forfear Chengsong parents: 125 diff changeset	127	small.
1260b383ae2c forfear Chengsong parents: 125 diff changeset	128
128 0203065d1370 structuring text Chengsong parents: 127 diff changeset	129	This has been partially addressed by the function $\blexer_{simp}$,
0203065d1370 structuring text Chengsong parents: 127 diff changeset	130	which after the simplification the $(a+aa)^*$ example's 8000 nodes will be
0203065d1370 structuring text Chengsong parents: 127 diff changeset	131	reduced to just 6 and stays constant in each derivative step.
0203065d1370 structuring text Chengsong parents: 127 diff changeset	132	The part that still needs more work is the correctness proof of this
0203065d1370 structuring text Chengsong parents: 127 diff changeset	133	function, namely,
0203065d1370 structuring text Chengsong parents: 127 diff changeset	134	\begin{equation}\label{mainthm}
0203065d1370 structuring text Chengsong parents: 127 diff changeset	135	\blexers \; r \; s = \blexer \;r\;s
0203065d1370 structuring text Chengsong parents: 127 diff changeset	136	\end{equation}
0203065d1370 structuring text Chengsong parents: 127 diff changeset	137
0203065d1370 structuring text Chengsong parents: 127 diff changeset	138	\noindent
0203065d1370 structuring text Chengsong parents: 127 diff changeset	139	and this is what this report is mainly about. A condensed
0203065d1370 structuring text Chengsong parents: 127 diff changeset	140	version of the last report will be provided in the next section
0203065d1370 structuring text Chengsong parents: 127 diff changeset	141	to help the reader understand the report, and the attempts
0203065d1370 structuring text Chengsong parents: 127 diff changeset	142	on the problem will follow.
0203065d1370 structuring text Chengsong parents: 127 diff changeset	143
126 1260b383ae2c forfear Chengsong parents: 125 diff changeset	144
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	145	\section{Recapitulation of Concepts From the Last Report}
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	146
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	147	\subsection*{Regular Expressions and Derivatives}
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	148	Suppose (basic) regular expressions are given by the following grammar:
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	149
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	150	\[ r ::= \ZERO \mid \ONE
b1e365afa29c changes Chengsong parents: 106 diff changeset	151	\mid c
b1e365afa29c changes Chengsong parents: 106 diff changeset	152	\mid r_1 \cdot r_2
b1e365afa29c changes Chengsong parents: 106 diff changeset	153	\mid r_1 + r_2
b1e365afa29c changes Chengsong parents: 106 diff changeset	154	\mid r^*
b1e365afa29c changes Chengsong parents: 106 diff changeset	155	\]
b1e365afa29c changes Chengsong parents: 106 diff changeset	156
b1e365afa29c changes Chengsong parents: 106 diff changeset	157	\noindent
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	158	The ingenious contribution of Brzozowski is the notion of \emph{derivatives} of
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	159	regular expressions, written~$\_ \backslash \_$. It uses the auxiliary notion of
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	160	$\nullable$ defined below.
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	161
b1e365afa29c changes Chengsong parents: 106 diff changeset	162	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	163	\begin{tabular}{lcl}
b1e365afa29c changes Chengsong parents: 106 diff changeset	164	$\nullable(\ZERO)$ & $\dn$ & $\mathit{false}$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	165	$\nullable(\ONE)$ & $\dn$ & $\mathit{true}$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	166	$\nullable(c)$ & $\dn$ & $\mathit{false}$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	167	$\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	168	$\nullable(r_1\cdot r_2)$ & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	169	$\nullable(r^*)$ & $\dn$ & $\mathit{true}$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	170	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	171	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	172
b1e365afa29c changes Chengsong parents: 106 diff changeset	173	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	174	\begin{tabular}{lcl}
b1e365afa29c changes Chengsong parents: 106 diff changeset	175	$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	176	$\ONE \backslash c$ & $\dn$ & $\ZERO$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	177	$d \backslash c$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	178	$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	179	$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	180	$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, nullable(r_1)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	181	& & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	182	& & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	183	$(r^)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	184	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	185	\end{center}
130 4d6f54c478b5 i Chengsong parents: 129 diff changeset	186	\noindent
4d6f54c478b5 i Chengsong parents: 129 diff changeset	187	And defines how a regular expression evolves into
4d6f54c478b5 i Chengsong parents: 129 diff changeset	188	a new regular expression after all the string it contains
4d6f54c478b5 i Chengsong parents: 129 diff changeset	189	is chopped off a certain head character $c$.
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	190
b1e365afa29c changes Chengsong parents: 106 diff changeset	191	The main property of the derivative operation is that
b1e365afa29c changes Chengsong parents: 106 diff changeset	192
b1e365afa29c changes Chengsong parents: 106 diff changeset	193	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	194	$c\!::\!s \in L(r)$ holds
b1e365afa29c changes Chengsong parents: 106 diff changeset	195	if and only if $s \in L(r\backslash c)$.
b1e365afa29c changes Chengsong parents: 106 diff changeset	196	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	197
b1e365afa29c changes Chengsong parents: 106 diff changeset	198	\noindent
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	199	We can generalise the derivative operation shown above for single characters
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	200	to strings as follows:
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	201
b1e365afa29c changes Chengsong parents: 106 diff changeset	202	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	203	\begin{tabular}{lcl}
b1e365afa29c changes Chengsong parents: 106 diff changeset	204	$r \backslash (c\!::\!s) $ & $\dn$ & $(r \backslash c) \backslash s$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	205	$r \backslash [\,] $ & $\dn$ & $r$
b1e365afa29c changes Chengsong parents: 106 diff changeset	206	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	207	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	208
b1e365afa29c changes Chengsong parents: 106 diff changeset	209	\noindent
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	210	and then define Brzozowski's regular-expression matching algorithm as:
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	211
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	212	\[
b1e365afa29c changes Chengsong parents: 106 diff changeset	213	match\;s\;r \;\dn\; nullable(r\backslash s)
b1e365afa29c changes Chengsong parents: 106 diff changeset	214	\]
b1e365afa29c changes Chengsong parents: 106 diff changeset	215
b1e365afa29c changes Chengsong parents: 106 diff changeset	216	\noindent
130 4d6f54c478b5 i Chengsong parents: 129 diff changeset	217	Assuming the a string is given as a sequence of characters, say $c_0c_1..c_n$,
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	218	this algorithm presented graphically is as follows:
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	219
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	220	\begin{equation}\label{graph:*}
b1e365afa29c changes Chengsong parents: 106 diff changeset	221	\begin{tikzcd}
b1e365afa29c changes Chengsong parents: 106 diff changeset	222	r_0 \arrow[r, "\backslash c_0"] & r_1 \arrow[r, "\backslash c_1"] & r_2 \arrow[r, dashed] & r_n \arrow[r,"\textit{nullable}?"] & \;\textrm{YES}/\textrm{NO}
b1e365afa29c changes Chengsong parents: 106 diff changeset	223	\end{tikzcd}
b1e365afa29c changes Chengsong parents: 106 diff changeset	224	\end{equation}
b1e365afa29c changes Chengsong parents: 106 diff changeset	225
b1e365afa29c changes Chengsong parents: 106 diff changeset	226	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	227	where we start with a regular expression $r_0$, build successive
b1e365afa29c changes Chengsong parents: 106 diff changeset	228	derivatives until we exhaust the string and then use \textit{nullable}
b1e365afa29c changes Chengsong parents: 106 diff changeset	229	to test whether the result can match the empty string. It can be
b1e365afa29c changes Chengsong parents: 106 diff changeset	230	relatively easily shown that this matcher is correct (that is given
b1e365afa29c changes Chengsong parents: 106 diff changeset	231	an $s = c_0...c_{n-1}$ and an $r_0$, it generates YES if and only if $s \in L(r_0)$).
b1e365afa29c changes Chengsong parents: 106 diff changeset	232
b1e365afa29c changes Chengsong parents: 106 diff changeset	233
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	234	\subsection*{Values and the Lexing Algorithm by Sulzmann and Lu}
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	235
b1e365afa29c changes Chengsong parents: 106 diff changeset	236	One limitation of Brzozowski's algorithm is that it only produces a
b1e365afa29c changes Chengsong parents: 106 diff changeset	237	YES/NO answer for whether a string is being matched by a regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	238	expression. Sulzmann and Lu~\cite{Sulzmann2014} extended this algorithm
b1e365afa29c changes Chengsong parents: 106 diff changeset	239	to allow generation of an actual matching, called a \emph{value} or
b1e365afa29c changes Chengsong parents: 106 diff changeset	240	sometimes also \emph{lexical value}. These values and regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	241	expressions correspond to each other as illustrated in the following
b1e365afa29c changes Chengsong parents: 106 diff changeset	242	table:
b1e365afa29c changes Chengsong parents: 106 diff changeset	243
b1e365afa29c changes Chengsong parents: 106 diff changeset	244
b1e365afa29c changes Chengsong parents: 106 diff changeset	245	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	246	\begin{tabular}{c@{\hspace{20mm}}c}
b1e365afa29c changes Chengsong parents: 106 diff changeset	247	\begin{tabular}{@{}rrl@{}}
b1e365afa29c changes Chengsong parents: 106 diff changeset	248	\multicolumn{3}{@{}l}{\textbf{Regular Expressions}}\medskip\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	249	$r$ & $::=$ & $\ZERO$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	250	& $\mid$ & $\ONE$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	251	& $\mid$ & $c$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	252	& $\mid$ & $r_1 \cdot r_2$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	253	& $\mid$ & $r_1 + r_2$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	254	\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	255	& $\mid$ & $r^*$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	256	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	257	&
b1e365afa29c changes Chengsong parents: 106 diff changeset	258	\begin{tabular}{@{\hspace{0mm}}rrl@{}}
b1e365afa29c changes Chengsong parents: 106 diff changeset	259	\multicolumn{3}{@{}l}{\textbf{Values}}\medskip\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	260	$v$ & $::=$ & \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	261	& & $\Empty$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	262	& $\mid$ & $\Char(c)$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	263	& $\mid$ & $\Seq\,v_1\, v_2$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	264	& $\mid$ & $\Left(v)$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	265	& $\mid$ & $\Right(v)$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	266	& $\mid$ & $\Stars\,[v_1,\ldots\,v_n]$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	267	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	268	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	269	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	270
b1e365afa29c changes Chengsong parents: 106 diff changeset	271	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	272	The contribution of Sulzmann and Lu is an extension of Brzozowski's
b1e365afa29c changes Chengsong parents: 106 diff changeset	273	algorithm by a second phase (the first phase being building successive
b1e365afa29c changes Chengsong parents: 106 diff changeset	274	derivatives---see \eqref{graph:*}). In this second phase, a POSIX value
b1e365afa29c changes Chengsong parents: 106 diff changeset	275	is generated in case the regular expression matches the string.
b1e365afa29c changes Chengsong parents: 106 diff changeset	276	Pictorially, the Sulzmann and Lu algorithm is as follows:
b1e365afa29c changes Chengsong parents: 106 diff changeset	277
b1e365afa29c changes Chengsong parents: 106 diff changeset	278	\begin{ceqn}
b1e365afa29c changes Chengsong parents: 106 diff changeset	279	\begin{equation}\label{graph:2}
b1e365afa29c changes Chengsong parents: 106 diff changeset	280	\begin{tikzcd}
b1e365afa29c changes Chengsong parents: 106 diff changeset	281	r_0 \arrow[r, "\backslash c_0"] \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	282	v_0 & v_1 \arrow[l,"inj_{r_0} c_0"] & v_2 \arrow[l, "inj_{r_1} c_1"] & v_n \arrow[l, dashed]
b1e365afa29c changes Chengsong parents: 106 diff changeset	283	\end{tikzcd}
b1e365afa29c changes Chengsong parents: 106 diff changeset	284	\end{equation}
b1e365afa29c changes Chengsong parents: 106 diff changeset	285	\end{ceqn}
b1e365afa29c changes Chengsong parents: 106 diff changeset	286
b1e365afa29c changes Chengsong parents: 106 diff changeset	287	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	288	For convenience, we shall employ the following notations: the regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	289	expression we start with is $r_0$, and the given string $s$ is composed
b1e365afa29c changes Chengsong parents: 106 diff changeset	290	of characters $c_0 c_1 \ldots c_{n-1}$. In the first phase from the
b1e365afa29c changes Chengsong parents: 106 diff changeset	291	left to right, we build the derivatives $r_1$, $r_2$, \ldots according
b1e365afa29c changes Chengsong parents: 106 diff changeset	292	to the characters $c_0$, $c_1$ until we exhaust the string and obtain
b1e365afa29c changes Chengsong parents: 106 diff changeset	293	the derivative $r_n$. We test whether this derivative is
b1e365afa29c changes Chengsong parents: 106 diff changeset	294	$\textit{nullable}$ or not. If not, we know the string does not match
b1e365afa29c changes Chengsong parents: 106 diff changeset	295	$r$ and no value needs to be generated. If yes, we start building the
b1e365afa29c changes Chengsong parents: 106 diff changeset	296	values incrementally by \emph{injecting} back the characters into the
b1e365afa29c changes Chengsong parents: 106 diff changeset	297	earlier values $v_n, \ldots, v_0$. This is the second phase of the
b1e365afa29c changes Chengsong parents: 106 diff changeset	298	algorithm from the right to left. For the first value $v_n$, we call the
b1e365afa29c changes Chengsong parents: 106 diff changeset	299	function $\textit{mkeps}$, which builds the lexical value
b1e365afa29c changes Chengsong parents: 106 diff changeset	300	for how the empty string has been matched by the (nullable) regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	301	expression $r_n$. This function is defined as
b1e365afa29c changes Chengsong parents: 106 diff changeset	302
b1e365afa29c changes Chengsong parents: 106 diff changeset	303	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	304	\begin{tabular}{lcl}
b1e365afa29c changes Chengsong parents: 106 diff changeset	305	$\mkeps(\ONE)$ & $\dn$ & $\Empty$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	306	$\mkeps(r_{1}+r_{2})$ & $\dn$
b1e365afa29c changes Chengsong parents: 106 diff changeset	307	& \textit{if} $\nullable(r_{1})$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	308	& & \textit{then} $\Left(\mkeps(r_{1}))$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	309	& & \textit{else} $\Right(\mkeps(r_{2}))$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	310	$\mkeps(r_1\cdot r_2)$ & $\dn$ & $\Seq\,(\mkeps\,r_1)\,(\mkeps\,r_2)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	311	$mkeps(r^*)$ & $\dn$ & $\Stars\,[]$
b1e365afa29c changes Chengsong parents: 106 diff changeset	312	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	313	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	314
b1e365afa29c changes Chengsong parents: 106 diff changeset	315
b1e365afa29c changes Chengsong parents: 106 diff changeset	316	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	317	After the $\mkeps$-call, we inject back the characters one by one in order to build
b1e365afa29c changes Chengsong parents: 106 diff changeset	318	the lexical value $v_i$ for how the regex $r_i$ matches the string $s_i$
b1e365afa29c changes Chengsong parents: 106 diff changeset	319	($s_i = c_i \ldots c_{n-1}$ ) from the previous lexical value $v_{i+1}$.
b1e365afa29c changes Chengsong parents: 106 diff changeset	320	After injecting back $n$ characters, we get the lexical value for how $r_0$
b1e365afa29c changes Chengsong parents: 106 diff changeset	321	matches $s$. For this Sulzmann and Lu defined a function that reverses
b1e365afa29c changes Chengsong parents: 106 diff changeset	322	the ``chopping off'' of characters during the derivative phase. The
b1e365afa29c changes Chengsong parents: 106 diff changeset	323	corresponding function is called \emph{injection}, written
b1e365afa29c changes Chengsong parents: 106 diff changeset	324	$\textit{inj}$; it takes three arguments: the first one is a regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	325	expression ${r_{i-1}}$, before the character is chopped off, the second
b1e365afa29c changes Chengsong parents: 106 diff changeset	326	is a character ${c_{i-1}}$, the character we want to inject and the
b1e365afa29c changes Chengsong parents: 106 diff changeset	327	third argument is the value ${v_i}$, into which one wants to inject the
b1e365afa29c changes Chengsong parents: 106 diff changeset	328	character (it corresponds to the regular expression after the character
b1e365afa29c changes Chengsong parents: 106 diff changeset	329	has been chopped off). The result of this function is a new value. The
b1e365afa29c changes Chengsong parents: 106 diff changeset	330	definition of $\textit{inj}$ is as follows:
b1e365afa29c changes Chengsong parents: 106 diff changeset	331
b1e365afa29c changes Chengsong parents: 106 diff changeset	332	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	333	\begin{tabular}{l@{\hspace{1mm}}c@{\hspace{1mm}}l}
b1e365afa29c changes Chengsong parents: 106 diff changeset	334	$\textit{inj}\,(c)\,c\,Empty$ & $\dn$ & $Char\,c$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	335	$\textit{inj}\,(r_1 + r_2)\,c\,\Left(v)$ & $\dn$ & $\Left(\textit{inj}\,r_1\,c\,v)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	336	$\textit{inj}\,(r_1 + r_2)\,c\,Right(v)$ & $\dn$ & $Right(\textit{inj}\,r_2\,c\,v)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	337	$\textit{inj}\,(r_1 \cdot r_2)\,c\,Seq(v_1,v_2)$ & $\dn$ & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	338	$\textit{inj}\,(r_1 \cdot r_2)\,c\,\Left(Seq(v_1,v_2))$ & $\dn$ & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	339	$\textit{inj}\,(r_1 \cdot r_2)\,c\,Right(v)$ & $\dn$ & $Seq(\textit{mkeps}(r_1),\textit{inj}\,r_2\,c\,v)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	340	$\textit{inj}\,(r^*)\,c\,Seq(v,Stars\,vs)$ & $\dn$ & $Stars((\textit{inj}\,r\,c\,v)\,::\,vs)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	341	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	342	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	343
b1e365afa29c changes Chengsong parents: 106 diff changeset	344	\noindent This definition is by recursion on the ``shape'' of regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	345	expressions and values.
b1e365afa29c changes Chengsong parents: 106 diff changeset	346
b1e365afa29c changes Chengsong parents: 106 diff changeset	347
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	348	\subsection*{Simplification of Regular Expressions}
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	349
b1e365afa29c changes Chengsong parents: 106 diff changeset	350	The main drawback of building successive derivatives according
b1e365afa29c changes Chengsong parents: 106 diff changeset	351	to Brzozowski's definition is that they can grow very quickly in size.
b1e365afa29c changes Chengsong parents: 106 diff changeset	352	This is mainly due to the fact that the derivative operation generates
b1e365afa29c changes Chengsong parents: 106 diff changeset	353	often ``useless'' $\ZERO$s and $\ONE$s in derivatives. As a result, if
b1e365afa29c changes Chengsong parents: 106 diff changeset	354	implemented naively both algorithms by Brzozowski and by Sulzmann and Lu
b1e365afa29c changes Chengsong parents: 106 diff changeset	355	are excruciatingly slow. For example when starting with the regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	356	expression $(a + aa)^*$ and building 12 successive derivatives
b1e365afa29c changes Chengsong parents: 106 diff changeset	357	w.r.t.~the character $a$, one obtains a derivative regular expression
b1e365afa29c changes Chengsong parents: 106 diff changeset	358	with more than 8000 nodes (when viewed as a tree). Operations like
b1e365afa29c changes Chengsong parents: 106 diff changeset	359	$\textit{der}$ and $\nullable$ need to traverse such trees and
b1e365afa29c changes Chengsong parents: 106 diff changeset	360	consequently the bigger the size of the derivative the slower the
b1e365afa29c changes Chengsong parents: 106 diff changeset	361	algorithm.
b1e365afa29c changes Chengsong parents: 106 diff changeset	362
b1e365afa29c changes Chengsong parents: 106 diff changeset	363	Fortunately, one can simplify regular expressions after each derivative
b1e365afa29c changes Chengsong parents: 106 diff changeset	364	step. Various simplifications of regular expressions are possible, such
b1e365afa29c changes Chengsong parents: 106 diff changeset	365	as the simplification of $\ZERO + r$, $r + \ZERO$, $\ONE\cdot r$, $r
b1e365afa29c changes Chengsong parents: 106 diff changeset	366	\cdot \ONE$, and $r + r$ to just $r$. These simplifications do not
b1e365afa29c changes Chengsong parents: 106 diff changeset	367	affect the answer for whether a regular expression matches a string or
b1e365afa29c changes Chengsong parents: 106 diff changeset	368	not, but fortunately also do not affect the POSIX strategy of how
b1e365afa29c changes Chengsong parents: 106 diff changeset	369	regular expressions match strings---although the latter is much harder
b1e365afa29c changes Chengsong parents: 106 diff changeset	370	to establish. Some initial results in this regard have been
b1e365afa29c changes Chengsong parents: 106 diff changeset	371	obtained in \cite{AusafDyckhoffUrban2016}.
b1e365afa29c changes Chengsong parents: 106 diff changeset	372
b1e365afa29c changes Chengsong parents: 106 diff changeset	373	If we want the size of derivatives in Sulzmann and Lu's algorithm to
b1e365afa29c changes Chengsong parents: 106 diff changeset	374	stay below this bound, we would need more aggressive simplifications.
b1e365afa29c changes Chengsong parents: 106 diff changeset	375	Essentially we need to delete useless $\ZERO$s and $\ONE$s, as well as
b1e365afa29c changes Chengsong parents: 106 diff changeset	376	deleting duplicates whenever possible. For example, the parentheses in
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	377	$(a+b) \cdot c + b\cdot c$ can be opened up to get $a\cdot c + b \cdot c + b
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	378	\cdot c$, and then simplified to just $a \cdot c + b \cdot c$. Another
b1e365afa29c changes Chengsong parents: 106 diff changeset	379	example is simplifying $(a^+a) + (a^+ \ONE) + (a +\ONE)$ to just
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	380	$a^*+a+\ONE$. Adding these more aggressive simplification rules help us
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	381	to achieve the same size bound as that of the partial derivatives.
b1e365afa29c changes Chengsong parents: 106 diff changeset	382
130 4d6f54c478b5 i Chengsong parents: 129 diff changeset	383
4d6f54c478b5 i Chengsong parents: 129 diff changeset	384	Suppose we apply simplification after each derivative step, and view
4d6f54c478b5 i Chengsong parents: 129 diff changeset	385	these two operations as an atomic one: $a \backslash_{simp}\,c \dn
4d6f54c478b5 i Chengsong parents: 129 diff changeset	386	\textit{simp}(a \backslash c)$. Then we can use the previous natural
4d6f54c478b5 i Chengsong parents: 129 diff changeset	387	extension from derivative w.r.t.~character to derivative
4d6f54c478b5 i Chengsong parents: 129 diff changeset	388	w.r.t.~string:%\comment{simp in the [] case?}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	389
4d6f54c478b5 i Chengsong parents: 129 diff changeset	390	\begin{center}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	391	\begin{tabular}{lcl}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	392	$r \backslash_{simp} (c\!::\!s) $ & $\dn$ & $(r \backslash_{simp}\, c) \backslash_{simp}\, s$ \\
4d6f54c478b5 i Chengsong parents: 129 diff changeset	393	$r \backslash_{simp} [\,] $ & $\dn$ & $r$
4d6f54c478b5 i Chengsong parents: 129 diff changeset	394	\end{tabular}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	395	\end{center}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	396
4d6f54c478b5 i Chengsong parents: 129 diff changeset	397	\noindent
4d6f54c478b5 i Chengsong parents: 129 diff changeset	398	we obtain an optimised version of the algorithm:
4d6f54c478b5 i Chengsong parents: 129 diff changeset	399
4d6f54c478b5 i Chengsong parents: 129 diff changeset	400	\begin{center}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	401	\begin{tabular}{lcl}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	402	$\textit{blexer\_simp}\;r\,s$ & $\dn$ &
4d6f54c478b5 i Chengsong parents: 129 diff changeset	403	$\textit{let}\;a = (r^\uparrow)\backslash_{simp}\, s\;\textit{in}$\\
4d6f54c478b5 i Chengsong parents: 129 diff changeset	404	& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
4d6f54c478b5 i Chengsong parents: 129 diff changeset	405	& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
4d6f54c478b5 i Chengsong parents: 129 diff changeset	406	& & $\;\;\textit{else}\;\textit{None}$
4d6f54c478b5 i Chengsong parents: 129 diff changeset	407	\end{tabular}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	408	\end{center}
4d6f54c478b5 i Chengsong parents: 129 diff changeset	409
4d6f54c478b5 i Chengsong parents: 129 diff changeset	410	\noindent
4d6f54c478b5 i Chengsong parents: 129 diff changeset	411	This algorithm keeps the regular expression size small, for example,
4d6f54c478b5 i Chengsong parents: 129 diff changeset	412	with this simplification our previous $(a + aa)^*$ example's 8000 nodes
4d6f54c478b5 i Chengsong parents: 129 diff changeset	413	will be reduced to just 6 and stays constant, no matter how long the
4d6f54c478b5 i Chengsong parents: 129 diff changeset	414	input string is.
4d6f54c478b5 i Chengsong parents: 129 diff changeset	415
4d6f54c478b5 i Chengsong parents: 129 diff changeset	416
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	417	In order to implement the idea of ``spilling out alternatives'' and to
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	418	make them compatible with the $\textit{inj}$-mechanism, we use
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	419	\emph{bitcodes}. They were first introduced by Sulzmann and Lu.
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	420	Here bits and bitcodes (lists of bits) are defined as:
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	421
b1e365afa29c changes Chengsong parents: 106 diff changeset	422	\begin{center}
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	423	$b ::= 1 \mid 0 \qquad
130 4d6f54c478b5 i Chengsong parents: 129 diff changeset	424	bs ::= [] \mid b::bs
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	425	$
b1e365afa29c changes Chengsong parents: 106 diff changeset	426	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	427
b1e365afa29c changes Chengsong parents: 106 diff changeset	428	\noindent
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	429	The $1$ and $0$ are not in bold in order to avoid
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	430	confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or
130 4d6f54c478b5 i Chengsong parents: 129 diff changeset	431	bit-lists) can be used to encode values (or potentially incomplete values) in a
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	432	compact form. This can be straightforwardly seen in the following
b1e365afa29c changes Chengsong parents: 106 diff changeset	433	coding function from values to bitcodes:
b1e365afa29c changes Chengsong parents: 106 diff changeset	434
b1e365afa29c changes Chengsong parents: 106 diff changeset	435	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	436	\begin{tabular}{lcl}
b1e365afa29c changes Chengsong parents: 106 diff changeset	437	$\textit{code}(\Empty)$ & $\dn$ & $[]$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	438	$\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	439	$\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	440	$\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	441	$\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	442	$\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	443	$\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\;
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	444	code(\Stars\,vs)$
b1e365afa29c changes Chengsong parents: 106 diff changeset	445	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	446	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	447
b1e365afa29c changes Chengsong parents: 106 diff changeset	448	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	449	Here $\textit{code}$ encodes a value into a bitcodes by converting
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	450	$\Left$ into $0$, $\Right$ into $1$, and marks the start of a non-empty
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	451	star iteration by $1$. The border where a local star terminates
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	452	is marked by $0$. This coding is lossy, as it throws away the information about
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	453	characters, and also does not encode the ``boundary'' between two
b1e365afa29c changes Chengsong parents: 106 diff changeset	454	sequence values. Moreover, with only the bitcode we cannot even tell
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	455	whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	456	reason for choosing this compact way of storing information is that the
b1e365afa29c changes Chengsong parents: 106 diff changeset	457	relatively small size of bits can be easily manipulated and ``moved
b1e365afa29c changes Chengsong parents: 106 diff changeset	458	around'' in a regular expression. In order to recover values, we will
b1e365afa29c changes Chengsong parents: 106 diff changeset	459	need the corresponding regular expression as an extra information. This
b1e365afa29c changes Chengsong parents: 106 diff changeset	460	means the decoding function is defined as:
b1e365afa29c changes Chengsong parents: 106 diff changeset	461
b1e365afa29c changes Chengsong parents: 106 diff changeset	462
b1e365afa29c changes Chengsong parents: 106 diff changeset	463	%\begin{definition}[Bitdecoding of Values]\mbox{}
b1e365afa29c changes Chengsong parents: 106 diff changeset	464	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	465	\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
b1e365afa29c changes Chengsong parents: 106 diff changeset	466	$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	467	$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
115 5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	468	$\textit{decode}'\,(0\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	469	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
b1e365afa29c changes Chengsong parents: 106 diff changeset	470	(\Left\,v, bs_1)$\\
115 5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	471	$\textit{decode}'\,(1\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	472	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
b1e365afa29c changes Chengsong parents: 106 diff changeset	473	(\Right\,v, bs_1)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	474	$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	475	$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	476	& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	477	& & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
115 5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	478	$\textit{decode}'\,(0\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	479	$\textit{decode}'\,(1\!::\!bs)\,(r^*)$ & $\dn$ &
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	480	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	481	& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	482	& & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	483
b1e365afa29c changes Chengsong parents: 106 diff changeset	484	$\textit{decode}\,bs\,r$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	485	$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	486	& & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
b1e365afa29c changes Chengsong parents: 106 diff changeset	487	\textit{else}\;\textit{None}$
b1e365afa29c changes Chengsong parents: 106 diff changeset	488	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	489	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	490	%\end{definition}
b1e365afa29c changes Chengsong parents: 106 diff changeset	491
b1e365afa29c changes Chengsong parents: 106 diff changeset	492	Sulzmann and Lu's integrated the bitcodes into regular expressions to
b1e365afa29c changes Chengsong parents: 106 diff changeset	493	create annotated regular expressions \cite{Sulzmann2014}.
b1e365afa29c changes Chengsong parents: 106 diff changeset	494	\emph{Annotated regular expressions} are defined by the following
b1e365afa29c changes Chengsong parents: 106 diff changeset	495	grammar:%\comment{ALTS should have an $as$ in the definitions, not just $a_1$ and $a_2$}
b1e365afa29c changes Chengsong parents: 106 diff changeset	496
b1e365afa29c changes Chengsong parents: 106 diff changeset	497	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	498	\begin{tabular}{lcl}
115 5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	499	$\textit{a}$ & $::=$ & $\ZERO$\\
5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	500	& $\mid$ & $_{bs}\ONE$\\
5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	501	& $\mid$ & $_{bs}{\bf c}$\\
5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	502	& $\mid$ & $_{bs}\oplus\,as$\\
5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	503	& $\mid$ & $_{bs}a_1\cdot a_2$\\
5c8afe4a8090 for secu Chengsong parents: 114 diff changeset	504	& $\mid$ & $_{bs}a^*$
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	505	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	506	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	507	%(in \textit{ALTS})
b1e365afa29c changes Chengsong parents: 106 diff changeset	508
b1e365afa29c changes Chengsong parents: 106 diff changeset	509	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	510	where $bs$ stands for bitcodes, $a$ for $\bold{a}$nnotated regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	511	expressions and $as$ for a list of annotated regular expressions.
116 dfcad6f19e06 just incase Chengsong parents: 115 diff changeset	512	The alternative constructor($\oplus$) has been generalized to
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	513	accept a list of annotated regular expressions rather than just 2.
b1e365afa29c changes Chengsong parents: 106 diff changeset	514	We will show that these bitcodes encode information about
b1e365afa29c changes Chengsong parents: 106 diff changeset	515	the (POSIX) value that should be generated by the Sulzmann and Lu
b1e365afa29c changes Chengsong parents: 106 diff changeset	516	algorithm.
b1e365afa29c changes Chengsong parents: 106 diff changeset	517
b1e365afa29c changes Chengsong parents: 106 diff changeset	518
b1e365afa29c changes Chengsong parents: 106 diff changeset	519	To do lexing using annotated regular expressions, we shall first
b1e365afa29c changes Chengsong parents: 106 diff changeset	520	transform the usual (un-annotated) regular expressions into annotated
b1e365afa29c changes Chengsong parents: 106 diff changeset	521	regular expressions. This operation is called \emph{internalisation} and
b1e365afa29c changes Chengsong parents: 106 diff changeset	522	defined as follows:
b1e365afa29c changes Chengsong parents: 106 diff changeset	523
b1e365afa29c changes Chengsong parents: 106 diff changeset	524	%\begin{definition}
b1e365afa29c changes Chengsong parents: 106 diff changeset	525	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	526	\begin{tabular}{lcl}
116 dfcad6f19e06 just incase Chengsong parents: 115 diff changeset	527	$(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
dfcad6f19e06 just incase Chengsong parents: 115 diff changeset	528	$(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
dfcad6f19e06 just incase Chengsong parents: 115 diff changeset	529	$(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	530	$(r_1 + r_2)^\uparrow$ & $\dn$ &
116 dfcad6f19e06 just incase Chengsong parents: 115 diff changeset	531	$_{[]}\oplus[\textit{fuse}\,[0]\,r_1^\uparrow,\,
dfcad6f19e06 just incase Chengsong parents: 115 diff changeset	532	\textit{fuse}\,[1]\,r_2^\uparrow]$\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	533	$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
116 dfcad6f19e06 just incase Chengsong parents: 115 diff changeset	534	$_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	535	$(r^*)^\uparrow$ & $\dn$ &
116 dfcad6f19e06 just incase Chengsong parents: 115 diff changeset	536	$_{[]}(r^\uparrow)^*$\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	537	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	538	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	539	%\end{definition}
b1e365afa29c changes Chengsong parents: 106 diff changeset	540
b1e365afa29c changes Chengsong parents: 106 diff changeset	541	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	542	We use up arrows here to indicate that the basic un-annotated regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	543	expressions are ``lifted up'' into something slightly more complex. In the
b1e365afa29c changes Chengsong parents: 106 diff changeset	544	fourth clause, $\textit{fuse}$ is an auxiliary function that helps to
b1e365afa29c changes Chengsong parents: 106 diff changeset	545	attach bits to the front of an annotated regular expression. Its
b1e365afa29c changes Chengsong parents: 106 diff changeset	546	definition is as follows:
b1e365afa29c changes Chengsong parents: 106 diff changeset	547
b1e365afa29c changes Chengsong parents: 106 diff changeset	548	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	549	\begin{tabular}{lcl}
117 0acf6b58236e just incase Chengsong parents: 116 diff changeset	550	$\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\
0acf6b58236e just incase Chengsong parents: 116 diff changeset	551	$\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ &
0acf6b58236e just incase Chengsong parents: 116 diff changeset	552	$_{bs @ bs'}\ONE$\\
0acf6b58236e just incase Chengsong parents: 116 diff changeset	553	$\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ &
0acf6b58236e just incase Chengsong parents: 116 diff changeset	554	$_{bs@bs'}{\bf c}$\\
0acf6b58236e just incase Chengsong parents: 116 diff changeset	555	$\textit{fuse}\;bs\,_{bs'}\oplus\textit{as}$ & $\dn$ &
0acf6b58236e just incase Chengsong parents: 116 diff changeset	556	$_{bs@bs'}\oplus\textit{as}$\\
0acf6b58236e just incase Chengsong parents: 116 diff changeset	557	$\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ &
0acf6b58236e just incase Chengsong parents: 116 diff changeset	558	$_{bs@bs'}a_1 \cdot a_2$\\
0acf6b58236e just incase Chengsong parents: 116 diff changeset	559	$\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ &
0acf6b58236e just incase Chengsong parents: 116 diff changeset	560	$_{bs @ bs'}a^*$
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	561	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	562	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	563
b1e365afa29c changes Chengsong parents: 106 diff changeset	564	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	565	After internalising the regular expression, we perform successive
b1e365afa29c changes Chengsong parents: 106 diff changeset	566	derivative operations on the annotated regular expressions. This
b1e365afa29c changes Chengsong parents: 106 diff changeset	567	derivative operation is the same as what we had previously for the
b1e365afa29c changes Chengsong parents: 106 diff changeset	568	basic regular expressions, except that we beed to take care of
b1e365afa29c changes Chengsong parents: 106 diff changeset	569	the bitcodes:
b1e365afa29c changes Chengsong parents: 106 diff changeset	570
109 79f347cb8b4d format Chengsong parents: 108 diff changeset	571
79f347cb8b4d format Chengsong parents: 108 diff changeset	572	\iffalse
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	573	%\begin{definition}{bder}
b1e365afa29c changes Chengsong parents: 106 diff changeset	574	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	575	\begin{tabular}{@{}lcl@{}}
b1e365afa29c changes Chengsong parents: 106 diff changeset	576	$(\textit{ZERO})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	577	$(\textit{ONE}\;bs)\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	578	$(\textit{CHAR}\;bs\,d)\,\backslash c$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	579	$\textit{if}\;c=d\; \;\textit{then}\;
b1e365afa29c changes Chengsong parents: 106 diff changeset	580	\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	581	$(\textit{ALTS}\;bs\,as)\,\backslash c$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	582	$\textit{ALTS}\;bs\,(as.map(\backslash c))$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	583	$(\textit{SEQ}\;bs\,a_1\,a_2)\,\backslash c$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	584	$\textit{if}\;\textit{bnullable}\,a_1$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	585	& &$\textit{then}\;\textit{ALTS}\,bs\,List((\textit{SEQ}\,[]\,(a_1\,\backslash c)\,a_2),$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	586	& &$\phantom{\textit{then}\;\textit{ALTS}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c)))$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	587	& &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\,\backslash c)\,a_2$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	588	$(\textit{STAR}\,bs\,a)\,\backslash c$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	589	$\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\,\backslash c))\,
b1e365afa29c changes Chengsong parents: 106 diff changeset	590	(\textit{STAR}\,[]\,r)$
b1e365afa29c changes Chengsong parents: 106 diff changeset	591	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	592	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	593	%\end{definition}
b1e365afa29c changes Chengsong parents: 106 diff changeset	594
109 79f347cb8b4d format Chengsong parents: 108 diff changeset	595	\begin{center}
79f347cb8b4d format Chengsong parents: 108 diff changeset	596	\begin{tabular}{@{}lcl@{}}
79f347cb8b4d format Chengsong parents: 108 diff changeset	597	$(\textit{ZERO})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	598	$(_{bs}\textit{ONE})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	599	$(_{bs}\textit{CHAR}\;d)\,\backslash c$ & $\dn$ &
79f347cb8b4d format Chengsong parents: 108 diff changeset	600	$\textit{if}\;c=d\; \;\textit{then}\;
79f347cb8b4d format Chengsong parents: 108 diff changeset	601	_{bs}\textit{ONE}\;\textit{else}\;\textit{ZERO}$\\
118 c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	602	$(_{bs}\textit{ALTS}\;\textit{as})\,\backslash c$ & $\dn$ &
c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	603	$_{bs}\textit{ALTS}\;(\textit{as}.\textit{map}(\backslash c))$\\
109 79f347cb8b4d format Chengsong parents: 108 diff changeset	604	$(_{bs}\textit{SEQ}\;a_1\,a_2)\,\backslash c$ & $\dn$ &
79f347cb8b4d format Chengsong parents: 108 diff changeset	605	$\textit{if}\;\textit{bnullable}\,a_1$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	606	& &$\textit{then}\;_{bs}\textit{ALTS}\,List((_{[]}\textit{SEQ}\,(a_1\,\backslash c)\,a_2),$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	607	& &$\phantom{\textit{then}\;_{bs}\textit{ALTS}\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c)))$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	608	& &$\textit{else}\;_{bs}\textit{SEQ}\,(a_1\,\backslash c)\,a_2$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	609	$(_{bs}\textit{STAR}\,a)\,\backslash c$ & $\dn$ &
118 c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	610	$_{bs}\textit{SEQ}\;(\textit{fuse}\, [0] \; r\,\backslash c )\,
109 79f347cb8b4d format Chengsong parents: 108 diff changeset	611	(_{bs}\textit{STAR}\,[]\,r)$
79f347cb8b4d format Chengsong parents: 108 diff changeset	612	\end{tabular}
79f347cb8b4d format Chengsong parents: 108 diff changeset	613	\end{center}
79f347cb8b4d format Chengsong parents: 108 diff changeset	614	%\end{definition}
79f347cb8b4d format Chengsong parents: 108 diff changeset	615	\fi
79f347cb8b4d format Chengsong parents: 108 diff changeset	616
79f347cb8b4d format Chengsong parents: 108 diff changeset	617	\begin{center}
79f347cb8b4d format Chengsong parents: 108 diff changeset	618	\begin{tabular}{@{}lcl@{}}
79f347cb8b4d format Chengsong parents: 108 diff changeset	619	$(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	620	$(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	621	$(_{bs}{\bf d})\,\backslash c$ & $\dn$ &
79f347cb8b4d format Chengsong parents: 108 diff changeset	622	$\textit{if}\;c=d\; \;\textit{then}\;
79f347cb8b4d format Chengsong parents: 108 diff changeset	623	_{bs}\ONE\;\textit{else}\;\ZERO$\\
118 c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	624	$(_{bs}\oplus \;\textit{as})\,\backslash c$ & $\dn$ &
c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	625	$_{bs}\oplus\;(\textit{as.map}(\backslash c))$\\
109 79f347cb8b4d format Chengsong parents: 108 diff changeset	626	$(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
79f347cb8b4d format Chengsong parents: 108 diff changeset	627	$\textit{if}\;\textit{bnullable}\,a_1$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	628	& &$\textit{then}\;_{bs}\oplus\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	629	& &$\phantom{\textit{then},\;_{bs}\oplus\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	630	& &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\
79f347cb8b4d format Chengsong parents: 108 diff changeset	631	$(_{bs}a^*)\,\backslash c$ & $\dn$ &
119 cc12352272f2 for fear Chengsong parents: 118 diff changeset	632	$_{bs}(\textit{fuse}\, [0] \; r\,\backslash c)\cdot
109 79f347cb8b4d format Chengsong parents: 108 diff changeset	633	(_{[]}r^*))$
79f347cb8b4d format Chengsong parents: 108 diff changeset	634	\end{tabular}
79f347cb8b4d format Chengsong parents: 108 diff changeset	635	\end{center}
79f347cb8b4d format Chengsong parents: 108 diff changeset	636
79f347cb8b4d format Chengsong parents: 108 diff changeset	637	%\end{definition}
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	638	\noindent
119 cc12352272f2 for fear Chengsong parents: 118 diff changeset	639	For instance, when we do derivative of $_{bs}a^*$ with respect to c,
cc12352272f2 for fear Chengsong parents: 118 diff changeset	640	we need to unfold it into a sequence,
cc12352272f2 for fear Chengsong parents: 118 diff changeset	641	and attach an additional bit $0$ to the front of $r \backslash c$
118 c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	642	to indicate that there is one more star iteration. Also the sequence clause
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	643	is more subtle---when $a_1$ is $\textit{bnullable}$ (here
b1e365afa29c changes Chengsong parents: 106 diff changeset	644	\textit{bnullable} is exactly the same as $\textit{nullable}$, except
b1e365afa29c changes Chengsong parents: 106 diff changeset	645	that it is for annotated regular expressions, therefore we omit the
118 c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	646	definition). Assume that $\textit{bmkeps}$ correctly extracts the bitcode for how
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	647	$a_1$ matches the string prior to character $c$ (more on this later),
118 c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	648	then the right branch of alternative, which is $\textit{fuse} \; \bmkeps \; a_1 (a_2
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	649	\backslash c)$ will collapse the regular expression $a_1$(as it has
b1e365afa29c changes Chengsong parents: 106 diff changeset	650	already been fully matched) and store the parsing information at the
b1e365afa29c changes Chengsong parents: 106 diff changeset	651	head of the regular expression $a_2 \backslash c$ by fusing to it. The
118 c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	652	bitsequence $\textit{bs}$, which was initially attached to the
c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	653	first element of the sequence $a_1 \cdot a_2$, has
c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	654	now been elevated to the top-level of $\oplus$, as this information will be
c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	655	needed whichever way the sequence is matched---no matter whether $c$ belongs
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	656	to $a_1$ or $ a_2$. After building these derivatives and maintaining all
b1e365afa29c changes Chengsong parents: 106 diff changeset	657	the lexing information, we complete the lexing by collecting the
b1e365afa29c changes Chengsong parents: 106 diff changeset	658	bitcodes using a generalised version of the $\textit{mkeps}$ function
b1e365afa29c changes Chengsong parents: 106 diff changeset	659	for annotated regular expressions, called $\textit{bmkeps}$:
b1e365afa29c changes Chengsong parents: 106 diff changeset	660
b1e365afa29c changes Chengsong parents: 106 diff changeset	661
b1e365afa29c changes Chengsong parents: 106 diff changeset	662	%\begin{definition}[\textit{bmkeps}]\mbox{}
b1e365afa29c changes Chengsong parents: 106 diff changeset	663	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	664	\begin{tabular}{lcl}
119 cc12352272f2 for fear Chengsong parents: 118 diff changeset	665	$\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\
cc12352272f2 for fear Chengsong parents: 118 diff changeset	666	$\textit{bmkeps}\,(_{bs}\oplus a::\textit{as})$ & $\dn$ &
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	667	$\textit{if}\;\textit{bnullable}\,a$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	668	& &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	669	& &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{bs}\oplus \textit{as})$\\
120 1ca011142964 for fear Chengsong parents: 119 diff changeset	670	$\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ &
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	671	$bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
119 cc12352272f2 for fear Chengsong parents: 118 diff changeset	672	$\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ &
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	673	$bs \,@\, [0]$
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	674	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	675	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	676	%\end{definition}
b1e365afa29c changes Chengsong parents: 106 diff changeset	677
b1e365afa29c changes Chengsong parents: 106 diff changeset	678	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	679	This function completes the value information by travelling along the
b1e365afa29c changes Chengsong parents: 106 diff changeset	680	path of the regular expression that corresponds to a POSIX value and
b1e365afa29c changes Chengsong parents: 106 diff changeset	681	collecting all the bitcodes, and using $S$ to indicate the end of star
b1e365afa29c changes Chengsong parents: 106 diff changeset	682	iterations. If we take the bitcodes produced by $\textit{bmkeps}$ and
b1e365afa29c changes Chengsong parents: 106 diff changeset	683	decode them, we get the value we expect. The corresponding lexing
b1e365afa29c changes Chengsong parents: 106 diff changeset	684	algorithm looks as follows:
b1e365afa29c changes Chengsong parents: 106 diff changeset	685
b1e365afa29c changes Chengsong parents: 106 diff changeset	686	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	687	\begin{tabular}{lcl}
b1e365afa29c changes Chengsong parents: 106 diff changeset	688	$\textit{blexer}\;r\,s$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	689	$\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	690	& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	691	& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	692	& & $\;\;\textit{else}\;\textit{None}$
b1e365afa29c changes Chengsong parents: 106 diff changeset	693	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	694	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	695
b1e365afa29c changes Chengsong parents: 106 diff changeset	696	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	697	In this definition $\_\backslash s$ is the generalisation of the derivative
b1e365afa29c changes Chengsong parents: 106 diff changeset	698	operation from characters to strings (just like the derivatives for un-annotated
b1e365afa29c changes Chengsong parents: 106 diff changeset	699	regular expressions).
b1e365afa29c changes Chengsong parents: 106 diff changeset	700
108 0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	701
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	702	\subsection*{Our Simplification Rules}
0a0c551bb368 updated Christian Urban <urbanc@in.tum.de> parents: 107 diff changeset	703
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	704	The main point of the bitcodes and annotated regular expressions is that
b1e365afa29c changes Chengsong parents: 106 diff changeset	705	we can apply rather aggressive (in terms of size) simplification rules
b1e365afa29c changes Chengsong parents: 106 diff changeset	706	in order to keep derivatives small. We have developed such
b1e365afa29c changes Chengsong parents: 106 diff changeset	707	``aggressive'' simplification rules and generated test data that show
b1e365afa29c changes Chengsong parents: 106 diff changeset	708	that the expected bound can be achieved. Obviously we could only
b1e365afa29c changes Chengsong parents: 106 diff changeset	709	partially cover the search space as there are infinitely many regular
b1e365afa29c changes Chengsong parents: 106 diff changeset	710	expressions and strings.
b1e365afa29c changes Chengsong parents: 106 diff changeset	711
b1e365afa29c changes Chengsong parents: 106 diff changeset	712	One modification we introduced is to allow a list of annotated regular
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	713	expressions in the $\oplus$ constructor. This allows us to not just
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	714	delete unnecessary $\ZERO$s and $\ONE$s from regular expressions, but
b1e365afa29c changes Chengsong parents: 106 diff changeset	715	also unnecessary ``copies'' of regular expressions (very similar to
b1e365afa29c changes Chengsong parents: 106 diff changeset	716	simplifying $r + r$ to just $r$, but in a more general setting). Another
b1e365afa29c changes Chengsong parents: 106 diff changeset	717	modification is that we use simplification rules inspired by Antimirov's
b1e365afa29c changes Chengsong parents: 106 diff changeset	718	work on partial derivatives. They maintain the idea that only the first
b1e365afa29c changes Chengsong parents: 106 diff changeset	719	``copy'' of a regular expression in an alternative contributes to the
b1e365afa29c changes Chengsong parents: 106 diff changeset	720	calculation of a POSIX value. All subsequent copies can be pruned away from
b1e365afa29c changes Chengsong parents: 106 diff changeset	721	the regular expression. A recursive definition of our simplification function
b1e365afa29c changes Chengsong parents: 106 diff changeset	722	that looks somewhat similar to our Scala code is given below:
b1e365afa29c changes Chengsong parents: 106 diff changeset	723	%\comment{Use $\ZERO$, $\ONE$ and so on.
b1e365afa29c changes Chengsong parents: 106 diff changeset	724	%Is it $ALTS$ or $ALTS$?}\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	725
b1e365afa29c changes Chengsong parents: 106 diff changeset	726	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	727	\begin{tabular}{@{}lcl@{}}
b1e365afa29c changes Chengsong parents: 106 diff changeset	728
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	729	$\textit{simp} \; (_{bs}a_1\cdot a_2)$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp} \; a_2) \; \textit{match} $ \\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	730	&&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow \ZERO$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	731	&&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow \ZERO$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	732	&&$\quad\textit{case} \; (\ONE, a_2') \Rightarrow \textit{fuse} \; bs \; a_2'$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	733	&&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow \textit{fuse} \; bs \; a_1'$ \\
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	734	&&$\quad\textit{case} \; (a_1', a_2') \Rightarrow _{bs}a_1' \cdot a_2'$ \\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	735
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	736	$\textit{simp} \; (_{bs}\oplus \textit{as})$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $ \\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	737	&&$\quad\textit{case} \; [] \Rightarrow \ZERO$ \\
b1e365afa29c changes Chengsong parents: 106 diff changeset	738	&&$\quad\textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$ \\
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	739	&&$\quad\textit{case} \; as' \Rightarrow _{bs}\oplus \textit{as'}$\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	740
b1e365afa29c changes Chengsong parents: 106 diff changeset	741	$\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$
b1e365afa29c changes Chengsong parents: 106 diff changeset	742	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	743	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	744
b1e365afa29c changes Chengsong parents: 106 diff changeset	745	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	746	The simplification does a pattern matching on the regular expression.
b1e365afa29c changes Chengsong parents: 106 diff changeset	747	When it detected that the regular expression is an alternative or
b1e365afa29c changes Chengsong parents: 106 diff changeset	748	sequence, it will try to simplify its children regular expressions
b1e365afa29c changes Chengsong parents: 106 diff changeset	749	recursively and then see if one of the children turn into $\ZERO$ or
b1e365afa29c changes Chengsong parents: 106 diff changeset	750	$\ONE$, which might trigger further simplification at the current level.
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	751	The most involved part is the $\oplus$ clause, where we use two
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	752	auxiliary functions $\textit{flatten}$ and $\textit{distinct}$ to open up nested
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	753	alternatives and reduce as many duplicates as possible. Function
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	754	$\textit{distinct}$ keeps the first occurring copy only and remove all later ones
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	755	when detected duplicates. Function $\textit{flatten}$ opens up nested $\oplus$s.
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	756	Its recursive definition is given below:
b1e365afa29c changes Chengsong parents: 106 diff changeset	757
b1e365afa29c changes Chengsong parents: 106 diff changeset	758	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	759	\begin{tabular}{@{}lcl@{}}
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	760	$\textit{flatten} \; (_{bs}\oplus \textit{as}) :: \textit{as'}$ & $\dn$ & $(\textit{map} \;
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	761	(\textit{fuse}\;bs)\; \textit{as}) \; @ \; \textit{flatten} \; as' $ \\
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	762	$\textit{flatten} \; \ZERO :: as'$ & $\dn$ & $ \textit{flatten} \; \textit{as'} $ \\
df9e966ecb6d safe Chengsong parents: 120 diff changeset	763	$\textit{flatten} \; a :: as'$ & $\dn$ & $a :: \textit{flatten} \; \textit{as'}$ \quad(otherwise)
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	764	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	765	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	766
b1e365afa29c changes Chengsong parents: 106 diff changeset	767	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	768	Here $\textit{flatten}$ behaves like the traditional functional programming flatten
b1e365afa29c changes Chengsong parents: 106 diff changeset	769	function, except that it also removes $\ZERO$s. Or in terms of regular expressions, it
b1e365afa29c changes Chengsong parents: 106 diff changeset	770	removes parentheses, for example changing $a+(b+c)$ into $a+b+c$.
b1e365afa29c changes Chengsong parents: 106 diff changeset	771
b1e365afa29c changes Chengsong parents: 106 diff changeset	772
b1e365afa29c changes Chengsong parents: 106 diff changeset	773
b1e365afa29c changes Chengsong parents: 106 diff changeset	774
126 1260b383ae2c forfear Chengsong parents: 125 diff changeset	775	\section{Current Work and Progress}
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	776	For reasons beyond this report, it turns out that a complete set of
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	777	simplification rules depends on values being encoded as
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	778	bitsequences.\footnote{Values are the results the lexing algorithms
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	779	generate; they encode how a regular expression matched a string.} We
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	780	already know that the lexing algorithm using bitsequences but
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	781	\emph{without} simplification is correct, albeilt horribly
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	782	slow. Therefore in the past 6 months I was trying to prove that the
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	783	algorithm using bitsequences plus our simplification rules is
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	784	also correct. Formally this amounts to show that
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	785
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	786	\begin{equation}\label{mainthm}
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	787	\blexers \; r \; s = \blexer \;r\;s
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	788	\end{equation}
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	789
94 2e2dca212fff add Chengsong parents: diff changeset	790	\noindent
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	791	whereby $\blexers$ simplifies (makes derivatives smaller) in each
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	792	step, whereas with $\blexer$ the size can grow exponentially. This
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	793	would be an important milestone for my thesis, because we already
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	794	have a very good idea how to establish that our set our simplification
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	795	rules keeps the size of derivativs below a relatively tight bound.
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	796
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	797	In order to prove the main theorem in \eqref{mainthm}, we need to prove that the
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	798	two functions produce the same output. The definition of these two functions
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	799	is shown below.
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	800
94 2e2dca212fff add Chengsong parents: diff changeset	801	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	802	\begin{tabular}{lcl}
2e2dca212fff add Chengsong parents: diff changeset	803	$\textit{blexer}\;r\,s$ & $\dn$ &
2e2dca212fff add Chengsong parents: diff changeset	804	$\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\
2e2dca212fff add Chengsong parents: diff changeset	805	& & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
2e2dca212fff add Chengsong parents: diff changeset	806	& & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
2e2dca212fff add Chengsong parents: diff changeset	807	& & $\;\;\textit{else}\;\textit{None}$
2e2dca212fff add Chengsong parents: diff changeset	808	\end{tabular}
2e2dca212fff add Chengsong parents: diff changeset	809	\end{center}
2e2dca212fff add Chengsong parents: diff changeset	810
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	811	\begin{center}
94 2e2dca212fff add Chengsong parents: diff changeset	812	\begin{tabular}{lcl}
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	813	$\blexers \; r \, s$ &$\dn$ &
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	814	$\textit{let} \; a = (r^\uparrow)\backslash_{simp}\, s\; \textit{in}$\\
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	815	& & $\; \; \textit{if} \; \textit{bnullable}(a)$\\
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	816	& & $\; \; \textit{then} \; \textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	817	& & $\;\; \textit{else}\;\textit{None}$
94 2e2dca212fff add Chengsong parents: diff changeset	818	\end{tabular}
2e2dca212fff add Chengsong parents: diff changeset	819	\end{center}
2e2dca212fff add Chengsong parents: diff changeset	820	\noindent
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	821	In these definitions $(r^\uparrow)$ is a kind of coding function that
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	822	is the same in each case, similarly the decode and the \textit{bmkeps}
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	823	are functions that are the same in each case. Our main
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	824	theorem~\eqref{mainthm} therefore boils down to proving the following
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	825	two propositions (depending on which branch the if-else clause
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	826	takes). They establish how the derivatives \emph{with} simplification
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	827	do not change the computed result:
94 2e2dca212fff add Chengsong parents: diff changeset	828
2e2dca212fff add Chengsong parents: diff changeset	829	\begin{itemize}
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	830	\item{(a)} If a string $s$ is in the language of $L(r)$, then \\
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	831	$\textit{bmkeps} (r^\uparrow)\backslash_{simp}\,s = \textit{bmkeps} (r^\uparrow)\backslash s$,\\
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	832	\item{(b)} If a string $s$ is in the language $L(r)$, then
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	833	$\rup \backslash_{simp} \,s$ is not nullable.
94 2e2dca212fff add Chengsong parents: diff changeset	834	\end{itemize}
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	835
94 2e2dca212fff add Chengsong parents: diff changeset	836	\noindent
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	837	We have already proved the second part in Isabelle. This is actually
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	838	not too difficult because we can show that simplification does not
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	839	change the language of simplified regular expressions.
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	840
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	841	If we can prove the first part, that is the bitsequence algorithm with
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	842	simplification produces the same result as the one without
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	843	simplification, then we are done. Unfortunately that part requires
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	844	more effort, because simplification does not only need to \emph{not}
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	845	change the language, but also not change the value (that is the
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	846	computed result).
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	847
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	848	%\bigskip\noindent\rule[1.5ex]{\linewidth}{5pt}
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	849	%Do you want to keep this? You essentially want to say that the old
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	850	%method used retrieve, which unfortunately cannot be adopted to
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	851	%the simplification rules. You could just say that and give an example.
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	852	%However you have to think about how you give the example....nobody knows
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	853	%about AZERO etc yet. Maybe it might be better to use normal regexes
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	854	%like $a + aa$, but annotate bitsequences as subscript like $_1(_0a + _1aa)$.
104 e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	855
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	856	%\bigskip\noindent\rule[1.5ex]{\linewidth}{5pt}
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	857	%REPLY:\\
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	858	%Yes, I am essentially saying that the old method
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	859	%cannot be adopted without adjustments.
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	860	%But this does not mean we should skip
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	861	%the proof of the bit-coded algorithm
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	862	%as it is still the main direction we are looking into
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	863	%to prove things. We are trying to modify
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	864	%the old proof to suit our needs, but not give
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	865	%up it totally, that is why i believe the old
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	866	%proof is fundamental in understanding
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	867	%what we are doing in the past 6 months.
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	868	%\bigskip\noindent\rule[1.5ex]{\linewidth}{5pt}
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	869
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	870	\subsubsection*{Existing Proof}
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	871
104 e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	872	For this we have started with looking at the original proof that
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	873	established that the bitsequence algorrithm produces the same result
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	874	as the algorithm not using bitsequences. Formally this proof
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	875	established
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	876
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	877	\begin{equation}\label{lexer}
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	878	\blexer \; (r^\uparrow) s = \lexer \;r \;s
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	879	\end{equation}
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	880
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	881	%\noindent
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	882	%might provide us insight into proving
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	883	%\begin{center}
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	884	%$\blexer \; r^\uparrow \;s = \blexers \; r^\uparrow \;s$
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	885	%\end{center}
104 e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	886
94 2e2dca212fff add Chengsong parents: diff changeset	887	\noindent
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	888	The proof uses two ``tricks''. One is that it uses a \flex-function
104 e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	889
94 2e2dca212fff add Chengsong parents: diff changeset	890	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	891	\begin{tabular}{lcl}
2e2dca212fff add Chengsong parents: diff changeset	892	$\textit{flex} \;r\; f\; (c\!::\!s) $ & $\dn$ & $\textit{flex} \; (r\backslash c) \;(\lambda v. f (inj \; r \; c \; v)) \;s$ \\
2e2dca212fff add Chengsong parents: diff changeset	893	$\textit{flex} \;r\; f\; [\,] $ & $\dn$ & $f$
2e2dca212fff add Chengsong parents: diff changeset	894	\end{tabular}
2e2dca212fff add Chengsong parents: diff changeset	895	\end{center}
104 e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	896
94 2e2dca212fff add Chengsong parents: diff changeset	897	\noindent
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	898	and then proves for the right-hand side in \eqref{lexer}
104 e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	899
e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	900	\begin{center}
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	901	$\lexer \;r\; s = \flex \;\textit{id} \; r\;s \;(\mkeps \; (r\backslash s))$
104 e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	902	\end{center}.
e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	903
e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	904
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	905
104 e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	906
e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	907	\noindent
e7ba4da53930 merged christian changes Chengsong parents: 103 diff changeset	908	The $\flex$-function essentially does lexing by
105 317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	909	stacking up injection functions while doing derivatives.
317a7d54ebcc updated Christian Urban <urbanc@in.tum.de> parents: 103 diff changeset	910
94 2e2dca212fff add Chengsong parents: diff changeset	911	explicitly showing the order of characters being
2e2dca212fff add Chengsong parents: diff changeset	912	injected back in each step.
2e2dca212fff add Chengsong parents: diff changeset	913	With $\flex$ we can write $\lexer$ this way:
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	914
94 2e2dca212fff add Chengsong parents: diff changeset	915	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	916	$\lexer \;r\; s = \flex \;id \; r\;s \;(\mkeps r\backslash s)$
2e2dca212fff add Chengsong parents: diff changeset	917	\end{center}
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	918
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	919	%\noindent
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	920	%$\flex$ focuses on the injections instead of the derivatives ,
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	921	%compared to the original definition of $\lexer$, which puts equal
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	922	%amount of emphasis on injection and derivative with respect to each
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	923	%character:
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	924
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	925	%\begin{center}
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	926	%\begin{tabular}{lcl}
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	927	%$\textit{lexer} \; r\; (c\!::\!s) $ & $\dn$ & $\textit{case} \; \lexer \; (r\backslash c) \;s \; %\textit{of}$ \\
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	928	% & & $\textit{None} \; \Longrightarrow \; \textit{None}$\\
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	929	% & & $\textbar \; v \; \Longrightarrow \; \inj \; r\;c\;v$\\
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	930	%$\textit{lexer} \; r\; [\,] $ & $\dn$ & $\textit{if} \; \nullable (r) \; \textit{then} \; \mkeps% (r) \; \textit{else} \;None$
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	931	%\end{tabular}
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	932	%\end{center}
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	933
94 2e2dca212fff add Chengsong parents: diff changeset	934	\noindent
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	935	The crux in the existing proof is how $\flex$ relates to injection, namely
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	936
94 2e2dca212fff add Chengsong parents: diff changeset	937	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	938	$\flex \; r \; id \; (s@[c]) \; v = \flex \; r \; id \; s \; (inj \; (r\backslash s) \; c\; v)$.
2e2dca212fff add Chengsong parents: diff changeset	939	\end{center}
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	940
94 2e2dca212fff add Chengsong parents: diff changeset	941	\noindent
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	942	This property allows one to rewrite an induction hypothesis like
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	943
94 2e2dca212fff add Chengsong parents: diff changeset	944	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	945	$ \flex \; r\; id\; (s@[c])\; v = \textit{decode} \;( \textit{retrieve}\; (\rup \backslash s) \; (\inj \; (r\backslash s) \;c\;v)\;) r$
2e2dca212fff add Chengsong parents: diff changeset	946	\end{center}
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	947
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	948
b1e365afa29c changes Chengsong parents: 106 diff changeset	949	\subsubsection{Retrieve Function}
b1e365afa29c changes Chengsong parents: 106 diff changeset	950	The crucial point is to find the
b1e365afa29c changes Chengsong parents: 106 diff changeset	951	$\textit{POSIX}$ information of a regular expression and how it is modified,
b1e365afa29c changes Chengsong parents: 106 diff changeset	952	augmented and propagated
b1e365afa29c changes Chengsong parents: 106 diff changeset	953	during simplification in parallel with the regular expression that
b1e365afa29c changes Chengsong parents: 106 diff changeset	954	has not been simplified in the subsequent derivative operations. To aid this,
b1e365afa29c changes Chengsong parents: 106 diff changeset	955	we use the helper function retrieve described by Sulzmann and Lu:
b1e365afa29c changes Chengsong parents: 106 diff changeset	956	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	957	\begin{tabular}{@{}l@{\hspace{2mm}}c@{\hspace{2mm}}l@{}}
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	958	$\textit{retrieve}\,(_{bs}\ONE)\,\Empty$ & $\dn$ & $bs$\\
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	959	$\textit{retrieve}\,(_{bs}{\bf c})\,(\Char\,d)$ & $\dn$ & $bs$\\
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	960	$\textit{retrieve}\,(_{bs}\oplus a::as)\,(\Left\,v)$ & $\dn$ &
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	961	$bs \,@\, \textit{retrieve}\,a\,v$\\
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	962	$\textit{retrieve}\,(_{bs}\oplus a::as)\,(\Right\,v)$ & $\dn$ &
df9e966ecb6d safe Chengsong parents: 120 diff changeset	963	$\textit{bs} \,@\, \textit{retrieve}\,(_{[]}\oplus as)\,v$\\
df9e966ecb6d safe Chengsong parents: 120 diff changeset	964	$\textit{retrieve}\,(_{bs}a_1\cdot a_2)\,(\Seq\,v_1\,v_2)$ & $\dn$ &
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	965	$bs \,@\,\textit{retrieve}\,a_1\,v_1\,@\, \textit{retrieve}\,a_2\,v_2$\\
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	966	$\textit{retrieve}\,(_{bs}a^*)\,(\Stars\,[])$ & $\dn$ &
df9e966ecb6d safe Chengsong parents: 120 diff changeset	967	$bs \,@\, [0]$\\
df9e966ecb6d safe Chengsong parents: 120 diff changeset	968	$\textit{retrieve}\,(_{bs}a^*)\,(\Stars\,(v\!::\!vs))$ & $\dn$ &\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	969	\multicolumn{3}{l}{
121 df9e966ecb6d safe Chengsong parents: 120 diff changeset	970	\hspace{3cm}$bs \,@\, [1] \,@\, \textit{retrieve}\,a\,v\,@\,
df9e966ecb6d safe Chengsong parents: 120 diff changeset	971	\textit{retrieve}\,(_{[]}a^*)\,(\Stars\,vs)$}\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	972	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	973	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	974	%\comment{Did not read further}\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	975	This function assembles the bitcode
b1e365afa29c changes Chengsong parents: 106 diff changeset	976	%that corresponds to a lexical value for how
b1e365afa29c changes Chengsong parents: 106 diff changeset	977	%the current derivative matches the suffix of the string(the characters that
b1e365afa29c changes Chengsong parents: 106 diff changeset	978	%have not yet appeared, but will appear as the successive derivatives go on.
b1e365afa29c changes Chengsong parents: 106 diff changeset	979	%How do we get this "future" information? By the value $v$, which is
b1e365afa29c changes Chengsong parents: 106 diff changeset	980	%computed by a pass of the algorithm that uses
b1e365afa29c changes Chengsong parents: 106 diff changeset	981	%$inj$ as described in the previous section).
b1e365afa29c changes Chengsong parents: 106 diff changeset	982	using information from both the derivative regular expression and the
b1e365afa29c changes Chengsong parents: 106 diff changeset	983	value. Sulzmann and Lu poroposed this function, but did not prove
b1e365afa29c changes Chengsong parents: 106 diff changeset	984	anything about it. Ausaf and Urban used it to connect the bitcoded
b1e365afa29c changes Chengsong parents: 106 diff changeset	985	algorithm to the older algorithm by the following equation:
b1e365afa29c changes Chengsong parents: 106 diff changeset	986
b1e365afa29c changes Chengsong parents: 106 diff changeset	987	\begin{center} $inj \;a\; c \; v = \textit{decode} \; (\textit{retrieve}\;
b1e365afa29c changes Chengsong parents: 106 diff changeset	988	(r^\uparrow)\backslash_{simp} \,c)\,v)$
b1e365afa29c changes Chengsong parents: 106 diff changeset	989	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	990
b1e365afa29c changes Chengsong parents: 106 diff changeset	991	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	992	whereby $r^\uparrow$ stands for the internalised version of $r$. Ausaf
b1e365afa29c changes Chengsong parents: 106 diff changeset	993	and Urban also used this fact to prove the correctness of bitcoded
b1e365afa29c changes Chengsong parents: 106 diff changeset	994	algorithm without simplification. Our purpose of using this, however,
b1e365afa29c changes Chengsong parents: 106 diff changeset	995	is to establish
b1e365afa29c changes Chengsong parents: 106 diff changeset	996
b1e365afa29c changes Chengsong parents: 106 diff changeset	997	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	998	$ \textit{retrieve} \;
b1e365afa29c changes Chengsong parents: 106 diff changeset	999	a \; v \;=\; \textit{retrieve} \; (\textit{simp}\,a) \; v'.$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1000	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1001	The idea is that using $v'$, a simplified version of $v$ that had gone
b1e365afa29c changes Chengsong parents: 106 diff changeset	1002	through the same simplification step as $\textit{simp}(a)$, we are able
b1e365afa29c changes Chengsong parents: 106 diff changeset	1003	to extract the bitcode that gives the same parsing information as the
b1e365afa29c changes Chengsong parents: 106 diff changeset	1004	unsimplified one. However, we noticed that constructing such a $v'$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1005	from $v$ is not so straightforward. The point of this is that we might
b1e365afa29c changes Chengsong parents: 106 diff changeset	1006	be able to finally bridge the gap by proving
b1e365afa29c changes Chengsong parents: 106 diff changeset	1007
94 2e2dca212fff add Chengsong parents: diff changeset	1008	\noindent
2e2dca212fff add Chengsong parents: diff changeset	1009	By using a property of retrieve we have the $\textit{RHS}$ of the above equality is
2e2dca212fff add Chengsong parents: diff changeset	1010	$decode (retrieve (r^\uparrow \backslash(s @ [c])) v) r$, and this gives the
2e2dca212fff add Chengsong parents: diff changeset	1011	main lemma result:
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	1012
94 2e2dca212fff add Chengsong parents: diff changeset	1013	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	1014	$ \flex \;r\; id \; (s@[c]) \; v =\textit{decode}(\textit{retrieve} (\rup \backslash (s@[c])) \;v) r$
2e2dca212fff add Chengsong parents: diff changeset	1015	\end{center}
106 e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	1016
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	1017
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	1018
e0db3242d8b5 added my comments Christian Urban <urbanc@in.tum.de> parents: 105 104 diff changeset	1019
94 2e2dca212fff add Chengsong parents: diff changeset	1020	\noindent
2e2dca212fff add Chengsong parents: diff changeset	1021	To use this lemma result for our
2e2dca212fff add Chengsong parents: diff changeset	1022	correctness proof, simply replace the $v$ in the
2e2dca212fff add Chengsong parents: diff changeset	1023	$\textit{RHS}$ of the above equality with
2e2dca212fff add Chengsong parents: diff changeset	1024	$\mkeps\;(r\backslash (s@[c]))$, and apply the lemma that
2e2dca212fff add Chengsong parents: diff changeset	1025
2e2dca212fff add Chengsong parents: diff changeset	1026	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	1027	$\textit{decode} \; \bmkeps \; \rup \; r = \textit{decode} \; (\textit{retrieve} \; \rup \; \mkeps(r)) \;r$
2e2dca212fff add Chengsong parents: diff changeset	1028	\end{center}
2e2dca212fff add Chengsong parents: diff changeset	1029	\noindent
2e2dca212fff add Chengsong parents: diff changeset	1030	We get the correctness of our bit-coded algorithm:
2e2dca212fff add Chengsong parents: diff changeset	1031	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	1032	$\flex \;r\; id \; s \; (\mkeps \; r\backslash s) = \textit{decode} \; \bmkeps \; \rup\backslash s \; r$
2e2dca212fff add Chengsong parents: diff changeset	1033	\end{center}
2e2dca212fff add Chengsong parents: diff changeset	1034	\noindent
2e2dca212fff add Chengsong parents: diff changeset	1035	The bridge between the above chain of equalities
2e2dca212fff add Chengsong parents: diff changeset	1036	is the use of $\retrieve$,
2e2dca212fff add Chengsong parents: diff changeset	1037	if we want to use a similar technique for the
2e2dca212fff add Chengsong parents: diff changeset	1038	simplified version of algorithm,
2e2dca212fff add Chengsong parents: diff changeset	1039	we face the problem that in the above
2e2dca212fff add Chengsong parents: diff changeset	1040	equalities,
2e2dca212fff add Chengsong parents: diff changeset	1041	$\retrieve \; a \; v$ is not always defined.
2e2dca212fff add Chengsong parents: diff changeset	1042	for example,
100 397b31867ea6 copied christian changes Chengsong parents: 95 diff changeset	1043	$\retrieve \; _0(_1a+_0a) \; \Left(\Empty)$
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1044	is defined, but not $\retrieve \; (_{01}a) \;\Left(\Empty)$,
94 2e2dca212fff add Chengsong parents: diff changeset	1045	though we can extract the same POSIX
2e2dca212fff add Chengsong parents: diff changeset	1046	bits from the two annotated regular expressions.
95 c969a973fcae updte1 Chengsong parents: 94 diff changeset	1047	The latter might occur when we try to retrieve from
c969a973fcae updte1 Chengsong parents: 94 diff changeset	1048	a simplified regular expression using the same value
c969a973fcae updte1 Chengsong parents: 94 diff changeset	1049	as the unsimplified one.
c969a973fcae updte1 Chengsong parents: 94 diff changeset	1050	This is because $\Left(\Empty)$ corresponds to
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1051	the regular expression structure $\ONE+r_2$ instead of
4a327e70d538 b Chengsong parents: 100 diff changeset	1052	$\ONE$.
94 2e2dca212fff add Chengsong parents: diff changeset	1053	That means, if we
2e2dca212fff add Chengsong parents: diff changeset	1054	want to prove that
2e2dca212fff add Chengsong parents: diff changeset	1055	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	1056	$\textit{decode} \; \bmkeps \; \rup\backslash s \; r = \textit{decode} \; \bmkeps \; \rup\backslash_{simp} s \; r$
2e2dca212fff add Chengsong parents: diff changeset	1057	\end{center}
2e2dca212fff add Chengsong parents: diff changeset	1058	\noindent
2e2dca212fff add Chengsong parents: diff changeset	1059	holds by using $\retrieve$,
2e2dca212fff add Chengsong parents: diff changeset	1060	we probably need to prove an equality like below:
2e2dca212fff add Chengsong parents: diff changeset	1061	\begin{center}
2e2dca212fff add Chengsong parents: diff changeset	1062	%$\retrieve \; \rup\backslash_{simp} s \; \mkeps(r\backslash_{simp} s)=\textit{retrieve} \; \rup\backslash s \; \mkeps(r\backslash s)$
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1063	$\retrieve \; \rup\backslash_{simp} s \; \mkeps(f(r\backslash s))=\textit{retrieve} \; \rup\backslash s \; \mkeps(r\backslash s)$
94 2e2dca212fff add Chengsong parents: diff changeset	1064	\end{center}
2e2dca212fff add Chengsong parents: diff changeset	1065	\noindent
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1066	$f$ rectifies $r\backslash s$ so the value $\mkeps(f(r\backslash s))$ becomes
4a327e70d538 b Chengsong parents: 100 diff changeset	1067	something simpler
94 2e2dca212fff add Chengsong parents: diff changeset	1068	to make the retrieve function defined.\\
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1069	\subsubsection{Ways to Rectify Value}
95 c969a973fcae updte1 Chengsong parents: 94 diff changeset	1070	One way to do this is to prove the following:
c969a973fcae updte1 Chengsong parents: 94 diff changeset	1071	\begin{center}
c969a973fcae updte1 Chengsong parents: 94 diff changeset	1072	$\retrieve \; \rup\backslash_{simp} s \; \mkeps(\simp(r\backslash s))=\textit{retrieve} \; \rup\backslash s \; \mkeps(r\backslash s)$
c969a973fcae updte1 Chengsong parents: 94 diff changeset	1073	\end{center}
c969a973fcae updte1 Chengsong parents: 94 diff changeset	1074	\noindent
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1075	The reason why we choose $\simp$ as $f$ is because
4a327e70d538 b Chengsong parents: 100 diff changeset	1076	$\rup\backslash_{simp} \, s$ and $\simp(\rup\backslash \, s)$
4a327e70d538 b Chengsong parents: 100 diff changeset	1077	have the same shape:
4a327e70d538 b Chengsong parents: 100 diff changeset	1078	\begin{center}
4a327e70d538 b Chengsong parents: 100 diff changeset	1079	$\erase (\rup\backslash_{simp} \, s) = \erase(\simp(\rup\backslash s))$
4a327e70d538 b Chengsong parents: 100 diff changeset	1080	\end{center}
4a327e70d538 b Chengsong parents: 100 diff changeset	1081
4a327e70d538 b Chengsong parents: 100 diff changeset	1082	\noindent
4a327e70d538 b Chengsong parents: 100 diff changeset	1083	$\erase$ in the above equality means to remove the bit-codes
4a327e70d538 b Chengsong parents: 100 diff changeset	1084	in an annotated regular expression and only keep the original
4a327e70d538 b Chengsong parents: 100 diff changeset	1085	regular expression(just like "erasing" the bits). Its definition is omitted.
4a327e70d538 b Chengsong parents: 100 diff changeset	1086	$\rup\backslash_{simp} \, s$ and $\simp(\rup\backslash s)$
4a327e70d538 b Chengsong parents: 100 diff changeset	1087	are very closely related, but not identical.
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1088	\subsubsection{Example for $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$}
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1089	For example, let $r$ be the regular expression
4a327e70d538 b Chengsong parents: 100 diff changeset	1090	$(a+b)(a+a*)$ and $s$ be the string $aa$, then
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1091	both $\erase (\rup\backslash_{simp} \, s)$ and $\erase (\simp (\rup\backslash s))$
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1092	are $\ONE + a^*$. However, without $\erase$
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1093	\begin{center}
4a327e70d538 b Chengsong parents: 100 diff changeset	1094	$\rup\backslash_{simp} \, s$ is equal to $_0(_0\ONE +_{11}a^*)$
4a327e70d538 b Chengsong parents: 100 diff changeset	1095	\end{center}
4a327e70d538 b Chengsong parents: 100 diff changeset	1096	\noindent
4a327e70d538 b Chengsong parents: 100 diff changeset	1097	whereas
4a327e70d538 b Chengsong parents: 100 diff changeset	1098	\begin{center}
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1099	$\simp(\rup\backslash s)$ is equal to $(_{00}\ONE +_{011}a^*)$
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1100	\end{center}
4a327e70d538 b Chengsong parents: 100 diff changeset	1101	\noindent
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1102	(For the sake of visual simplicity, we use numbers to denote the bits
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1103	in bitcodes as we have previously defined for annotated
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1104	regular expressions. $\S$ is replaced by
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1105	subscript $_1$ and $\Z$ by $_0$.)
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1106
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1107	What makes the difference?
b1e365afa29c changes Chengsong parents: 106 diff changeset	1108
b1e365afa29c changes Chengsong parents: 106 diff changeset	1109	%Two "rules" might be inferred from the above example.
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1110
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1111	%First, after erasing the bits the two regular expressions
b1e365afa29c changes Chengsong parents: 106 diff changeset	1112	%are exactly the same: both become $1+a^*$. Here the
b1e365afa29c changes Chengsong parents: 106 diff changeset	1113	%function $\simp$ exhibits the "one in the end equals many times
b1e365afa29c changes Chengsong parents: 106 diff changeset	1114	%at the front"
b1e365afa29c changes Chengsong parents: 106 diff changeset	1115	%property: one simplification in the end causes the
b1e365afa29c changes Chengsong parents: 106 diff changeset	1116	%same regular expression structure as
b1e365afa29c changes Chengsong parents: 106 diff changeset	1117	%successive simplifications done alongside derivatives.
b1e365afa29c changes Chengsong parents: 106 diff changeset	1118	%$\rup\backslash_{simp} \, s$ unfolds to
b1e365afa29c changes Chengsong parents: 106 diff changeset	1119	%$\simp((\simp(r\backslash a))\backslash a)$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1120	%and $\simp(\rup\backslash s)$ unfolds to
b1e365afa29c changes Chengsong parents: 106 diff changeset	1121	%$\simp((r\backslash a)\backslash a)$. The one simplification
b1e365afa29c changes Chengsong parents: 106 diff changeset	1122	%in the latter causes the resulting regular expression to
b1e365afa29c changes Chengsong parents: 106 diff changeset	1123	%become $1+a^*$, exactly the same as the former with
b1e365afa29c changes Chengsong parents: 106 diff changeset	1124	%two simplifications.
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1125
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1126	%Second, the bit-codes are different, but they are essentially
b1e365afa29c changes Chengsong parents: 106 diff changeset	1127	%the same: if we push the outmost bits ${\bf_0}(_0\ONE +_{11}a^*)$ of $\rup\backslash_{simp} \, s$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1128	%inside then we get $(_{00}\ONE +_{011}a^*)$, exactly the
b1e365afa29c changes Chengsong parents: 106 diff changeset	1129	%same as that of $\rup\backslash \, s$. And this difference
b1e365afa29c changes Chengsong parents: 106 diff changeset	1130	%does not matter when we try to apply $\bmkeps$ or $\retrieve$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1131	%to it. This seems a good news if we want to use $\retrieve$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1132	%to prove things.
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1133
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1134	%If we look into the difference above, we could see that the
b1e365afa29c changes Chengsong parents: 106 diff changeset	1135	%difference is not fundamental: the bits are just being moved
b1e365afa29c changes Chengsong parents: 106 diff changeset	1136	%around in a way that does not hurt the correctness.
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1137	During the first derivative operation,
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1138	$\rup\backslash a=(_0\ONE + \ZERO)(_0a + _1a^*)$ is
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1139	in the form of a sequence regular expression with
b1e365afa29c changes Chengsong parents: 106 diff changeset	1140	two components, the first
b1e365afa29c changes Chengsong parents: 106 diff changeset	1141	one $\ONE + \ZERO$ being nullable.
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1142	Recall the simplification function definition:
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1143	\begin{center}
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1144	\begin{tabular}{@{}lcl@{}}
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1145
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1146	$\textit{simp} \; (\textit{SEQ}\;bs\,a_1\,a_2)$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp} \; a_2) \; \textit{match} $ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1147	&&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow \ZERO$ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1148	&&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow \ZERO$ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1149	&&$\quad\textit{case} \; (\ONE, a_2') \Rightarrow \textit{fuse} \; bs \; a_2'$ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1150	&&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow \textit{fuse} \; bs \; a_1'$ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1151	&&$\quad\textit{case} \; (a_1', a_2') \Rightarrow \textit{SEQ} \; bs \; a_1' \; a_2'$ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1152
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1153	$\textit{simp} \; (\textit{ALTS}\;bs\,as)$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1154	&&$\quad\textit{case} \; [] \Rightarrow \ZERO$ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1155	&&$\quad\textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$ \\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1156	&&$\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$\\
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1157
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1158	$\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1159	\end{tabular}
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1160	\end{center}
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1161
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1162	\noindent
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1163
b1e365afa29c changes Chengsong parents: 106 diff changeset	1164	and the difinition of $\flatten$:
b1e365afa29c changes Chengsong parents: 106 diff changeset	1165	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1166	\begin{tabular}{c c c}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1167	$\flatten \; []$ & $\dn$ & $[]$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	1168	$\flatten \; \ZERO::rs$ & $\dn$ & $rs$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	1169	$\flatten \;(_{\textit{bs}_1}\oplus \textit{rs}_1 ::rs)$ & $\dn$ & $(\map \, (\fuse \, \textit{bs}_1) \,\textit{rs}_1) ::: \flatten(rs)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	1170	$\flatten \; r :: rs$ & $\dn$ & $r::\flatten(rs)$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1171	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1172	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1173
b1e365afa29c changes Chengsong parents: 106 diff changeset	1174	\noindent
b1e365afa29c changes Chengsong parents: 106 diff changeset	1175	If we call $\simp$ on $\rup\backslash a$, just as $\backslash_{simp}$
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1176	requires, then we would go throught the third clause of
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1177	the sequence case:$\quad\textit{case} \; (\ONE, a_2') \Rightarrow \textit{fuse} \; bs \; a_2'$.
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1178	The $\ZERO$ of $(_0\ONE + \ZERO)$ is thrown away
b1e365afa29c changes Chengsong parents: 106 diff changeset	1179	by $\flatten$ and
b1e365afa29c changes Chengsong parents: 106 diff changeset	1180	$_0\ONE$ merged into $(_0a + _1a^*)$ by simply
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1181	putting its bits($_0$) to the front of the second component:
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1182	${\bf_0}(_0a + _1a^*)$.
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1183	After a second derivative operation,
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1184	namely, $(_0(_0a + _1a^*))\backslash a$, we get
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1185	$
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1186	_0(_0 \ONE + _1(_1\ONE \cdot a^*))
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1187	$, and this simplifies to $_0(_0 \ONE + _{11} a^*)$
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1188	by the third clause of the alternative case:
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1189	$\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$.
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1190	The outmost bit $_0$ stays with
b1e365afa29c changes Chengsong parents: 106 diff changeset	1191	the outmost regular expression, rather than being fused to
b1e365afa29c changes Chengsong parents: 106 diff changeset	1192	its child regular expressions, as what we will later see happens
b1e365afa29c changes Chengsong parents: 106 diff changeset	1193	to $\simp(\rup\backslash \, s)$.
b1e365afa29c changes Chengsong parents: 106 diff changeset	1194	If we choose to not simplify in the midst of derivative operations,
b1e365afa29c changes Chengsong parents: 106 diff changeset	1195	but only do it at the end after the string has been exhausted,
b1e365afa29c changes Chengsong parents: 106 diff changeset	1196	namely, $\simp(\rup\backslash \, s)=\simp((\rup\backslash a)\backslash a)$,
b1e365afa29c changes Chengsong parents: 106 diff changeset	1197	then at the {\bf second} derivative of
b1e365afa29c changes Chengsong parents: 106 diff changeset	1198	$(\rup\backslash a)\bf{\backslash a}$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1199	we will go throught the clause of $\backslash$:
b1e365afa29c changes Chengsong parents: 106 diff changeset	1200	\begin{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1201	\begin{tabular}{lcl}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1202	$(\textit{SEQ}\;bs\,a_1\,a_2)\,\backslash c$ & $\dn$ &
b1e365afa29c changes Chengsong parents: 106 diff changeset	1203	$(when \; \textit{bnullable}\,a_1)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	1204	& &$\textit{ALTS}\,bs\,List(\;\;(\textit{SEQ}\,[]\,(a_1\,\backslash c)\,a_2),$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	1205	& &$(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))\;\;)$\\
b1e365afa29c changes Chengsong parents: 106 diff changeset	1206	\end{tabular}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1207	\end{center}
b1e365afa29c changes Chengsong parents: 106 diff changeset	1208
b1e365afa29c changes Chengsong parents: 106 diff changeset	1209	because
b1e365afa29c changes Chengsong parents: 106 diff changeset	1210	$\rup\backslash a = (_0\ONE + \ZERO)(_0a + _1a^*)$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1211	is a sequence
b1e365afa29c changes Chengsong parents: 106 diff changeset	1212	with the first component being nullable
b1e365afa29c changes Chengsong parents: 106 diff changeset	1213	(unsimplified, unlike the first round of running$\backslash_{simp}$).
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1214	Therefore $((_0\ONE + \ZERO)(_0a + _1a^*))\backslash a$ splits into
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1215	$([(\ZERO + \ZERO)\cdot(_0a + _1a^)] + _0( _0\ONE + _1[_1\ONE \cdot a^]))$.
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1216	After these two successive derivatives without simplification,
aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1217	we apply $\simp$ to this regular expression, which goes through
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1218	the alternative clause, and each component of
b1e365afa29c changes Chengsong parents: 106 diff changeset	1219	$([(\ZERO + \ZERO)\cdot(_0a + _1a^)] + _0( _0\ONE + _1[_1\ONE \cdot a^]))$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1220	will be simplified, giving us the list:$[\ZERO, _0(_0\ONE + _{11}a^*)]$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1221	This list is then "flattened"--$\ZERO$ will be
b1e365afa29c changes Chengsong parents: 106 diff changeset	1222	thrown away by $\textit{flatten}$; $ _0(_0\ONE + _{11}a^*)$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1223	is opened up to make the list consisting of two separate elements
b1e365afa29c changes Chengsong parents: 106 diff changeset	1224	$_{00}\ONE$ and $_{011}a^*$, note that $flatten$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1225	$\fuse$s the bit(s) $_0$ to the front of $_0\ONE $ and $_{11}a^*$.
122 dc0cdfc5fc66 version Chengsong parents: 121 diff changeset	1226	The order of simplification, which impacts the order that alternatives
dc0cdfc5fc66 version Chengsong parents: 121 diff changeset	1227	are opened up, causes
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1228	the bits to be moved differently.
b1e365afa29c changes Chengsong parents: 106 diff changeset	1229
b1e365afa29c changes Chengsong parents: 106 diff changeset	1230	\subsubsection{A Failed Attempt To Remedy the Problem Above}
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1231	A simple class of regular expression and string
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1232	pairs $(r, s)$ can be deduced from the above example
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1233	which trigger the difference between
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1234	$\rup\backslash_{simp} \, s$
b1e365afa29c changes Chengsong parents: 106 diff changeset	1235	and $\simp(\rup\backslash s)$:
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1236	\begin{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1237	\begin{tabular}{lcl}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1238	$D =\{ (r_1 \cdot r_2,\; [c_1c_2]) \mid $ & $\simp(r_2) = r_2, \simp(r_1 \backslash c_1) = \ONE,$\\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1239	$r_1 \; not \; \nullable, c_2 \in L(r_2),$ & $\exists \textit{rs},\textit{bs}.\; r_2 \backslash c_2 = _{bs}{\oplus rs}$\\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1240	$\exists \textit{rs}_1. \; \simp(r_2 \backslash c_2) = _{bs}{\oplus \textit{rs}_1}$ & $and \;\simp(r_1 \backslash [c_1c_2]) = \ZERO\}$\\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1241	\end{tabular}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1242	\end{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1243	We take a pair $(r, \;s)$ from the set $D$.
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1244
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1245	Now we compute ${\bf \rup \backslash_{simp} s}$, we get:
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1246	\begin{center}
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1247	\begin{tabular}{lcl}
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1248	$(r_1\cdot r_2)\backslash_{simp} \, [c_1c_2]$ & $= \simp\left[ \big(\simp\left[ \left( r_1\cdot r_2 \right) \backslash c_1\right] \big)\backslash c_2\right]$\\
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1249	& $= \simp\left[ \big(\simp \left[ \left(r_1 \backslash c_1\right) \cdot r_2 \right] \big) \backslash c_2 \right]$\\
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1250	& $= \simp \left[ (\fuse \; \bmkeps(r_1\backslash c_1) \; \simp(r_2) ) \backslash c_2 \right]$,\\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1251	& $= \simp \left[ (\fuse \; \bmkeps(r_1\backslash c_1) \; r_2 ) \backslash c_2 \right]$,
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1252	\end{tabular}
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1253	\end{center}
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1254	\noindent
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1255	from the definition of $D$ we know $r_1 \backslash c_1$ is nullable, therefore
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1256	$\bmkeps(r_1\backslash c_1)$ returns a bitcode, we shall call it
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1257	$\textit{bs}_2$.
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1258	The above term can be rewritten as
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1259	\begin{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1260	$ \simp \left[ \fuse \; \textit{bs}_2\; r_2 \backslash c_2 \right]$,
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1261	\end{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1262	which is equal to
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1263	\begin{center}
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1264	$\simp \left[ \fuse \; \textit{bs}_2 \; _{bs}{\oplus rs} \right]$\\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1265	$=\simp \left[ \; _{bs_2++bs}{\oplus rs} \right]$\\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1266	$= \; _{bs_2++bs}{\oplus \textit{rs}_1} $
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1267	\end{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1268	\noindent
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1269	by using the properties from the set $D$ again
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1270	and again(The reason why we set so many conditions
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1271	that the pair $(r,s)$ need to satisfy is because we can
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1272	rewrite them easily to construct the difference.)
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1273
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1274	Now we compute ${\bf \simp(\rup \backslash s)}$:
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1275	\begin{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1276	$\simp \big[(r_1\cdot r_2) \backslash [c_1c_2] \big]= \simp \left[ ((r_1 \cdot r_2 )\backslash c_1) \backslash c_2 \right]$
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1277	\end{center}
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1278	\noindent
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1279	Again, using the properties above, we obtain
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1280	the following chain of equalities:
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1281	\begin{center}
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1282	$\simp(\rup \backslash s)= \simp \left[ ((r_1 \cdot r_2 )\backslash c_1) \backslash c_2 \right]= \simp\left[ \left(r_1 \backslash c_1\right) \cdot r_2 \big) \backslash c_2 \right]$\\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1283	$= \simp \left[ \oplus[\big( \left(r_1 \backslash c_1\right) \backslash c_2 \big) \cdot r_2 \; , \; \fuse \; (\bmkeps \;r_1\backslash c_1) \; r_2 \backslash c_2 ] \right]$,
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1284	\end{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1285	\noindent
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1286	as before, we call the bitcode returned by
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1287	$\bmkeps(r_1\backslash c_1)$ as
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1288	$\textit{bs}_2$.
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1289	Also, $\simp(r_2 \backslash c_2)$ is
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1290	$_{bs}\oplus \textit{rs}_1$,
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1291	and $( \left(r_1 \backslash c_1\right) \backslash c_2 \cdot r_2)$
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1292	simplifies to $\ZERO$,
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1293	so the above term can be expanded as
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1294	\begin{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1295	\begin{tabular}{l}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1296	$\textit{distinct}(\flatten[\ZERO\;, \; _{\textit{bs}_2++\textit{bs}}\oplus \textit{rs}_1] ) \; \textit{match} $ \\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1297	$\textit{case} \; [] \Rightarrow \ZERO$ \\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1298	$\textit{case} \; a :: [] \Rightarrow \textit{\fuse \; \textit{bs} a}$ \\
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1299	$\textit{case} \; as' \Rightarrow _{[]}\oplus as'$\\
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1300	\end{tabular}
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1301	\end{center}
a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1302	\noindent
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1303	Applying the definition of $\flatten$, we get
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1304	\begin{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1305	$_{[]}\oplus (\textit{map} \; \fuse (\textit{bs}_2 ++ bs) rs_1)$
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1306	\end{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1307	\noindent
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1308	compared to the result
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1309	\begin{center}
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1310	$ \; _{bs_2++bs}{\oplus \textit{rs}_1} $
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1311	\end{center}
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1312	\noindent
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1313	Note how these two regular expressions only
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1314	differ in terms of the position of the bits
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1315	$\textit{bs}_2++\textit{bs}$. They are the same otherwise.
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1316	What caused this difference to happen?
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1317	The culprit is the $\flatten$ function, which spills
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1318	out the bitcodes in the inner alternatives when
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1319	there exists an outer alternative.
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1320	Note how the absence of simplification
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1321	caused $\simp(\rup \backslash s)$ to
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1322	generate the nested alternatives structure:
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1323	\begin{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1324	$ \oplus[\ZERO \;, \; _{bs}\oplus \textit{rs} ]$
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1325	\end{center}
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1326	and this will always trigger the $\flatten$ to
112 c19f2d20d92c added section Chengsong parents: 111 diff changeset	1327	spill out the inner alternative's bitcode $\textit{bs}$,
111 af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1328	whereas when
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1329	simplification is done along the way,
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1330	the structure of nested alternatives is never created(we can
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1331	actually prove that simplification function never allows nested
af2c63f9bdf9 refined section a bit Chengsong parents: 110 diff changeset	1332	alternatives to happen, more on this later).
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1333
113 8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1334	How about we do not allow the function $\simp$
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1335	to fuse out the bits when it is unnecessary?
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1336	Like, for the above regular expression, we might
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1337	just delete the outer layer of alternative
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1338	\begin{center}
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1339	\st{$ {\oplus[\ZERO \;,}$} $_{bs}\oplus \textit{rs}$ \st{$]$}
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1340	\end{center}
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1341	and get $_{bs}\oplus \textit{rs}$ instead, without
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1342	fusing the bits $\textit{bs}$ inside to every element
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1343	of $\textit{rs}$.
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1344	This idea can be realized by making the following
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1345	changes to the $\simp$-function:
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1346	\begin{center}
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1347	\begin{tabular}{@{}lcl@{}}
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1348
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1349	$\textit{simp}' \; (_{\textit{bs}}(a_1 \cdot a_2))$ & $\dn$ & $\textit{as} \; \simp \; \textit{was} \; \textit{before} $ \\
113 8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1350
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1351	$\textit{simp}' \; (_{bs}\oplus as)$ & $\dn$ & \st{$\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $} \\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1352	&&\st{$\quad\textit{case} \; [] \Rightarrow \ZERO$} \\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1353	&&\st{$\quad\textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$} \\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1354	&&\st{$\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$}\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1355	&&$\textit{if}(\textit{hollowAlternatives}( \textit{map \; simp \; as}))$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1356	&&$\textit{then} \; \fuse \; \textit{bs}\; \textit{extractAlt}(\textit{map} \; \simp \; \textit{as} )$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1357	&&$\textit{else} \; \simp(_{bs} \oplus \textit{as})$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1358
113 8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1359
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1360	$\textit{simp}' \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$
113 8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1361	\end{tabular}
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1362	\end{center}
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1363
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1364	\noindent
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1365	given the definition of $\textit{hollowAlternatives}$ and $\textit{extractAlt}$ :
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1366	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1367	$\textit{hollowAlternatives}( \textit{rs}) \dn
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1368	\exists r = (_{\textit{bs}_1}\oplus \textit{rs}_1) \in \textit{rs}. \forall r' \in \textit{rs}, \;
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1369	\textit{either} \; r' = \ZERO \; \textit{or} \; r' = r $
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1370	$\textit{extractAlt}( \textit{rs}) \dn \textit{if}\big(
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1371	\exists r = (_{\textit{bs}_1}\oplus \textit{rs}_1) \in \textit{rs}. \forall r' \in \textit{rs}, \;
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1372	\textit{either} \; r' = \ZERO \; \textit{or} \; r' = r \big)\; \textit{then} \; \textit{return} \; r$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1373	\end{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1374	\noindent
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1375	Basically, $\textit{hollowAlternatives}$ captures the feature of
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1376	a list of regular expression of the shape
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1377	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1378	$ \oplus[\ZERO \;, \; _{bs}\oplus \textit{rs} ]$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1379	\end{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1380	and this means we can simply elevate the
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1381	inner regular expression $_{bs}\oplus \textit{rs}$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1382	to the outmost
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1383	and throw away the useless $\ZERO$s and
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1384	the outer $\oplus$ wrapper.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1385	Using this new definition of simp,
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1386	under the example where $r$ is the regular expression
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1387	$(a+b)(a+a*)$ and $s$ is the string $aa$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1388	the problem of $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1389	is resolved.
113 8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1390
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1391	Unfortunately this causes new problems:
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1392	for the counterexample where
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1393	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1394	$r$ is the regular expression
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1395	$(ab+(a^*+aa))$ and $s$ is the string $aa$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1396	\end{center}
122 dc0cdfc5fc66 version Chengsong parents: 121 diff changeset	1397
dc0cdfc5fc66 version Chengsong parents: 121 diff changeset	1398	\noindent
dc0cdfc5fc66 version Chengsong parents: 121 diff changeset	1399	$\rup\backslash_{simp} \, s$ is equal to
125 788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1400	$ _1(_{011}a^* + _1\ONE) $ whereas
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1401	$ \simp(\rup\backslash s) = (_{1011}a^* + _{11}\ONE)$.
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1402	This discrepancy does not appear for the old
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1403	version of $\simp$.
125 788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1404
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1405	Why?
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1406
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1407	During the first derivative operation,
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1408	\begin{center}
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1409	$\rup\backslash a=( _0[ \ONE\cdot {\bf b}] + _1( _0[ _1\ONE \cdot {\bf a}^*] + [ \ONE \cdot {\bf a}]) )$,
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1410	\end{center}
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1411	\noindent
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1412	the second derivative gives us
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1413	\begin{center}
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1414	$\rup\backslash a=(_0( [\ZERO\cdot {\bf b}] + 0) + _1( _0( [\ZERO\cdot {\bf a}^] + _1[ _1\ONE \cdot {\bf a}^]) + _1( [\ZERO \cdot {\bf a}] + \ONE) ))$,
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1415	\end{center}
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1416
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1417	\noindent
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1418	and this simplifies to
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1419	\begin{center}
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1420	$ _1(_{011}{\bf a}^* + _1\ONE) $
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1421	\end{center}
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1422
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1423	If, after the first derivative we apply simplification we get
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1424	$(_0{\bf b} + _{101}{\bf a}^* + _{11}{\bf a} )$,
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1425	and we do another derivative, getting
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1426	$(\ZERO + (_{101}(\ONE \cdot _1{\bf a}^*)+_{11}\ONE)$,
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1427	which simplifies to
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1428	\begin{center}
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1429	$ (_{1011}a^* + _{11}\ONE) $
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1430	\end{center}
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1431
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1432
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1433
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1434
788f4aa28bc5 added a bit? Chengsong parents: 123 diff changeset	1435
122 dc0cdfc5fc66 version Chengsong parents: 121 diff changeset	1436	We have changed the algorithm to suppress the old
dc0cdfc5fc66 version Chengsong parents: 121 diff changeset	1437	counterexample, but this gives rise to new counterexamples.
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1438	This dilemma causes this amendment not a successful
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1439	attempt to make $\rup\backslash_{simp} \, s = \simp(\rup\backslash s)$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1440	under every possible regular expression and string.
112 c19f2d20d92c added section Chengsong parents: 111 diff changeset	1441	\subsection{Properties of the Function $\simp$}
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1442
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1443	We have proved in Isabelle quite a few properties
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1444	of the $\simp$-function, which helps the proof to go forward
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1445	and we list them here to aid comprehension.
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1446
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1447	To start, we need a bit of auxiliary notations,
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1448	which is quite basic and is only written here
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1449	for clarity.
113 8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1450
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1451	$\textit{sub}(r)$ computes the set of the
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1452	sub-regular expression of $r$:
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1453	\begin{center}
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1454	$\textit{sub}(\ONE) \dn \{\ONE\}$\\
8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1455	$\textit{sub}(r_1 \cdot r_2) \dn \textit{sub}(r_1) \cup \textit{sub}(r_2) \cup \{r_1 \cdot r_2\}$\\
112 c19f2d20d92c added section Chengsong parents: 111 diff changeset	1456	$\textit{sub}(r_1 + r_2) \dn \textit{sub}(r_1) \cup \textit{sub}(r_2) \cup \{r_1+r_2\}$\\
113 8dbc83ecea3b test for talisker address change Chengsong parents: 112 diff changeset	1457	\end{center}
112 c19f2d20d92c added section Chengsong parents: 111 diff changeset	1458	$\textit{good}(r) \dn r \neq \ZERO \land \\
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1459	\forall r' \in \textit{sub}(r), \textit{if} \; r' = _{bs_1}\oplus(rs_1), \;
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1460	\textit{then} \nexists r'' \in \textit{rs}_1 \; s.t.\;
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1461	r'' = _{bs_2}\oplus \textit{rs}_2 $
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1462
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1463	The properties are mainly the ones below:
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1464	\begin{itemize}
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1465	\item
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1466	\begin{center}
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1467	$\simp(\simp(r)) = \simp(r)$
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1468	\end{center}
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1469	\item
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1470	\begin{center}
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1471	$\textit{if} r = \simp(r') \textit{then} \; \textit{good}(r) $
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1472	\end{center}
c19f2d20d92c added section Chengsong parents: 111 diff changeset	1473	\end{itemize}
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1474	\subsection{the Contains relation}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1475	$\retrieve$ is a too strong relation in that
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1476	it only extracts one bitcode instead of a set of them.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1477	Therefore we try to define another relation(predicate)
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1478	to capture the fact the regular expression has bits
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1479	being moved around but still has all the bits needed.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1480	The contains symbol, written$\gg$, is a relation that
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1481	takes two arguments in an infix form
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1482	and returns a truth value.
112 c19f2d20d92c added section Chengsong parents: 111 diff changeset	1483
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1484
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1485	In other words, from the set of regular expression and
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1486	bitcode pairs
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1487	$\textit{RV} = \{(r, v) \mid r \text{r is a regular expression, v is a value}\}$,
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1488	those that satisfy the following requirements are in the set
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1489	$\textit{RV}_Contains$.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1490	Unlike the $\retrieve$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1491	function, which takes two arguments $r$ and $v$ and
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1492	produces an only answer $\textit{bs}$, it takes only
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1493	one argument $r$ and returns a set of bitcodes that
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1494	can be generated by $r$.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1495	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1496	\begin{tabular}{llclll}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1497	& & & $_{bs}\ONE$ & $\gg$ & $\textit{bs}$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1498	& & & $_{bs}{\bf c}$ & $\gg$ & $\textit{bs}$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1499	$\textit{if} \; r_1 \gg \textit{bs}_1$ & $r_2 \; \gg \textit{bs}_2$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1500	& $\textit{then}$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1501	$_{bs}{r_1 \cdot r_2}$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1502	$\gg$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1503	$\textit{bs} @ \textit{bs}_1 @ \textit{bs}_2$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1504
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1505	$\textit{if} \; r \gg \textit{bs}_1$ & & $\textit{then}$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1506	$_{bs}{\oplus(r :: \textit{rs}})$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1507	$\gg$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1508	$\textit{bs} @ \textit{bs}_1 $\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1509
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1510	$\textit{if} \; _{bs}(\oplus \textit{rs}) \gg \textit{bs} @ \textit{bs}_1$ & & $\textit{then}$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1511	$_{bs}{\oplus(r :: \textit{rs}})$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1512	$\gg$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1513	$\textit{bs} @ \textit{bs}_1 $\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1514
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1515	$\textit{if} \; r \gg \textit{bs}_1\; \textit{and}$ & $_{bs}r^* \gg \textit{bs} @ \textit{bs}_2$ & $\textit{then}$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1516	$_{bs}r^* $ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1517	$\gg$ &
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1518	$\textit{bs} @ [0] @ \textit{bs}_1@ \textit{bs}_2 $\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1519
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1520	& & & $_{bs}r^*$ & $\gg$ & $\textit{bs} @ [1]$\\
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1521	\end{tabular}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1522	\end{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1523	It is a predicate in the sense that given
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1524	a regular expression and a bitcode, it
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1525	returns true or false, depending on whether
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1526	or not the regular expression can actually produce that
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1527	value. If the predicates returns a true, then
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1528	we say that the regular expression $r$ contains
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1529	the bitcode $\textit{bs}$, written
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1530	$r \gg \textit{bs}$.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1531	The $\gg$ operator with the
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1532	regular expression $r$ may also be seen as a
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1533	machine that does a derivative of regular expressions
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1534	on all strings simultaneously, taking
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1535	the bits by going throught the regular expression tree
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1536	structure in a depth first manner, regardless of whether
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1537	the part being traversed is nullable or not.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1538	It put all possible bits that can be produced on such a traversal
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1539	into a set.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1540	For example, if we are given the regular expression
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1541	$((a+b)(c+d))^*$, the tree structure may be written as
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1542	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1543	\begin{tikzpicture}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1544	\tikz[tree layout]\graph[nodes={draw, circle}] {
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1545	* ->
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1546	{@-> {
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1547	{+1 ->
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1548	{a , b}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1549	},
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1550	{+ ->
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1551	{c , d }
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1552	}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1553	}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1554	}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1555	};
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1556	\end{tikzpicture}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1557	\end{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1558	\subsection{the $\textit{ders}_2$ Function}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1559	If we want to prove the result
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1560	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1561	$ \textit{blexer}\_{simp}(r, \; s) = \textit{blexer}(r, \; s)$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1562	\end{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1563	inductively
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1564	on the structure of the regular expression,
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1565	then we need to induct on the case $r_1 \cdot r_2$,
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1566	it can be good if we could express $(r_1 \cdot r_2) \backslash s$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1567	in terms of $r_1 \backslash s$ and $r_2 \backslash s$,
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1568	and this naturally induces the $ders2$ function,
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1569	which does a "convolution" on $r_1$ and $r_2$ using the string
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1570	$s$.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1571	It is based on the observation that the derivative of $r_1 \cdot r_2$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1572	with respect to a string $s$ can actually be written in an "explicit form"
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1573	composed of $r_1$'s derivative of $s$ and $r_2$'s derivative of $s$.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1574	This can be illustrated in the following procedure execution
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1575	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1576	$ (r_1 \cdot r_2) \backslash [c_1c_2] = (\textit{if} \; \nullable(r_1)\; \textit{then} \; ((r_1 \backslash c_1) \cdot r_2 + r_2 \backslash c_1) \; \textit{else} \; (r_1\backslash c_1) \cdot r_2) \backslash c_2$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1577	\end{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1578	which can also be written in a "convoluted sum"
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1579	format:
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1580	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1581	$ (r_1 \cdot r_2) \backslash [c_1c_2] = \sum{r_1 \backslash s_i \cdot r_2 \backslash s_j}$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1582	\end{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1583	In a more serious manner, we should write
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1584	\begin{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1585	$ (r_1 \cdot r_2) \backslash [c_1c_2] = \sum{r_1 \backslash s_i \cdot r_2 \backslash s_j}$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1586	\end{center}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1587	Note this differentiates from the previous form in the sense that
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1588	it calculates the results $r_1\backslash s_i , r_2 \backslash s_j$ first and then glue them together
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1589	through nested alternatives whereas the $r_1 \cdot r_2 \backslash s$ procedure,
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1590	used by algorithm $\lexer$, can only produce each component of the
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1591	resulting alternatives regular expression altogether rather than
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1592	generating each of the children nodes one by one
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1593	n a single recursive call that is only for generating that
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1594	very expression itself.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1595	We have this
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1596	\section{Conclusion}
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1597	Under the exhaustive tests we believe the main
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1598	result holds, yet a proof still seems elusive.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1599	We have tried out different approaches, and
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1600	found a lot of properties of the function $\simp$.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1601	The counterexamples where $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1602	are also valuable in the sense that
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1603	we get to know better why they are not equal and what
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1604	are the subtle differences between a
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1605	nested simplified regular expression and a
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1606	regular expression that is simplified at the final moment.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1607	We are almost there, but a last step is needed to make the proof work.
dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1608	Hopefully in the next few weeks we will be able to find one.
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1609	%CONSTRUCTION SITE HERE
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1610	that is to say, despite the bits are being moved around on the regular expression
4a327e70d538 b Chengsong parents: 100 diff changeset	1611	(difference in bits), the structure of the (unannotated)regular expression
4a327e70d538 b Chengsong parents: 100 diff changeset	1612	after one simplification is exactly the same after the
4a327e70d538 b Chengsong parents: 100 diff changeset	1613	same sequence of derivative operations
4a327e70d538 b Chengsong parents: 100 diff changeset	1614	regardless of whether we did simplification
4a327e70d538 b Chengsong parents: 100 diff changeset	1615	along the way.
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1616	One way would be to give a function that calls
110 a85c0f0fcf44 f Chengsong parents: 109 diff changeset	1617
107 b1e365afa29c changes Chengsong parents: 106 diff changeset	1618	fuse is the culprit: it causes the order in which alternatives
b1e365afa29c changes Chengsong parents: 106 diff changeset	1619	are opened up to be remembered and finally the difference
b1e365afa29c changes Chengsong parents: 106 diff changeset	1620	appear in $\simp(\rup \backslash s)$ and $\rup \backslash{simp} \,s$.
b1e365afa29c changes Chengsong parents: 106 diff changeset	1621	but we have to use them as they are essential in the simplification:
b1e365afa29c changes Chengsong parents: 106 diff changeset	1622	flatten needs them.
b1e365afa29c changes Chengsong parents: 106 diff changeset	1623
b1e365afa29c changes Chengsong parents: 106 diff changeset	1624
b1e365afa29c changes Chengsong parents: 106 diff changeset	1625
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1626	However, without erase the above equality does not hold:
4a327e70d538 b Chengsong parents: 100 diff changeset	1627	for the regular expression
4a327e70d538 b Chengsong parents: 100 diff changeset	1628	$(a+b)(a+a*)$,
4a327e70d538 b Chengsong parents: 100 diff changeset	1629	if we do derivative with respect to string $aa$,
114 dd7f719c451d avoid work loss Chengsong parents: 113 diff changeset	1630	we get
103 aeb0bc2d1812 hg Chengsong parents: 102 diff changeset	1631
109 79f347cb8b4d format Chengsong parents: 108 diff changeset	1632	\subsection{Another Proof Strategy}
101 4a327e70d538 b Chengsong parents: 100 diff changeset	1633	sdddddr does not equal sdsdsdsr sometimes.\\
4a327e70d538 b Chengsong parents: 100 diff changeset	1634	For example,
4a327e70d538 b Chengsong parents: 100 diff changeset	1635
4a327e70d538 b Chengsong parents: 100 diff changeset	1636	This equicalence class method might still have the potential of proving this,
4a327e70d538 b Chengsong parents: 100 diff changeset	1637	but not yet
4a327e70d538 b Chengsong parents: 100 diff changeset	1638	i parallelly tried another method of using retrieve\\
4a327e70d538 b Chengsong parents: 100 diff changeset	1639
4a327e70d538 b Chengsong parents: 100 diff changeset	1640
94 2e2dca212fff add Chengsong parents: diff changeset	1641	The vsimp function, defined as follows
2e2dca212fff add Chengsong parents: diff changeset	1642	tries to simplify the value in lockstep with
2e2dca212fff add Chengsong parents: diff changeset	1643	regular expression:\\
2e2dca212fff add Chengsong parents: diff changeset	1644
2e2dca212fff add Chengsong parents: diff changeset	1645
2e2dca212fff add Chengsong parents: diff changeset	1646	The problem here is that
2e2dca212fff add Chengsong parents: diff changeset	1647
2e2dca212fff add Chengsong parents: diff changeset	1648	we used retrieve for the key induction:
2e2dca212fff add Chengsong parents: diff changeset	1649	$decode (retrieve (r\backslash (s @ [c])) v) r $
2e2dca212fff add Chengsong parents: diff changeset	1650	$decode (retrieve (r\backslash s) (inj (r\backslash s) c v)) r$
2e2dca212fff add Chengsong parents: diff changeset	1651	Here, decode recovers a value that corresponds to a match(possibly partial)
2e2dca212fff add Chengsong parents: diff changeset	1652	from bits, and the bits are extracted by retrieve,
2e2dca212fff add Chengsong parents: diff changeset	1653	and the key value $v$ that guides retrieve is
2e2dca212fff add Chengsong parents: diff changeset	1654	$mkeps r\backslash s$, $inj r c (mkeps r\backslash s)$, $inj (inj (v))$, ......
2e2dca212fff add Chengsong parents: diff changeset	1655	if we can
2e2dca212fff add Chengsong parents: diff changeset	1656	the problem is that
2e2dca212fff add Chengsong parents: diff changeset	1657	need vsiimp to make a value that is suitable for decoding
2e2dca212fff add Chengsong parents: diff changeset	1658	$Some(flex rid(s@[c])v) = Some(flex rids(inj (r\backslash s)cv))$
2e2dca212fff add Chengsong parents: diff changeset	1659	another way that christian came up with that might circumvent the
2e2dca212fff add Chengsong parents: diff changeset	1660	prblem of finding suitable value is by not stating the visimp
2e2dca212fff add Chengsong parents: diff changeset	1661	function but include all possible value in a set that a regex is able to produce,
2e2dca212fff add Chengsong parents: diff changeset	1662	and proving that both r and sr are able to produce the bits that correspond the POSIX value
2e2dca212fff add Chengsong parents: diff changeset	1663
2e2dca212fff add Chengsong parents: diff changeset	1664	produced by feeding the same initial regular expression $r$ and string $s$ to the
2e2dca212fff add Chengsong parents: diff changeset	1665	two functions $ders$ and $ders\_simp$.
2e2dca212fff add Chengsong parents: diff changeset	1666	The reason why
2e2dca212fff add Chengsong parents: diff changeset	1667	Namely, if $bmkeps( r_1) = bmkeps(r_2)$, then we
2e2dca212fff add Chengsong parents: diff changeset	1668
2e2dca212fff add Chengsong parents: diff changeset	1669
2e2dca212fff add Chengsong parents: diff changeset	1670	If we define the equivalence relation $\sim_{m\epsilon}$ between two regular expressions
2e2dca212fff add Chengsong parents: diff changeset	1671	$r_1$ and $r_2$as follows:
2e2dca212fff add Chengsong parents: diff changeset	1672	$r_1 \sim_{m\epsilon} r_2 \iff bmkeps(r_1)= bmkeps(r_2)$
2e2dca212fff add Chengsong parents: diff changeset	1673	(in other words, they $r1$ and $r2$ produce the same output under the function $bmkeps$.)
2e2dca212fff add Chengsong parents: diff changeset	1674	Then the first goal
2e2dca212fff add Chengsong parents: diff changeset	1675	might be restated as
2e2dca212fff add Chengsong parents: diff changeset	1676	$(r^\uparrow)\backslash_{simp}\, s \sim_{m\epsilon} (r^\uparrow)\backslash s$.
2e2dca212fff add Chengsong parents: diff changeset	1677	I tried to establish an equivalence relation between the regular experssions
2e2dca212fff add Chengsong parents: diff changeset	1678	like dddr dddsr,.....
2e2dca212fff add Chengsong parents: diff changeset	1679	but right now i am only able to establish dsr and dr, using structural induction on r.
2e2dca212fff add Chengsong parents: diff changeset	1680	Those involve multiple derivative operations are harder to prove.
2e2dca212fff add Chengsong parents: diff changeset	1681	Two attempts have been made:
2e2dca212fff add Chengsong parents: diff changeset	1682	(1)induction on the number of der operations(or in other words, the length of the string s),
2e2dca212fff add Chengsong parents: diff changeset	1683	the inductive hypothesis was initially specified as
2e2dca212fff add Chengsong parents: diff changeset	1684	"For an arbitrary regular expression r,
2e2dca212fff add Chengsong parents: diff changeset	1685	For all string s in the language of r whose length do not exceed
2e2dca212fff add Chengsong parents: diff changeset	1686	the number n, ders s r me derssimp s r"
2e2dca212fff add Chengsong parents: diff changeset	1687	and the proof goal may be stated as
2e2dca212fff add Chengsong parents: diff changeset	1688	"For an arbitrary regular expression r,
2e2dca212fff add Chengsong parents: diff changeset	1689	For all string s in the language of r whose length do not exceed
2e2dca212fff add Chengsong parents: diff changeset	1690	the number n+1, ders s r me derssimp s r"
2e2dca212fff add Chengsong parents: diff changeset	1691	the problem here is that although we can easily break down
2e2dca212fff add Chengsong parents: diff changeset	1692	a string s of length n+1 into s1@list(c), it is not that easy
2e2dca212fff add Chengsong parents: diff changeset	1693	to use the i.h. as a stepping stone to prove anything because s1 may well be not
2e2dca212fff add Chengsong parents: diff changeset	1694	in the language L(r). This inhibits us from obtaining the fact that
2e2dca212fff add Chengsong parents: diff changeset	1695	ders s1 r me derssimps s1 r.
2e2dca212fff add Chengsong parents: diff changeset	1696	Further exploration is needed to amend this hypothesis so it includes the
2e2dca212fff add Chengsong parents: diff changeset	1697	situation when s1 is not nullable.
2e2dca212fff add Chengsong parents: diff changeset	1698	For example, what information(bits?
2e2dca212fff add Chengsong parents: diff changeset	1699	values?) can be extracted
2e2dca212fff add Chengsong parents: diff changeset	1700	from the regular expression ders(s1,r) so that we can compute or predict the possible
2e2dca212fff add Chengsong parents: diff changeset	1701	result of bmkeps after another derivative operation. What function f can used to
2e2dca212fff add Chengsong parents: diff changeset	1702	carry out the task? The possible way of exploration can be
2e2dca212fff add Chengsong parents: diff changeset	1703	more directly perceived throught the graph below:
2e2dca212fff add Chengsong parents: diff changeset	1704	find a function
2e2dca212fff add Chengsong parents: diff changeset	1705	f
2e2dca212fff add Chengsong parents: diff changeset	1706	such that
2e2dca212fff add Chengsong parents: diff changeset	1707	f(bders s1 r)
2e2dca212fff add Chengsong parents: diff changeset	1708	= re1
2e2dca212fff add Chengsong parents: diff changeset	1709	f(bderss s1 r)
2e2dca212fff add Chengsong parents: diff changeset	1710	= re2
2e2dca212fff add Chengsong parents: diff changeset	1711	bmkeps(bders s r) = g(re1,c)
2e2dca212fff add Chengsong parents: diff changeset	1712	bmkeps(bderssimp s r) = g(re2,c)
2e2dca212fff add Chengsong parents: diff changeset	1713	and g(re1,c) = g(re2,c)
2e2dca212fff add Chengsong parents: diff changeset	1714	The inductive hypothesis would be
2e2dca212fff add Chengsong parents: diff changeset	1715	"For all strings s1 of length <= n,
2e2dca212fff add Chengsong parents: diff changeset	1716	f(bders s1 r)
2e2dca212fff add Chengsong parents: diff changeset	1717	= re1
2e2dca212fff add Chengsong parents: diff changeset	1718	f(bderss s1 r)
2e2dca212fff add Chengsong parents: diff changeset	1719	= re2"
2e2dca212fff add Chengsong parents: diff changeset	1720	proving this would be a lemma for the main proof:
2e2dca212fff add Chengsong parents: diff changeset	1721	the main proof would be
2e2dca212fff add Chengsong parents: diff changeset	1722	"
2e2dca212fff add Chengsong parents: diff changeset	1723	bmkeps(bders s r) = g(re1,c)
2e2dca212fff add Chengsong parents: diff changeset	1724	bmkeps(bderssimp s r) = g(re2,c)
2e2dca212fff add Chengsong parents: diff changeset	1725	for s = s1@c
2e2dca212fff add Chengsong parents: diff changeset	1726	"
2e2dca212fff add Chengsong parents: diff changeset	1727	and f need to be a recursive property for the lemma to be proved:
2e2dca212fff add Chengsong parents: diff changeset	1728	it needs to store not only the "after one char nullable info",
2e2dca212fff add Chengsong parents: diff changeset	1729	but also the "after two char nullable info",
2e2dca212fff add Chengsong parents: diff changeset	1730	and so on so that it is able to predict what f will compute after a derivative operation,
2e2dca212fff add Chengsong parents: diff changeset	1731	in other words, it needs to be "infinitely recursive"\\
2e2dca212fff add Chengsong parents: diff changeset	1732	To prove the lemma, in other words, to get
2e2dca212fff add Chengsong parents: diff changeset	1733	"For all strings s1 of length <= n+1,
2e2dca212fff add Chengsong parents: diff changeset	1734	f(bders s1 r)
2e2dca212fff add Chengsong parents: diff changeset	1735	= re3
2e2dca212fff add Chengsong parents: diff changeset	1736	f(bderss s1 r)
2e2dca212fff add Chengsong parents: diff changeset	1737	= re4"\\
2e2dca212fff add Chengsong parents: diff changeset	1738	from\\
2e2dca212fff add Chengsong parents: diff changeset	1739	"For all strings s1 of length <= n,
2e2dca212fff add Chengsong parents: diff changeset	1740	f(bders s1 r)
2e2dca212fff add Chengsong parents: diff changeset	1741	= re1
2e2dca212fff add Chengsong parents: diff changeset	1742	f(bderss s1 r)
2e2dca212fff add Chengsong parents: diff changeset	1743	= re2"\\
2e2dca212fff add Chengsong parents: diff changeset	1744	it might be best to construct an auxiliary function h such that\\
2e2dca212fff add Chengsong parents: diff changeset	1745	h(re1, c) = re3\\
2e2dca212fff add Chengsong parents: diff changeset	1746	h(re2, c) = re4\\
2e2dca212fff add Chengsong parents: diff changeset	1747	and re3 = f(bder c (bders s1 r))\\
2e2dca212fff add Chengsong parents: diff changeset	1748	re4 = f(simp(bder c (bderss s1 r)))
2e2dca212fff add Chengsong parents: diff changeset	1749	The key point here is that we are not satisfied with what bders s r will produce under
2e2dca212fff add Chengsong parents: diff changeset	1750	bmkeps, but also how it will perform after a derivative operation and then bmkeps, and two
2e2dca212fff add Chengsong parents: diff changeset	1751	derivative operations and so on. In essence, we are preserving the regular expression
2e2dca212fff add Chengsong parents: diff changeset	1752	itself under the function f, in a less compact way than the regluar expression: we are
2e2dca212fff add Chengsong parents: diff changeset	1753	not just recording but also interpreting what the regular expression matches.
2e2dca212fff add Chengsong parents: diff changeset	1754	In other words, we need to prove the properties of bderss s r beyond the bmkeps result,
2e2dca212fff add Chengsong parents: diff changeset	1755	i.e., not just the nullable ones, but also those containing remaining characters.\\
2e2dca212fff add Chengsong parents: diff changeset	1756	(2)we observed the fact that
2e2dca212fff add Chengsong parents: diff changeset	1757	erase sdddddr= erase sdsdsdsr
2e2dca212fff add Chengsong parents: diff changeset	1758	that is to say, despite the bits are being moved around on the regular expression
2e2dca212fff add Chengsong parents: diff changeset	1759	(difference in bits), the structure of the (unannotated)regular expression
2e2dca212fff add Chengsong parents: diff changeset	1760	after one simplification is exactly the same after the
2e2dca212fff add Chengsong parents: diff changeset	1761	same sequence of derivative operations
2e2dca212fff add Chengsong parents: diff changeset	1762	regardless of whether we did simplification
2e2dca212fff add Chengsong parents: diff changeset	1763	along the way.
2e2dca212fff add Chengsong parents: diff changeset	1764	However, without erase the above equality does not hold:
2e2dca212fff add Chengsong parents: diff changeset	1765	for the regular expression
2e2dca212fff add Chengsong parents: diff changeset	1766	$(a+b)(a+a*)$,
2e2dca212fff add Chengsong parents: diff changeset	1767	if we do derivative with respect to string $aa$,
2e2dca212fff add Chengsong parents: diff changeset	1768	we get
2e2dca212fff add Chengsong parents: diff changeset	1769	%TODO
2e2dca212fff add Chengsong parents: diff changeset	1770	sdddddr does not equal sdsdsdsr sometimes.\\
2e2dca212fff add Chengsong parents: diff changeset	1771	For example,
2e2dca212fff add Chengsong parents: diff changeset	1772
2e2dca212fff add Chengsong parents: diff changeset	1773	This equicalence class method might still have the potential of proving this,
2e2dca212fff add Chengsong parents: diff changeset	1774	but not yet
2e2dca212fff add Chengsong parents: diff changeset	1775	i parallelly tried another method of using retrieve\\
2e2dca212fff add Chengsong parents: diff changeset	1776
2e2dca212fff add Chengsong parents: diff changeset	1777
2e2dca212fff add Chengsong parents: diff changeset	1778
2e2dca212fff add Chengsong parents: diff changeset	1779	\noindent\rule[0.5ex]{\linewidth}{1pt}
2e2dca212fff add Chengsong parents: diff changeset	1780
2e2dca212fff add Chengsong parents: diff changeset	1781
2e2dca212fff add Chengsong parents: diff changeset	1782
2e2dca212fff add Chengsong parents: diff changeset	1783
2e2dca212fff add Chengsong parents: diff changeset	1784
2e2dca212fff add Chengsong parents: diff changeset	1785
2e2dca212fff add Chengsong parents: diff changeset	1786	\bibliographystyle{plain}
2e2dca212fff add Chengsong parents: diff changeset	1787	\bibliography{root}
2e2dca212fff add Chengsong parents: diff changeset	1788
2e2dca212fff add Chengsong parents: diff changeset	1789
2e2dca212fff add Chengsong parents: diff changeset	1790	\end{document}
118 c7825cfacc76 lualatex is probably the culprit Chengsong parents: 117 diff changeset	1791

author	Chengsong
	Sat, 08 Feb 2020 22:01:40 +0000
changeset 130	4d6f54c478b5
parent 129	576ddb23f596
child 131	b6984212cf87
permissions	-rw-r--r--