lexing: thys3/Paper.thy@3831621d7b14 (annotated)

496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1	(<)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	2	theory Paper
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	3	imports
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	4	"Posix.LexerSimp"
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	5	"Posix.FBound"
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	6	"HOL-Library.LaTeXsugar"
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	7	begin
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	8
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	9	declare [[show_question_marks = false]]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	10
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	11	notation (latex output)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	12	If ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	13	Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	14
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	15
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	16	abbreviation
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	17	"der_syn r c \<equiv> der c r"
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	18	abbreviation
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	19	"ders_syn r s \<equiv> ders s r"
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	20	abbreviation
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	21	"bder_syn r c \<equiv> bder c r"
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	22
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	23	notation (latex output)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	24	der_syn ("_\\_" [79, 1000] 76) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	25	ders_syn ("_\\_" [79, 1000] 76) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	26	bder_syn ("_\\_" [79, 1000] 76) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	27	bders ("_\\_" [79, 1000] 76) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	28	bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	29
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	30	ZERO ("\<^bold>0" 81) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	31	ONE ("\<^bold>1" 81) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	32	CH ("_" [1000] 80) and
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	33	ALT ("_ + _" [76,76] 77) and
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	34	SEQ ("_ \<cdot> _" [78,78] 78) and
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	35	STAR ("_\<^sup>*" [79] 78) and
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	36	NTIMES ("_\<^sup>{\<^bsup>_\<^esup>\<^sup>}" [79] 78) and
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	37
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	38	val.Void ("Empty" 78) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	39	val.Char ("Char _" [1000] 78) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	40	val.Left ("Left _" [79] 78) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	41	val.Right ("Right _" [1000] 78) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	42	val.Seq ("Seq _ _" [79,79] 78) and
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	43	val.Stars ("Stars _" [1000] 78) and
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	44
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	45	Prf ("\<turnstile> _ : _" [75,75] 75) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	46	Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	47
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	48	flat ("\|_\|" [75] 74) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	49	flats ("\|_\|" [72] 74) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	50	injval ("inj _ _ _" [79,77,79] 76) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	51	mkeps ("mkeps _" [79] 76) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	52	length ("len _" [73] 73) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	53	set ("_" [73] 73) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	54
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	55	AZERO ("ZERO" 81) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	56	AONE ("ONE _" [79] 78) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	57	ACHAR ("CHAR _ _" [79, 79] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	58	AALTs ("ALTs _ _" [77,77] 78) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	59	ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	60	ASTAR ("STAR _ _" [79, 79] 78) and
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	61	ANTIMES ("NT _ _ _" [79, 79, 79] 78) and
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	62
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	63	code ("code _" [79] 74) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	64	intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	65	erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	66	bnullable ("bnullable _" [1000] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	67	bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	68	bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	69	bmkeps ("bmkeps _" [1000] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	70
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	71	eq1 ("_ \<approx> _" [77,77] 76) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	72
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	73	srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	74	rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	75	srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	76	blexer_simp ("blexer\<^sup>+" 1000)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	77
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	78
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	79	lemma better_retrieve:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	80	shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	81	and "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	82	apply (metis list.exhaust retrieve.simps(4))
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	83	by (metis list.exhaust retrieve.simps(5))
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	84
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	85
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	86	lemma better_retrieve2:
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	87	shows "retrieve (ANTIMES bs r (n + 1)) (Stars (v#vs)) =
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	88	bs @ [Z] @ retrieve r v @ retrieve (ANTIMES [] r n) (Stars vs)"
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	89	by auto
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	90	(>)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	91
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	92	section {* Introduction *}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	93
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	94	text {*
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	95
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	96	In the last fifteen or so years, Brzozowski's derivatives of regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	97	expressions have sparked quite a bit of interest in the functional
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	98	programming and theorem prover communities.
563 c92a41d9c4da updated paper and theories to include n-times Christian Urban <christian.urban@kcl.ac.uk> parents: 502 diff changeset	99	Derivatives of a
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	100	regular expressions, written @{term "der c r"}, give a simple solution
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	101	to the problem of matching a string @{term s} with a regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	102	expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	103	succession) all the characters of the string matches the empty string,
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	104	then @{term r} matches @{term s} (and {\em vice versa}).
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	105	The beauty of
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	106	Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	107	expressible in any functional language, and easily definable and
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	108	reasoned about in theorem provers---the definitions just consist of
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	109	inductive datatypes and simple recursive functions. Another attractive
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	110	feature of derivatives is that they can be easily extended to \emph{bounded}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	111	regular expressions, such as @{term "r"}$^{\{@{text n}\}}$ or @{term r}$^{\{..@{text n}\}}$, where numbers or
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	112	intervals of numbers specify how many times a regular expression should be used
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	113	during matching.
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	114
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	115
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	116
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	117
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	118	However, there are two difficulties with derivative-based matchers:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	119	First, Brzozowski's original matcher only generates a yes/no answer
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	120	for whether a regular expression matches a string or not. This is too
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	121	little information in the context of lexing where separate tokens must
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	122	be identified and also classified (for example as keywords
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	123	or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	124	difficulty by cleverly extending Brzozowski's matching
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	125	algorithm. Their extended version generates additional information on
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	126	\emph{how} a regular expression matches a string following the POSIX
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	127	rules for regular expression matching. They achieve this by adding a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	128	second ``phase'' to Brzozowski's algorithm involving an injection
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	129	function. In our own earlier work we provided the formal
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	130	specification of what POSIX matching means and proved in Isabelle/HOL
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	131	the correctness
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	132	of Sulzmann and Lu's extended algorithm accordingly
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	133	\cite{AusafDyckhoffUrban2016}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	134
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	135	The second difficulty is that Brzozowski's derivatives can
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	136	grow to arbitrarily big sizes. For example if we start with the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	137	regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	138	successive derivatives according to the character $a$, we end up with
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	139	a sequence of ever-growing derivatives like
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	140
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	141	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	142	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	143	\begin{tabular}{rll}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	144	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	145	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	146	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	147	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	148	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	149	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	150	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	151
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	152	\noindent where after around 35 steps we run out of memory on a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	153	typical computer (we shall define shortly the precise details of our
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	154	regular expressions and the derivative operation). Clearly, the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	155	notation involving $\ZERO$s and $\ONE$s already suggests
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	156	simplification rules that can be applied to regular regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	157	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	158	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	159	r$. While such simple-minded simplifications have been proved in our
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	160	earlier work to preserve the correctness of Sulzmann and Lu's
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	161	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	162	\emph{not} help with limiting the growth of the derivatives shown
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	163	above: the growth is slowed, but some derivatives can still grow rather
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	164	quickly beyond any finite bound.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	165
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	166
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	167	Sulzmann and Lu address this ``growth problem'' in a second algorithm
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	168	\cite{Sulzmann2014} where they introduce bitcoded
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	169	regular expressions. In this version, POSIX values are
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	170	represented as bitsequences and such sequences are incrementally generated
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	171	when derivatives are calculated. The compact representation
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	172	of bitsequences and regular expressions allows them to define a more
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	173	``aggressive'' simplification method that keeps the size of the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	174	derivatives finitely bounded no matter what the length of the string is.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	175	They make some informal claims about the correctness and linear behaviour
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	176	of this version, but do not provide any supporting proof arguments, not
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	177	even ``pencil-and-paper'' arguments. They write about their bitcoded
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	178	\emph{incremental parsing method} (that is the algorithm to be fixed and formalised
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	179	in this paper):
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	180	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	181	\begin{quote}\it
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	182	``Correctness Claim: We further claim that the incremental parsing
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	183	method [..] in combination with the simplification steps [..]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	184	yields POSIX parse trees. We have tested this claim
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	185	extensively [..] but yet
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	186	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	187	\end{quote}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	188
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	189	\noindent{}\textbf{Contributions:} We fill this gap by implementing in Isabelle/HOL
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	190	our version of the derivative-based lexing algorithm of Sulzmann and Lu
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	191	\cite{Sulzmann2014} where regular expressions are annotated with
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	192	bitsequences. We define the crucial simplification function as a
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	193	recursive function, without the need of a fixpoint operation. One objective of
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	194	the simplification function is to remove duplicates of regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	195	expressions. For this Sulzmann and Lu use in their paper the standard
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	196	@{text nub} function from Haskell's list library, but this function
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	197	does not achieve the intended objective with bitcoded regular expressions. The
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	198	reason is that in the bitcoded setting, each copy generally has a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	199	different bitcode annotation---so @{text nub} would never ``fire''.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	200	Inspired by Scala's library for lists, we shall instead use a @{text
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	201	distinctWith} function that finds duplicates under an ``erasing'' function
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	202	that deletes bitcodes before comparing regular expressions.
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	203	We shall also introduce our \emph{own} arguments and definitions for
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	204	establishing the correctness of the bitcoded algorithm when
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	205	simplifications are included. Finally we
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	206	establish that the size of derivatives can be finitely bounded.\medskip%\footnote{ In this paper, we shall first briefly introduce the basic notions
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	207	%of regular expressions and describe the definition
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	208	%of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	209	%as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	210	%the correctness for the bitcoded algorithm without simplification, and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	211	%after that extend the proof to include simplification.}\smallskip
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	212	%\mbox{}\\[-10mm]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	213
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	214	\noindent In this paper, we shall first briefly introduce the basic notions
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	215	of regular expressions and describe the definition
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	216	of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	217	as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	218	the correctness for the bitcoded algorithm without simplification, and
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	219	after that extend the proof to include simplification.
9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	220	Our Isabelle code including the results from Sec.~5 is available
9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	221	from \textcolor{darkblue}{\url{https://github.com/urbanchr/posix}}.
9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	222	\mbox{}\\[-6mm]
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	223
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	224	*}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	225
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	226	section {* Background *}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	227
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	228	text {*
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	229	In our Isabelle/HOL formalisation strings are lists of characters with
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	230	the empty string being represented by the empty list, written $[]$,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	231	and list-cons being written as $\_\!::\!\_\,$; string
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	232	concatenation is $\_ \,@\, \_\,$. We often use the usual
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	233	bracket notation for lists also for strings; for example a string
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	234	consisting of just a single character $c$ is written $[c]$.
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	235	Our regular expressions are defined as the following inductive
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	236	datatype:\\[-4mm]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	237	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	238	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	239	@{text "r ::="} \;
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	240	@{const "ZERO"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	241	@{const "ONE"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	242	@{term "CH c"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	243	@{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	244	@{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
563 c92a41d9c4da updated paper and theories to include n-times Christian Urban <christian.urban@kcl.ac.uk> parents: 502 diff changeset	245	@{term "STAR r"} $\mid$
c92a41d9c4da updated paper and theories to include n-times Christian Urban <christian.urban@kcl.ac.uk> parents: 502 diff changeset	246	@{term "NTIMES r n"}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	247	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	248
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	249	\noindent where @{const ZERO} stands for the regular expression that
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	250	does not match any string, @{const ONE} for the regular expression
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	251	that matches only the empty string and @{term c} for matching a
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	252	character literal. The constructors $+$ and $\cdot$ represent
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	253	alternatives and sequences, respectively. We sometimes omit the
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	254	$\cdot$ in a sequence regular expression for brevity. The
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	255	\emph{language} of a regular expression, written $L(r)$, is defined
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	256	as usual and we omit giving the definition here (see for example
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	257	\cite{AusafDyckhoffUrban2016}).
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	258
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	259	In our work here we also add to the usual ``basic'' regular
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	260	expressions the \emph{bounded} regular expression @{term "NTIMES r
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	261	n"} where the @{term n} specifies that @{term r} should match
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	262	exactly @{term n}-times (it is not included in Sulzmann and Lu's original work). For brevity we omit the other bounded
644 9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	263	regular expressions @{text "r"}$^{\{..\textit{n}\}}$,
9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	264	@{text "r"}$^{\{\textit{n}..\}}$
9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	265	and @{text "r"}$^{\{\textit{n}..\textit{m}\}}$ which specify intervals for how many
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	266	times @{text r} should match. The results presented in this paper
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	267	extend straightforwardly to them too. The importance of the bounded
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	268	regular expressions is that they are often used in practical
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	269	applications, such as Snort (a system for detecting network
599 a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	270	intrusions) and also in XML Schema definitions. According to Bj\"{o}rklund et
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	271	al~\cite{BjorklundMartensTimm2015}, bounded regular expressions
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	272	occur frequently in the latter and can have counters of up to
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	273	ten million. The problem is that tools based on the classic notion
644 9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	274	of automata need to expand @{text "r"}$^{\{\textit{n}\}}$ into @{text n}
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	275	connected copies of the automaton for @{text r}. This leads to very
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	276	inefficient matching algorithms or algorithms that consume large
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	277	amounts of memory. A classic example is the regular expression
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	278	\mbox{@{term "SEQ (SEQ (STAR (ALT a b)) a) (NTIMES (ALT a b) n)"}}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	279	where the minimal DFA requires at least $2^{n + 1}$ states (see
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	280	\cite{CountingSet2020}). Therefore regular expression matching
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	281	libraries that rely on the classic notion of DFAs often impose
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	282	adhoc limits for bounded regular expressions: For example in the
571 a76458dd9e4c added paper about counting automata Christian Urban <christian.urban@kcl.ac.uk> parents: 569 diff changeset	283	regular expression matching library in the Go language and also in Google's RE2 library the regular expression
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	284	@{term "NTIMES a 1001"} is not permitted, because no counter can be
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	285	above 1000; and in the regular expression library in Rust
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	286	expressions such as @{text "a\<^bsup>{1000}{100}{5}\<^esup>"} give an error
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	287	message for being too big. Up until recently,\footnote{up until version 1.5.4 of the regex
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	288	library in Rust; see also CVE-2022-24713.} Rust
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	289	however happily generated automata for regular expressions such as
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	290	@{text "a\<^bsup>{0}{4294967295}\<^esup>"}. This was due to a bug
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	291	in the algorithm that decides when a regular expression is acceptable
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	292	or too big according to Rust's classification (it did not account for the fact that @{text "a\<^bsup>{0}\<^esup>"} and similar examples can match the empty string). We shall come back to
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	293	this example later in the paper.
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	294	These problems can of course be solved in matching
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	295	algorithms where automata go beyond the classic notion and for
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	296	instance include explicit counters (e.g.~\cite{CountingSet2020}).
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	297	The point here is that Brzozowski derivatives and the algorithms by
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	298	Sulzmann and Lu can be straightforwardly extended to deal with
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	299	bounded regular expressions and moreover the resulting code
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	300	still consists of only simple recursive functions and inductive
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	301	datatypes. Finally, bounded regular expressions
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	302	do not destroy our finite boundedness property, which we shall
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	303	prove later on.
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	304
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	305	%, because during the lexing process counters will only be
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	306	%decremented.
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	307
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	308
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	309	Central to Brzozowski's regular expression matcher are two functions
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	310	called @{text nullable} and \emph{derivative}. The latter is written
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	311	$r\backslash c$ for the derivative of the regular expression $r$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	312	w.r.t.~the character $c$. Both functions are defined by recursion over
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	313	regular expressions.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	314	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	315	\begin{center}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	316	\begin{tabular}{c @ {\hspace{-9mm}}c}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	317	\begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	318	@{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	319	@{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	320	@{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	321	@{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	322	@{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	323	& & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	324	& & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	325	% & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	326	@{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	327	@{thm (lhs) der.simps(7)} & $\dn$ & @{thm (rhs) der.simps(7)}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	328	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	329	&
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	330	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	331	@{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	332	@{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	333	@{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	334	@{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	335	@{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	336	@{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	337	@{thm (lhs) nullable.simps(7)} & $\dn$ & @{text "if"} @{text "n = 0"}\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	338	& & @{text "then"} @{const "True"}\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	339	& & @{text "else"} @{term "nullable r"}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	340	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	341	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	342	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	343
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	344	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	345	We can extend this definition to give derivatives w.r.t.~strings,
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	346	namely as @{thm (lhs) ders.simps(1)}$\dn$@{thm (rhs) ders.simps(1)}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	347	and @{thm (lhs) ders.simps(2)}$\dn$@{thm (rhs) ders.simps(2)}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	348	Using @{text nullable} and the derivative operation, we can
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	349	define a simple regular expression matcher, namely
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	350	$@{text "match s r"} \dn @{term nullable}(r\backslash s)$.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	351	This is essentially Brzozowski's algorithm from 1964. Its
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	352	main virtue is that the algorithm can be easily implemented as a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	353	functional program (either in a functional programming language or in
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	354	a theorem prover). The correctness of @{text match} amounts to
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	355	establishing the property:%\footnote{It is a fun exercise to formally prove this property in a theorem prover.}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	356	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	357	\begin{proposition}\label{matchcorr}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	358	@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	359	\end{proposition}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	360
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	361	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	362	It is a fun exercise to formally prove this property in a theorem prover.
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	363	We are aware
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	364	of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	365	Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	366	of the work by Krauss and Nipkow~\cite{Krauss2011}. And another one
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	367	in Coq is given by Coquand and Siles \cite{Coquand2012}.
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	368	Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	369
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	370	The novel idea of Sulzmann and Lu is to extend this algorithm for
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	371	lexing, where it is important to find out which part of the string
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	372	is matched by which part of the regular expression.
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	373	For this Sulzmann and Lu presented two lexing algorithms in their
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	374	paper~\cite{Sulzmann2014}. The first algorithm consists of two phases: first a
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	375	matching phase (which is Brzozowski's algorithm) and then a value
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	376	construction phase. The values encode \emph{how} a regular expression
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	377	matches a string. \emph{Values} are defined as the inductive datatype
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	378	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	379	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	380	@{text "v ::="}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	381	@{const "Void"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	382	@{term "val.Char c"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	383	@{term "Left v"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	384	@{term "Right v"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	385	@{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	386	@{term "Stars vs"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	387	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	388
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	389	\noindent where we use @{term vs} to stand for a list of values. The
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	390	string underlying a value can be calculated by a @{const flat}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	391	function, written @{term "flat DUMMY"}. It traverses a value and
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	392	collects the characters contained in it (see \cite{AusafDyckhoffUrban2016}).
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	393
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	394
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	395	Sulzmann and Lu also define inductively an
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	396	inhabitation relation that associates values to regular expressions. Our
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	397	version of this relation is defined by the following six rules:
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	398	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	399	\begin{center}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	400	\begin{tabular}{@ {}l@ {}}
644 9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	401	@{thm[mode=Axiom] Prf.intros(4)}\qquad
9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	402	@{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\qquad
9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	403	@{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\qquad
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	404	@{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\medskip\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	405	@{thm[mode=Axiom] Prf.intros(5)[of "c"]}\qquad
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	406	@{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}\qquad
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	407	$\mprset{flushleft}\inferrule{
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	408	@{thm (prem 1) Prf.intros(7)[of "vs\<^sub>1" "r" "vs\<^sub>2" "n"]}\\\\
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	409	@{thm (prem 2) Prf.intros(7)[of "vs\<^sub>1" "r" "vs\<^sub>2" "n"]}\quad
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	410	@{thm (prem 3) Prf.intros(7)[of "vs\<^sub>1" "r" "vs\<^sub>2" "n"]}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	411	}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	412	{@{thm (concl) Prf.intros(7)[of "vs\<^sub>1" "r" "vs\<^sub>2" "n"]}}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	413	$
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	414	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	415	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	416
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	417	\noindent Note that no values are associated with the regular expression
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	418	@{term ZERO}, since it cannot match any string. Interesting is our version
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	419	of the rule for @{term "STAR r"} where we require that each value
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	420	in @{term vs} flattens to a non-empty string. This means if @{term "STAR r"} ``fires''
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	421	one or more times, then each copy needs to match a non-empty string.
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	422	Similarly, in the rule
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	423	for @{term "NTIMES r n"} we require that the length of the list
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	424	@{term "vs\<^sub>1 @ vs\<^sub>2"} equals @{term n} (meaning the regular expression
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	425	@{text r} matches @{text n}-times) and that the first segment of this list
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	426	contains values that flatten to non-empty strings followed by a segment that
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	427	only contains values that flatten to the empty string.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	428	It is routine to establish how values ``inhabiting'' a regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	429	expression correspond to the language of a regular expression, namely
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	430	@{thm L_flat_Prf}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	431
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	432	In general there is more than one value inhabiting a regular
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	433	expression (meaning regular expressions can typically match more
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	434	than one string). But even when fixing a string from the language of the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	435	regular expression, there are generally more than one way of how the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	436	regular expression can match this string. POSIX lexing is about
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	437	identifying the unique value for a given regular expression and a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	438	string that satisfies the informal POSIX rules (see
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	439	\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	440	%\footnote{POSIX
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	441	% lexing acquired its name from the fact that the corresponding
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	442	% rules were described as part of the POSIX specification for
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	443	% Unix-like operating systems \cite{POSIX}.}
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	444	Sometimes these
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	445	informal rules are called \emph{maximal munch rule} and \emph{rule priority}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	446	One contribution of our earlier paper is to give a convenient
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	447	specification for what POSIX values are (the inductive rules are shown in
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	448	Fig~\ref{POSIXrules}).
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	449
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	450	\begin{figure}[t]
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	451	\begin{center}\small%
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	452	\begin{tabular}{@ {\hspace{-2mm}}c@ {}}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	453	\\[-8.5mm]
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	454	@{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	455	@{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	456	@{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	457	@{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	458	$\mprset{flushleft}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	459	\inferrule
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	460	{@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	461	@{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	462	@{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	463	{@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	464	@{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	465	$\mprset{flushleft}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	466	\inferrule
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	467	{@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	468	@{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	469	@{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	470	@{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	471	{@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\medskip\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	472	\mprset{sep=4mm}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	473	@{thm[mode=Rule] Posix.intros(9)}\<open>Pn[]\<close>\quad\;
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	474	$\mprset{flushleft}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	475	\inferrule
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	476	{@{thm (prem 1) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	477	@{thm (prem 2) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \qquad
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	478	@{thm (prem 3) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]} \\\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	479	@{thm (prem 4) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	480	{@{thm (concl) Posix.intros(8)[of "s\<^sub>1" "r" "v" "s\<^sub>2" n "vs"]}}$\<open>Pn+\<close>
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	481	\\[-4mm]
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	482	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	483	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	484	\caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	485	of given a string $s$ and a regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	486	expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	487	regular expression matching.\smallskip}\label{POSIXrules}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	488	\end{figure}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	489
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	490	The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	491	an injection function on values that mirrors (but inverts) the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	492	construction of the derivative on regular expressions. Essentially it
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	493	injects back a character into a value.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	494	For this they define two functions called @{text mkeps} and @{text inj}:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	495	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	496	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	497	\begin{tabular}{@ {}l@ {}}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	498	\begin{tabular}{@ {}lcl@ {}}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	499	@{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	500	@{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	501	@{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	502	@{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	503	@{thm (lhs) mkeps.simps(5)} & $\dn$ & @{thm (rhs) mkeps.simps(5)}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	504	\end{tabular}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	505	%\end{tabular}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	506	%\end{center}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	507	\smallskip\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	508
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	509	%\begin{center}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	510	%\begin{tabular}{@ {}cc@ {}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	511	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	512	@{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)[of "c"]}\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	513	@{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	514	@{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	515	@{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	516	@{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	517	%!
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	518	@{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	519	& @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	520	@{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	521	& @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	522	@{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	523	@{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	524	%!
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	525	@{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	526	& @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	527	@{thm (lhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]} & $\dn$
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	528	& @{thm (rhs) injval.simps(8)[of "r" "n" "c" "v" "vs"]}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	529	\end{tabular}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	530	%!&
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	531	%!\begin{tabular}{@ {\hspace{-3mm}}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}@ {}}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	532
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	533	%!\end{tabular}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	534	%!\end{tabular}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	535	\end{tabular}%!\smallskip
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	536	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	537
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	538	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	539	The function @{text mkeps} is run when the last derivative is nullable, that is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	540	the string to be matched is in the language of the regular expression. It generates
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	541	a value for how the last derivative can match the empty string. In case
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	542	of @{term "NTIMES r n"} we use the function @{term replicate} in order to generate
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	543	a list of exactly @{term n} copies, which is the length of the list we expect in this
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	544	case. The injection function\footnote{While the character argument @{text c} is not
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	545	strictly necessary in the @{text inj}-function for the fragment of regular expressions we
644 9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	546	use in this paper, it is necessary for extended regular expressions. For example for the range regular expression of the form @{text "[a-z]"}.
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	547	We therefore keep this argument from the original formulation of @{text inj} by Sulzmann and Lu.}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	548	then calculates the corresponding value for each intermediate derivative until
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	549	a value for the original regular expression is generated.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	550	Graphically the algorithm by
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	551	Sulzmann and Lu can be illustrated by the following picture %in Figure~\ref{Sulz}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	552	where the path from the left to the right involving @{term derivatives}/@{const
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	553	nullable} is the first phase of the algorithm (calculating successive
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	554	\Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	555	left, the second phase.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	556	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	557	\begin{center}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	558	\begin{tikzpicture}[scale=0.85,node distance=8mm,
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	559	every node/.style={minimum size=6mm}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	560	\node (r1) {@{term "r\<^sub>1"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	561	\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	562	\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	563	\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	564	\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	565	\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	566	\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	567	\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	568	\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	569	\draw[->,line width=1mm](r4) -- (v4);
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	570	\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	571	\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	572	\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	573	\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	574	\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	575	\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	576	\draw (r4) node[anchor=north west] {\;\raisebox{-5mm}{@{term "mkeps"}}};
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	577	\end{tikzpicture}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	578	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	579	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	580	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	581	The picture shows the steps required when a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	582	regular expression, say @{text "r\<^sub>1"}, matches the string @{term
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	583	"[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:%\\[-8mm]
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	584
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	585	% \begin{figure}[t]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	586	%\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	587	%\begin{tikzpicture}[scale=1,node distance=1cm,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	588	% every node/.style={minimum size=6mm}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	589	%\node (r1) {@{term "r\<^sub>1"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	590	%\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	591	%\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	592	%\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	593	%\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	594	%\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	595	%\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	596	%\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	597	%\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	598	%\draw[->,line width=1mm](r4) -- (v4);
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	599	%\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	600	%\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	601	%\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	602	%\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	603	%\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	604	%\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	605	%\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	606	%\end{tikzpicture}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	607	%\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	608	%\mbox{}\\[-13mm]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	609	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	610	%\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	611	%matching the string @{term "[a,b,c]"}. The first phase (the arrows from
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	612	%left to right) is \Brz's matcher building successive derivatives. If the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	613	%last regular expression is @{term nullable}, then the functions of the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	614	%second phase are called (the top-down and right-to-left arrows): first
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	615	%@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	616	%how the empty string has been recognised by @{term "r\<^sub>4"}. After
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	617	%that the function @{term inj} ``injects back'' the characters of the string into
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	618	%the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	619	%the POSIX value for this string and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	620	%regular expression.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	621	%\label{Sulz}}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	622	%\end{figure}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	623
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	624
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	625
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	626	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	627	\begin{tabular}{lcl}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	628	@{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	629	@{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	630	@{term "None"} @{text "\<Rightarrow>"} @{term None}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	631	%%& & \hspace{27mm}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	632	$\|\;$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	633	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	634	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	635
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	636
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	637	We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	638	this algorithm is correct, that is it generates POSIX values. The
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	639	central property we established relates the derivative operation to the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	640	injection function.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	641
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	642	\begin{proposition}\label{Posix2}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	643	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	644	\end{proposition}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	645
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	646	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	647	With this in place we were able to prove:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	648
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	649	\begin{proposition}\mbox{}\label{lexercorrect}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	650	\textrm{(1)}\; @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}.\\
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	651	\mbox{\hspace{29.5mm}}\textrm{(2)}\; @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}.
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	652	%
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	653	% \smallskip\\
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	654	%\begin{tabular}{ll}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	655	%(1) & @{thm (lhs) lexer_correct_None} \;if and only if\; @{thm (rhs) lexer_correct_None}\\
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	656	%(2) & @{thm (lhs) lexer_correct_Some} \;if and only if\; @{thm (rhs) lexer_correct_Some}\\
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	657	%\end{tabular}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	658	\end{proposition}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	659
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	660	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	661	In fact we have shown that, in the success case, the generated POSIX value $v$ is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	662	unique and in the failure case that there is no POSIX value $v$ that satisfies
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	663	$(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	664	slow in cases where the derivatives grow arbitrarily (recall the example from the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	665	Introduction). However it can be used as a convenient reference point for the correctness
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	666	proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	667
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	668	*}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	669
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	670	section {* Bitcoded Regular Expressions and Derivatives *}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	671
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	672	text {*
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	673
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	674	In the second part of their paper \cite{Sulzmann2014},
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	675	Sulzmann and Lu describe another algorithm that also generates POSIX
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	676	values but dispenses with the second phase where characters are
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	677	injected ``back'' into values. For this they annotate bitcodes to
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	678	regular expressions, which we define in Isabelle/HOL as the datatype\medskip
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	679
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	680	\begin{center}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	681	%\noindent
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	682	%\begin{minipage}{0.9\textwidth}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	683	\,@{term breg} $\,::=\,$ @{term "AZERO"}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	684	$\,\mid$ @{term "AONE bs"}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	685	$\,\mid$ @{term "ACHAR bs c"}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	686	$\,\mid$ @{term "AALTs bs rs"}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	687	$\,\mid$ @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	688	$\,\mid$ @{term "ASTAR bs r"}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	689	$\,\mid$ \mbox{@{term "ANTIMES bs r n"}}%\hfill\mbox{}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	690	%\end{minipage}\medskip
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	691	\end{center}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	692
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	693	\noindent where @{text bs} stands for bitsequences; @{text r},
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	694	@{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	695	expressions; and @{text rs} for lists of bitcoded regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	696	expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	697	is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	698	For bitsequences we use lists made up of the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	699	constants @{text Z} and @{text S}. The idea with bitcoded regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	700	expressions is to incrementally generate the value information (for
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	701	example @{text Left} and @{text Right}) as bitsequences. For this
578 e71a6e2aca2d updated paper and literature Christian Urban <christian.urban@kcl.ac.uk> parents: 571 diff changeset	702	Sulzmann and Lu follow Nielsen and Henglein \cite{NielsenHenglein2011}
e71a6e2aca2d updated paper and literature Christian Urban <christian.urban@kcl.ac.uk> parents: 571 diff changeset	703	and define a coding
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	704	function for how values can be coded into bitsequences.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	705
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	706	\begin{center}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	707	\begin{tabular}{c@ {\hspace{5mm}}c}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	708	\begin{tabular}{lcl}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	709	@{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	710	@{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	711	@{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	712	@{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	713	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	714	&
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	715	\begin{tabular}{lcl}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	716	@{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	717	@{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	718	@{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	719	\mbox{\phantom{XX}}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	720	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	721	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	722	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	723
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	724	\noindent
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	725	As can be seen, this coding is ``lossy'' in the sense that it does not
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	726	record explicitly character values and also not sequence values (for
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	727	them it just appends two bitsequences). However, the
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	728	different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	729	@{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	730	if there is still a value coming in the list of @{text Stars}, whereas @{text S}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	731	indicates the end of the list. The lossiness makes the process of
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	732	decoding a bit more involved, but the point is that if we have a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	733	regular expression \emph{and} a bitsequence of a corresponding value,
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	734	then we can always decode the value accurately~(see Fig.~\ref{decode}).
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	735	The function \textit{decode} checks whether all of the bitsequence is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	736	consumed and returns the corresponding value as @{term "Some v"}; otherwise
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	737	it fails with @{text "None"}. We can establish that for a value $v$
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	738	inhabiting a regular expression $r$, the decoding of its
578 e71a6e2aca2d updated paper and literature Christian Urban <christian.urban@kcl.ac.uk> parents: 571 diff changeset	739	bitsequence never fails (see also \cite{NielsenHenglein2011}).
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	740
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	741	%The decoding can be
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	742	%defined by using two functions called $\textit{decode}'$ and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	743	%\textit{decode}:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	744
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	745	\begin{figure}[t]
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	746	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	747	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
599 a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	748	$\textit{decode}'\,bs\,(\ONE)$ & $\;\dn\;$ & $(\Empty, bs)$\\
a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	749	$\textit{decode}'\,bs\,(c)$ & $\;\dn\;$ & $(\Char\,c, bs)$\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	750	$\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	751	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	752	(\Left\,v, bs_1)$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	753	$\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	754	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	755	(\Right\,v, bs_1)$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	756	$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	757	$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	758	& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	759	\hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	760	$\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	761	$\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ &
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	762	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	763	& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	764	\hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	765	$\textit{decode}'\,bs\,(r^{\{n\}})$ & $\dn$ & $\textit{decode}'\,bs\,r^*$\smallskip\smallskip\\
599 a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	766
a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	767	$\textit{decode}\,bs\,r$ & $\dn$ &
a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	768	$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	769	& & $\;\;\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;\textit{else}\;\textit{None}$\\[-5mm]
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	770	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	771	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	772	\caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	773	\end{figure}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	774
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	775	%\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	776	%The function \textit{decode} checks whether all of the bitsequence is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	777	%consumed and returns the corresponding value as @{term "Some v"}; otherwise
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	778	%it fails with @{text "None"}. We can establish that for a value $v$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	779	%inhabited by a regular expression $r$, the decoding of its
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	780	%bitsequence never fails.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	781
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	782	\begin{lemma}\label{codedecode}\it
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	783	If $\;\vdash v : r$ then
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	784	$\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	785	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	786
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	787
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	788
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	789	Sulzmann and Lu define the function \emph{internalise}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	790	in order to transform (standard) regular expressions into annotated
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	791	regular expressions. We write this operation as $r^\uparrow$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	792	This internalisation uses the following
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	793	\emph{fuse} function.\medskip
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	794
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	795	%
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	796	%!\begin{center}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	797	\noindent\begin{minipage}{\textwidth}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	798	\begin{tabular}{@ {}c@ {\hspace{3mm}}c@ {}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	799	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	800	$\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	801	$\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	802	$\textit{ONE}\,(bs\,@\,bs')$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	803	$\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	804	$\textit{CHAR}\,(bs\,@\,bs')\,c$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	805	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	806	&
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	807	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	808	$\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	809	$\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	810	$\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	811	$\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	812	$\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	813	$\textit{STAR}\,(bs\,@\,bs')\,r$\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	814	$\textit{fuse}\,bs\,(\textit{NT}\,bs'\,r\,n)$ & $\dn$ &
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	815	$\textit{NT}\,(bs\,@\,bs')\,r\,n$
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	816	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	817	\end{tabular}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	818	\end{minipage}\medskip
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	819	%!\end{center}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	820
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	821	\noindent
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	822	This function ``fuses'' a bitsequence to the topmost constructor of an bitcoded regular expressions.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	823	A regular expression can then be \emph{internalised} into a bitcoded
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	824	regular expression as follows:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	825	%
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	826	%!\begin{center}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	827	%!\begin{tabular}{@ {}c@ {\hspace{2mm}}c@ {\hspace{2mm}}c@ {}}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	828	%!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	829	%!$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	830	%!$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	831	%!\end{tabular}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	832	%!&
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	833	%!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	834	%!$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	835	%!$(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	836	%!$(r^{\{n\}})^\uparrow$ & $\dn$ &
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	837	%! $\textit{NT}\;[]\,r^\uparrow\,n$
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	838	%!\end{tabular}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	839	%!&
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	840	%!\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	841	%!$(r_1 + r_2)^\uparrow$ & $\dn$ &
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	842	%! $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	843	%! (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	844	%!$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	845	%! $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	846	%!\end{tabular}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	847	%!\end{tabular}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	848	%!\end{center}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	849	%!
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	850	\begin{center}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	851	\begin{tabular}{@ {}c@ {\hspace{7mm}}c@ {}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	852	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	853	$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	854	$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	855	$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	856	$(r^*)^\uparrow$ & $\dn$ & $\textit{STAR}\;[]\,r^\uparrow$\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	857	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	858	&
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	859	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	860	$(r_1 + r_2)^\uparrow$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	861	$\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	862	(\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	863	$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	864	$\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	865	$(r^{\{n\}})^\uparrow$ & $\dn$ &
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	866	$\textit{NT}\;[]\,r^\uparrow\,n$
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	867	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	868	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	869	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	870
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	871	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	872	There is also an \emph{erase}-function, written $r^\downarrow$, which
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	873	transforms a bitcoded regular expression into a (standard) regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	874	expression by just erasing the annotated bitsequences. We omit the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	875	straightforward definition. For defining the algorithm, we also need
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	876	the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	877	``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	878	bitcoded regular expressions.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	879	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	880	\begin{center}
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	881	\begin{tabular}{@ {\hspace{-1mm}}c@ {\hspace{10mm}}c@ {}}
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	882	\begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	883	$\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	884	$\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	885	$\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	886	$\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	887	\multicolumn{3}{l}{$\quad\exists\, r \in \rs. \,\textit{bnullable}\,r$}\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	888	$\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	889	\multicolumn{3}{l}{$\quad\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$}\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	890	$\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	891	$\textit{True}$\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	892	$\textit{bnullable}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	893	\multicolumn{3}{l}{$\quad\textit{if}\;n = 0\;\textit{then}\;\textit{True}\;
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	894	\textit{else}\;\textit{bnullable}\,r$}\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	895	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	896	&
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	897	\begin{tabular}{@ {}l@ {\hspace{0.5mm}}c@ {\hspace{1mm}}l@ {}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	898	$\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	899	$\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	900	$bs\,@\,\textit{bmkepss}\,\rs$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	901	$\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	902	\multicolumn{3}{l}{$\quad{}bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	903	$\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	904	$bs \,@\, [\S]$\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	905	$\textit{bmkeps}\,(\textit{NT}\,bs\,r\,n)$ & $\dn$ &\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	906	\multicolumn{3}{l@ {}}{$\quad\textit{if}\;n=0\;\textit{then}\;bs \,@\, [\S]$}\\
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	907	\multicolumn{3}{l@ {}}{$\quad\textit{else}\,bs \,@\, [\Z] \,@\, \textit{bmkeps}\,r\,@\,
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	908	\textit{bmkeps}\,(@{term "ANTIMES [] r (n - 1)"})$}\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	909	$\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	910	\multicolumn{3}{l}{$\quad\textit{if}\;\textit{bnullable}\,r\,\textit{then}\;\textit{bmkeps}\,r\;
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	911	\textit{else}\;\textit{bmkepss}\,\rs$}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	912	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	913	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	914	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	915
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	916
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	917	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	918	The key function in the bitcoded algorithm is the derivative of a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	919	bitcoded regular expression. This derivative function calculates the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	920	derivative but at the same time also the incremental part of the bitsequences
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	921	that contribute to constructing a POSIX value.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	922	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	923	\begin{center}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	924	\begin{tabular}{@ {}l@ {\hspace{-1mm}}cl@ {}}
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	925	$(\textit{ZERO})\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	926	$(\textit{ONE}\;bs)\backslash c$ & $\;\dn\;$ & $\textit{ZERO}$\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	927	$(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	928	$\textit{if}\;c=d\; \;\textit{then}\;
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	929	\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	930	$(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	931	$\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	932	$(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	933	$\textit{if}\;\textit{bnullable}\,r_1$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	934	& &$\textit{then}\;\textit{ALT}\,bs\;(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	935	$\;(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	936	& &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	937	$(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	938	$\textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	939	(\textit{STAR}\,[]\,r)$\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	940	$(\textit{NT}\,bs\,r\,n)\backslash c$ & $\dn$ &
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	941	$\textit{if}\;n = 0\; \textit{then}\;\textit{ZERO}\;\textit{else}\;
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	942	\textit{SEQ}\;(bs\,@\,[\Z])\,(r\backslash c)\,
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	943	(\textit{NT}\,[]\,r\,(n - 1))$
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	944	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	945	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	946
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	947
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	948	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	949	This function can also be extended to strings, written $r\backslash s$,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	950	just like the standard derivative. We omit the details. Finally we
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	951	can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	952	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	953	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	954	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	955	$\textit{blexer}\;r\,s$ & $\dn$ &
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	956	$\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	957	& & $\;\;\;\textit{if}\; \textit{bnullable}(r_{der})$
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	958	$\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r$
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	959	$\;\textit{else}\;\textit{None}$
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	960	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	961	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	962
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	963	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	964	This bitcoded lexer first internalises the regular expression $r$ and then
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	965	builds the bitcoded derivative according to $s$. If the derivative is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	966	(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	967	$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value. If
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	968	the derivative is \emph{not} nullable, then $\textit{None}$ is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	969	returned. We can show that this way of calculating a value
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	970	generates the same result as \textit{lexer}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	971
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	972	Before we can proceed we need to define a helper function, called
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	973	\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	974	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	975	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	976	\begin{tabular}{lcl}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	977	@{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	978	@{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	979	@{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	980	@{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	981	@{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	982	@{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	983	& $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	984	@{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	985	@{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	986	@{thm (lhs) retrieve.simps(9)} & $\dn$ & @{thm (rhs) retrieve.simps(9)}\\
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	987	@{thm (lhs) better_retrieve2} & $\dn$ & @{thm (rhs) better_retrieve2}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	988	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	989	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	990
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	991	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	992	The idea behind this function is to retrieve a possibly partial
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	993	bitsequence from a bitcoded regular expression, where the retrieval is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	994	guided by a value. For example if the value is $\Left$ then we
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	995	descend into the left-hand side of an alternative in order to
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	996	assemble the bitcode. Similarly for
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	997	$\Right$. The property we can show is that for a given $v$ and $r$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	998	with $\vdash v : r$, the retrieved bitsequence from the internalised
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	999	regular expression is equal to the bitcoded version of $v$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1000
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1001	\begin{lemma}\label{retrievecode}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1002	If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1003	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1004
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1005	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1006	We also need some auxiliary facts about how the bitcoded operations
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1007	relate to the ``standard'' operations on regular expressions. For
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1008	example if we build a bitcoded derivative and erase the result, this
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1009	is the same as if we first erase the bitcoded regular expression and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1010	then perform the ``standard'' derivative operation.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1011
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1012	\begin{lemma}\label{bnullable}%\mbox{}\smallskip\\
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1013	\textit{(1)} $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1014	\mbox{\hspace{21.5mm}}\textit{(2)} $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1015	\mbox{\hspace{21.5mm}}\textit{(3)} $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1016	%\begin{tabular}{ll}
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1017	%\textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1018	%\textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1019	%\textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ \;provided\; $\textit{nullable}(r^\downarrow)$.
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1020	%\end{tabular}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1021	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1022
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1023	%\begin{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1024	% All properties are by induction on annotated regular expressions.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1025	% %There are no interesting cases.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1026	%\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1027
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1028	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1029	The only difficulty left for the correctness proof is that the bitcoded algorithm
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1030	has only a ``forward phase'' where POSIX values are generated incrementally.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1031	We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1032	functions during the forward phase. An auxiliary function, called $\textit{flex}$,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1033	allows us to recast the rules of $\lexer$ in terms of a single
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1034	phase and stacked up injection functions.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1035	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1036	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1037	\begin{tabular}{@ {}l@ {}c@ {}l@ {\hspace{10mm}}l@ {}c@ {}l@ {}}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1038	$\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1039	$\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1040	$\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1041	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1042	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1043
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1044	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1045	The point of this function is that when
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1046	reaching the end of the string, we just need to apply the stacked up
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1047	injection functions to the value generated by @{text mkeps}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1048	Using this function we can recast the success case in @{text lexer}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1049	as follows:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1050
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1051	\begin{lemma}\label{flex}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1052	If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1053	(\textit{mkeps}\,(r\backslash s))$.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1054	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1055
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1056	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1057	Note we did not redefine \textit{lexer}, we just established that the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1058	value generated by \textit{lexer} can also be obtained by a different
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1059	method. While this different method is not efficient (we essentially
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1060	need to traverse the string $s$ twice, once for building the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1061	derivative $r\backslash s$ and another time for stacking up injection
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1062	functions), it helps us with proving
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1063	that incrementally building up values in @{text blexer} generates the same result.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1064
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1065	This brings us to our main lemma in this section: if we calculate a
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1066	derivative, say $r\backslash s$, and have a value, say $v$, inhabiting
8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1067	this derivative, then we can produce the result @{text lexer} generates
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1068	by applying this value to the stacked-up injection functions
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1069	that $\textit{flex}$ assembles. The lemma establishes that this is the same
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1070	value as if we build the annotated derivative $r^\uparrow\backslash s$
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1071	and then retrieve the bitcoded version of @{text v}, followed by a
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1072	decoding step.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1073
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1074	\begin{lemma}[Main Lemma]\label{mainlemma}\mbox{}\\\it
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1075	If $\vdash v : r\backslash s$ then
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1076	$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1077	\textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1078	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1079
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1080
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1081
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1082	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1083	%With this lemma in place,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1084	We can then prove the correctness of \textit{blexer}---it indeed
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1085	produces the same result as \textit{lexer}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1086
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1087
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1088	\begin{theorem}\label{thmone}
599 a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	1089	$\textit{blexer}\,r\,s = \textit{lexer}\,r\,s$
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1090	\end{theorem}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1091
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1092
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1093
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1094	\noindent This establishes that the bitcoded algorithm \emph{without}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1095	simplification produces correct results. This was
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1096	only conjectured by Sulzmann and Lu in their paper
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1097	\cite{Sulzmann2014}. The next step is to add simplifications.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1098
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1099	*}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1100
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1101
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1102	section {* Simplification *}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1103
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1104	text {*
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1105
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1106	Derivatives as calculated by Brzozowski's method are usually more
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1107	complex regular expressions than the initial one; the result is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1108	that derivative-based matching and lexing algorithms are
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1109	often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1110	optimisations are possible, such as the simplifications
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1111	$\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1112	$\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1113	simplifications can considerably speed up the two algorithms in many
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1114	cases, they do not solve fundamentally the growth problem with
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1115	derivatives. To see this let us return to the example from the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1116	Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1117	If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1118	the simplification rules shown above we obtain
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1119	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1120	%%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1121	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1122	\begin{equation}\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}\label{derivex}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1123	(a + aa)^* \quad\xll\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1124	\underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\;
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1125	((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r})
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1126	\end{equation}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1127
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1128	\noindent This is a simpler derivative, but unfortunately we
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1129	cannot make any further simplifications. This is a problem because
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1130	the outermost alternatives contains two copies of the same
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1131	regular expression (underlined with $r$). These copies will
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1132	spawn new copies in later derivative steps and they in turn even more copies. This
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1133	destroys any hope of taming the size of the derivatives. But the
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1134	second copy of $r$ in~\eqref{derivex} will never contribute to a
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1135	value, because POSIX lexing will always prefer matching a string
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1136	with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1137	The issue with the simple-minded
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1138	simplification rules above is that the rule $r + r \Rightarrow r$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1139	will never be applicable because as can be seen in this example the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1140	regular expressions are not next to each other but separated by another regular expression.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1141
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1142	But here is where Sulzmann and Lu's representation of generalised
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1143	alternatives in the bitcoded algorithm shines: in @{term
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1144	"ALTs bs rs"} we can define a more aggressive simplification by
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1145	recursively simplifying all regular expressions in @{text rs} and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1146	then analyse the resulting list and remove any duplicates.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1147	Another advantage with the bitsequences in bitcoded regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1148	expressions is that they can be easily modified such that simplification does not
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1149	interfere with the value constructions. For example we can ``flatten'', or
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1150	de-nest, or spill out, @{text ALTs} as follows
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1151	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1152	\[
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1153	@{text "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) :: rs\<^sub>1)"}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1154	\quad\xrightarrow{bsimp}\quad
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1155	@{text "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) @ rs\<^sub>1)"}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1156	\]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1157
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1158	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1159	where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1160	to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1161	ensure that the correct value corresponding to the original (unsimplified)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1162	regular expression can still be extracted. %In this way the value construction
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1163	%is not affected by simplification.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1164
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1165	However there is one problem with the definition for the more
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1166	aggressive simplification rules described by Sulzmann and Lu. Recasting
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1167	their definition with our syntax they define the step of removing
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1168	duplicates as
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1169	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1170	\[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1171	bs (nub (map bsimp rs))"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1172	\]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1173
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1174	\noindent where they first recursively simplify the regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1175	expressions in @{text rs} (using @{text map}) and then use
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1176	Haskell's @{text nub}-function to remove potential
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1177	duplicates. While this makes sense when considering the example
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1178	shown in \eqref{derivex}, @{text nub} is the inappropriate
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1179	function in the case of bitcoded regular expressions. The reason
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1180	is that in general the elements in @{text rs} will have a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1181	different annotated bitsequence and in this way @{text nub}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1182	will never find a duplicate to be removed. One correct way to
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1183	handle this situation is to first \emph{erase} the regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1184	expressions when comparing potential duplicates. This is inspired
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1185	by Scala's list functions of the form \mbox{@{text "distinctWith rs eq
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1186	acc"}} where @{text eq} is an user-defined equivalence relation that
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1187	compares two elements in @{text rs}. We define this function in
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1188	Isabelle/HOL as
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1189
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1190	\begin{center}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1191	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1192	@{thm (lhs) distinctWith.simps(1)} & $\dn$ & @{thm (rhs) distinctWith.simps(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1193	@{thm (lhs) distinctWith.simps(2)} & $\dn$ &
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1194	@{text "if (\<exists> y \<in> acc. eq x y)"}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1195	@{text "then distinctWith xs eq acc"}\\
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1196	& & @{text "else x :: distinctWith xs eq ({x} \<union> acc)"}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1197	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1198	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1199
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1200	\noindent where we scan the list from left to right (because we
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1201	have to remove later copies). In @{text distinctWith}, @{text eq} is intended to be an
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1202	equivalence relation for bitcoded regular expressions and @{text acc} is an accumulator for bitcoded regular
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1203	expressions---essentially a set of regular expressions that we have already seen
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1204	while scanning the list. Therefore we delete an element, say @{text x},
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1205	from the list provided a @{text "y"} with @{text "y"} being equivalent to @{text x} is already in the accumulator;
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1206	otherwise we keep @{text x} and scan the rest of the list but
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1207	add @{text "x"} as another ``seen'' element to @{text acc}. We will use
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1208	@{term distinctWith} where @{text eq} is an equivalence that deletes bitsequences from bitcoded regular expressions
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1209	before comparing the components. One way to define this in Isabelle/HOL is by the following recursive function from
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1210	bitcoded regular expressions to @{text bool}:
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1211	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1212	\begin{center}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1213	\begin{tabular}{@ {}cc@ {}}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1214	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {\hspace{1mm}}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1215	@{thm (lhs) eq1.simps(1)} & $\dn$ & @{thm (rhs) eq1.simps(1)}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1216	@{thm (lhs) eq1.simps(2)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(2)[of DUMMY DUMMY]}\\
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1217	@{thm (lhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(7)[of DUMMY "r\<^sub>1" DUMMY "r\<^sub>2"]}\\
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1218	@{thm (lhs) eq1.simps(5)[of DUMMY DUMMY]} & $\dn$ & @{thm (rhs) eq1.simps(5)[of DUMMY DUMMY]}\\
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1219	\mbox{}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1220	\end{tabular}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1221	&
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1222	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {\hspace{1mm}}l@ {}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1223	@{thm (lhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]} & $\dn$ & @{thm (rhs) eq1.simps(3)[of DUMMY "c" DUMMY "d"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1224	@{thm (lhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]} & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1225	@{thm (rhs) eq1.simps(4)[of DUMMY "r\<^sub>1\<^sub>1" "r\<^sub>1\<^sub>2" DUMMY "r\<^sub>2\<^sub>1" "r\<^sub>2\<^sub>2"]}\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1226	@{thm (lhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]} & $\dn$ & @{thm (rhs) eq1.simps(8)[of DUMMY "r\<^sub>1" "n\<^sub>1" DUMMY "r\<^sub>2" "n\<^sub>2"]}\\
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1227	@{thm (lhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]} & $\dn$ & \\
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1228	\multicolumn{3}{r}{@{thm (rhs) eq1.simps(6)[of DUMMY "r\<^sub>1" "rs\<^sub>1" DUMMY "r\<^sub>2" "rs\<^sub>2"]}}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1229	\end{tabular}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1230	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1231	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1232
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1233	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1234	where all other cases are set to @{text False}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1235	This equivalence is clearly a computationally more expensive operation than @{text nub},
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1236	but is needed in order to make the removal of unnecessary copies
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1237	to work properly.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1238
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1239	Our simplification function depends on three more helper functions, one is called
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1240	@{text flts} and analyses lists of regular expressions coming from alternatives.
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1241	It is defined by four clauses as follows:
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1242
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1243	\begin{center}
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1244	\begin{tabular}{c}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1245	@{thm (lhs) flts.simps(1)} $\dn$ @{thm (rhs) flts.simps(1)} \qquad
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1246	@{thm (lhs) flts.simps(2)} $\dn$ @{thm (rhs) flts.simps(2)} \qquad
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1247	@{text "flts ((ALTs bs' rs') :: rs"}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1248	%@{ thm (lhs) flts.simps(3)[of "bs'" "rs'"]}
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1249	$\dn$ @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\smallskip\\
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1250	@{text "flts (r :: rs)"} $\dn$ @{text "r :: flts rs"}\hspace{5mm}(otherwise)
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1251	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1252	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1253
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1254	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1255	The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1256	the third ``de-nests'' alternatives (but retains the
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1257	bitsequence @{text "bs'"} accumulated in the inner alternative). There are
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1258	some corner cases to be considered when the resulting list inside an alternative is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1259	empty or a singleton list. We take care of those cases in the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1260	@{text "bsimpALTs"} function; similarly we define a helper function that simplifies
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1261	sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1262
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1263	\begin{center}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1264	\begin{tabular}{@ {}c@ {\hspace{4mm}}c@ {}}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1265	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1266	@{text "bsimpALTs bs []"} & $\dn$ & @{text "ZERO"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1267	@{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1268	@{text "bsimpALTs bs rs"} & $\dn$ & @{text "ALTs bs rs"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1269	\mbox{}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1270	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1271	&
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1272	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1273	@{text "bsimpSEQ bs _ ZERO"} & $\dn$ & @{text "ZERO"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1274	@{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1275	@{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1276	& $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1277	@{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ & @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1278	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1279	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1280	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1281
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1282	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1283	With this in place we can define our simplification function as
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1284
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1285	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1286	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1287	@{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1288	@{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1289	@{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{text "bsimpALTs bs (distinctWith (flts (map bsimp rs)) \<approx> \<emptyset>)"}\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1290	@{text "bsimp r"} & $\dn$ & @{text r}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1291	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1292	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1293
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1294	\noindent
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1295	We believe our recursive function @{term bsimp} simplifies bitcoded regular
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1296	expressions as intended by Sulzmann and Lu with the small addition of removing ``useless'' @{text ONE}s
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1297	in sequence regular
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1298	expressions. There is no point in applying the
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1299	@{text bsimp} function repeatedly (like the simplification in their paper which needs to be
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1300	applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1301	that is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1302
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1303	\begin{proposition}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1304	@{term "bsimp (bsimp r) = bsimp r"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1305	\end{proposition}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1306
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1307	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1308	This can be proved by induction on @{text r} but requires a detailed analysis
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1309	that the de-nesting of alternatives always results in a flat list of regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1310	expressions. We omit the details since it does not concern the correctness proof.
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1311	%It might be interesting to not that we do not simplify inside @{term STAR} and
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1312	%@{text NT}: the reason is that we want to keep the
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1313
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1314	Next we can include simplification after each derivative step leading to the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1315	following notion of bitcoded derivatives:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1316
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1317	\begin{center}
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1318	\begin{tabular}{c@ {\hspace{10mm}}c}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1319	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1320	@{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1321	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1322	&
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1323	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1324	@{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1325	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1326	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1327	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1328
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1329	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1330	and use it in the improved lexing algorithm defined as
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1331
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1332	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1333	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1334	$\textit{blexer}^+\;r\,s$ & $\dn$ &
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1335	$\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$\\
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1336	& & $\;\;\;\textit{if}\; \textit{bnullable}(r_{der})$
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1337	$\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r$ $\;\textit{else}\;\textit{None}$
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1338	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1339	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1340
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1341	\noindent
8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1342	Note that in $\textit{blexer}^+$ the derivative $r_{der}$ is calculated
8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1343	using the simplifying derivative $\_\,\backslash_{bsimp}\,\_$.
8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1344	The remaining task is to show that @{term blexer} and
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1345	@{term "blexer_simp"} generate the same answers.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1346
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1347	When we first
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1348	attempted this proof we encountered a problem with the idea
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1349	in Sulzmann and Lu's paper where the argument seems to be to appeal
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1350	again to the @{text retrieve}-function defined for the unsimplified version
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1351	of the algorithm. But
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1352	this does not work, because desirable properties such as
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1353	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1354	\[
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1355	@{text "retrieve r v = retrieve (bsimp r) v"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1356	\]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1357
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1358	\noindent do not hold under simplification---this property
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1359	essentially purports that we can retrieve the same value from a
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1360	simplified version of the regular expression. To start with @{text retrieve}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1361	depends on the fact that the value @{text v} corresponds to the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1362	structure of the regular expression @{text r}---but the whole point of simplification
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1363	is to ``destroy'' this structure by making the regular expression simpler.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1364	To see this consider the regular expression @{term "r = ALT r' ZERO"} and a corresponding
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1365	value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1366	we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1367	bitsequence. The reason that this works is that @{text r} is an alternative
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1368	regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1369	@{text r}, then @{text v} does not correspond to the shape of the regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1370	expression anymore. So unless one can somehow
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1371	synchronise the change in the simplified regular expressions with
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1372	the original POSIX value, there is no hope of appealing to @{text retrieve} in the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1373	correctness argument for @{term blexer_simp}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1374
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1375	For our proof we found it more helpful to introduce the rewriting systems shown in
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1376	Fig~\ref{SimpRewrites}. The idea is to generate
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1377	simplified regular expressions in small steps (unlike the @{text bsimp}-function which
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1378	does the same in a big step), and show that each of
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1379	the small steps preserves the bitcodes that lead to the POSIX value.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1380	The rewrite system is organised such that $\leadsto$ is for bitcoded regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1381	expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1382	expressions. The former essentially implements the simplifications of
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1383	@{text "bsimpSEQ"} and @{text flts}; while the latter implements the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1384	simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1385	regular expression reduces in zero or more steps to the simplified
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1386	regular expression generated by @{text bsimp}:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1387
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1388	\begin{lemma}\label{lemone}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1389	@{thm[mode=IfThen] rewrites_to_bsimp}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1390	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1391
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1392
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1393
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1394	\noindent
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1395	We can also show that this rewrite system preserves @{term bnullable}, that
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1396	is simplification does not affect nullability:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1397
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1398	\begin{lemma}\label{lembnull}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1399	@{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1400	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1401
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1402
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1403
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1404	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1405	From this, we can show that @{text bmkeps} will produce the same bitsequence
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1406	as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1407	establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1408	in the paper by Sulzmannn and Lu).
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1409
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1410
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1411	\begin{lemma}\label{lemthree}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1412	@{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1413	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1414
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1415
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1416
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1417	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1418	Crucial is also the fact that derivative steps and simplification steps can be interleaved,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1419	which is shown by the fact that $\leadsto$ is preserved under derivatives.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1420
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1421	\begin{lemma}\label{bderlem}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1422	@{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1423	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1424
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1425
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1426
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1427
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1428	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1429	Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1430	that the unsimplified
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1431	derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1432
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1433	\begin{lemma}\label{lemtwo}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1434	@{thm[mode=IfThen] central}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1435	\end{lemma}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1436
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1437	%\begin{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1438	%By reverse induction on @{term s} generalising over @{text r}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1439	%\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1440
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1441	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1442	With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1443	generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
599 a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	1444	is indeed the POSIX value as generated by \textit{lexer}.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1445
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1446	\begin{theorem}
599 a5f666410101 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 578 diff changeset	1447	@{thm[mode=IfThen] main_blexer_simp[symmetric]} \; (@{text "= lexer r s"}\; by Thm.~\ref{thmone})
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1448	\end{theorem}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1449
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1450	%\begin{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1451	%By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1452	%\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1453
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1454	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1455	This means that if the algorithm is called with a regular expression @{term r} and a string
498 ab626b60ee64 fixed tiny typo in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 496 diff changeset	1456	@{term s} with $@{term s} \in L(@{term r})$, it will return @{term "Some v"} for the unique
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1457	@{term v} we defined by the POSIX relation $(@{term s}, @{term r}) \rightarrow @{term v}$; otherwise
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1458	the algorithm returns @{term "None"} when $s \not\in L(r)$ and no such @{text v} exists.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1459	This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1460	The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily big but
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1461	can be finitely bounded, which
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1462	we shall show next.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1463
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1464	\begin{figure}[t]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1465	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1466	\begin{tabular}{@ {\hspace{-8mm}}c@ {}}
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1467	\\[-8mm]
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1468	@{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1469	@{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1470	@{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1471	@{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1472	@{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1473	@{thm[mode=Axiom] bs6}$A0$\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1474	@{thm[mode=Axiom] bs7}$A1$\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1475	@{thm[mode=Rule] bs10[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1476	@{thm[mode=Rule] rrewrite_srewrite.ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\quad
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1477	@{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1478	@{thm[mode=Axiom] ss4}$L\ZERO$\quad
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1479	@{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1480	@{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\[-5mm]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1481	\end{tabular}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1482	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1483	\caption{The rewrite rules that generate simplified regular expressions
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1484	in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1485	expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1486	expressions. Interesting is the $LD$ rule that allows copies of regular
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1487	expressions to be removed provided a regular expression earlier in the list can
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1488	match the same strings.}\label{SimpRewrites}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1489	\end{figure}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1490	*}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1491
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1492	section {* Finite Bound for the Size of Derivatives *}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1493
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1494	text {*
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1495
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1496	In this section let us sketch our argument for why the size of the simplified
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1497	derivatives with the aggressive simplification function can be finitely bounded. Suppose
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1498	we have a size function for bitcoded regular expressions, written
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1499	$\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1500	(we omit the precise definition; ditto for lists $\llbracket r\!s\rrbracket$). For this we show that for every $r$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1501	there exists a bound $N$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1502	such that
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1503
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1504	\begin{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1505	$\forall s. \; \llbracket@{term "bders_simp r s"}\rrbracket \leq N$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1506	\end{center}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1507
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1508	\noindent
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1509	Note that the bound $N$ is a bound for \emph{all} strings, no matter how long they are.
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1510	We establish this bound by induction on $r$. The base cases for @{term AZERO},
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1511	@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1512	for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1513	hypotheses state $\exists N_1. \forall s. \; \llbracket{}@{term "bders_simp r\<^sub>1 s"}\rrbracket \leq N_1$ and
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1514	$\exists N_2. \forall s. \; \llbracket@{term "bders_simp r\<^sub>2 s"}\rrbracket \leq N_2$. We can reason as follows\medskip
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1515	%
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1516	%!\begin{center}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1517
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1518	\noindent\begin{minipage}{\textwidth}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1519	\begin{tabular}{lcll}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1520	& & $ \llbracket@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\rrbracket$\\
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1521	& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1522	[@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in \textit{Suf}(@{text r}_1, s)]))\rrbracket $ & (1) \\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1523	& $\leq$ &
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1524	$\llbracket\textit{distinctWith}\,((@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}) ::
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1525	[@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in \textit{Suf}(@{text r}_1, s)])\,\approx\;@{term "{}"}\rrbracket + 1 $ & (2) \\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1526	& $\leq$ & $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket +
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1527	\llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in \textit{Suf}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket + 1 $ & (3) \\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1528	& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 +
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1529	\llbracket\textit{distinctWith}\,[@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in \textit{Suf}(@{text r}_1, s)]\,\approx\;@{term "{}"}\rrbracket$ & (4)\\
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1530	& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1531	\end{tabular}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1532	\end{minipage}\medskip
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1533	%!\end{center}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1534
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1535	% tell Chengsong about Indian paper of closed forms of derivatives
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1536
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1537	\noindent
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1538	where in (1) the set $\textit{Suf}(@{text "r"}_1, s)$ are all the suffixes of $s$ where @{term "bders_simp r\<^sub>1 s'"} is nullable ($s'$ being a suffix of $s$).
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1539	In (3) we know that $\llbracket@{term "ASEQ [] (bders_simp r\<^sub>1 s) r\<^sub>2"}\rrbracket$ is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1540	bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1541	than $N_2$. The list length after @{text distinctWith} is bounded by a number, which we call $l_{N_2}$. It stands
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1542	for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them).
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1543	We reason similarly for @{text STAR} and @{text NT}.\smallskip
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1544
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1545
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1546	Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1547	far from the actual bound we can expect. We can do better than this, but this does not improve
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1548	the finiteness property we are proving. If we are interested in a polynomial bound,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1549	one would hope to obtain a similar tight bound as for partial
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1550	derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1551	@{text distinctWith} is to maintain a ``set'' of alternatives (like the sets in
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1552	partial derivatives). Unfortunately to obtain the exact same bound would mean
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1553	we need to introduce simplifications, such as
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1554	$(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1555	which exist for partial derivatives. However, if we introduce them in our
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1556	setting we would lose the POSIX property of our calculated values. For example
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1557	given the regular expressions @{term "SEQ (ALT a ab) (ALT b ONE)"} and the string $[a, b]$, then our
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1558	algorithm generates the following correct POSIX value
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1559	%
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1560	\[
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1561	@{term "Seq (Right (Seq (Char a) (Char b))) (Right Empty)"}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1562	\]
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1563
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1564	\noindent
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1565	Essentially it matches the string with the longer @{text "Right"}-alternative in the
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1566	first sequence (and then the `rest' with the empty regular expression @{const ONE} from the second sequence).
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1567	If we add the simplification above, then we obtain the following value
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1568	@{term "Seq (Left (Char a)) (Left (Char b))"}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1569	where the @{text "Left"}-alternatives get priority. However, this violates
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1570	the POSIX rules and we have not been able to
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1571	reconcile this problem. Therefore we leave better bounds for future work.%!\\[-6.5mm]
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1572
644 9f984ff20020 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 643 diff changeset	1573	Note also that while Antimirov was able to give a bound on the \emph{size}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1574	of his partial derivatives~\cite{Antimirov95}, Brzozowski gave a bound
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1575	on the \emph{number} of derivatives, provided they are quotient via
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1576	ACI rules \cite{Brzozowski1964}. Brzozowski's result is crucial when one
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1577	uses his derivatives for obtaining a DFA (it essentially bounds
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1578	the number of states). However, this result does \emph{not}
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1579	transfer to our setting where we are interested in the \emph{size} of the
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1580	derivatives. For example it is \emph{not} true for our derivatives that the
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1581	set of derivatives $r \backslash s$ for a given $r$ and all strings
617 3ea6a38c33b5 small polish Christian Urban <christian.urban@kcl.ac.uk> parents: 616 diff changeset	1582	$s$ is finite (even when simplified). This is because for our annotated regular expressions
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1583	the bitcode annotation is dependent on the number of iterations that
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1584	are needed for @{text STAR}-regular expressions. This is not a problem for us: Since we intend to do lexing
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1585	by calculating (as fast as possible) derivatives, the bound on the size
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1586	of the derivatives is important, not their number. % of derivatives.
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1587
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1588
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1589	*}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1590
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1591
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1592	section {* Conclusion *}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1593
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1594	text {*
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1595
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1596	We set out in this work to prove in Isabelle/HOL the correctness of
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1597	the second POSIX lexing algorithm by Sulzmann and Lu
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1598	\cite{Sulzmann2014}. This follows earlier work where we established
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1599	the correctness of the first algorithm
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1600	\cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1601	introduce our own specification for POSIX values,
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1602	because the informal definition given by Sulzmann and Lu did not
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1603	stand up to formal proof. Also for the second algorithm we needed
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1604	to introduce our own definitions and proof ideas in order to
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1605	establish the correctness. Our interest in the second algorithm
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1606	lies in the fact that by using bitcoded regular expressions and an
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1607	aggressive simplification method there is a chance that the
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1608	derivatives can be kept universally small (we established in this
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1609	paper that for a given $r$ they can be kept finitely bounded for
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1610	\emph{all} strings). Our formalisation is approximately 7500 lines of
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1611	Isabelle code. A little more than half of this code concerns the finiteness bound
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1612	obtained in Section 5. This slight ``bloat'' in the latter part is because
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1613	we had to introduce an intermediate datatype for annotated regular expressions and repeat many
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1614	definitions for this intermediate datatype. But overall we think this made our
9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1615	formalisation work smoother. The code of our formalisation can be found at
9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1616	\textcolor{darkblue}{\url{https://github.com/urbanchr/posix}}.
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1617	%This is important if one is after
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1618	%an efficient POSIX lexing algorithm based on derivatives.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1619
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1620	Having proved the correctness of the POSIX lexing algorithm, which
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1621	lessons have we learned? Well, we feel this is a very good example
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1622	where formal proofs give further insight into the matter at
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1623	hand. For example it is very hard to see a problem with @{text nub}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1624	vs @{text distinctWith} with only experimental data---one would still
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1625	see the correct result but find that simplification does not simplify in well-chosen, but not
499 6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1626	obscure, examples.
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1627	%We found that from an implementation
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1628	%point-of-view it is really important to have the formal proofs of
6a100d32314c updated the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 498 diff changeset	1629	%the corresponding properties at hand.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1630
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1631	With the results reported here, we can of course only make a claim
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1632	about the correctness of the algorithm and the sizes of the
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1633	derivatives, not about the efficiency or runtime of our version of
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1634	Sulzmann and Lu's algorithm. But we found the size is an important
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1635	first indicator about efficiency: clearly if the derivatives can
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1636	grow to arbitrarily big sizes and the algorithm needs to traverse
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1637	the derivatives possibly several times, then the algorithm will be
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1638	slow---excruciatingly slow that is. Other works seem to make
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1639	stronger claims, but during our formalisation work we have
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1640	developed a healthy suspicion when for example experimental data is
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1641	used to back up efficiency claims. For instance Sulzmann and Lu
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1642	write about their equivalent of @{term blexer_simp} \textit{``...we
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1643	can incrementally compute bitcoded parse trees in linear time in
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1644	the size of the input''} \cite[Page 14]{Sulzmann2014}. Given the
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1645	growth of the derivatives in some cases even after aggressive
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1646	simplification, this is a hard to believe claim. A similar claim
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1647	about a theoretical runtime of @{text "O(n\<^sup>2)"} for one
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1648	specific list of regular expressions is made for the Verbatim
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1649	lexer, which calculates tokens according to POSIX
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1650	rules~\cite{verbatim}. For this, Verbatim uses Brzozowski's
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1651	derivatives like in our work. About their empirical data, the authors write:
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1652	\textit{``The results of our empirical tests [..] confirm that
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1653	Verbatim has @{text "O(n\<^sup>2)"} time complexity.''}
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1654	\cite[Section~VII]{verbatim}. While their correctness proof for
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1655	Verbatim is formalised in Coq, the claim about the runtime
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1656	complexity is only supported by some empirical evidence obtained by
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1657	using the code extraction facilities of Coq. Given our observation
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1658	with the ``growth problem'' of derivatives, this runtime bound
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1659	is unlikely to hold universally: indeed we tried out their extracted OCaml
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1660	code with the example \mbox{@{text "(a + aa)\<^sup>*"}} as a single
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1661	lexing rule, and it took for us around 5 minutes to tokenise a
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1662	string of 40 $a$'s and that increased to approximately 19 minutes
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1663	when the string is 50 $a$'s long. Taking into account that
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1664	derivatives are not simplified in the Verbatim lexer, such numbers
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1665	are not surprising. Clearly our result of having finite
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1666	derivatives might sound rather weak in this context but we think
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1667	such efficiency claims really require further scrutiny. The
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1668	contribution of this paper is to make sure derivatives do not grow
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1669	arbitrarily big (universally). In the example \mbox{@{text "(a +
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1670	aa)\<^sup>*"}}, \emph{all} derivatives have a size of 17 or
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1671	less. The result is that lexing a string of, say, 50\,000 a's with
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1672	the regular expression \mbox{@{text "(a + aa)\<^sup>*"}} takes
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1673	approximately 10 seconds with our Scala implementation of the
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1674	presented algorithm.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1675
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1676	Finally, let us come back to the point about bounded regular
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1677	expressions. We have in this paper only shown that @{term "NTIMES r
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1678	n"} can be included, but all our results extend straightforwardly
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1679	also to the other bounded regular expressions. We find bounded
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1680	regular expressions fit naturally into the setting of Brzozowski
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1681	derivatives and the bitcoded regular expressions by Sulzmann and Lu.
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1682	In contrast bounded regular expressions are often the Achilles'
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1683	heel in regular expression matchers that use the traditional
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1684	automata-based approach to lexing, primarily because they need to expand the
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1685	counters of bounded regular expressions into $n$-connected copies
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1686	of an automaton. This is not needed in Sulzmann and Lu's algorithm.
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1687	To see the difference consider for example the regular expression @{term "SEQ (NTIMES a
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1688	1001) (STAR a)"}, which is not permitted in the Go language because
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1689	the counter is too big. In contrast we have no problem with
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1690	matching this regular expression with, say 50\,000 a's, because the
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1691	counters can be kept compact. In fact, the overall size of the
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1692	derivatives is never greater than 5 in this example. Even in the
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1693	example from Section 2, where Rust raises an error message, namely
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1694	\mbox{@{text "a\<^bsup>{1000}{100}{5}\<^esup>"}}, the maximum size for
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1695	our derivatives is a moderate 14.
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1696
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1697	Let us also return to the example @{text
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1698	"a\<^bsup>{0}{4294967295}\<^esup>"} which until recently Rust
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1699	deemed acceptable. But this was due to a bug. It turns out that it took Rust
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1700	more than 11 minutes to generate an automaton for this regular
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1701	expression and then to determine that a string of just one(!)
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1702	@{text a} does \emph{not} match this regular expression. Therefore
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1703	it is probably a wise choice that in newer versions of Rust's
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1704	regular expression library such regular expressions are declared as
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1705	``too big'' and raise an error message. While this is clearly
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1706	a contrived example, the safety guaranties Rust wants to provide necessitate
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1707	this conservative approach.
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1708	However, with the derivatives and simplifications we have shown
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1709	in this paper, the example can be solved with ease:
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1710	it essentially only involves operations on
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1711	integers and our Scala implementation takes only a few seconds to
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1712	find out that this string, or even much larger strings, do not match.
6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1713
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1714	Let us also compare our work to the verified Verbatim++ lexer where
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1715	the authors of the Verbatim lexer introduced a number of
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1716	improvements and optimisations, for example memoisation
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1717	\cite{verbatimpp}. However, unlike Verbatim, which works with
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1718	derivatives like in our work, Verbatim++ compiles first a regular
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1719	expression into a DFA. While this makes lexing fast in many cases,
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1720	with examples of bounded regular expressions like
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1721	\mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}}
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1722	one needs to represent them as
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1723	sequences of $a \cdot a \cdot \ldots \cdot a$ (500 a's in sequence). We have run
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1724	their extracted code with such a regular expression as a
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1725	single lexing rule and a string of 50\,000 a's---lexing in this case
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1726	takes approximately 5~minutes. We are not aware of any better
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1727	translation using the traditional notion of DFAs so that we can improve on this. Therefore we
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1728	prefer to stick with calculating derivatives, but attempt to make
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1729	this calculation (in the future) as fast as possible. What we can guaranty
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1730	with the presented work is that the maximum size of the derivatives
616 8907d4b6316d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 615 diff changeset	1731	for \mbox{@{text "a\<^bsup>{100}{5}\<^esup>"}$\,\cdot\,$@{term "STAR a"}} is never bigger than 9. This means our Scala
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1732	implementation again only needs a few seconds for this example and matching 50\,000 a's, say.
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1733	%
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1734	%
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1735	%Possible ideas are
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1736	%zippers which have been used in the context of parsing of
5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1737	%context-free grammars \cite{zipperparser}.
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1738	%\\[-5mm] %\smallskip
642 6c13f76c070b updated Christian Urban <christian.urban@kcl.ac.uk> parents: 617 diff changeset	1739
643 9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1740	%\noindent
9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1741	%Our Isabelle code including the results from Sec.~5 is available from \url{https://github.com/urbanchr/posix}.
9580bae0500d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 642 diff changeset	1742	%%\\[-10mm]
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1743
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1744
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1745	%%\bibliographystyle{plain}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1746	\bibliography{root}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1747	*}
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1748
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1749	(<)
8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1750	text {*
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1751	\newpage
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1752	\appendix
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1753
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1754	\section{Some Proofs}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1755
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1756	While we have formalised \emph{all} results in Isabelle/HOL, below we shall give some
569 5af61c89f51e updated paper and corresponding theories Christian Urban <christian.urban@kcl.ac.uk> parents: 563 diff changeset	1757	rough details of our reasoning in ``English'' if this is helpful for reviewing.
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1758
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1759	\begin{proof}[Proof of Lemma~\ref{codedecode}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1760	This follows from the property that
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1761	$\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1762	for any bit-sequence $bs$ and $\vdash v : r$. This property can be
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1763	easily proved by induction on $\vdash v : r$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1764	\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1765
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1766	\begin{proof}[Proof of Lemma~\ref{mainlemma}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1767	This can be proved by induction on $s$ and generalising over
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1768	$v$. The interesting point is that we need to prove this in the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1769	reverse direction for $s$. This means instead of cases $[]$ and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1770	$c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1771	string from the back.\footnote{Isabelle/HOL provides an induction principle
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1772	for this way of performing the induction.}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1773
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1774	The case for $[]$ is routine using Lemmas~\ref{codedecode}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1775	and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1776	the assumption that $\vdash v : (r\backslash s)\backslash c$
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1777	holds. Hence by Prop.~\ref{Posix2} we know that
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1778	(*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1779	By definition of $\textit{flex}$ we can unfold the left-hand side
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1780	to be
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1781	\[
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1782	\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1783	\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1784	\]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1785
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1786	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1787	By IH and (*) we can rewrite the right-hand side to
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1788	$\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1789	(\inj\,(r\backslash s)\,c\,\,v))\,r$ which is equal to
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1790	$\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1791	(s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1792	because we generalised over $v$ in our induction.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1793	\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1794
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1795	\begin{proof}[Proof of Theorem~\ref{thmone}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1796	We can first expand both sides using Lem.~\ref{flex} and the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1797	definition of \textit{blexer}. This gives us two
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1798	\textit{if}-statements, which we need to show to be equal. By
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1799	Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1800	$\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1801	\nullable(r\backslash s)$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1802	For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1803	$d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1804	by Lemma~\ref{bnullable}\textit{(3)} that
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1805	%
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1806	\[
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1807	\textit{decode}(\textit{bmkeps}\:r_d)\,r =
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1808	\textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1809	\]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1810
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1811	\noindent
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1812	where the right-hand side is equal to
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1813	$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1814	d))$ by Lemma~\ref{mainlemma} (we know
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1815	$\vdash \textit{mkeps}\,d : d$ by (*)). This shows the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1816	\textit{if}-branches return the same value. In the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1817	\textit{else}-branches both \textit{lexer} and \textit{blexer} return
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1818	\textit{None}. Therefore we can conclude the proof.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1819	\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1820
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1821	\begin{proof}[Proof of Lemma~\ref{lemone}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1822	By induction on @{text r}. For this we can use the properties
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1823	@{thm fltsfrewrites} and @{text "rs"}$\;\stackrel{s}{\leadsto}^*$@{text "distinctWith rs \<approx>"} @{term "{}"}. The latter uses
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1824	repeated applications of the $LD$ rule which allows the removal
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1825	of duplicates that can recognise the same strings.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1826	\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1827
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1828	\begin{proof}[Proof of Lemma~\ref{lembnull}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1829	Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1830	The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1831	be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1832	@{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1833	\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1834
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1835	\begin{proof}[Proof of Lemma \ref{lemthree}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1836	By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1837	Again the only interesting case is the rule $LD$ where we need to ensure that
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1838	@{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1839	bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"} holds.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1840	This is indeed the case because according to the POSIX rules the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1841	generated bitsequence is determined by the first alternative that can match the
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1842	string (in this case being nullable).
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1843	\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1844
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1845	\begin{proof}[Proof of Lemma~\ref{bderlem}]
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1846	By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1847	The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1848	if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1849	\end{proof}
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1850	*}
615 8881a09a06fd updated paper for FoSSaCS Christian Urban <christian.urban@kcl.ac.uk> parents: 599 diff changeset	1851	(>)
496 f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1852	(<)
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1853	end
f493a20feeb3 updated to include the paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1854	(>)

author	Chengsong
	Wed, 23 Aug 2023 03:02:31 +0100 (18 months ago)
changeset 668	3831621d7b14
parent 644	9f984ff20020
permissions	-rw-r--r--