lexing: thys2/Paper/Paper.thy@222333d2bdc2 (annotated)

396 cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1	(<)
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	2	theory Paper
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	3	imports
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	4	"../Lexer"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	5	"../Simplifying"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	6	"../Positions"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	7	"../SizeBound4"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	8	"HOL-Library.LaTeXsugar"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	9	begin
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	10
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	11	declare [[show_question_marks = false]]
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	12
398 dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	13	notation (latex output)
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	14	If ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	15	Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73)
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	16
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	17
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	18	abbreviation
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	19	"der_syn r c \<equiv> der c r"
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	20	abbreviation
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	21	"ders_syn r s \<equiv> ders s r"
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	22	abbreviation
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	23	"bder_syn r c \<equiv> bder c r"
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	24
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	25	notation (latex output)
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	26	der_syn ("_\\_" [79, 1000] 76) and
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	27	ders_syn ("_\\_" [79, 1000] 76) and
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	28	bder_syn ("_\\_" [79, 1000] 76) and
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	29	bders ("_\\_" [79, 1000] 76) and
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	30	bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	31
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	32	ZERO ("\<^bold>0" 81) and
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	33	ONE ("\<^bold>1" 81) and
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	34	CH ("_" [1000] 80) and
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	35	ALT ("_ + _" [77,77] 78) and
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	36	SEQ ("_ \<cdot> _" [77,77] 78) and
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	37	STAR ("_\<^sup>*" [79] 78) and
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	38
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	39	val.Void ("Empty" 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	40	val.Char ("Char _" [1000] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	41	val.Left ("Left _" [79] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	42	val.Right ("Right _" [1000] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	43	val.Seq ("Seq _ _" [79,79] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	44	val.Stars ("Stars _" [79] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	45
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	46	Prf ("\<turnstile> _ : _" [75,75] 75) and
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	47	Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	48
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	49	flat ("\|_\|" [75] 74) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	50	flats ("\|_\|" [72] 74) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	51	injval ("inj _ _ _" [79,77,79] 76) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	52	mkeps ("mkeps _" [79] 76) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	53	length ("len _" [73] 73) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	54	set ("_" [73] 73) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	55
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	56	AZERO ("ZERO" 81) and
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	57	AONE ("ONE _" [79] 78) and
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	58	ACHAR ("CHAR _ _" [79, 79] 80) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	59	AALTs ("ALTs _ _" [77,77] 78) and
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	60	ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	61	ASTAR ("STAR _ _" [79, 79] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	62
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	63	code ("code _" [79] 74) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	64	intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	65	erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	66	bnullable ("bnullable _" [1000] 80) and
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	67	bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	68	bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	69	bmkeps ("bmkeps _" [1000] 80) and
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	70
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	71	srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	72	rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	73	srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	74	blexer_simp ("blexer\<^sup>+" 1000)
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	75
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	76
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	77	lemma better_retrieve:
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	78	shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	79	and "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	80	apply (metis list.exhaust retrieve.simps(4))
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	81	by (metis list.exhaust retrieve.simps(5))
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	82
396 cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	83	(>)
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	84
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	85	section {* Introduction *}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	86
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	87	text {*
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	88
400 46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	89	In the last fifteen or so years, Brzozowski's derivatives of regular
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	90	expressions have sparked quite a bit of interest in the functional
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	91	programming and theorem prover communities. The beauty of
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	92	Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	93	expressible in any functional language, and easily definable and
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	94	reasoned about in theorem provers---the definitions just consist of
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	95	inductive datatypes and simple recursive functions. Derivatives of a
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	96	regular expression, written @{term "der c r"}, give a simple solution
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	97	to the problem of matching a string @{term s} with a regular
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	98	expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	99	succession) all the characters of the string matches the empty string,
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	100	then @{term r} matches @{term s} (and {\em vice versa}). We are aware
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	101	of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	102	Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	103	of the work by Krauss and Nipkow \cite{Krauss2011}. And another one
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	104	in Coq is given by Coquand and Siles \cite{Coquand2012}.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	105	Also Ribeiro and Du Bois give one in Agda \cite{RibeiroAgda2017}.
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	106
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	107
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	108	However, there are two difficulties with derivative-based matchers:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	109	First, Brzozowski's original matcher only generates a yes/no answer
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	110	for whether a regular expression matches a string or not. This is too
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	111	little information in the context of lexing where separate tokens must
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	112	be identified and also classified (for example as keywords
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	113	or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	114	difficulty by cleverly extending Brzozowski's matching
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	115	algorithm. Their extended version generates additional information on
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	116	\emph{how} a regular expression matches a string following the POSIX
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	117	rules for regular expression matching. They achieve this by adding a
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	118	second ``phase'' to Brzozowski's algorithm involving an injection
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	119	function. In our own earlier work we provided the formal
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	120	specification of what POSIX matching means and proved in Isabelle/HOL
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	121	the correctness
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	122	of Sulzmann and Lu's extended algorithm accordingly
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	123	\cite{AusafDyckhoffUrban2016}.
400 46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	124
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	125	The second difficulty is that Brzozowski's derivatives can
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	126	grow to arbitrarily big sizes. For example if we start with the
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	127	regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	128	successive derivatives according to the character $a$, we end up with
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	129	a sequence of ever-growing derivatives like
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	130
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	131	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	132	\begin{center}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	133	\begin{tabular}{rll}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	134	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	135	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	136	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	137	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	138	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	139	\end{tabular}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	140	\end{center}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	141
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	142	\noindent where after around 35 steps we run out of memory on a
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	143	typical computer (we shall define shortly the precise details of our
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	144	regular expressions and the derivative operation). Clearly, the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	145	notation involving $\ZERO$s and $\ONE$s already suggests
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	146	simplification rules that can be applied to regular regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	147	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	148	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	149	r$. While such simple-minded simplifications have been proved in our
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	150	earlier work to preserve the correctness of Sulzmann and Lu's
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	151	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	152	\emph{not} help with limiting the growth of the derivatives shown
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	153	above: the growth is slowed, but the derivatives can still grow rather
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	154	quickly beyond any finite bound.
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	155
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	156
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	157	Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	158	\cite{Sulzmann2014} where they introduce bitcoded
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	159	regular expressions. In this version, POSIX values are
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	160	represented as bitsequences and such sequences are incrementally generated
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	161	when derivatives are calculated. The compact representation
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	162	of bitsequences and regular expressions allows them to define a more
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	163	``aggressive'' simplification method that keeps the size of the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	164	derivatives finite no matter what the length of the string is.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	165	They make some informal claims about the correctness and linear behaviour
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	166	of this version, but do not provide any supporting proof arguments, not
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	167	even ``pencil-and-paper'' arguments. They write about their bitcoded
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	168	\emph{incremental parsing method} (that is the algorithm to be formalised
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	169	in this paper):
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	170
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	171	\begin{quote}\it
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	172	``Correctness Claim: We further claim that the incremental parsing
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	173	method [..] in combination with the simplification steps [..]
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	174	yields POSIX parse trees. We have tested this claim
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	175	extensively [..] but yet
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	176	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	177	\end{quote}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	178
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	179	\noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	180	the derivative-based lexing algorithm of Sulzmann and Lu
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	181	\cite{Sulzmann2014} where regular expressions are annotated with
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	182	bitsequences. We define the crucial simplification function as a
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	183	recursive function, without the need of a fix-point operation. One objective of
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	184	the simplification function is to remove duplicates of regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	185	expressions. For this Sulzmann and Lu use in their paper the standard
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	186	@{text nub} function from Haskell's list library, but this function
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	187	does not achieve the intended objective with bitcoded regular expressions. The
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	188	reason is that in the bitcoded setting, each copy generally has a
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	189	different bitcode annotation---so @{text nub} would never ``fire''.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	190	Inspired by Scala's library for lists, we shall instead use a @{text
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	191	distinctBy} function that finds duplicates under an erasing function
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	192	which deletes bitcodes.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	193	We shall also introduce our own argument and definitions for
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	194	establishing the correctness of the bitcoded algorithm when
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	195	simplifications are included.\medskip
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	196
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	197	\noindent In this paper, we shall first briefly introduce the basic notions
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	198	of regular expressions and describe the basic definitions
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	199	of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	200	as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	201	the correctness for the bitcoded algorithm without simplification, and
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	202	after that extend the proof to include simplification.
400 46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	203
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	204	*}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	205
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	206	section {* Background *}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	207
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	208	text {*
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	209	In our Isabelle/HOL formalisation strings are lists of characters with
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	210	the empty string being represented by the empty list, written $[]$,
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	211	and list-cons being written as $\_\!::\!\_\,$; string
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	212	concatenation is $\_ \,@\, \_\,$. We often use the usual
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	213	bracket notation for lists also for strings; for example a string
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	214	consisting of just a single character $c$ is written $[c]$.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	215	Our regular expressions are defined as usual as the elements of the following inductive
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	216	datatype:
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	217
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	218	\begin{center}
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	219	@{text "r ::="} \;
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	220	@{const "ZERO"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	221	@{const "ONE"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	222	@{term "CH c"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	223	@{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	224	@{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	225	@{term "STAR r"}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	226	\end{center}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	227
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	228	\noindent where @{const ZERO} stands for the regular expression that does
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	229	not match any string, @{const ONE} for the regular expression that matches
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	230	only the empty string and @{term c} for matching a character literal.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	231	The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	232	The
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	233	\emph{language} of a regular expression, written $L$, is defined as usual
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	234	and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}).
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	235
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	236	Central to Brzozowski's regular expression matcher are two functions
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	237	called @{text nullable} and \emph{derivative}. The latter is written
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	238	$r\backslash c$ for the derivative of the regular expression $r$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	239	w.r.t.~the character $c$. Both functions are defined by recursion over
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	240	regular expressions.
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	241
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	242	\begin{center}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	243	\begin{tabular}{cc}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	244	\begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	245	@{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	246	@{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	247	@{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	248	@{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	249	@{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	250	& & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	251	& & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	252	% & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	253	@{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	254	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	255	&
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	256	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	257	@{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	258	@{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	259	@{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	260	@{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	261	@{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	262	@{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	263	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	264	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	265	\end{center}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	266
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	267	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	268	We can extend this definition to give derivatives w.r.t.~strings:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	269
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	270	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	271	\begin{tabular}{cc}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	272	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	273	@{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	274	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	275	&
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	276	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	277	@{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	278	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	279	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	280	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	281
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	282	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	283	Using @{text nullable} and the derivative operation, we can
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	284	define the following simple regular expression matcher:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	285	%
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	286	\[
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	287	@{text "match s r"} \;\dn\; @{term nullable}(r\backslash s)
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	288	\]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	289
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	290	\noindent This is essentially Brzozowski's algorithm from 1964. Its
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	291	main virtue is that the algorithm can be easily implemented as a
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	292	functional program (either in a functional programming language or in
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	293	a theorem prover). The correctness proof for @{text match} amounts to
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	294	establishing the property
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	295	%
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	296	\begin{proposition}\label{matchcorr}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	297	@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	298	\end{proposition}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	299
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	300	\noindent
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	301	It is a fun exercise to formally prove this property in a theorem prover.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	302
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	303	The novel idea of Sulzmann and Lu is to extend this algorithm for
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	304	lexing, where it is important to find out which part of the string
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	305	is matched by which part of the regular expression.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	306	For this Sulzmann and Lu presented two lexing algorithms in their paper
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	307	\cite{Sulzmann2014}. The first algorithm consists of two phases: first a
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	308	matching phase (which is Brzozowski's algorithm) and then a value
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	309	construction phase. The values encode \emph{how} a regular expression
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	310	matches a string. \emph{Values} are defined as the inductive datatype
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	311
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	312	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	313	@{text "v :="}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	314	@{const "Void"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	315	@{term "val.Char c"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	316	@{term "Left v"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	317	@{term "Right v"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	318	@{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	319	@{term "Stars vs"}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	320	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	321
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	322	\noindent where we use @{term vs} to stand for a list of values. The
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	323	string underlying a value can be calculated by a @{const flat}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	324	function, written @{term "flat DUMMY"}. It traverses a value and
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	325	collects the characters contained in it. Sulzmann and Lu also define inductively an
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	326	inhabitation relation that associates values to regular expressions:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	327
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	328	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	329	\begin{tabular}{c}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	330	\\[-8mm]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	331	@{thm[mode=Axiom] Prf.intros(4)} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	332	@{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	333	@{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	334	@{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\\[4mm]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	335	@{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	336	@{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	337	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	338	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	339
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	340	\noindent Note that no values are associated with the regular expression
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	341	@{term ZERO}, since it cannot match any string.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	342	It is routine to establish how values ``inhabiting'' a regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	343	expression correspond to the language of a regular expression, namely
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	344
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	345	\begin{proposition}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	346	@{thm L_flat_Prf}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	347	\end{proposition}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	348
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	349	In general there is more than one value inhabited by a regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	350	expression (meaning regular expressions can typically match more
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	351	than one string). But even when fixing a string from the language of the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	352	regular expression, there are generally more than one way of how the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	353	regular expression can match this string. POSIX lexing is about
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	354	identifying the unique value for a given regular expression and a
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	355	string that satisfies the informal POSIX rules (see
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	356	\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).\footnote{POSIX
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	357	lexing acquired its name from the fact that the corresponding
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	358	rules were described as part of the POSIX specification for
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	359	Unix-like operating systems \cite{POSIX}.} Sometimes these
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	360	informal rules are called \emph{maximal much rule} and \emph{rule priority}.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	361	One contribution of our earlier paper is to give a convenient
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	362	specification for what POSIX values are (the inductive rules are shown in
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	363	Figure~\ref{POSIXrules}).
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	364
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	365	\begin{figure}[t]
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	366	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	367	\begin{tabular}{c}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	368	@{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	369	@{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	370	@{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	371	@{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	372	$\mprset{flushleft}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	373	\inferrule
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	374	{@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	375	@{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	376	@{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	377	{@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\smallskip\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	378	@{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	379	$\mprset{flushleft}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	380	\inferrule
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	381	{@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	382	@{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	383	@{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	384	@{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	385	{@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\\[-4mm]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	386	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	387	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	388	\caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	389	of given a string $s$ and a regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	390	expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	391	regular expression matching.}\label{POSIXrules}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	392	\end{figure}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	393
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	394	The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	395	an injection function on values that mirrors (but inverts) the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	396	construction of the derivative on regular expressions. Essentially it
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	397	injects back a character into a value.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	398	For this they define two functions called @{text mkeps} and @{text inj}:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	399
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	400	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	401	\begin{tabular}{l}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	402	\begin{tabular}{lcl}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	403	@{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	404	@{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	405	@{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	406	@{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	407	\end{tabular}\smallskip\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	408
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	409	\begin{tabular}{lcl}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	410	@{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	411	@{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ &
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	412	@{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	413	@{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	414	@{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	415	@{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	416	& @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	417	@{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	418	& @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	419	@{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	420	& @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	421	@{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	422	& @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	423	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	424	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	425	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	426
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	427	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	428	The function @{text mkeps} is run when the last derivative is nullable, that is
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	429	the string to be matched is in the language of the regular expression. It generates
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	430	a value for how the last derivative can match the empty string. The injection function
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	431	then calculates the corresponding value for each intermediate derivative until
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	432	a value for the original regular expression is generated.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	433	Graphically the algorithm by
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	434	Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	435	where the path from the left to the right involving @{term derivatives}/@{const
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	436	nullable} is the first phase of the algorithm (calculating successive
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	437	\Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	438	left, the second phase. The picture above shows the steps required when a
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	439	regular expression, say @{text "r\<^sub>1"}, matches the string @{term
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	440	"[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	441
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	442	\begin{figure}[t]
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	443	\begin{center}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	444	\begin{tikzpicture}[scale=2,node distance=1.3cm,
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	445	every node/.style={minimum size=6mm}]
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	446	\node (r1) {@{term "r\<^sub>1"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	447	\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	448	\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	449	\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	450	\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	451	\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	452	\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	453	\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	454	\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	455	\draw[->,line width=1mm](r4) -- (v4);
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	456	\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	457	\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	458	\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	459	\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	460	\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	461	\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	462	\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	463	\end{tikzpicture}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	464	\end{center}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	465	\mbox{}\\[-13mm]
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	466
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	467	\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	468	matching the string @{term "[a,b,c]"}. The first phase (the arrows from
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	469	left to right) is \Brz's matcher building successive derivatives. If the
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	470	last regular expression is @{term nullable}, then the functions of the
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	471	second phase are called (the top-down and right-to-left arrows): first
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	472	@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	473	how the empty string has been recognised by @{term "r\<^sub>4"}. After
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	474	that the function @{term inj} ``injects back'' the characters of the string into
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	475	the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	476	the POSIX value for this string and
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	477	regular expression.
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	478	\label{Sulz}}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	479	\end{figure}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	480
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	481
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	482
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	483	\begin{center}
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	484	\begin{tabular}{lcl}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	485	@{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	486	@{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	487	& & \phantom{$\|$} @{term "None"} @{text "\<Rightarrow>"} @{term None}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	488	& & $\|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	489	\end{tabular}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	490	\end{center}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	491
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	492
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	493	We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	494	this algorithm is correct, that is it generates POSIX values. The
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	495	central property we established relates the derivative operation to the
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	496	injection function.
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	497
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	498	\begin{proposition}\label{Posix2}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	499	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	500	\end{proposition}
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	501
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	502	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	503	With this in place we were able to prove:
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	504
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	505
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	506	\begin{proposition}\mbox{}\smallskip\\\label{lexercorrect}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	507	\begin{tabular}{ll}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	508	(1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	509	(2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	510	\end{tabular}
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	511	\end{proposition}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	512
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	513	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	514	In fact we have shown that in the success case the generated POSIX value $v$ is
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	515	unique and in the failure case that there is no POSIX value $v$ that satisfies
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	516	$(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	517	slow in cases where the derivatives grow arbitrarily (recall the example from the
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	518	Introduction). However it can be used as a convenient reference point for the correctness
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	519	proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	520
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	521	*}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	522
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	523	section {* Bitcoded Regular Expressions and Derivatives *}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	524
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	525	text {*
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	526
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	527	In the second part of their paper \cite{Sulzmann2014},
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	528	Sulzmann and Lu describe another algorithm that also generates POSIX
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	529	values but dispenses with the second phase where characters are
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	530	injected ``back'' into values. For this they annotate bitcodes to
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	531	regular expressions, which we define in Isabelle/HOL as the datatype
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	532
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	533	\begin{center}
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	534	\begin{tabular}{lcl}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	535	@{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	536	& $\mid$ & @{term "ACHAR bs c"}\\
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	537	& $\mid$ & @{term "AALTs bs rs"}\\
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	538	& $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	539	& $\mid$ & @{term "ASTAR bs r"}
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	540	\end{tabular}
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	541	\end{center}
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	542
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	543	\noindent where @{text bs} stands for bitsequences; @{text r},
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	544	@{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	545	expressions; and @{text rs} for lists of bitcoded regular
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	546	expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	547	is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}.
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	548	For bitsequences we use lists made up of the
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	549	constants @{text Z} and @{text S}. The idea with bitcoded regular
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	550	expressions is to incrementally generate the value information (for
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	551	example @{text Left} and @{text Right}) as bitsequences. For this
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	552	Sulzmann and Lu define a coding
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	553	function for how values can be coded into bitsequences.
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	554
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	555	\begin{center}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	556	\begin{tabular}{cc}
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	557	\begin{tabular}{lcl}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	558	@{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	559	@{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	560	@{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	561	@{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	562	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	563	&
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	564	\begin{tabular}{lcl}
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	565	@{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	566	@{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	567	@{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	568	\mbox{\phantom{XX}}\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	569	\end{tabular}
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	570	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	571	\end{center}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	572
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	573	\noindent
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	574	As can be seen, this coding is ``lossy'' in the sense that we do not
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	575	record explicitly character values and also not sequence values (for
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	576	them we just append two bitsequences). However, the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	577	different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	578	@{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	579	if there is still a value coming in the list of @{text Stars}, whereas @{text S}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	580	indicates the end of the list. The lossiness makes the process of
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	581	decoding a bit more involved, but the point is that if we have a
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	582	regular expression \emph{and} a bitsequence of a corresponding value,
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	583	then we can always decode the value accurately. The decoding can be
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	584	defined by using two functions called $\textit{decode}'$ and
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	585	\textit{decode}:
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	586
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	587	\begin{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	588	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	589	$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	590	$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	591	$\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	592	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	593	(\Left\,v, bs_1)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	594	$\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	595	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	596	(\Right\,v, bs_1)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	597	$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	598	$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	599	& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	600	\hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	601	$\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	602	$\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	603	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	604	& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	605	\hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	606	$\textit{decode}\,bs\,r$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	607	$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	608	& & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	609	\textit{else}\;\textit{None}$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	610	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	611	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	612
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	613	\noindent
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	614	The function \textit{decode} checks whether all of the bitsequence is
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	615	consumed and returns the corresponding value as @{term "Some v"}; otherwise
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	616	it fails with @{text "None"}. We can establish that for a value $v$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	617	inhabited by a regular expression $r$, the decoding of its
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	618	bitsequence never fails.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	619
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	620	\begin{lemma}\label{codedecode}\it
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	621	If $\;\vdash v : r$ then
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	622	$\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	623	\end{lemma}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	624
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	625	\begin{proof}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	626	This follows from the property that
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	627	$\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	628	for any bit-sequence $bs$ and $\vdash v : r$. This property can be
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	629	easily proved by induction on $\vdash v : r$.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	630	\end{proof}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	631
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	632	Sulzmann and Lu define the function \emph{internalise}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	633	in order to transform (standard) regular expressions into annotated
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	634	regular expressions. We write this operation as $r^\uparrow$.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	635	This internalisation uses the following
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	636	\emph{fuse} function.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	637
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	638	\begin{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	639	\begin{tabular}{lcl}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	640	$\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	641	$\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	642	$\textit{ONE}\,(bs\,@\,bs')$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	643	$\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	644	$\textit{CHAR}\,(bs\,@\,bs')\,c$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	645	$\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	646	$\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	647	$\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	648	$\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	649	$\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	650	$\textit{STAR}\,(bs\,@\,bs')\,r$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	651	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	652	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	653
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	654	\noindent
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	655	A regular expression can then be \emph{internalised} into a bitcoded
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	656	regular expression as follows:
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	657
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	658	\begin{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	659	\begin{tabular}{lcl}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	660	$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	661	$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	662	$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	663	$(r_1 + r_2)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	664	$\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	665	(\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	666	$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	667	$\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	668	$(r^*)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	669	$\textit{STAR}\;[]\,r^\uparrow$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	670	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	671	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	672
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	673	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	674	There is also an \emph{erase}-function, written $r^\downarrow$, which
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	675	transforms a bitcoded regular expression into a (standard) regular
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	676	expression by just erasing the annotated bitsequences. We omit the
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	677	straightforward definition. For defining the algorithm, we also need
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	678	the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	679	``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	680	bitcoded regular expressions.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	681	%
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	682	\begin{center}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	683	\begin{tabular}{@ {}c@ {}c@ {}}
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	684	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	685	$\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$\\
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	686	$\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{true}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	687	$\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{false}$\\
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	688	$\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	689	$\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	690	$\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	691	$\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	692	$\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	693	$\textit{true}$
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	694	\end{tabular}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	695	&
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	696	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	697	$\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	698	$\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	699	$bs\,@\,\textit{bmkepss}\,\rs$\\
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	700	$\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	701	\multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	702	$\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	703	$bs \,@\, [\S]$\\
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	704	$\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	705	$\textit{if}\;\textit{bnullable}\,r$\\
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	706	& &$\textit{then}\;\textit{bmkeps}\,r$\\
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	707	& &$\textit{else}\;\textit{bmkepss}\,\rs$
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	708	\end{tabular}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	709	\end{tabular}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	710	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	711
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	712
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	713	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	714	The key function in the bitcoded algorithm is the derivative of a
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	715	bitcoded regular expression. This derivative calculates the
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	716	derivative but at the same time also the incremental part of the bitsequences
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	717	that contribute to constructing a POSIX value.
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	718
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	719	\begin{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	720	\begin{tabular}{@ {}lcl@ {}}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	721	$(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \\
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	722	$(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	723	$(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	724	$\textit{if}\;c=d\; \;\textit{then}\;
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	725	\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	726	$(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	727	$\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	728	$(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	729	$\textit{if}\;\textit{bnullable}\,r_1$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	730	& &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	731	& &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	732	& &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	733	$(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	734	$\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	735	(\textit{STAR}\,[]\,r)$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	736	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	737	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	738
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	739
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	740	\noindent
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	741	This function can also be extended to strings, written $r\backslash s$,
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	742	just like the standard derivative. We omit the details. Finally we
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	743	can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	744
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	745	\begin{center}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	746	\begin{tabular}{lcl}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	747	$\textit{blexer}\;r\,s$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	748	$\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	749	& & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	750	\;\;\textit{else}\;\textit{None}$
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	751	\end{tabular}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	752	\end{center}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	753
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	754	\noindent
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	755	This bitcoded lexer first internalises the regular expression $r$ and then
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	756	builds the bitcoded derivative according to $s$. If the derivative is
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	757	(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	758	$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value. If
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	759	the derivative is \emph{not} nullable, then $\textit{None}$ is
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	760	returned. We can show that this way of calculating a value
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	761	generates the same result as \textit{lexer}.
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	762
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	763	Before we can proceed we need to define a helper function, called
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	764	\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	765
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	766	\begin{center}
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	767	\begin{tabular}{lcl}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	768	@{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	769	@{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	770	@{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	771	@{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	772	@{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	773	@{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	774	& $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	775	@{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	776	@{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	777	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	778	\end{center}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	779
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	780	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	781	The idea behind this function is to retrieve a possibly partial
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	782	bitsequence from a bitcoded regular expression, where the retrieval is
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	783	guided by a value. For example if the value is $\Left$ then we
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	784	descend into the left-hand side of an alternative in order to
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	785	assemble the bitcode. Similarly for
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	786	$\Right$. The property we can show is that for a given $v$ and $r$
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	787	with $\vdash v : r$, the retrieved bitsequence from the internalised
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	788	regular expression is equal to the bitcoded version of $v$.
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	789
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	790	\begin{lemma}\label{retrievecode}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	791	If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	792	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	793
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	794	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	795	We also need some auxiliary facts about how the bitcoded operations
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	796	relate to the ``standard'' operations on regular expressions. For
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	797	example if we build a bitcoded derivative and erase the result, this
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	798	is the same as if we first erase the bitcoded regular expression and
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	799	then perform the ``standard'' derivative operation.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	800
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	801	\begin{lemma}\label{bnullable}\mbox{}\smallskip\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	802	\begin{tabular}{ll}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	803	\textit{(1)} & $(a\backslash s)^\downarrow = (a^\downarrow)\backslash s$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	804	\textit{(2)} & $\textit{bnullable}(a)$ iff $\textit{nullable}(a^\downarrow)$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	805	\textit{(3)} & $\textit{bmkeps}(a) = \textit{retrieve}\,a\,(\textit{mkeps}\,(a^\downarrow))$ provided $\textit{nullable}(a^\downarrow)$.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	806	\end{tabular}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	807	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	808
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	809	\begin{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	810	All properties are by induction on annotated regular expressions. There are no
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	811	interesting cases.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	812	\end{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	813
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	814	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	815	The only difficulty left for the correctness proof is that the bitcoded algorithm
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	816	has only a ``forward phase'' where POSIX values are generated incrementally.
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	817	We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	818	functions during the forward phase. An auxiliary function, called $\textit{flex}$,
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	819	allows us to recast the rules of $\lexer$ in terms of a single
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	820	phase and stacked up injection functions.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	821
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	822	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	823	\begin{tabular}{lcl}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	824	$\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	825	$\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	826	$\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	827	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	828	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	829
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	830	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	831	The point of this function is that when
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	832	reaching the end of the string, we just need to apply the stacked up
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	833	injection functions to the value generated by @{text mkeps}.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	834	Using this function we can recast the success case in @{text lexer}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	835	as follows:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	836
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	837	\begin{proposition}\label{flex}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	838	If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	839	(\mkeps (r\backslash s))$.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	840	\end{proposition}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	841
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	842	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	843	Note we did not redefine \textit{lexer}, we just established that the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	844	value generated by \textit{lexer} can also be obtained by a different
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	845	method. While this different method is not efficient (we essentially
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	846	need to traverse the string $s$ twice, once for building the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	847	derivative $r\backslash s$ and another time for stacking up injection
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	848	functions using \textit{flex}), it helps us with proving
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	849	that incrementally building up values in @{text blexer} generates the same result.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	850
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	851	This brings us to our main lemma in this section: if we calculate a
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	852	derivative, say $r\backslash s$, and have a value, say $v$, inhabited
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	853	by this derivative, then we can produce the result @{text lexer} generates
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	854	by applying this value to the stacked-up injection functions
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	855	that $\textit{flex}$ assembles. The lemma establishes that this is the same
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	856	value as if we build the annotated derivative $r^\uparrow\backslash s$
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	857	and then retrieve the corresponding bitcoded version, followed by a
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	858	decoding step.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	859
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	860	\begin{lemma}[Main Lemma]\label{mainlemma}\it
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	861	If $\vdash v : r\backslash s$ then
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	862	\[\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	863	\textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	864	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	865
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	866	\begin{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	867	This can be proved by induction on $s$ and generalising over
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	868	$v$. The interesting point is that we need to prove this in the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	869	reverse direction for $s$. This means instead of cases $[]$ and
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	870	$c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	871	string from the back.\footnote{Isabelle/HOL provides an induction principle
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	872	for this way of performing the induction.}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	873
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	874	The case for $[]$ is routine using Lemmas~\ref{codedecode}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	875	and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	876	the assumption that $\vdash v : (r\backslash s)\backslash c$
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	877	holds. Hence by Prop.~\ref{Posix2} we know that
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	878	(*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	879	By definition of $\textit{flex}$ we can unfold the left-hand side
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	880	to be
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	881	\[
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	882	\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	883	\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	884	\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	885
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	886	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	887	By induction hypothesis and (*) we can rewrite the right-hand side to
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	888	%
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	889	\[
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	890	\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	891	(\inj\,(r\backslash s)\,c\,\,v))\,r
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	892	\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	893
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	894	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	895	which is equal to
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	896	$\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	897	(s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	898	because we generalised over $v$ in our induction.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	899	\end{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	900
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	901	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	902	With this lemma in place, we can prove the correctness of \textit{blexer}---it indeed
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	903	produces the same result as \textit{lexer}.
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	904
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	905
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	906	\begin{theorem}\label{thmone}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	907	$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	908	\end{theorem}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	909
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	910	\begin{proof}
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	911	We can first expand both sides using Prop.~\ref{flex} and the
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	912	definition of \textit{blexer}. This gives us two
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	913	\textit{if}-statements, which we need to show to be equal. By
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	914	Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	915	\[
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	916	\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	917	\nullable(r\backslash s)
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	918	\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	919
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	920	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	921	For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	922	$d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	923	by Lemma~\ref{bnullable}\textit{(3)} that
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	924	%
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	925	\[
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	926	\textit{decode}(\textit{bmkeps}\,r_d)\,r =
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	927	\textit{decode}(\textit{retrieve}\,a\,(\textit{mkeps}\,d))\,r
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	928	\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	929
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	930	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	931	where the right-hand side is equal to
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	932	$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	933	d))$ by Lemma~\ref{mainlemma} (we know
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	934	$\vdash \textit{mkeps}\,d : d$ by (*)). This shows the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	935	\textit{if}-branches return the same value. In the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	936	\textit{else}-branches both \textit{lexer} and \textit{blexer} return
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	937	\textit{None}. Therefore we can conclude the proof.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	938	\end{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	939
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	940	\noindent This establishes that the bitcoded algorithm by Sulzmann and
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	941	Lu \emph{without} simplification produces correct results. This was
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	942	only conjectured by Sulzmann and Lu in their paper
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	943	\cite{Sulzmann2014}. The next step is to add simplifications.
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	944
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	945	*}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	946
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	947
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	948	section {* Simplification *}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	949
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	950	text {*
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	951
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	952	Derivatives as calculated by Brzozowski’s method are usually more
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	953	complex regular expressions than the initial one; the result is
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	954	that derivative-based matching and lexing algorithms are
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	955	often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	956	optimisations are possible, such as the simplifications
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	957	$\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	958	$\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	959	simplifications can considerably speed up the two algorithms in many
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	960	cases, they do not solve fundamentally the growth problem with
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	961	derivatives. To see this let us return to the example from the
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	962	Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	963	If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	964	the simplification rules shown above we obtain
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	965	%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	966	\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}%%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	967	%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	968	\begin{equation}\label{derivex}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	969	(a + aa)^* \quad\xll\quad
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	970	\underbrace{\mbox{$(\ONE + \ONE{}a) \cdot (a + aa)^*$}}_{r} \;+\;
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	971	((a + aa)^* + \underbrace{\mbox{$(\ONE + \ONE{}a) \cdot (a + aa)^*$}}_{r})
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	972	\end{equation}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	973
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	974	\noindent This is a simpler derivative, but unfortunately we
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	975	cannot make any further simplifications. This is a problem because
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	976	the outermost alternatives contains two copies of the same
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	977	regular expression (underlined with $r$). These copies will
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	978	spawn new copies in later derivative steps and they in turn even more copies. This
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	979	destroys any hope of taming the size of the derivatives. But the
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	980	second copy of $r$ in \eqref{derivex} will never contribute to a
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	981	value, because POSIX lexing will always prefer matching a string
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	982	with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	983	The dilemma with the simple-minded
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	984	simplification rules above is that the rule $r + r \Rightarrow r$
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	985	will never be applicable because as can be seen in this example the
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	986	regular expressions are not next to each other but separated by another regular expression.
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	987
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	988	But here is where Sulzmann and Lu's representation of generalised
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	989	alternatives in the bitcoded algorithm shines: in @{term
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	990	"ALTs bs rs"} we can define a more aggressive simplification by
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	991	recursively simplifying all regular expressions in @{text rs} and
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	992	then analyse the resulting list and remove any duplicates.
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	993	Another advantage with the bitsequences in bitcoded regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	994	expressions is that they can be easily modified such that simplification does not
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	995	interfere with the value constructions. For example we can ``flatten'', or
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	996	de-nest, @{text ALTs} as follows
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	997	%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	998	\[
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	999	@{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1000	\quad\xrightarrow{bsimp}\quad
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1001	@{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) # rs\<^sub>1)"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1002	\]
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1003
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1004	\noindent
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1005	where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1006	to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1007	ensure that the correct value corresponding to the original (unsimplified)
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1008	regular expression can still be extracted. %In this way the value construction
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1009	%is not affected by simplification.
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1010
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1011	However there is one problem with the definition for the more
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1012	aggressive simplification rules described by Sulzmann and Lu. Recasting
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1013	their definition with our syntax they define the step of removing
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1014	duplicates as
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1015	%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1016	\[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1017	bs (nup (map bsimp rs))"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1018	\]
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1019
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1020	\noindent where they first recursively simplify the regular
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1021	expressions in @{text rs} (using @{text map}) and then use
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1022	Haskell's @{text nub}-function to remove potential
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1023	duplicates. While this makes sense when considering the example
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1024	shown in \eqref{derivex}, @{text nub} is the inappropriate
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1025	function in the case of bitcoded regular expressions. The reason
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1026	is that in general the elements in @{text rs} will have a
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1027	different annotated bitsequence and in this way @{text nub}
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1028	will never find a duplicate to be removed. The correct way to
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1029	handle this situation is to first \emph{erase} the regular
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1030	expressions when comparing potential duplicates. This is inspired
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1031	by Scala's list functions of the form \mbox{@{text "distinctBy rs f
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1032	acc"}} where a function is applied first before two elements
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1033	are compared. We define this function in Isabelle/HOL as
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1034
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1035	\begin{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1036	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1037	@{thm (lhs) distinctBy.simps(1)} & $\dn$ & @{thm (rhs) distinctBy.simps(1)}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1038	@{thm (lhs) distinctBy.simps(2)} & $\dn$ & @{thm (rhs) distinctBy.simps(2)}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1039	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1040	\end{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1041
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1042	\noindent where we scan the list from left to right (because we
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1043	have to remove later copies). In @{text distinctBy}, @{text f} is a
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1044	function and @{text acc} is an accumulator for regular
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1045	expressions---essentially a set of regular expressions that we have already seen
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1046	while scanning the list. Therefore we delete an element, say @{text x},
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1047	from the list provided @{text "f x"} is already in the accumulator;
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1048	otherwise we keep @{text x} and scan the rest of the list but
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1049	add @{text "f x"} as another ``seen'' element to @{text acc}. We will use
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1050	@{term distinctBy} where @{text f} is the erase functions, @{term "erase (DUMMY)"},
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1051	that deletes bitsequences from bitcoded regular expressions.
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1052	This is clearly a computationally more expensive operation, than @{text nub},
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1053	but is needed in order to make the removal of unnecessary copies
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1054	to work properly.
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1055
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1056	Our simplification function depends on three helper functions, one is called
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1057	@{text flts} and analyses lists of regular expressions coming from alternatives.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1058	It is defined as follows:
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1059
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1060	\begin{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1061	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1062	@{thm (lhs) flts.simps(1)} & $\dn$ & @{thm (rhs) flts.simps(1)}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1063	@{thm (lhs) flts.simps(2)} & $\dn$ & @{thm (rhs) flts.simps(2)}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1064	@{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1065	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1066	\end{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1067
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1068	\noindent
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1069	The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1070	the second ``spills'' out nested alternatives (but retaining the
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1071	bitsequence @{text "bs'"} accumulated in the inner alternative). There are
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1072	some corner cases to be considered when the resulting list inside an alternative is
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1073	empty or a singleton list. We take care of those cases in the
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1074	@{text "bsimpALTs"} function; similarly we define a helper function that simplifies
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1075	sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1076
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1077	\begin{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1078	\begin{tabular}{c@ {\hspace{5mm}}c}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1079	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1080	@{text "bsimpALTs bs []"} & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1081	@{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1082	@{text "bsimpALTs bs rs"} & $\dn$ & @{text "ALTs bs rs"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1083	\mbox{}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1084	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1085	&
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1086	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1087	@{text "bsimpSEQ bs _ ZERO"} & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1088	@{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1089	@{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1090	& $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1091	@{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ & @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1092	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1093	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1094	\end{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1095
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1096	\noindent
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1097	With this in place we can define our simplification function as
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1098
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1099	\begin{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1100	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1101	@{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1102	@{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1103	@{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{thm (rhs) bsimp.simps(2)[of "bs" _]}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1104	@{text "bsimp r"} & $\dn$ & @{text r}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1105	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1106	\end{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1107
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1108	\noindent
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1109	As far as we can see, our recursive function @{term bsimp} simplifies regular
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1110	expressions as intended by Sulzmann and Lu. There is no point in applying the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1111	@{text bsimp} function repeatedly (like the simplification in their paper which needs to be
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1112	applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1113	that is
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1114
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1115	\begin{proposition}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1116	@{term "bsimp (bsimp r) = bsimp r"}
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1117	\end{proposition}
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	1118
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1119	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1120	This can be proved by induction on @{text r} but requires a detailed analysis
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1121	that the de-nesting of alternatives always results in a flat list of regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1122	expressions. We omit the details since it does not concern the correctness proof.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1123
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1124	Next we can include simplification after each derivative step leading to the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1125	following notion of bitcoded derivatives:
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1126
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1127	\begin{center}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1128	\begin{tabular}{cc}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1129	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1130	@{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1131	\end{tabular}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1132	&
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1133	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1134	@{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1135	\end{tabular}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1136	\end{tabular}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1137	\end{center}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1138
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1139	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1140	and use it in the improved lexing algorithm defined as
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1141
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1142	\begin{center}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1143	\begin{tabular}{lcl}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1144	$\textit{blexer}^+\;r\,s$ & $\dn$ &
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1145	$\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1146	& & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1147	\;\;\textit{else}\;\textit{None}$
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1148	\end{tabular}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1149	\end{center}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1150
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1151	\noindent The remaining task is to show that @{term blexer} and
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1152	@{term "blexer_simp"} generate the same answers.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1153
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1154	When we first
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1155	attempted this proof we encountered a problem with the idea
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1156	in Sulzmann and Lu's paper where the argument seems to be to appeal
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1157	again to the @{text retrieve}-function defined for the unsimplified version
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1158	of the algorithm. But
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1159	this does not work, because desirable properties such as
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1160	%
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1161	\[
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1162	@{text "retrieve r v = retrieve (bsimp r) v"}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1163	\]
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1164
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1165	\noindent do not hold under simplification---this property
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1166	essentially purports that we can retrieve the same value from a
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1167	simplified version of the regular expression. To start with @{text retrieve}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1168	depends on the fact that the value @{text v} correspond to the
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1169	structure of the regular expressions---but the whole point of simplification
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1170	is to ``destroy'' this structure by making the regular expression simpler.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1171	To see this consider the regular expression @{text "r = r' + 0"} and a corresponding
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1172	value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1173	we can use @{text retrieve} and @{text v} in order to extract a corresponding
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1174	bitsequence. The reason that this works is that @{text r} is an alternative
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1175	regular expression and @{text v} a corresponding value. However, if we simplify
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1176	@{text r}, then @{text v} does not correspond to the shape of the regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1177	expression anymore. So unless one can somehow
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1178	synchronise the change in the simplified regular expressions with
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1179	the original POSIX value, there is no hope of appealing to @{text retrieve} in the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1180	correctness argument for @{term blexer_simp}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1181
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1182	We found it more helpful to introduce the rewriting systems shown in
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1183	Figure~\ref{SimpRewrites}. The idea is to generate
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1184	simplified regular expressions in small steps (unlike the @{text bsimp}-function which
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1185	does the same in a big step), and show that each of
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1186	the small steps preserves the bitcodes that lead to the final POSIX value.
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1187	The rewrite system is organised such that $\leadsto$ is for bitcoded regular
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1188	expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1189	expressions. The former essentially implements the simplifications of
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1190	@{text "bsimpSEQ"} and @{text flts}; while the latter implements the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1191	simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1192	regular expression reduces in zero or more steps to the simplified
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1193	regular expression generated by @{text bsimp}:
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1194
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1195	\begin{lemma}\label{lemone}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1196	@{thm[mode=IfThen] rewrites_to_bsimp}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1197	\end{lemma}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1198
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1199	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1200	By induction on @{text r}. For this we can use the properties
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1201	@{thm fltsfrewrites} and @{thm ss6_stronger}. The latter uses
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1202	repeated applications of the $LD$ rule which allows the removal
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1203	of duplicates that can recognise the same strings.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1204	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1205
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1206	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1207	We can show that this rewrite system preserves @{term bnullable}, that
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1208	is simplification, essentially, does not affect nullability:
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	1209
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1210	\begin{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1211	@{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1212	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1213
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1214	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1215	Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1216	The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1217	be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1218	@{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1219	\end{proof}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1220
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1221	\noindent
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1222	From this, we can show that @{text bmkeps} will produce the same bitsequence
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1223	as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1224	establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1225	in the paper by Sulzmannn and Lu).
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1226
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1227
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1228	\begin{lemma}\label{lemthree}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1229	@{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1230	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1231
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1232	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1233	By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1234	Again the only interesting case is the rule $LD$ where we need to ensure that
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1235	\[
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1236	@{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1237	bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1238	\]
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1239
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1240	\noindent holds. This is indeed the case because according to the POSIX rules the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1241	generated bitsequence is determined by the first alternative that can match the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1242	string (in this case being nullable).
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1243	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1244
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1245	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1246	Crucial is also the fact that derivative steps and simplification steps can be interleaved,
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1247	which is shown by the fact that $\leadsto$ is preserved under derivatives.
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1248
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1249	\begin{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1250	@{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1251	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1252
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1253	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1254	By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1255	The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1256	if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1257	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1258
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1259
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1260	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1261	Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1262	that the unsimplified
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1263	derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1264
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1265	\begin{lemma}\label{lemtwo}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1266	@{thm[mode=IfThen] central}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1267	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1268
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1269	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1270	By reverse induction on @{term s} generalising over @{text r}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1271	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1272
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1273	\noindent
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1274	With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1275	generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1276	is indeed the POSIX value.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1277
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1278	\begin{theorem}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1279	@{thm[mode=IfThen] main_blexer_simp}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1280	\end{theorem}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1281
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1282	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1283	By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1284	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1285
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1286	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1287	This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1288	The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily, which
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1289	we shall show next.
398 dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1290
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1291	\begin{figure}[t]
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1292	\begin{center}
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1293	\begin{tabular}{c}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1294	@{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1295	@{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1296	@{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1297	@{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1298	@{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1299	@{thm[mode=Axiom] bs6}$A0$\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1300	@{thm[mode=Axiom] bs7}$A1$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1301	@{thm[mode=Rule] bs8[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1302	@{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LH$\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1303	@{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LT$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1304	@{thm[mode=Axiom] ss4}$L\ZERO$\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1305	@{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1306	@{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\
398 dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1307	\end{tabular}
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1308	\end{center}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1309	\caption{The rewrite rules that generate simplified regular expressions
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1310	in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1311	expressions and @{term "rrewrites rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1312	regular expressions. Interesting is the $LD$ rule that allows copies of regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1313	expressions be removed provided a regular expression earlier in the list can
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1314	match the same strings.}\label{SimpRewrites}
398 dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1315	\end{figure}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1316	*}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1317
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1318	section {* Finiteness of Derivatives *}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1319
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1320	text {*
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1321
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1322	In this section let us sketch our argument for why the size of the simplified
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1323	derivatives with the aggressive simplification function is finite. Suppose
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1324	we have a size function for bitcoded regular expressions, written
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1325	@{text "\|r\|"}, which counts the number of nodes if we regard $r$ as a tree
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1326	(we omit the precise definition). For this we show that for every $r$
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1327	there exists a bound $N$
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1328	such that
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1329
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1330	\begin{center}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1331	$\forall s. \; \|@{term "bders_simp r s"}\| < N$
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1332	\end{center}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1333
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1334	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1335	We prove this by induction on $r$. The base cases for @{term AZERO},
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1336	@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1337	for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1338	hypotheses state $\forall s. \; \|@{term "bders_simp r\<^sub>1 s"}\| < N_1$ and
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1339	$\forall s. \; \|@{term "bders_simp r\<^sub>2 s"}\| < N_2$. We can reason as follows
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1340
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1341	\begin{center}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1342	\begin{tabular}{lcll}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1343	& & $ \|@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\|$\\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1344	& $ = $ & $\|bsimp(ALTs\;bs\;((@{term "bders_simp r\<^sub>1 s"}) \cdot r_2) ::
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1345	[@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in Suf\!fix(s)])\| $ & (1) \\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1346	& $\leq$ &
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1347	$\|distinctBy\,(flts\,((@{term "bders_simp r\<^sub>1 s "}) \cdot r_2) ::
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1348	[@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in Suf\!fix(s)])\| + 1 $ & (2) \\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1349	& $\leq$ & $\|(@{term "bders_simp r\<^sub>1 s"}) \cdot r_2\| +
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1350	\|distinctBy\,(flts\, [@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in Suf\!fix(s)])\| + 1 $ & (3) \\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1351	& $\leq$ & $N_1 + \|r_2\| + 2 + \|distinctBy\,(flts\, [@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in Suf\!fix(s)])\|$ & (4)\\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1352	& $\leq$ & $N_1 + \|r_2\| + 2 + l_{N_{2}} * N_{2}$ & (5)
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1353	\end{tabular}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1354	\end{center}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1355
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1356	% tell Chengsong about Indian paper of closed forms of derivatives
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1357
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1358	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1359	where in (1) the $Suf\!fix(s')$ are the suffixes where @{term "bders_simp r\<^sub>1 s''"} is nullable for
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1360	@{text "s = s'' @ s'"}. In (3) we know that $\|(@{term "bders_simp r\<^sub>1 s"}) \cdot r_2\|$ is
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1361	bounded by $N_1 + \|r_2\|$. In (5) we know the list comprehension contains only regular expressions of size smaller
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1362	than $N_2$. The list length after @{text distinctBy} is bounded by a number, which we call $l_{N_2}$. It stands
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1363	for the number of distinct regular expressions with a maximum size $N_2$ (there can only be finitely many of them).
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1364	We reason similarly in the @{text Star}-case.\medskip
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1365
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1366	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1367	Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1368	far from the actual bound we can expect. We can do better than this, but this does not improve
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1369	the finiteness property we are proving. If we are interested in a polynomial bound,
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1370	one would hope to obtain a similar tight bound as for partial
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1371	derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1372	@{text distinctBy} is to maintain a ``set'' of alternatives (like the sets in
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1373	partial derivatives). Unfortunately to obtain the exact same bound would mean
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1374	we need to introduce simplifications such as
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1375	%
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1376	\[ (r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1377	\]
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1378
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1379	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1380	which exist for partial derivatives. However, if we introduce them in our
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1381	setting we would lose the POSIX property of our calculated values. We leave better
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1382	bounds for future work.
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1383
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1384	*}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1385
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1386
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1387	section {* Conclusion *}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1388
396 cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1389	text {*
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1390
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1391	We set out in this work to prove in Isabelle/HOL the correctness of
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1392	the second POSIX lexing algorithm by Sulzmann and Lu
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1393	\cite{Sulzmann2014}. This follows earlier work where we established
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1394	the correctness of the first algorithm
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1395	\cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1396	introduce our own specification about what POSIX values are,
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1397	because the informal definition given by Sulzmann and Lu did not
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1398	stand up to a formal proof. Also for the second algorithm we needed
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1399	to introduce our own definitions and proof ideas in order to establish the
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1400	correctness. Our interest in the second algorithm
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1401	lies in the fact that by using bitcoded regular expressions and an aggressive
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1402	simplification method there is a chance that the the derivatives
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1403	can be kept universally small (we established in this paper that
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1404	they can be kept finite for any string). This is important if one is after
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1405	an efficient POSIX lexing algorithm.
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1406
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1407	Having proved the correctness of the POSIX lexing algorithm, which
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1408	lessons have we learned? Well, we feel this is a very good example
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1409	where formal proofs give further insight into the matter at
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1410	hand. For example it is very hard to see a problem with @{text nub}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1411	vs @{text distinctBy} with only experimental data---one would still
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1412	see the correct result but find that simplification does not simplify in well-chosen, but not
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1413	obscure, examples. We found that from an implementation
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1414	point-of-view it is really important to have the formal proofs of
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1415	the corresponding properties at hand. We have also developed a
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1416	healthy suspicion when experimental data is used to back up
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1417	efficiency claims. For example Sulzmann and Lu write about their
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1418	equivalent of @{term blexer_simp} ``...we can incrementally compute
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1419	bitcoded parse trees in linear time in the size of the input''
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1420	\cite[Page 14]{Sulzmann2014}.
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1421	Given the growth of the
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1422	derivatives in some cases even after aggressive simplification, this
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1423	is a hard to believe fact. A similar claim about a theoretical runtime
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1424	of @{text "O(n\<^sup>2)"} is made for the Verbatim lexer, which calculates POSIX matches and is based on
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1425	derivatives \cite{verbatim}. In this case derivatives are not simplified.
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1426	Clearly our result of having finite
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1427	derivatives is rather weak in this context but we think such effeciency claims
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1428	require further scrutiny.\medskip
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1429
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1430	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1431	Our Isabelle/HOL code is available under \url{https://github.com/urbanchr/posix}.
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1432
396 cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1433
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1434	%%\bibliographystyle{plain}
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1435	\bibliography{root}
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1436	*}
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1437
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1438	(<)
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1439	end
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1440	(>)

author	Christian Urban <christian.urban@kcl.ac.uk>
	Wed, 02 Mar 2022 11:43:41 +0000
changeset 436	222333d2bdc2
parent 426	5b77220fdf01
child 458	30c91ea7095b
permissions	-rw-r--r--