lexing: thys2/Paper/Paper.thy@47618d607bbf (annotated)

396 cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1	(<)
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	2	theory Paper
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	3	imports
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	4	"../Lexer"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	5	"../Simplifying"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	6	"../Positions"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	7	"../SizeBound4"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	8	"HOL-Library.LaTeXsugar"
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	9	begin
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	10
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	11	declare [[show_question_marks = false]]
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	12
398 dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	13	notation (latex output)
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	14	If ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	15	Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73)
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	16
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	17
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	18	abbreviation
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	19	"der_syn r c \<equiv> der c r"
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	20	abbreviation
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	21	"ders_syn r s \<equiv> ders s r"
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	22	abbreviation
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	23	"bder_syn r c \<equiv> bder c r"
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	24
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	25	notation (latex output)
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	26	der_syn ("_\\_" [79, 1000] 76) and
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	27	ders_syn ("_\\_" [79, 1000] 76) and
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	28	bder_syn ("_\\_" [79, 1000] 76) and
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	29	bders ("_\\_" [79, 1000] 76) and
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	30	bders_simp ("_\\\<^sub>b\<^sub>s\<^sub>i\<^sub>m\<^sub>p _" [79, 1000] 76) and
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	31
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	32	ZERO ("\<^bold>0" 81) and
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	33	ONE ("\<^bold>1" 81) and
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	34	CH ("_" [1000] 80) and
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	35	ALT ("_ + _" [77,77] 78) and
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	36	SEQ ("_ \<cdot> _" [77,77] 78) and
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	37	STAR ("_\<^sup>*" [79] 78) and
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	38
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	39	val.Void ("Empty" 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	40	val.Char ("Char _" [1000] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	41	val.Left ("Left _" [79] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	42	val.Right ("Right _" [1000] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	43	val.Seq ("Seq _ _" [79,79] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	44	val.Stars ("Stars _" [79] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	45
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	46	Prf ("\<turnstile> _ : _" [75,75] 75) and
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	47	Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	48
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	49	flat ("\|_\|" [75] 74) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	50	flats ("\|_\|" [72] 74) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	51	injval ("inj _ _ _" [79,77,79] 76) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	52	mkeps ("mkeps _" [79] 76) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	53	length ("len _" [73] 73) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	54	set ("_" [73] 73) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	55
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	56	AZERO ("ZERO" 81) and
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	57	AONE ("ONE _" [79] 78) and
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	58	ACHAR ("CHAR _ _" [79, 79] 80) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	59	AALTs ("ALTs _ _" [77,77] 78) and
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	60	ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	61	ASTAR ("STAR _ _" [79, 79] 78) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	62
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	63	code ("code _" [79] 74) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	64	intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	65	erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	66	bnullable ("bnullable _" [1000] 80) and
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	67	bsimp_AALTs ("bsimpALT _ _" [10,1000] 80) and
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	68	bsimp_ASEQ ("bsimpSEQ _ _ _" [10,1000,1000] 80) and
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	69	bmkeps ("bmkeps _" [1000] 80) and
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	70
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	71	srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) and
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	72	rrewrites ("_ \<^latex>\<open>\\mbox{$\\,\\leadsto^*$}\<close> _" [71, 71] 80) and
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	73	srewrites ("_ \<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}^*$}\<close> _" [71, 71] 80) and
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	74	blexer_simp ("blexer\<^sup>+" 1000)
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	75
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	76
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	77	lemma better_retrieve:
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	78	shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	79	and "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	80	apply (metis list.exhaust retrieve.simps(4))
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	81	by (metis list.exhaust retrieve.simps(5))
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	82
396 cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	83	(>)
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	84
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	85	section {* Introduction *}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	86
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	87	text {*
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	88
400 46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	89	In the last fifteen or so years, Brzozowski's derivatives of regular
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	90	expressions have sparked quite a bit of interest in the functional
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	91	programming and theorem prover communities. The beauty of
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	92	Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	93	expressible in any functional language, and easily definable and
46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	94	reasoned about in theorem provers---the definitions just consist of
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	95	inductive datatypes and simple recursive functions. Derivatives of a
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	96	regular expression, written @{term "der c r"}, give a simple solution
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	97	to the problem of matching a string @{term s} with a regular
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	98	expression @{term r}: if the derivative of @{term r} w.r.t.\ (in
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	99	succession) all the characters of the string matches the empty string,
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	100	then @{term r} matches @{term s} (and {\em vice versa}). We are aware
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	101	of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	102	Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	103	of the work by Krauss and Nipkow~\cite{Krauss2011}. And another one
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	104	in Coq is given by Coquand and Siles \cite{Coquand2012}.
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	105	Also Ribeiro and Du Bois give one in Agda~\cite{RibeiroAgda2017}.
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	106
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	107
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	108	However, there are two difficulties with derivative-based matchers:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	109	First, Brzozowski's original matcher only generates a yes/no answer
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	110	for whether a regular expression matches a string or not. This is too
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	111	little information in the context of lexing where separate tokens must
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	112	be identified and also classified (for example as keywords
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	113	or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	114	difficulty by cleverly extending Brzozowski's matching
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	115	algorithm. Their extended version generates additional information on
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	116	\emph{how} a regular expression matches a string following the POSIX
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	117	rules for regular expression matching. They achieve this by adding a
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	118	second ``phase'' to Brzozowski's algorithm involving an injection
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	119	function. In our own earlier work we provided the formal
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	120	specification of what POSIX matching means and proved in Isabelle/HOL
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	121	the correctness
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	122	of Sulzmann and Lu's extended algorithm accordingly
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	123	\cite{AusafDyckhoffUrban2016}.
400 46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	124
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	125	The second difficulty is that Brzozowski's derivatives can
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	126	grow to arbitrarily big sizes. For example if we start with the
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	127	regular expression \mbox{@{text "(a + aa)\<^sup>*"}} and take
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	128	successive derivatives according to the character $a$, we end up with
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	129	a sequence of ever-growing derivatives like
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	130
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	131	\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	132	\begin{center}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	133	\begin{tabular}{rll}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	134	$(a + aa)^$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^$\\
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	135	& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	136	& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	137	& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	138	& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	139	\end{tabular}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	140	\end{center}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	141
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	142	\noindent where after around 35 steps we run out of memory on a
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	143	typical computer (we shall define shortly the precise details of our
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	144	regular expressions and the derivative operation). Clearly, the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	145	notation involving $\ZERO$s and $\ONE$s already suggests
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	146	simplification rules that can be applied to regular regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	147	expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	148	\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	149	r$. While such simple-minded simplifications have been proved in our
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	150	earlier work to preserve the correctness of Sulzmann and Lu's
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	151	algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	152	\emph{not} help with limiting the growth of the derivatives shown
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	153	above: the growth is slowed, but the derivatives can still grow rather
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	154	quickly beyond any finite bound.
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	155
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	156
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	157	Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	158	\cite{Sulzmann2014} where they introduce bitcoded
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	159	regular expressions. In this version, POSIX values are
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	160	represented as bitsequences and such sequences are incrementally generated
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	161	when derivatives are calculated. The compact representation
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	162	of bitsequences and regular expressions allows them to define a more
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	163	``aggressive'' simplification method that keeps the size of the
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	164	derivatives finitely bounded no matter what the length of the string is.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	165	They make some informal claims about the correctness and linear behaviour
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	166	of this version, but do not provide any supporting proof arguments, not
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	167	even ``pencil-and-paper'' arguments. They write about their bitcoded
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	168	\emph{incremental parsing method} (that is the algorithm to be formalised
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	169	in this paper):
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	170
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	171	\begin{quote}\it
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	172	``Correctness Claim: We further claim that the incremental parsing
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	173	method [..] in combination with the simplification steps [..]
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	174	yields POSIX parse trees. We have tested this claim
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	175	extensively [..] but yet
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	176	have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	177	\end{quote}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	178
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	179	\noindent{}\textbf{Contributions:} We have implemented in Isabelle/HOL
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	180	the derivative-based lexing algorithm of Sulzmann and Lu
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	181	\cite{Sulzmann2014} where regular expressions are annotated with
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	182	bitsequences. We define the crucial simplification function as a
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	183	recursive function, without the need of a fix-point operation. One objective of
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	184	the simplification function is to remove duplicates of regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	185	expressions. For this Sulzmann and Lu use in their paper the standard
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	186	@{text nub} function from Haskell's list library, but this function
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	187	does not achieve the intended objective with bitcoded regular expressions. The
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	188	reason is that in the bitcoded setting, each copy generally has a
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	189	different bitcode annotation---so @{text nub} would never ``fire''.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	190	Inspired by Scala's library for lists, we shall instead use a @{text
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	191	distinctBy} function that finds duplicates under an erasing function
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	192	which deletes bitcodes.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	193	We shall also introduce our own argument and definitions for
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	194	establishing the correctness of the bitcoded algorithm when
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	195	simplifications are included.\medskip
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	196
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	197	\noindent In this paper, we shall first briefly introduce the basic notions
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	198	of regular expressions and describe the definition
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	199	of POSIX lexing from our earlier work \cite{AusafDyckhoffUrban2016}. This serves
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	200	as a reference point for what correctness means in our Isabelle/HOL proofs. We shall then prove
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	201	the correctness for the bitcoded algorithm without simplification, and
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	202	after that extend the proof to include simplification.
400 46e5566ad4ba updated Christian Urban <christian.urban@kcl.ac.uk> parents: 398 diff changeset	203
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	204	*}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	205
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	206	section {* Background *}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	207
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	208	text {*
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	209	In our Isabelle/HOL formalisation strings are lists of characters with
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	210	the empty string being represented by the empty list, written $[]$,
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	211	and list-cons being written as $\_\!::\!\_\,$; string
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	212	concatenation is $\_ \,@\, \_\,$. We often use the usual
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	213	bracket notation for lists also for strings; for example a string
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	214	consisting of just a single character $c$ is written $[c]$.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	215	Our regular expressions are defined as usual as the elements of the following inductive
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	216	datatype:
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	217
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	218	\begin{center}
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	219	@{text "r ::="} \;
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	220	@{const "ZERO"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	221	@{const "ONE"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	222	@{term "CH c"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	223	@{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	224	@{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	225	@{term "STAR r"}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	226	\end{center}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	227
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	228	\noindent where @{const ZERO} stands for the regular expression that does
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	229	not match any string, @{const ONE} for the regular expression that matches
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	230	only the empty string and @{term c} for matching a character literal.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	231	The constructors $+$ and $\cdot$ represent alternatives and sequences, respectively.
461 c4b6906068a9 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 460 diff changeset	232	We sometimes omit the $\cdot$ in a sequence regular expression for brevity.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	233	The
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	234	\emph{language} of a regular expression, written $L(r)$, is defined as usual
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	235	and we omit giving the definition here (see for example \cite{AusafDyckhoffUrban2016}).
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	236
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	237	Central to Brzozowski's regular expression matcher are two functions
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	238	called @{text nullable} and \emph{derivative}. The latter is written
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	239	$r\backslash c$ for the derivative of the regular expression $r$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	240	w.r.t.~the character $c$. Both functions are defined by recursion over
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	241	regular expressions.
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	242
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	243	\begin{center}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	244	\begin{tabular}{cc}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	245	\begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	246	@{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	247	@{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	248	@{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	249	@{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	250	@{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	251	& & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	252	& & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	253	% & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	254	@{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	255	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	256	&
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	257	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	258	@{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	259	@{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	260	@{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	261	@{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	262	@{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	263	@{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	264	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	265	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	266	\end{center}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	267
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	268	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	269	We can extend this definition to give derivatives w.r.t.~strings:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	270
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	271	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	272	\begin{tabular}{cc}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	273	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	274	@{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	275	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	276	&
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	277	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	278	@{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	279	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	280	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	281	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	282
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	283	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	284	Using @{text nullable} and the derivative operation, we can
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	285	define the following simple regular expression matcher:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	286	%
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	287	\[
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	288	@{text "match s r"} \;\dn\; @{term nullable}(r\backslash s)
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	289	\]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	290
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	291	\noindent This is essentially Brzozowski's algorithm from 1964. Its
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	292	main virtue is that the algorithm can be easily implemented as a
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	293	functional program (either in a functional programming language or in
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	294	a theorem prover). The correctness proof for @{text match} amounts to
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	295	establishing the property
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	296	%
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	297	\begin{proposition}\label{matchcorr}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	298	@{text "match s r"} \;\;\text{if and only if}\;\; $s \in L(r)$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	299	\end{proposition}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	300
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	301	\noindent
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	302	It is a fun exercise to formally prove this property in a theorem prover.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	303
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	304	The novel idea of Sulzmann and Lu is to extend this algorithm for
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	305	lexing, where it is important to find out which part of the string
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	306	is matched by which part of the regular expression.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	307	For this Sulzmann and Lu presented two lexing algorithms in their paper
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	308	\cite{Sulzmann2014}. The first algorithm consists of two phases: first a
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	309	matching phase (which is Brzozowski's algorithm) and then a value
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	310	construction phase. The values encode \emph{how} a regular expression
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	311	matches a string. \emph{Values} are defined as the inductive datatype
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	312
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	313	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	314	@{text "v :="}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	315	@{const "Void"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	316	@{term "val.Char c"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	317	@{term "Left v"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	318	@{term "Right v"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	319	@{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	320	@{term "Stars vs"}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	321	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	322
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	323	\noindent where we use @{term vs} to stand for a list of values. The
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	324	string underlying a value can be calculated by a @{const flat}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	325	function, written @{term "flat DUMMY"}. It traverses a value and
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	326	collects the characters contained in it. Sulzmann and Lu also define inductively an
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	327	inhabitation relation that associates values to regular expressions:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	328
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	329	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	330	\begin{tabular}{c}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	331	\\[-8mm]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	332	@{thm[mode=Axiom] Prf.intros(4)} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	333	@{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	334	@{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} \qquad
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	335	@{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>2" "r\<^sub>1"]}\\[4mm]
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	336	@{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	337	@{thm[mode=Rule] Prf.intros(6)[of "vs" "r"]}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	338	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	339	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	340
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	341	\noindent Note that no values are associated with the regular expression
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	342	@{term ZERO}, since it cannot match any string.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	343	It is routine to establish how values ``inhabiting'' a regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	344	expression correspond to the language of a regular expression, namely
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	345
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	346	\begin{proposition}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	347	@{thm L_flat_Prf}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	348	\end{proposition}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	349
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	350	In general there is more than one value inhabited by a regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	351	expression (meaning regular expressions can typically match more
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	352	than one string). But even when fixing a string from the language of the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	353	regular expression, there are generally more than one way of how the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	354	regular expression can match this string. POSIX lexing is about
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	355	identifying the unique value for a given regular expression and a
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	356	string that satisfies the informal POSIX rules (see
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	357	\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}).\footnote{POSIX
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	358	lexing acquired its name from the fact that the corresponding
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	359	rules were described as part of the POSIX specification for
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	360	Unix-like operating systems \cite{POSIX}.} Sometimes these
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	361	informal rules are called \emph{maximal munch rule} and \emph{rule priority}.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	362	One contribution of our earlier paper is to give a convenient
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	363	specification for what POSIX values are (the inductive rules are shown in
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	364	Figure~\ref{POSIXrules}).
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	365
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	366	\begin{figure}[t]
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	367	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	368	\begin{tabular}{c}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	369	@{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	370	@{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	371	@{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	372	@{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	373	$\mprset{flushleft}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	374	\inferrule
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	375	{@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	376	@{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	377	@{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	378	{@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\medskip\smallskip\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	379	@{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	380	$\mprset{flushleft}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	381	\inferrule
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	382	{@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	383	@{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	384	@{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	385	@{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	386	{@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>\\[-4mm]
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	387	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	388	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	389	\caption{The inductive definition of POSIX values taken from our earlier paper \cite{AusafDyckhoffUrban2016}. The ternary relation, written $(s, r) \rightarrow v$, formalises the notion
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	390	of given a string $s$ and a regular
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	391	expression $r$ what is the unique value $v$ that satisfies the informal POSIX constraints for
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	392	regular expression matching.}\label{POSIXrules}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	393	\end{figure}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	394
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	395	The clever idea by Sulzmann and Lu \cite{Sulzmann2014} in their first algorithm is to define
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	396	an injection function on values that mirrors (but inverts) the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	397	construction of the derivative on regular expressions. Essentially it
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	398	injects back a character into a value.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	399	For this they define two functions called @{text mkeps} and @{text inj}:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	400
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	401	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	402	\begin{tabular}{l}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	403	\begin{tabular}{lcl}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	404	@{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	405	@{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	406	@{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	407	@{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	408	\end{tabular}\smallskip\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	409
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	410	\begin{tabular}{lcl}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	411	@{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	412	@{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ &
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	413	@{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	414	@{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	415	@{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	416	@{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	417	& @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	418	@{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	419	& @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	420	@{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	421	& @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	422	@{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	423	& @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	424	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	425	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	426	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	427
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	428	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	429	The function @{text mkeps} is run when the last derivative is nullable, that is
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	430	the string to be matched is in the language of the regular expression. It generates
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	431	a value for how the last derivative can match the empty string. The injection function
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	432	then calculates the corresponding value for each intermediate derivative until
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	433	a value for the original regular expression is generated.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	434	Graphically the algorithm by
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	435	Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	436	where the path from the left to the right involving @{term derivatives}/@{const
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	437	nullable} is the first phase of the algorithm (calculating successive
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	438	\Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	439	left, the second phase. The picture above shows the steps required when a
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	440	regular expression, say @{text "r\<^sub>1"}, matches the string @{term
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	441	"[a,b,c]"}. The first lexing algorithm by Sulzmann and Lu can be defined as:
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	442
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	443	\begin{figure}[t]
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	444	\begin{center}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	445	\begin{tikzpicture}[scale=2,node distance=1.3cm,
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	446	every node/.style={minimum size=6mm}]
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	447	\node (r1) {@{term "r\<^sub>1"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	448	\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	449	\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	450	\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	451	\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	452	\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	453	\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	454	\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	455	\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	456	\draw[->,line width=1mm](r4) -- (v4);
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	457	\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	458	\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	459	\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	460	\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	461	\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	462	\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	463	\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	464	\end{tikzpicture}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	465	\end{center}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	466	\mbox{}\\[-13mm]
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	467
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	468	\caption{The two phases of the first algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	469	matching the string @{term "[a,b,c]"}. The first phase (the arrows from
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	470	left to right) is \Brz's matcher building successive derivatives. If the
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	471	last regular expression is @{term nullable}, then the functions of the
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	472	second phase are called (the top-down and right-to-left arrows): first
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	473	@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	474	how the empty string has been recognised by @{term "r\<^sub>4"}. After
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	475	that the function @{term inj} ``injects back'' the characters of the string into
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	476	the values. The value @{term "v\<^sub>1"} is the result of the algorithm representing
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	477	the POSIX value for this string and
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	478	regular expression.
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	479	\label{Sulz}}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	480	\end{figure}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	481
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	482
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	483
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	484	\begin{center}
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	485	\begin{tabular}{lcl}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	486	@{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	487	@{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	488	& & \phantom{$\|$} @{term "None"} @{text "\<Rightarrow>"} @{term None}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	489	& & $\|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	490	\end{tabular}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	491	\end{center}
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	492
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	493
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	494	We have shown in our earlier paper \cite{AusafDyckhoffUrban2016} that
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	495	this algorithm is correct, that is it generates POSIX values. The
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	496	central property we established relates the derivative operation to the
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	497	injection function.
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	498
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	499	\begin{proposition}\label{Posix2}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	500	\textit{If} $(s,\; r\backslash c) \rightarrow v$ \textit{then} $(c :: s,\; r) \rightarrow$ \textit{inj} $r\; c\; v$.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	501	\end{proposition}
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	502
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	503	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	504	With this in place we were able to prove:
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	505
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	506
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	507	\begin{proposition}\mbox{}\smallskip\\\label{lexercorrect}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	508	\begin{tabular}{ll}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	509	(1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	510	(2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	511	\end{tabular}
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	512	\end{proposition}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	513
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	514	\noindent
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	515	In fact we have shown that, in the success case, the generated POSIX value $v$ is
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	516	unique and in the failure case that there is no POSIX value $v$ that satisfies
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	517	$(s, r) \rightarrow v$. While the algorithm is correct, it is excruciatingly
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	518	slow in cases where the derivatives grow arbitrarily (recall the example from the
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	519	Introduction). However it can be used as a convenient reference point for the correctness
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	520	proof of the second algorithm by Sulzmann and Lu, which we shall describe next.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	521
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	522	*}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	523
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	524	section {* Bitcoded Regular Expressions and Derivatives *}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	525
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	526	text {*
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	527
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	528	In the second part of their paper \cite{Sulzmann2014},
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	529	Sulzmann and Lu describe another algorithm that also generates POSIX
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	530	values but dispenses with the second phase where characters are
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	531	injected ``back'' into values. For this they annotate bitcodes to
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	532	regular expressions, which we define in Isabelle/HOL as the datatype
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	533
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	534	\begin{center}
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	535	\begin{tabular}{lcl}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	536	@{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	537	& $\mid$ & @{term "ACHAR bs c"}\\
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	538	& $\mid$ & @{term "AALTs bs rs"}\\
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	539	& $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	540	& $\mid$ & @{term "ASTAR bs r"}
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	541	\end{tabular}
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	542	\end{center}
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	543
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	544	\noindent where @{text bs} stands for bitsequences; @{text r},
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	545	@{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	546	expressions; and @{text rs} for lists of bitcoded regular
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	547	expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	548	is just an abbreviation for \mbox{@{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}}.
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	549	For bitsequences we use lists made up of the
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	550	constants @{text Z} and @{text S}. The idea with bitcoded regular
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	551	expressions is to incrementally generate the value information (for
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	552	example @{text Left} and @{text Right}) as bitsequences. For this
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	553	Sulzmann and Lu define a coding
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	554	function for how values can be coded into bitsequences.
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	555
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	556	\begin{center}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	557	\begin{tabular}{cc}
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	558	\begin{tabular}{lcl}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	559	@{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	560	@{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	561	@{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	562	@{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	563	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	564	&
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	565	\begin{tabular}{lcl}
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	566	@{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	567	@{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	568	@{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	569	\mbox{\phantom{XX}}\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	570	\end{tabular}
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	571	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	572	\end{center}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	573
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	574	\noindent
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	575	As can be seen, this coding is ``lossy'' in the sense that we do not
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	576	record explicitly character values and also not sequence values (for
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	577	them we just append two bitsequences). However, the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	578	different alternatives for @{text Left}, respectively @{text Right}, are recorded as @{text Z} and
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	579	@{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	580	if there is still a value coming in the list of @{text Stars}, whereas @{text S}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	581	indicates the end of the list. The lossiness makes the process of
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	582	decoding a bit more involved, but the point is that if we have a
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	583	regular expression \emph{and} a bitsequence of a corresponding value,
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	584	then we can always decode the value accurately. The decoding can be
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	585	defined by using two functions called $\textit{decode}'$ and
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	586	\textit{decode}:
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	587
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	588	\begin{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	589	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	590	$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	591	$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	592	$\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	593	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	594	(\Left\,v, bs_1)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	595	$\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	596	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	597	(\Right\,v, bs_1)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	598	$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	599	$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	600	& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	601	\hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	602	$\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	603	$\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	604	$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	605	& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	606	\hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	607	$\textit{decode}\,bs\,r$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	608	$\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	609	& & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	610	\textit{else}\;\textit{None}$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	611	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	612	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	613
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	614	\noindent
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	615	The function \textit{decode} checks whether all of the bitsequence is
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	616	consumed and returns the corresponding value as @{term "Some v"}; otherwise
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	617	it fails with @{text "None"}. We can establish that for a value $v$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	618	inhabited by a regular expression $r$, the decoding of its
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	619	bitsequence never fails.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	620
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	621	\begin{lemma}\label{codedecode}\it
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	622	If $\;\vdash v : r$ then
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	623	$\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	624	\end{lemma}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	625
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	626	\begin{proof}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	627	This follows from the property that
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	628	$\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	629	for any bit-sequence $bs$ and $\vdash v : r$. This property can be
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	630	easily proved by induction on $\vdash v : r$.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	631	\end{proof}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	632
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	633	Sulzmann and Lu define the function \emph{internalise}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	634	in order to transform (standard) regular expressions into annotated
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	635	regular expressions. We write this operation as $r^\uparrow$.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	636	This internalisation uses the following
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	637	\emph{fuse} function.
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	638
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	639	\begin{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	640	\begin{tabular}{lcl}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	641	$\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	642	$\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	643	$\textit{ONE}\,(bs\,@\,bs')$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	644	$\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	645	$\textit{CHAR}\,(bs\,@\,bs')\,c$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	646	$\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	647	$\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	648	$\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	649	$\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	650	$\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	651	$\textit{STAR}\,(bs\,@\,bs')\,r$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	652	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	653	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	654
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	655	\noindent
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	656	A regular expression can then be \emph{internalised} into a bitcoded
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	657	regular expression as follows:
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	658
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	659	\begin{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	660	\begin{tabular}{lcl}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	661	$(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	662	$(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	663	$(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	664	$(r_1 + r_2)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	665	$\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	666	(\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	667	$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	668	$\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	669	$(r^*)^\uparrow$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	670	$\textit{STAR}\;[]\,r^\uparrow$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	671	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	672	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	673
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	674	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	675	There is also an \emph{erase}-function, written $r^\downarrow$, which
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	676	transforms a bitcoded regular expression into a (standard) regular
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	677	expression by just erasing the annotated bitsequences. We omit the
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	678	straightforward definition. For defining the algorithm, we also need
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	679	the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	680	``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	681	bitcoded regular expressions.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	682	%
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	683	\begin{center}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	684	\begin{tabular}{@ {}c@ {}c@ {}}
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	685	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	686	$\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{False}$\\
30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	687	$\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{True}$\\
30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	688	$\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{False}$\\
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	689	$\textit{bnullable}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	690	$\exists\, r \in \rs. \,\textit{bnullable}\,r$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	691	$\textit{bnullable}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	692	$\textit{bnullable}\,r_1\wedge \textit{bnullable}\,r_2$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	693	$\textit{bnullable}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	694	$\textit{True}$
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	695	\end{tabular}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	696	&
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	697	\begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	698	$\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	699	$\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	700	$bs\,@\,\textit{bmkepss}\,\rs$\\
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	701	$\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	702	\multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	703	$\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	704	$bs \,@\, [\S]$\\
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	705	$\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	706	$\textit{if}\;\textit{bnullable}\,r$\\
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	707	& &$\textit{then}\;\textit{bmkeps}\,r$\\
222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	708	& &$\textit{else}\;\textit{bmkepss}\,\rs$
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	709	\end{tabular}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	710	\end{tabular}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	711	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	712
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	713
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	714	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	715	The key function in the bitcoded algorithm is the derivative of a
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	716	bitcoded regular expression. This derivative function calculates the
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	717	derivative but at the same time also the incremental part of the bitsequences
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	718	that contribute to constructing a POSIX value.
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	719
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	720	\begin{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	721	\begin{tabular}{@ {}lcl@ {}}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	722	$(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \\
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	723	$(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	724	$(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	725	$\textit{if}\;c=d\; \;\textit{then}\;
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	726	\textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	727	$(\textit{ALTs}\;bs\,\rs)\backslash c$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	728	$\textit{ALTs}\,bs\,(\mathit{map}\,(\_\backslash c)\,\rs)$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	729	$(\textit{SEQ}\;bs\,r_1\,r_2)\backslash c$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	730	$\textit{if}\;\textit{bnullable}\,r_1$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	731	& &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(r_1\backslash c)\,r_2)$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	732	& &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,r_1)\,(r_2\backslash c))$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	733	& &$\textit{else}\;\textit{SEQ}\,bs\,(r_1\backslash c)\,r_2$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	734	$(\textit{STAR}\,bs\,r)\backslash c$ & $\dn$ &
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	735	$\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	736	(\textit{STAR}\,[]\,r)$
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	737	\end{tabular}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	738	\end{center}
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	739
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	740
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	741	\noindent
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	742	This function can also be extended to strings, written $r\backslash s$,
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	743	just like the standard derivative. We omit the details. Finally we
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	744	can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}:
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	745
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	746	\begin{center}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	747	\begin{tabular}{lcl}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	748	$\textit{blexer}\;r\,s$ & $\dn$ &
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	749	$\textit{let}\;r_{der} = (r^\uparrow)\backslash s\;\textit{in}$\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	750	& & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	751	\;\;\textit{else}\;\textit{None}$
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	752	\end{tabular}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	753	\end{center}
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	754
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	755	\noindent
57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	756	This bitcoded lexer first internalises the regular expression $r$ and then
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	757	builds the bitcoded derivative according to $s$. If the derivative is
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	758	(b)nullable the string is in the language of $r$ and it extracts the bitsequence using the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	759	$\textit{bmkeps}$ function. Finally it decodes the bitsequence into a value. If
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	760	the derivative is \emph{not} nullable, then $\textit{None}$ is
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	761	returned. We can show that this way of calculating a value
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	762	generates the same result as \textit{lexer}.
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	763
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	764	Before we can proceed we need to define a helper function, called
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	765	\textit{retrieve}, which Sulzmann and Lu introduced for the correctness proof.
416 57182b36ec01 more with the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 410 diff changeset	766
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	767	\begin{center}
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	768	\begin{tabular}{lcl}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	769	@{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	770	@{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	771	@{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	772	@{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	773	@{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	774	@{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	775	& $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	776	@{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	777	@{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	778	\end{tabular}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	779	\end{center}
1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	780
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	781	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	782	The idea behind this function is to retrieve a possibly partial
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	783	bitsequence from a bitcoded regular expression, where the retrieval is
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	784	guided by a value. For example if the value is $\Left$ then we
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	785	descend into the left-hand side of an alternative in order to
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	786	assemble the bitcode. Similarly for
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	787	$\Right$. The property we can show is that for a given $v$ and $r$
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	788	with $\vdash v : r$, the retrieved bitsequence from the internalised
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	789	regular expression is equal to the bitcoded version of $v$.
402 1612f2a77bf6 more definitions in the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 400 diff changeset	790
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	791	\begin{lemma}\label{retrievecode}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	792	If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	793	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	794
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	795	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	796	We also need some auxiliary facts about how the bitcoded operations
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	797	relate to the ``standard'' operations on regular expressions. For
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	798	example if we build a bitcoded derivative and erase the result, this
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	799	is the same as if we first erase the bitcoded regular expression and
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	800	then perform the ``standard'' derivative operation.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	801
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	802	\begin{lemma}\label{bnullable}\mbox{}\smallskip\\
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	803	\begin{tabular}{ll}
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	804	\textit{(1)} & $(r\backslash s)^\downarrow = (r^\downarrow)\backslash s$\\
30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	805	\textit{(2)} & $\textit{bnullable}(r)$ iff $\textit{nullable}(r^\downarrow)$\\
30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	806	\textit{(3)} & $\textit{bmkeps}(r) = \textit{retrieve}\,r\,(\textit{mkeps}\,(r^\downarrow))$ provided $\textit{nullable}(r^\downarrow)$.
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	807	\end{tabular}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	808	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	809
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	810	\begin{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	811	All properties are by induction on annotated regular expressions. There are no
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	812	interesting cases.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	813	\end{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	814
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	815	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	816	The only difficulty left for the correctness proof is that the bitcoded algorithm
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	817	has only a ``forward phase'' where POSIX values are generated incrementally.
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	818	We can achieve the same effect with @{text lexer} (which has two phases) by stacking up injection
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	819	functions during the forward phase. An auxiliary function, called $\textit{flex}$,
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	820	allows us to recast the rules of $\lexer$ in terms of a single
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	821	phase and stacked up injection functions.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	822
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	823	\begin{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	824	\begin{tabular}{lcl}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	825	$\textit{flex}\;r\,f\,[]$ & $\dn$ & $f$\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	826	$\textit{flex}\;r\,f\,(c\!::\!s)$ & $\dn$ &
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	827	$\textit{flex}\,(r\backslash c)\,(\lambda v.\,f\,(\inj\,r\,c\,v))\,s$\\
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	828	\end{tabular}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	829	\end{center}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	830
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	831	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	832	The point of this function is that when
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	833	reaching the end of the string, we just need to apply the stacked up
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	834	injection functions to the value generated by @{text mkeps}.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	835	Using this function we can recast the success case in @{text lexer}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	836	as follows:
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	837
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	838	\begin{proposition}\label{flex}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	839	If @{text "lexer r s = Some v"} \;then\; @{text "v = "}$\,\textit{flex}\,r\,id\,s\,
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	840	(\mkeps (r\backslash s))$.
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	841	\end{proposition}
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	842
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	843	\noindent
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	844	Note we did not redefine \textit{lexer}, we just established that the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	845	value generated by \textit{lexer} can also be obtained by a different
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	846	method. While this different method is not efficient (we essentially
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	847	need to traverse the string $s$ twice, once for building the
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	848	derivative $r\backslash s$ and another time for stacking up injection
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	849	functions using \textit{flex}), it helps us with proving
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	850	that incrementally building up values in @{text blexer} generates the same result.
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	851
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	852	This brings us to our main lemma in this section: if we calculate a
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	853	derivative, say $r\backslash s$, and have a value, say $v$, inhabited
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	854	by this derivative, then we can produce the result @{text lexer} generates
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	855	by applying this value to the stacked-up injection functions
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	856	that $\textit{flex}$ assembles. The lemma establishes that this is the same
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	857	value as if we build the annotated derivative $r^\uparrow\backslash s$
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	858	and then retrieve the corresponding bitcoded version, followed by a
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	859	decoding step.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	860
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	861	\begin{lemma}[Main Lemma]\label{mainlemma}\it
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	862	If $\vdash v : r\backslash s$ then
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	863	\[\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,v) =
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	864	\textit{decode}(\textit{retrieve}\,(r^\uparrow \backslash s)\,v)\,r\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	865	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	866
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	867	\begin{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	868	This can be proved by induction on $s$ and generalising over
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	869	$v$. The interesting point is that we need to prove this in the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	870	reverse direction for $s$. This means instead of cases $[]$ and
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	871	$c\!::\!s$, we have cases $[]$ and $s\,@\,[c]$ where we unravel the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	872	string from the back.\footnote{Isabelle/HOL provides an induction principle
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	873	for this way of performing the induction.}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	874
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	875	The case for $[]$ is routine using Lemmas~\ref{codedecode}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	876	and~\ref{retrievecode}. In the case $s\,@\,[c]$, we can infer from
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	877	the assumption that $\vdash v : (r\backslash s)\backslash c$
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	878	holds. Hence by Prop.~\ref{Posix2} we know that
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	879	(*) $\vdash \inj\,(r\backslash s)\,c\,v : r\backslash s$ holds too.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	880	By definition of $\textit{flex}$ we can unfold the left-hand side
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	881	to be
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	882	\[
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	883	\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,(s\,@\,[c])\,v) =
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	884	\textit{Some}\,(\textit{flex}\;r\,\textit{id}\,s\,(\inj\,(r\backslash s)\,c\,v))
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	885	\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	886
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	887	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	888	By induction hypothesis and (*) we can rewrite the right-hand side to
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	889	%
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	890	\[
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	891	\textit{decode}\,(\textit{retrieve}\,(r^\uparrow\backslash s)\;
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	892	(\inj\,(r\backslash s)\,c\,\,v))\,r
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	893	\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	894
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	895	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	896	which is equal to
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	897	$\textit{decode}\,(\textit{retrieve}\, (r^\uparrow\backslash
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	898	(s\,@\,[c]))\,v)\,r$ as required. The last rewrite step is possible
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	899	because we generalised over $v$ in our induction.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	900	\end{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	901
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	902	\noindent
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	903	With this lemma in place, we can prove the correctness of \textit{blexer}---it indeed
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	904	produces the same result as \textit{lexer}.
405 3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	905
3cfea5bb5e23 updated some of the text and cardinality proof Christian Urban <christian.urban@kcl.ac.uk> parents: 402 diff changeset	906
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	907	\begin{theorem}\label{thmone}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	908	$\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	909	\end{theorem}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	910
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	911	\begin{proof}
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	912	We can first expand both sides using Prop.~\ref{flex} and the
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	913	definition of \textit{blexer}. This gives us two
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	914	\textit{if}-statements, which we need to show to be equal. By
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	915	Lemma~\ref{bnullable}\textit{(2)} we know the \textit{if}-tests coincide:
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	916	\[
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	917	\textit{bnullable}(r^\uparrow\backslash s) \;\textit{iff}\;
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	918	\nullable(r\backslash s)
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	919	\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	920
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	921	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	922	For the \textit{if}-branch suppose $r_d \dn r^\uparrow\backslash s$ and
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	923	$d \dn r\backslash s$. We have (*) @{text "nullable d"}. We can then show
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	924	by Lemma~\ref{bnullable}\textit{(3)} that
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	925	%
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	926	\[
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	927	\textit{decode}(\textit{bmkeps}\:r_d)\,r =
30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	928	\textit{decode}(\textit{retrieve}\,r_d\,(\textit{mkeps}\,d))\,r
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	929	\]
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	930
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	931	\noindent
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	932	where the right-hand side is equal to
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	933	$\textit{Some}\,(\textit{flex}\,r\,\textit{id}\,s\,(\textit{mkeps}\,
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	934	d))$ by Lemma~\ref{mainlemma} (we know
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	935	$\vdash \textit{mkeps}\,d : d$ by (*)). This shows the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	936	\textit{if}-branches return the same value. In the
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	937	\textit{else}-branches both \textit{lexer} and \textit{blexer} return
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	938	\textit{None}. Therefore we can conclude the proof.
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	939	\end{proof}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	940
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	941	\noindent This establishes that the bitcoded algorithm by Sulzmann and
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	942	Lu \emph{without} simplification produces correct results. This was
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	943	only conjectured by Sulzmann and Lu in their paper
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	944	\cite{Sulzmann2014}. The next step is to add simplifications.
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	945
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	946	*}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	947
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	948
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	949	section {* Simplification *}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	950
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	951	text {*
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	952
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	953	Derivatives as calculated by Brzozowski’s method are usually more
b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	954	complex regular expressions than the initial one; the result is
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	955	that derivative-based matching and lexing algorithms are
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	956	often abysmally slow if the ``growth problem'' is not addressed. As Sulzmann and Lu wrote, various
423 b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	957	optimisations are possible, such as the simplifications
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	958	$\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r \Rightarrow r$,
b7199d6c672d updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 420 diff changeset	959	$\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow r$. While these
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	960	simplifications can considerably speed up the two algorithms in many
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	961	cases, they do not solve fundamentally the growth problem with
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	962	derivatives. To see this let us return to the example from the
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	963	Introduction that shows the derivatives for \mbox{@{text "(a + aa)\<^sup>*"}}.
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	964	If we delete in the 3rd step all $\ZERO{}s$ and $\ONE$s according to
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	965	the simplification rules shown above we obtain
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	966	%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	967	\def\xll{\xrightarrow{\_\backslash{} [a, a, a]}}%%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	968	%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	969	\begin{equation}\label{derivex}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	970	(a + aa)^* \quad\xll\quad
463 421397f267b9 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 462 diff changeset	971	\underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r} \;+\;
421397f267b9 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 462 diff changeset	972	((a + aa)^* + \underbrace{\mbox{$(\ONE + a) \cdot (a + aa)^*$}}_{r})
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	973	\end{equation}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	974
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	975	\noindent This is a simpler derivative, but unfortunately we
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	976	cannot make any further simplifications. This is a problem because
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	977	the outermost alternatives contains two copies of the same
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	978	regular expression (underlined with $r$). These copies will
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	979	spawn new copies in later derivative steps and they in turn even more copies. This
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	980	destroys any hope of taming the size of the derivatives. But the
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	981	second copy of $r$ in \eqref{derivex} will never contribute to a
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	982	value, because POSIX lexing will always prefer matching a string
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	983	with the first copy. So it could be safely removed without affecting the correctness of the algorithm.
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	984	The dilemma with the simple-minded
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	985	simplification rules above is that the rule $r + r \Rightarrow r$
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	986	will never be applicable because as can be seen in this example the
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	987	regular expressions are not next to each other but separated by another regular expression.
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	988
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	989	But here is where Sulzmann and Lu's representation of generalised
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	990	alternatives in the bitcoded algorithm shines: in @{term
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	991	"ALTs bs rs"} we can define a more aggressive simplification by
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	992	recursively simplifying all regular expressions in @{text rs} and
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	993	then analyse the resulting list and remove any duplicates.
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	994	Another advantage with the bitsequences in bitcoded regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	995	expressions is that they can be easily modified such that simplification does not
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	996	interfere with the value constructions. For example we can ``flatten'', or
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	997	de-nest, @{text ALTs} as follows
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	998	%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	999	\[
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1000	@{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1001	\quad\xrightarrow{bsimp}\quad
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1002	@{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) # rs\<^sub>1)"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1003	\]
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1004
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1005	\noindent
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1006	where we just need to fuse the bitsequence that has accumulated in @{text "bs\<^sub>2"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1007	to the alternatives in @{text "rs\<^sub>2"}. As we shall show below this will
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1008	ensure that the correct value corresponding to the original (unsimplified)
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1009	regular expression can still be extracted. %In this way the value construction
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1010	%is not affected by simplification.
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1011
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1012	However there is one problem with the definition for the more
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1013	aggressive simplification rules described by Sulzmann and Lu. Recasting
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1014	their definition with our syntax they define the step of removing
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1015	duplicates as
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1016	%
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1017	\[ @{text "bsimp (ALTs bs rs)"} \dn @{text "ALTs
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	1018	bs (nub (map bsimp rs))"}
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1019	\]
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	1020
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1021	\noindent where they first recursively simplify the regular
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1022	expressions in @{text rs} (using @{text map}) and then use
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1023	Haskell's @{text nub}-function to remove potential
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1024	duplicates. While this makes sense when considering the example
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1025	shown in \eqref{derivex}, @{text nub} is the inappropriate
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1026	function in the case of bitcoded regular expressions. The reason
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1027	is that in general the elements in @{text rs} will have a
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1028	different annotated bitsequence and in this way @{text nub}
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	1029	will never find a duplicate to be removed. One correct way to
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1030	handle this situation is to first \emph{erase} the regular
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1031	expressions when comparing potential duplicates. This is inspired
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1032	by Scala's list functions of the form \mbox{@{text "distinctBy rs f
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1033	acc"}} where a function is applied first before two elements
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1034	are compared. We define this function in Isabelle/HOL as
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1035
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1036	\begin{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1037	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1038	@{thm (lhs) distinctBy.simps(1)} & $\dn$ & @{thm (rhs) distinctBy.simps(1)}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1039	@{thm (lhs) distinctBy.simps(2)} & $\dn$ & @{thm (rhs) distinctBy.simps(2)}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1040	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1041	\end{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1042
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1043	\noindent where we scan the list from left to right (because we
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1044	have to remove later copies). In @{text distinctBy}, @{text f} is a
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1045	function and @{text acc} is an accumulator for regular
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1046	expressions---essentially a set of regular expressions that we have already seen
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1047	while scanning the list. Therefore we delete an element, say @{text x},
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1048	from the list provided @{text "f x"} is already in the accumulator;
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1049	otherwise we keep @{text x} and scan the rest of the list but
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1050	add @{text "f x"} as another ``seen'' element to @{text acc}. We will use
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	1051	@{term distinctBy} where @{text f} is the erase function, @{term "erase (DUMMY)"},
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1052	that deletes bitsequences from bitcoded regular expressions.
461 c4b6906068a9 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 460 diff changeset	1053	This is clearly a computationally more expensive operation than @{text nub},
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1054	but is needed in order to make the removal of unnecessary copies
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1055	to work properly.
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1056
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1057	Our simplification function depends on three helper functions, one is called
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1058	@{text flts} and analyses lists of regular expressions coming from alternatives.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1059	It is defined as follows:
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1060
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1061	\begin{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1062	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1063	@{thm (lhs) flts.simps(1)} & $\dn$ & @{thm (rhs) flts.simps(1)}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1064	@{thm (lhs) flts.simps(2)} & $\dn$ & @{thm (rhs) flts.simps(2)}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1065	@{thm (lhs) flts.simps(3)[of "bs'" "rs'"]} & $\dn$ & @{thm (rhs) flts.simps(3)[of "bs'" "rs'"]}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1066	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1067	\end{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1068
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1069	\noindent
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1070	The second clause of @{text flts} removes all instances of @{text ZERO} in alternatives and
458 30c91ea7095b updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 436 diff changeset	1071	the third ``spills'' out nested alternatives (but retaining the
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1072	bitsequence @{text "bs'"} accumulated in the inner alternative). There are
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1073	some corner cases to be considered when the resulting list inside an alternative is
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1074	empty or a singleton list. We take care of those cases in the
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1075	@{text "bsimpALTs"} function; similarly we define a helper function that simplifies
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1076	sequences according to the usual rules about @{text ZERO}s and @{text ONE}s:
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1077
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1078	\begin{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1079	\begin{tabular}{c@ {\hspace{5mm}}c}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1080	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1081	@{text "bsimpALTs bs []"} & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1082	@{text "bsimpALTs bs [r]"} & $\dn$ & @{text "fuse bs r"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1083	@{text "bsimpALTs bs rs"} & $\dn$ & @{text "ALTs bs rs"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1084	\mbox{}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1085	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1086	&
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1087	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1088	@{text "bsimpSEQ bs _ ZERO"} & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1089	@{text "bsimpSEQ bs ZERO _"} & $\dn$ & @{text "ZERO"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1090	@{text "bsimpSEQ bs\<^sub>1 (ONE bs\<^sub>2) r\<^sub>2"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1091	& $\dn$ & @{text "fuse (bs\<^sub>1 @ bs\<^sub>2) r\<^sub>2"}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1092	@{text "bsimpSEQ bs r\<^sub>1 r\<^sub>2"} & $\dn$ & @{text "SEQ bs r\<^sub>1 r\<^sub>2"}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1093	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1094	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1095	\end{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1096
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1097	\noindent
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1098	With this in place we can define our simplification function as
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1099
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1100	\begin{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1101	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1102	@{thm (lhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1103	@{thm (rhs) bsimp.simps(1)[of "bs" "r\<^sub>1" "r\<^sub>2"]}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1104	@{thm (lhs) bsimp.simps(2)[of "bs" _]} & $\dn$ & @{thm (rhs) bsimp.simps(2)[of "bs" _]}\\
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1105	@{text "bsimp r"} & $\dn$ & @{text r}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1106	\end{tabular}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1107	\end{center}
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1108
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1109	\noindent
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1110	As far as we can see, our recursive function @{term bsimp} simplifies regular
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1111	expressions as intended by Sulzmann and Lu. There is no point in applying the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1112	@{text bsimp} function repeatedly (like the simplification in their paper which needs to be
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1113	applied until a fixpoint is reached) because we can show that @{term bsimp} is idempotent,
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1114	that is
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1115
2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1116	\begin{proposition}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1117	@{term "bsimp (bsimp r) = bsimp r"}
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1118	\end{proposition}
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	1119
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1120	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1121	This can be proved by induction on @{text r} but requires a detailed analysis
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1122	that the de-nesting of alternatives always results in a flat list of regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1123	expressions. We omit the details since it does not concern the correctness proof.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1124
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1125	Next we can include simplification after each derivative step leading to the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1126	following notion of bitcoded derivatives:
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1127
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1128	\begin{center}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1129	\begin{tabular}{cc}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1130	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1131	@{thm (lhs) bders_simp.simps(1)} & $\dn$ & @{thm (rhs) bders_simp.simps(1)}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1132	\end{tabular}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1133	&
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1134	\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1135	@{thm (lhs) bders_simp.simps(2)} & $\dn$ & @{thm (rhs) bders_simp.simps(2)}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1136	\end{tabular}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1137	\end{tabular}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1138	\end{center}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1139
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1140	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1141	and use it in the improved lexing algorithm defined as
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1142
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1143	\begin{center}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1144	\begin{tabular}{lcl}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1145	$\textit{blexer}^+\;r\,s$ & $\dn$ &
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1146	$\textit{let}\;r_{der} = (r^\uparrow)\backslash_{bsimp}\, s\;\textit{in}$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1147	& & $\;\;\;\;\textit{if}\; \textit{bnullable}(r_{der}) \;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,r_{der})\,r
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1148	\;\;\textit{else}\;\textit{None}$
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1149	\end{tabular}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1150	\end{center}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1151
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1152	\noindent The remaining task is to show that @{term blexer} and
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1153	@{term "blexer_simp"} generate the same answers.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1154
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1155	When we first
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1156	attempted this proof we encountered a problem with the idea
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1157	in Sulzmann and Lu's paper where the argument seems to be to appeal
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1158	again to the @{text retrieve}-function defined for the unsimplified version
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1159	of the algorithm. But
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1160	this does not work, because desirable properties such as
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1161	%
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1162	\[
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1163	@{text "retrieve r v = retrieve (bsimp r) v"}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1164	\]
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1165
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1166	\noindent do not hold under simplification---this property
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1167	essentially purports that we can retrieve the same value from a
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1168	simplified version of the regular expression. To start with @{text retrieve}
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	1169	depends on the fact that the value @{text v} corresponds to the
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	1170	structure of the regular expression @{text r}---but the whole point of simplification
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1171	is to ``destroy'' this structure by making the regular expression simpler.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1172	To see this consider the regular expression @{text "r = r' + 0"} and a corresponding
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1173	value @{text "v = Left v'"}. If we annotate bitcodes to @{text "r"}, then
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	1174	we can use @{text retrieve} with @{text r} and @{text v} in order to extract a corresponding
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1175	bitsequence. The reason that this works is that @{text r} is an alternative
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	1176	regular expression and @{text v} a corresponding @{text "Left"}-value. However, if we simplify
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1177	@{text r}, then @{text v} does not correspond to the shape of the regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1178	expression anymore. So unless one can somehow
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1179	synchronise the change in the simplified regular expressions with
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1180	the original POSIX value, there is no hope of appealing to @{text retrieve} in the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1181	correctness argument for @{term blexer_simp}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1182
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1183	We found it more helpful to introduce the rewriting systems shown in
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1184	Figure~\ref{SimpRewrites}. The idea is to generate
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1185	simplified regular expressions in small steps (unlike the @{text bsimp}-function which
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1186	does the same in a big step), and show that each of
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1187	the small steps preserves the bitcodes that lead to the final POSIX value.
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1188	The rewrite system is organised such that $\leadsto$ is for bitcoded regular
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1189	expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1190	expressions. The former essentially implements the simplifications of
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1191	@{text "bsimpSEQ"} and @{text flts}; while the latter implements the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1192	simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1193	regular expression reduces in zero or more steps to the simplified
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1194	regular expression generated by @{text bsimp}:
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1195
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1196	\begin{lemma}\label{lemone}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1197	@{thm[mode=IfThen] rewrites_to_bsimp}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1198	\end{lemma}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1199
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1200	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1201	By induction on @{text r}. For this we can use the properties
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1202	@{thm fltsfrewrites} and @{thm ss6_stronger}. The latter uses
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1203	repeated applications of the $LD$ rule which allows the removal
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1204	of duplicates that can recognise the same strings.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1205	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1206
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1207	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1208	We can show that this rewrite system preserves @{term bnullable}, that
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1209	is simplification, essentially, does not affect nullability:
420 b66a4305749c updated Christian Urban <christian.urban@kcl.ac.uk> parents: 418 diff changeset	1210
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1211	\begin{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1212	@{thm[mode=IfThen] bnullable0(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1213	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1214
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1215	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1216	Straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1217	The only interesting case is the rule $LD$ where the property holds since by the side-conditions of that rule the empty string will
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1218	be in both @{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c)"} and
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1219	@{text "L (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1220	\end{proof}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1221
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1222	\noindent
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1223	From this, we can show that @{text bmkeps} will produce the same bitsequence
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1224	as long as one of the bitcoded regular expressions in $\leadsto$ is nullable (this lemma
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1225	establishes the missing fact we were not able to establish using @{text retrieve}, as suggested
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1226	in the paper by Sulzmannn and Lu).
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1227
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1228
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1229	\begin{lemma}\label{lemthree}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1230	@{thm[mode=IfThen] rewrite_bmkeps_aux(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1231	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1232
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1233	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1234	By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1235	Again the only interesting case is the rule $LD$ where we need to ensure that
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1236	\[
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1237	@{text "bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ [r\<^sub>2] @ rs\<^sub>c) =
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1238	bmkeps (rs\<^sub>a @ [r\<^sub>1] @ rs\<^sub>b @ rs\<^sub>c)"}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1239	\]
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1240
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1241	\noindent holds. This is indeed the case because according to the POSIX rules the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1242	generated bitsequence is determined by the first alternative that can match the
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1243	string (in this case being nullable).
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1244	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1245
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1246	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1247	Crucial is also the fact that derivative steps and simplification steps can be interleaved,
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1248	which is shown by the fact that $\leadsto$ is preserved under derivatives.
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1249
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1250	\begin{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1251	@{thm[mode=IfThen] rewrite_preserves_bder(1)[of "r\<^sub>1" "r\<^sub>2"]}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1252	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1253
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1254	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1255	By straightforward mutual induction on the definition of $\leadsto$ and $\stackrel{s}{\leadsto}$.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1256	The case for $LD$ holds because @{term "L (erase (bder c r\<^sub>2)) \<subseteq> L (erase (bder c r\<^sub>1))"}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1257	if and only if @{term "L (erase (r\<^sub>2)) \<subseteq> L (erase (r\<^sub>1))"}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1258	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1259
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1260
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1261	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1262	Using this fact together with Lemma~\ref{lemone} allows us to prove the central lemma
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1263	that the unsimplified
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1264	derivative (with a string @{term s}) reduces to the simplified derivative (with the same string).
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1265
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1266	\begin{lemma}\label{lemtwo}
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1267	@{thm[mode=IfThen] central}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1268	\end{lemma}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1269
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1270	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1271	By reverse induction on @{term s} generalising over @{text r}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1272	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1273
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1274	\noindent
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1275	With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1276	generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1277	is indeed the POSIX value.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1278
418 41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1279	\begin{theorem}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1280	@{thm[mode=IfThen] main_blexer_simp}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1281	\end{theorem}
41a2a3b63853 more of the paper Christian Urban <christian.urban@kcl.ac.uk> parents: 416 diff changeset	1282
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1283	\begin{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1284	By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1285	\end{proof}
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1286
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1287	\noindent
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1288	This completes the correctness proof for the second POSIX lexing algorithm by Sulzmann and Lu.
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1289	The interesting point of this algorithm is that the sizes of derivatives do not grow arbitrarily, which
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1290	we shall show next.
398 dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1291
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1292	\begin{figure}[t]
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1293	\begin{center}
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1294	\begin{tabular}{c}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1295	@{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}$S\ZERO{}_l$\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1296	@{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}$S\ZERO{}_r$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1297	@{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}$S\ONE$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1298	@{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}SL\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1299	@{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}SR\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1300	@{thm[mode=Axiom] bs6}$A0$\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1301	@{thm[mode=Axiom] bs7}$A1$\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1302	@{thm[mode=Rule] bs8[of "rs\<^sub>1" "rs\<^sub>2"]}$AL$\\
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	1303	@{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}$LT$\qquad
726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	1304	@{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}$LH$\\
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1305	@{thm[mode=Axiom] ss4}$L\ZERO$\qquad
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1306	@{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}$LS$\medskip\\
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1307	@{thm[mode=Rule] ss6[of "r\<^sub>2" "r\<^sub>1" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}$LD$\\
398 dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1308	\end{tabular}
dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1309	\end{center}
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1310	\caption{The rewrite rules that generate simplified regular expressions
14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1311	in small steps: @{term "rrewrite r\<^sub>1 r\<^sub>2"} is for bitcoded regular
461 c4b6906068a9 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 460 diff changeset	1312	expressions and @{term "srewrite rs\<^sub>1 rs\<^sub>2"} for \emph{lists} of bitcoded
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1313	regular expressions. Interesting is the $LD$ rule that allows copies of regular
461 c4b6906068a9 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 460 diff changeset	1314	expressions to be removed provided a regular expression earlier in the list can
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1315	match the same strings.}\label{SimpRewrites}
398 dac6d27c99c6 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 397 diff changeset	1316	\end{figure}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1317	*}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1318
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1319	section {* Finiteness of Derivatives *}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1320
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1321	text {*
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1322
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1323	In this section let us sketch our argument for why the size of the simplified
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1324	derivatives with the aggressive simplification function is finite. Suppose
436 222333d2bdc2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 426 diff changeset	1325	we have a size function for bitcoded regular expressions, written
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1326	@{text "\|r\|"}, which counts the number of nodes if we regard $r$ as a tree
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1327	(we omit the precise definition). For this we show that for every $r$
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1328	there exists a bound $N$
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1329	such that
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1330
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1331	\begin{center}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1332	$\forall s. \; \|@{term "bders_simp r s"}\| < N$
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1333	\end{center}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1334
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1335	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1336	We prove this by induction on $r$. The base cases for @{term AZERO},
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1337	@{term "AONE bs"} and @{term "ACHAR bs c"} are straightforward. The interesting case is
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1338	for sequences of the form @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}. In this case our induction
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1339	hypotheses state $\forall s. \; \|@{term "bders_simp r\<^sub>1 s"}\| < N_1$ and
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1340	$\forall s. \; \|@{term "bders_simp r\<^sub>2 s"}\| < N_2$. We can reason as follows
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1341
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1342	\begin{center}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1343	\begin{tabular}{lcll}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1344	& & $ \|@{term "bders_simp (ASEQ bs r\<^sub>1 r\<^sub>2) s"}\|$\\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1345	& $ = $ & $\|bsimp(ALTs\;bs\;((@{term "bders_simp r\<^sub>1 s"}) \cdot r_2) ::
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1346	[@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in Suf\!fix(s)])\| $ & (1) \\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1347	& $\leq$ &
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1348	$\|distinctBy\,(flts\,((@{term "bders_simp r\<^sub>1 s "}) \cdot r_2) ::
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1349	[@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in Suf\!fix(s)])\| + 1 $ & (2) \\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1350	& $\leq$ & $\|(@{term "bders_simp r\<^sub>1 s"}) \cdot r_2\| +
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1351	\|distinctBy\,(flts\, [@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in Suf\!fix(s)])\| + 1 $ & (3) \\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1352	& $\leq$ & $N_1 + \|r_2\| + 2 + \|distinctBy\,(flts\, [@{term "bders_simp r\<^sub>2 s'"} \;\|\; s' \in Suf\!fix(s)])\|$ & (4)\\
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1353	& $\leq$ & $N_1 + \|r_2\| + 2 + l_{N_{2}} * N_{2}$ & (5)
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1354	\end{tabular}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1355	\end{center}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1356
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1357	% tell Chengsong about Indian paper of closed forms of derivatives
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1358
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1359	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1360	where in (1) the $Suf\!fix(s')$ are the suffixes where @{term "bders_simp r\<^sub>1 s''"} is nullable for
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1361	@{text "s = s'' @ s'"}. In (3) we know that $\|(@{term "bders_simp r\<^sub>1 s"}) \cdot r_2\|$ is
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1362	bounded by $N_1 + \|r_2\|$. In (5) we know the list comprehension contains only regular expressions of size smaller
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1363	than $N_2$. The list length after @{text distinctBy} is bounded by a number, which we call $l_{N_2}$. It stands
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1364	for the number of distinct regular expressions with a maximum size $N_2$ (there can only be finitely many of them).
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1365	We reason similarly in the @{text Star}-case.\medskip
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1366
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1367	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1368	Clearly we give in this finiteness argument (Step (5)) a very loose bound that is
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1369	far from the actual bound we can expect. We can do better than this, but this does not improve
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1370	the finiteness property we are proving. If we are interested in a polynomial bound,
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1371	one would hope to obtain a similar tight bound as for partial
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1372	derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1373	@{text distinctBy} is to maintain a ``set'' of alternatives (like the sets in
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1374	partial derivatives). Unfortunately to obtain the exact same bound would mean
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1375	we need to introduce simplifications such as
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1376	%
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1377	\[ (r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1378	\]
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1379
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1380	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1381	which exist for partial derivatives. However, if we introduce them in our
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1382	setting we would lose the POSIX property of our calculated values. We leave better
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1383	bounds for future work.
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1384
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1385	*}
397 e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1386
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1387
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1388	section {* Conclusion *}
e1b74d618f1b updated Sizebound4 Christian Urban <christian.urban@kcl.ac.uk> parents: 396 diff changeset	1389
396 cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1390	text {*
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1391
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1392	We set out in this work to prove in Isabelle/HOL the correctness of
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1393	the second POSIX lexing algorithm by Sulzmann and Lu
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1394	\cite{Sulzmann2014}. This follows earlier work where we established
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1395	the correctness of the first algorithm
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1396	\cite{AusafDyckhoffUrban2016}. In the earlier work we needed to
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1397	introduce our own specification about what POSIX values are,
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1398	because the informal definition given by Sulzmann and Lu did not
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1399	stand up to a formal proof. Also for the second algorithm we needed
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1400	to introduce our own definitions and proof ideas in order to establish the
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1401	correctness. Our interest in the second algorithm
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1402	lies in the fact that by using bitcoded regular expressions and an aggressive
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	1403	simplification method there is a chance that the derivatives
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1404	can be kept universally small (we established in this paper that
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1405	they can be kept finite for any string). This is important if one is after
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	1406	an efficient POSIX lexing algorithm based on derivatives.
425 14c558ae0b09 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 424 diff changeset	1407
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1408	Having proved the correctness of the POSIX lexing algorithm, which
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1409	lessons have we learned? Well, we feel this is a very good example
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1410	where formal proofs give further insight into the matter at
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1411	hand. For example it is very hard to see a problem with @{text nub}
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1412	vs @{text distinctBy} with only experimental data---one would still
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1413	see the correct result but find that simplification does not simplify in well-chosen, but not
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1414	obscure, examples. We found that from an implementation
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1415	point-of-view it is really important to have the formal proofs of
462 d9b672c4c0ac updated Christian Urban <christian.urban@kcl.ac.uk> parents: 461 diff changeset	1416	the corresponding properties at hand.
d9b672c4c0ac updated Christian Urban <christian.urban@kcl.ac.uk> parents: 461 diff changeset	1417
d9b672c4c0ac updated Christian Urban <christian.urban@kcl.ac.uk> parents: 461 diff changeset	1418	We have also developed a
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1419	healthy suspicion when experimental data is used to back up
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1420	efficiency claims. For example Sulzmann and Lu write about their
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1421	equivalent of @{term blexer_simp} ``...we can incrementally compute
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1422	bitcoded parse trees in linear time in the size of the input''
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1423	\cite[Page 14]{Sulzmann2014}.
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1424	Given the growth of the
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1425	derivatives in some cases even after aggressive simplification, this
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	1426	is a hard to believe claim. A similar claim about a theoretical runtime
459 484403cf0c9d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 458 diff changeset	1427	of @{text "O(n\<^sup>2)"} is made for the Verbatim lexer, which calculates
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	1428	tokens according to POSIX rules~\cite{verbatim}. For this Verbatim uses Brzozowski's
462 d9b672c4c0ac updated Christian Urban <christian.urban@kcl.ac.uk> parents: 461 diff changeset	1429	derivatives like in our work.
474 726f4e65c0fe made paper changes after ITP comments Christian Urban <christian.urban@kcl.ac.uk> parents: 464 diff changeset	1430	The authors write: ``The results of our empirical tests [..] confirm that Verbatim has
461 c4b6906068a9 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 460 diff changeset	1431	@{text "O(n\<^sup>2)"} time complexity.'' \cite[Section~VII]{verbatim}.
459 484403cf0c9d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 458 diff changeset	1432	While their correctness proof for Verbatim is formalised in Coq, the claim about
461 c4b6906068a9 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 460 diff changeset	1433	the runtime complexity is only supported by some emperical evidence obtained
c4b6906068a9 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 460 diff changeset	1434	by using the code extraction facilities of Coq.
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	1435	Given our observation with the ``growth problem'' of derivatives,
460 6e269f557fc5 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 459 diff changeset	1436	we
6e269f557fc5 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 459 diff changeset	1437	tried out their extracted OCaml code with the example
6e269f557fc5 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 459 diff changeset	1438	\mbox{@{text "(a + aa)\<^sup>*"}} as a single lexing rule, and it took for us around 5 minutes to tokenise a
459 484403cf0c9d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 458 diff changeset	1439	string of 40 $a$'s and that increased to approximately 19 minutes when the
464 e6248d2c20c2 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 463 diff changeset	1440	string is 50 $a$'s long. Taking into account that derivatives are not simplified in the Verbatim
460 6e269f557fc5 updated paper Christian Urban <christian.urban@kcl.ac.uk> parents: 459 diff changeset	1441	lexer, such numbers are not surprising.
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1442	Clearly our result of having finite
459 484403cf0c9d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 458 diff changeset	1443	derivatives might sound rather weak in this context but we think such effeciency claims
484403cf0c9d updated Christian Urban <christian.urban@kcl.ac.uk> parents: 458 diff changeset	1444	really require further scrutiny.\medskip
426 5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1445
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1446	\noindent
5b77220fdf01 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 425 diff changeset	1447	Our Isabelle/HOL code is available under \url{https://github.com/urbanchr/posix}.
424 2416fdec6396 updated Christian Urban <christian.urban@kcl.ac.uk> parents: 423 diff changeset	1448
396 cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1449
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1450	%%\bibliographystyle{plain}
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1451	\bibliography{root}
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1452	*}
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1453
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1454	(<)
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1455	end
cc8e231529fb added ITP paper Christian Urban <christian.urban@kcl.ac.uk> parents: diff changeset	1456	(>)

author	Chengsong
	Mon, 09 May 2022 17:17:52 +0100
changeset 511	47618d607bbf
parent 474	726f4e65c0fe
permissions	-rw-r--r--