# HG changeset patch # User Christian Urban # Date 1534857259 -3600 # Node ID 25b7d6bfd294fd9284d8a7f9336b997e6c40974f # Parent ed3169a567eab3d765f86ab1698f6417d97b233f updated diff -r ed3169a567ea -r 25b7d6bfd294 thys/Journal/PaperExt.thy --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/thys/Journal/PaperExt.thy Tue Aug 21 14:14:19 2018 +0100 @@ -0,0 +1,323 @@ +(*<*) +theory PaperExt +imports + (*"../LexerExt"*) + (*"../PositionsExt"*) + "~~/src/HOL/Library/LaTeXsugar" +begin +(*>*) + +(* +declare [[show_question_marks = false]] +declare [[eta_contract = false]] + +abbreviation + "der_syn r c \ der c r" + +abbreviation + "ders_syn r s \ ders s r" + + +notation (latex output) + If ("(\<^latex>\\\textrm{\if\<^latex>\}\ (_)/ \<^latex>\\\textrm{\then\<^latex>\}\ (_)/ \<^latex>\\\textrm{\else\<^latex>\}\ (_))" 10) and + Cons ("_\<^latex>\\\mbox{$\\,$}\::\<^latex>\\\mbox{$\\,$}\_" [75,73] 73) and + + ZERO ("\<^bold>0" 81) and + ONE ("\<^bold>1" 81) and + CHAR ("_" [1000] 80) and + ALT ("_ + _" [77,77] 78) and + SEQ ("_ \ _" [77,77] 78) and + STAR ("_\<^sup>\" [78] 78) and + NTIMES ("_\<^bsup>'{_'}\<^esup>" [78, 50] 80) and + FROMNTIMES ("_\<^bsup>'{_..'}\<^esup>" [78, 50] 80) and + UPNTIMES ("_\<^bsup>'{.._'}\<^esup>" [78, 50] 80) and + NMTIMES ("_\<^bsup>'{_.._'}\<^esup>" [78, 50,50] 80) and + + val.Void ("Empty" 78) and + val.Char ("Char _" [1000] 78) and + val.Left ("Left _" [79] 78) and + val.Right ("Right _" [1000] 78) and + val.Seq ("Seq _ _" [79,79] 78) and + val.Stars ("Stars _" [1000] 78) and + + L ("L'(_')" [10] 78) and + LV ("LV _ _" [80,73] 78) and + der_syn ("_\\_" [79, 1000] 76) and + ders_syn ("_\\_" [79, 1000] 76) and + flat ("|_|" [75] 74) and + flats ("|_|" [72] 74) and + Sequ ("_ @ _" [78,77] 63) and + injval ("inj _ _ _" [81,77,79] 76) and + mkeps ("mkeps _" [79] 76) and + length ("len _" [73] 73) and + intlen ("len _" [73] 73) and + set ("_" [73] 73) and + + Prf ("_ : _" [75,75] 75) and + Posix ("'(_, _') \ _" [63,75,75] 75) and + + lexer ("lexer _ _" [78,78] 77) and + DUMMY ("\<^latex>\\\underline{\\hspace{2mm}}\") +*) +(*>*) + +(* +section {* Extensions*} + +text {* + + A strong point in favour of + Sulzmann and Lu's algorithm is that it can be extended in various + ways. If we are interested in tokenising a string, then we need to not just + split up the string into tokens, but also ``classify'' the tokens (for + example whether it is a keyword or an identifier). This can be done with + only minor modifications to the algorithm by introducing \emph{record + regular expressions} and \emph{record values} (for example + \cite{Sulzmann2014b}): + + \begin{center} + @{text "r :="} + @{text "..."} $\mid$ + @{text "(l : r)"} \qquad\qquad + @{text "v :="} + @{text "..."} $\mid$ + @{text "(l : v)"} + \end{center} + + \noindent where @{text l} is a label, say a string, @{text r} a regular + expression and @{text v} a value. All functions can be smoothly extended + to these regular expressions and values. For example \mbox{@{text "(l : + r)"}} is nullable iff @{term r} is, and so on. The purpose of the record + regular expression is to mark certain parts of a regular expression and + then record in the calculated value which parts of the string were matched + by this part. The label can then serve as classification for the tokens. + For this recall the regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\"} for + keywords and identifiers from the Introduction. With the record regular + expression we can form \mbox{@{text "((key : r\<^bsub>key\<^esub>) + (id : r\<^bsub>id\<^esub>))\<^sup>\"}} + and then traverse the calculated value and only collect the underlying + strings in record values. With this we obtain finite sequences of pairs of + labels and strings, for example + + \[@{text "(l\<^sub>1 : s\<^sub>1), ..., (l\<^sub>n : s\<^sub>n)"}\] + + \noindent from which tokens with classifications (keyword-token, + identifier-token and so on) can be extracted. + + In the context of POSIX matching, it is also interesting to study additional + constructors about bounded-repetitions of regular expressions. For this let + us extend the results from the previous section to the following four + additional regular expression constructors: + + \begin{center} + \begin{tabular}{lcrl@ {\hspace{12mm}}l} + @{text r} & @{text ":="} & $\ldots\mid$ & @{term "NTIMES r n"} & exactly-@{text n}-times\\ + & & $\mid$ & @{term "UPNTIMES r n"} & upto-@{text n}-times\\ + & & $\mid$ & @{term "FROMNTIMES r n"} & from-@{text n}-times\\ + & & $\mid$ & @{term "NMTIMES r n m"} & between-@{text nm}-times\\ + \end{tabular} + \end{center} + + \noindent + We will call them \emph{bounded regular expressions}. + With the help of the power operator (definition ommited) for sets of strings, the languages + recognised by these regular expression can be defined in Isabelle as follows: + + \begin{center} + \begin{tabular}{lcl} + @{thm (lhs) L.simps(8)} & $\dn$ & @{thm (rhs) L.simps(8)}\\ + @{thm (lhs) L.simps(7)} & $\dn$ & @{thm (rhs) L.simps(7)}\\ + @{thm (lhs) L.simps(9)} & $\dn$ & @{thm (rhs) L.simps(9)}\\ + @{thm (lhs) L.simps(10)} & $\dn$ & @{thm (rhs) L.simps(10)}\\ + \end{tabular} + \end{center} + + \noindent This definition implies that in the last clause @{term + "NMTIMES r n m"} matches no string in case @{term "m < n"}, because + then the interval @{term "{n..m}"} is empty. While the language + recognised by these regular expressions is straightforward, some + care is needed for how to define the corresponding lexical + values. First, with a slight abuse of language, we will (re)use + values of the form @{term "Stars vs"} for values inhabited in + bounded regular expressions. Second, we need to introduce inductive + rules for extending our inhabitation relation shown on + Page~\ref{prfintros}, from which we then derived our notion of + lexical values. Given the rule for @{term "STAR r"}, the rule for + @{term "UPNTIMES r n"} just requires additionally that the length of + the list of values must be smaller or equal to @{term n}: + + \begin{center} + @{thm[mode=Rule] Prf.intros(7)[of "vs"]} + \end{center} + + \noindent Like in the @{term "STAR r"}-rule, we require with the + left-premise that some non-empty part of the string is `chipped' + away by \emph{every} value in @{text vs}, that means the corresponding + values do not flatten to the empty string. In the rule for @{term + "NTIMES r n"} (that is exactly-@{term n}-times @{text r}) we clearly need to require + that the length of the list of values equals to @{text n}. But enforcing + that every of these @{term n} values `chipps' away some part of a string + would be too strong. + + + \begin{center} + @{thm[mode=Rule] Prf.intros(8)[of "vs\<^sub>1" r "vs\<^sub>2"]} + \end{center} + + \begin{center} + \begin{tabular}{lcl} + @{thm (lhs) der.simps(8)} & $\dn$ & @{thm (rhs) der.simps(8)}\\ + @{thm (lhs) der.simps(7)} & $\dn$ & @{thm (rhs) der.simps(7)}\\ + @{thm (lhs) der.simps(9)} & $\dn$ & @{thm (rhs) der.simps(9)}\\ + @{thm (lhs) der.simps(10)} & $\dn$ & @{thm (rhs) der.simps(10)}\\ + \end{tabular} + \end{center} + + + \begin{center} + \begin{tabular}{lcl} + @{thm (lhs) mkeps.simps(5)} & $\dn$ & @{thm (rhs) mkeps.simps(5)}\\ + @{thm (lhs) mkeps.simps(6)} & $\dn$ & @{thm (rhs) mkeps.simps(6)}\\ + @{thm (lhs) mkeps.simps(7)} & $\dn$ & @{thm (rhs) mkeps.simps(7)}\\ + @{thm (lhs) mkeps.simps(8)} & $\dn$ & @{thm (rhs) mkeps.simps(8)}\\ + \end{tabular} + \end{center} + + \begin{center} + \begin{tabular}{lcl} + @{thm (lhs) injval.simps(8)} & $\dn$ & @{thm (rhs) injval.simps(8)}\\ + @{thm (lhs) injval.simps(9)} & $\dn$ & @{thm (rhs) injval.simps(9)}\\ + @{thm (lhs) injval.simps(10)} & $\dn$ & @{thm (rhs) injval.simps(10)}\\ + @{thm (lhs) injval.simps(11)} & $\dn$ & @{thm (rhs) injval.simps(11)}\\ + \end{tabular} + \end{center} + + + @{thm [mode=Rule] Posix_NTIMES1[of "s\<^sub>1" r v "s\<^sub>2"]} + + @{thm [mode=Rule] Posix_NTIMES2} + + @{thm [mode=Rule] Posix_UPNTIMES1[of "s\<^sub>1" r v "s\<^sub>2"]} + + @{thm [mode=Rule] Posix_UPNTIMES2} + + @{thm [mode=Rule] Posix_FROMNTIMES1[of "s\<^sub>1" r v "s\<^sub>2"]} + + @{thm [mode=Rule] Posix_FROMNTIMES3[of "s\<^sub>1" r v "s\<^sub>2"]} + + @{thm [mode=Rule] Posix_FROMNTIMES2} + + @{thm [mode=Rule] Posix_NMTIMES1[of "s\<^sub>1" r v "s\<^sub>2"]} + + @{thm [mode=Rule] Posix_NMTIMES3[of "s\<^sub>1" r v "s\<^sub>2"]} + + @{thm [mode=Rule] Posix_NMTIMES2} + + + + @{term "\ i \ {m..n} . P i"} @{term "\ i \ {..n} . P i"} + @{term "\ i \ {m..n} . P i"} @{term "\ i \ {..n} . P i"} + @{term "\ i \ {0::nat..n} . P i"} + + + +*} + + + +section {* The Correctness Argument by Sulzmann and Lu\label{argu} *} + +text {* +% \newcommand{\greedy}{\succcurlyeq_{gr}} + \newcommand{\posix}{>} + + An extended version of \cite{Sulzmann2014} is available at the website of + its first author; this includes some ``proofs'', claimed in + \cite{Sulzmann2014} to be ``rigorous''. Since these are evidently not in + final form, we make no comment thereon, preferring to give general reasons + for our belief that the approach of \cite{Sulzmann2014} is problematic. + Their central definition is an ``ordering relation'' defined by the + rules (slightly adapted to fit our notation): + + ?? + + \noindent The idea behind the rules (A1) and (A2), for example, is that a + @{text Left}-value is bigger than a @{text Right}-value, if the underlying + string of the @{text Left}-value is longer or of equal length to the + underlying string of the @{text Right}-value. The order is reversed, + however, if the @{text Right}-value can match a longer string than a + @{text Left}-value. In this way the POSIX value is supposed to be the + biggest value for a given string and regular expression. + + Sulzmann and Lu explicitly refer to the paper \cite{Frisch2004} by Frisch + and Cardelli from where they have taken the idea for their correctness + proof. Frisch and Cardelli introduced a similar ordering for GREEDY + matching and they showed that their GREEDY matching algorithm always + produces a maximal element according to this ordering (from all possible + solutions). The only difference between their GREEDY ordering and the + ``ordering'' by Sulzmann and Lu is that GREEDY always prefers a @{text + Left}-value over a @{text Right}-value, no matter what the underlying + string is. This seems to be only a very minor difference, but it has + drastic consequences in terms of what properties both orderings enjoy. + What is interesting for our purposes is that the properties reflexivity, + totality and transitivity for this GREEDY ordering can be proved + relatively easily by induction. +*} +*) + +section {* Conclusion *} + +text {* + + We have implemented the POSIX value calculation algorithm introduced by + Sulzmann and Lu + \cite{Sulzmann2014}. Our implementation is nearly identical to the + original and all modifications we introduced are harmless (like our char-clause for + @{text inj}). We have proved this algorithm to be correct, but correct + according to our own specification of what POSIX values are. Our + specification (inspired from work by Vansummeren \cite{Vansummeren2006}) appears to be + much simpler than in \cite{Sulzmann2014} and our proofs are nearly always + straightforward. We have attempted to formalise the original proof + by Sulzmann and Lu \cite{Sulzmann2014}, but we believe it contains + unfillable gaps. In the online version of \cite{Sulzmann2014}, the authors + already acknowledge some small problems, but our experience suggests + that there are more serious problems. + + Having proved the correctness of the POSIX lexing algorithm in + \cite{Sulzmann2014}, which lessons have we learned? Well, this is a + perfect example for the importance of the \emph{right} definitions. We + have (on and off) explored mechanisations as soon as first versions + of \cite{Sulzmann2014} appeared, but have made little progress with + turning the relatively detailed proof sketch in \cite{Sulzmann2014} into a + formalisable proof. Having seen \cite{Vansummeren2006} and adapted the + POSIX definition given there for the algorithm by Sulzmann and Lu made all + the difference: the proofs, as said, are nearly straightforward. The + question remains whether the original proof idea of \cite{Sulzmann2014}, + potentially using our result as a stepping stone, can be made to work? + Alas, we really do not know despite considerable effort. + + + Closely related to our work is an automata-based lexer formalised by + Nipkow \cite{Nipkow98}. This lexer also splits up strings into longest + initial substrings, but Nipkow's algorithm is not completely + computational. The algorithm by Sulzmann and Lu, in contrast, can be + implemented with ease in any functional language. A bespoke lexer for the + Imp-language is formalised in Coq as part of the Software Foundations book + by Pierce et al \cite{Pierce2015}. The disadvantage of such bespoke lexers is that they + do not generalise easily to more advanced features. + Our formalisation is available from the Archive of Formal Proofs \cite{aduAFP16} + under \url{http://www.isa-afp.org/entries/Posix-Lexing.shtml}.\medskip + + \noindent + {\bf Acknowledgements:} + We are very grateful to Martin Sulzmann for his comments on our work and + moreover for patiently explaining to us the details in \cite{Sulzmann2014}. We + also received very helpful comments from James Cheney and anonymous referees. + % \small + \bibliographystyle{plain} + \bibliography{root} + +*} + +(*<*) +end +(*>*) \ No newline at end of file diff -r ed3169a567ea -r 25b7d6bfd294 thys/SpecAlts.thy --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/thys/SpecAlts.thy Tue Aug 21 14:14:19 2018 +0100 @@ -0,0 +1,762 @@ + +theory SpecAlts + imports Main "~~/src/HOL/Library/Sublist" +begin + +section {* Sequential Composition of Languages *} + +definition + Sequ :: "string set \ string set \ string set" ("_ ;; _" [100,100] 100) +where + "A ;; B = {s1 @ s2 | s1 s2. s1 \ A \ s2 \ B}" + +text {* Two Simple Properties about Sequential Composition *} + +lemma Sequ_empty_string [simp]: + shows "A ;; {[]} = A" + and "{[]} ;; A = A" +by (simp_all add: Sequ_def) + +lemma Sequ_empty [simp]: + shows "A ;; {} = {}" + and "{} ;; A = {}" +by (simp_all add: Sequ_def) + + +section {* Semantic Derivative (Left Quotient) of Languages *} + +definition + Der :: "char \ string set \ string set" +where + "Der c A \ {s. c # s \ A}" + +definition + Ders :: "string \ string set \ string set" +where + "Ders s A \ {s'. s @ s' \ A}" + +lemma Der_null [simp]: + shows "Der c {} = {}" +unfolding Der_def +by auto + +lemma Der_empty [simp]: + shows "Der c {[]} = {}" +unfolding Der_def +by auto + +lemma Der_char [simp]: + shows "Der c {[d]} = (if c = d then {[]} else {})" +unfolding Der_def +by auto + +lemma Der_union [simp]: + shows "Der c (A \ B) = Der c A \ Der c B" +unfolding Der_def + by auto + +lemma Der_Union [simp]: + shows "Der c (\B. A) = (\B. Der c A)" +unfolding Der_def +by auto + +lemma Der_Sequ [simp]: + shows "Der c (A ;; B) = (Der c A) ;; B \ (if [] \ A then Der c B else {})" +unfolding Der_def Sequ_def +by (auto simp add: Cons_eq_append_conv) + + +section {* Kleene Star for Languages *} + +inductive_set + Star :: "string set \ string set" ("_\" [101] 102) + for A :: "string set" +where + start[intro]: "[] \ A\" +| step[intro]: "\s1 \ A; s2 \ A\\ \ s1 @ s2 \ A\" + +(* Arden's lemma *) + +lemma Star_cases: + shows "A\ = {[]} \ A ;; A\" +unfolding Sequ_def +by (auto) (metis Star.simps) + +lemma Star_decomp: + assumes "c # x \ A\" + shows "\s1 s2. x = s1 @ s2 \ c # s1 \ A \ s2 \ A\" +using assms +by (induct x\"c # x" rule: Star.induct) + (auto simp add: append_eq_Cons_conv) + +lemma Star_Der_Sequ: + shows "Der c (A\) \ (Der c A) ;; A\" +unfolding Der_def Sequ_def +by(auto simp add: Star_decomp) + + +lemma Der_star [simp]: + shows "Der c (A\) = (Der c A) ;; A\" +proof - + have "Der c (A\) = Der c ({[]} \ A ;; A\)" + by (simp only: Star_cases[symmetric]) + also have "... = Der c (A ;; A\)" + by (simp only: Der_union Der_empty) (simp) + also have "... = (Der c A) ;; A\ \ (if [] \ A then Der c (A\) else {})" + by simp + also have "... = (Der c A) ;; A\" + using Star_Der_Sequ by auto + finally show "Der c (A\) = (Der c A) ;; A\" . +qed + + +section {* Regular Expressions *} + +datatype rexp = + ZERO +| ONE +| CHAR char +| SEQ rexp rexp +| ALTS "rexp list" +| STAR rexp + +section {* Semantics of Regular Expressions *} + +fun + L :: "rexp \ string set" +where + "L (ZERO) = {}" +| "L (ONE) = {[]}" +| "L (CHAR c) = {[c]}" +| "L (SEQ r1 r2) = (L r1) ;; (L r2)" +| "L (ALTS rs) = (\r \ set rs. L r)" +| "L (STAR r) = (L r)\" + + +section {* Nullable, Derivatives *} + +fun + nullable :: "rexp \ bool" +where + "nullable (ZERO) = False" +| "nullable (ONE) = True" +| "nullable (CHAR c) = False" +| "nullable (ALTS rs) = (\r \ set rs. nullable r)" +| "nullable (SEQ r1 r2) = (nullable r1 \ nullable r2)" +| "nullable (STAR r) = True" + + +fun + der :: "char \ rexp \ rexp" +where + "der c (ZERO) = ZERO" +| "der c (ONE) = ZERO" +| "der c (CHAR d) = (if c = d then ONE else ZERO)" +| "der c (ALTS rs) = ALTS (map (der c) rs)" +| "der c (SEQ r1 r2) = + (if nullable r1 + then ALTS [SEQ (der c r1) r2, der c r2] + else SEQ (der c r1) r2)" +| "der c (STAR r) = SEQ (der c r) (STAR r)" + +fun + ders :: "string \ rexp \ rexp" +where + "ders [] r = r" +| "ders (c # s) r = ders s (der c r)" + + +lemma nullable_correctness: + shows "nullable r \ [] \ (L r)" +by (induct r) (auto simp add: Sequ_def) + +lemma der_correctness: + shows "L (der c r) = Der c (L r)" + apply(induct r) + apply(simp_all add: nullable_correctness) + apply(auto simp add: Der_def) + done + +lemma ders_correctness: + shows "L (ders s r) = Ders s (L r)" +by (induct s arbitrary: r) + (simp_all add: Ders_def der_correctness Der_def) + +fun flats :: "rexp list \ rexp list" + where + "flats [] = []" +| "flats (ZERO # rs1) = flats(rs1)" +| "flats ((ALTS rs1) #rs2) = rs1 @ (flats rs2)" +| "flats (r1 # rs2) = r1 # flats rs2" + +fun simp_SEQ where + "simp_SEQ ONE r\<^sub>2 = r\<^sub>2" +| "simp_SEQ r\<^sub>1 ONE = r\<^sub>1" +| "simp_SEQ r\<^sub>1 r\<^sub>2 = SEQ r\<^sub>1 r\<^sub>2" + +fun + simp :: "rexp \ rexp" +where + "simp (ALTS rs) = ALTS (remdups (flats (map simp rs)))" +| "simp (SEQ r1 r2) = simp_SEQ (simp r1) (simp r2)" +| "simp r = r" + +lemma simp_SEQ_correctness: + shows "L (simp_SEQ r1 r2) = L (SEQ r1 r2)" + apply(induct r1 r2 rule: simp_SEQ.induct) + apply(simp_all) + done + +lemma flats_correctness: + shows "(\r \ set (flats rs). L r) = L (ALTS rs)" + apply(induct rs rule: flats.induct) + apply(simp_all) + done + + +lemma simp_correctness: + shows "L (simp r) = L r" + apply(induct r) + apply(simp_all) + apply(simp add: simp_SEQ_correctness) + apply(simp add: flats_correctness) + done + +fun + ders2 :: "string \ rexp \ rexp" +where + "ders2 [] r = r" +| "ders2 (c # s) r = ders2 s (simp (der c r))" + +lemma ders2_ZERO: + shows "ders2 s ZERO = ZERO" + apply(induct s) + apply(simp_all) + done + +lemma ders2_ONE: + shows "ders2 s ONE \ {ZERO, ONE}" + apply(induct s) + apply(simp_all) + apply(auto) + apply(case_tac s) + apply(auto) + apply(case_tac s) + apply(auto) + done + +lemma ders2_CHAR: + shows "ders2 s (CHAR c) \ {ZERO, ONE, CHAR c}" + apply(induct s) + apply(simp_all) + apply(auto simp add: ders2_ZERO) + apply(case_tac s) + apply(auto simp add: ders2_ZERO) + using ders2_ONE + apply(auto)[1] + using ders2_ONE + apply(auto)[1] + done + +lemma remdup_size: + shows "size_list f (remdups rs) \ size_list f rs" + apply(induct rs) + apply(simp_all) + done + +lemma flats_append: + shows "flats (rs1 @ rs2) = (flats rs1) @ (flats rs2)" + apply(induct rs1 arbitrary: rs2) + apply(auto) + apply(case_tac a) + apply(auto) + done + +lemma flats_Cons: + shows "flats (r # rs) = (flats [r]) @ (flats rs)" + apply(subst flats_append[symmetric]) + apply(simp) + done + +lemma flats_size: + shows "size_list (\x. size (ders2 s x)) (flats rs) \ size_list (\x. size (ders2 s x)) rs" + apply(induct rs arbitrary: s rule: flats.induct) + apply(simp_all) + apply(simp add: ders2_ZERO) + apply (simp add: le_SucI) + + apply(subst flats_Cons) + apply(simp) + apply(case_tac a) + apply(auto) + apply(simp add: ders2_ZERO) + apply (simp add: le_SucI) + sorry + +lemma ders2_ALTS: + shows "size (ders2 s (ALTS rs)) \ size (ALTS (map (ders2 s) rs))" + apply(induct s arbitrary: rs) + apply(simp_all) + thm size_list_pointwise + apply (simp add: size_list_pointwise) + apply(drule_tac x="remdups (flats (map (simp \ der a) rs))" in meta_spec) + apply(rule le_trans) + apply(assumption) + apply(simp) + apply(rule le_trans) + apply(rule remdup_size) + apply(simp add: comp_def) + apply(rule le_trans) + apply(rule flats_size) + by (simp add: size_list_pointwise) + +definition + "derss2 A r = {ders2 s r | s. s \ A}" + +lemma + "\rd \ derss2 (UNIV) r. size rd \ Suc (size r)" + apply(induct r) + apply(auto simp add: derss2_def ders2_ZERO)[1] + apply(auto simp add: derss2_def ders2_ZERO)[1] + using ders2_ONE + apply(auto)[1] + apply (metis rexp.size(7) rexp.size(8) zero_le) + using ders2_CHAR + apply(auto)[1] + apply (smt derss2_def le_SucI le_zero_eq mem_Collect_eq rexp.size(7) rexp.size(8) rexp.size(9)) + defer + apply(auto simp add: derss2_def) + apply(rule le_trans) + apply(rule ders2_ALTS) + apply(simp) + apply(simp add: comp_def) + + apply(simp add: size_list_pointwise) + apply(case_tac s) + apply(simp) + apply(simp only:) + apply(auto)[1] + + apply(case_tac s) + apply(simp) + apply(simp) + +section {* Values *} + +datatype val = + Void +| Char char +| Seq val val +| Nth nat val +| Stars "val list" + + +section {* The string behind a value *} + +fun + flat :: "val \ string" +where + "flat (Void) = []" +| "flat (Char c) = [c]" +| "flat (Nth n v) = flat v" +| "flat (Seq v1 v2) = (flat v1) @ (flat v2)" +| "flat (Stars []) = []" +| "flat (Stars (v#vs)) = (flat v) @ (flat (Stars vs))" + +abbreviation + "flats vs \ concat (map flat vs)" + +lemma flat_Stars [simp]: + "flat (Stars vs) = flats vs" +by (induct vs) (auto) + +lemma Star_concat: + assumes "\s \ set ss. s \ A" + shows "concat ss \ A\" +using assms by (induct ss) (auto) + +lemma Star_cstring: + assumes "s \ A\" + shows "\ss. concat ss = s \ (\s \ set ss. s \ A \ s \ [])" +using assms +apply(induct rule: Star.induct) +apply(auto)[1] +apply(rule_tac x="[]" in exI) +apply(simp) +apply(erule exE) +apply(clarify) +apply(case_tac "s1 = []") +apply(rule_tac x="ss" in exI) +apply(simp) +apply(rule_tac x="s1#ss" in exI) +apply(simp) +done + + +section {* Lexical Values *} + +inductive + Prf :: "val \ rexp \ bool" ("\ _ : _" [100, 100] 100) +where + "\\ v1 : r1; \ v2 : r2\ \ \ Seq v1 v2 : SEQ r1 r2" +| "\\ v1 : (nth rs n); n < length rs\ \ \ (Nth n v1) : ALTS rs" +| "\ Void : ONE" +| "\ Char c : CHAR c" +| "\v \ set vs. \ v : r \ flat v \ [] \ \ Stars vs : STAR r" + +inductive_cases Prf_elims: + "\ v : ZERO" + "\ v : SEQ r1 r2" + "\ v : ALTS rs" + "\ v : ONE" + "\ v : CHAR c" + "\ vs : STAR r" + +lemma Prf_Stars_appendE: + assumes "\ Stars (vs1 @ vs2) : STAR r" + shows "\ Stars vs1 : STAR r \ \ Stars vs2 : STAR r" +using assms +by (auto intro: Prf.intros elim!: Prf_elims) + + +lemma Star_cval: + assumes "\s\set ss. \v. s = flat v \ \ v : r" + shows "\vs. flats vs = concat ss \ (\v\set vs. \ v : r \ flat v \ [])" +using assms +apply(induct ss) +apply(auto) +apply(rule_tac x="[]" in exI) +apply(simp) +apply(case_tac "flat v = []") +apply(rule_tac x="vs" in exI) +apply(simp) +apply(rule_tac x="v#vs" in exI) +apply(simp) +done + + +lemma L_flat_Prf1: + assumes "\ v : r" + shows "flat v \ L r" +using assms + apply(induct) + apply(auto simp add: Sequ_def Star_concat) + done + +lemma L_flat_Prf2: + assumes "s \ L r" + shows "\v. \ v : r \ flat v = s" +using assms +proof(induct r arbitrary: s) + case (STAR r s) + have IH: "\s. s \ L r \ \v. \ v : r \ flat v = s" by fact + have "s \ L (STAR r)" by fact + then obtain ss where "concat ss = s" "\s \ set ss. s \ L r \ s \ []" + using Star_cstring by auto + then obtain vs where "flats vs = s" "\v\set vs. \ v : r \ flat v \ []" + using IH Star_cval by metis + then show "\v. \ v : STAR r \ flat v = s" + using Prf.intros(5) flat_Stars by blast +next + case (SEQ r1 r2 s) + then show "\v. \ v : SEQ r1 r2 \ flat v = s" + unfolding Sequ_def L.simps by (fastforce intro: Prf.intros) +next + case (ALTS rs s) + then show "\v. \ v : ALTS rs \ flat v = s" + unfolding L.simps + apply(auto) + apply(case_tac rs) + apply(simp) + apply(simp) + apply(auto) + apply(drule_tac x="a" in meta_spec) + apply(simp) + apply(drule_tac x="s" in meta_spec) + apply(simp) + apply(erule exE) + apply(rule_tac x="Nth 0 v" in exI) + apply(simp) + apply(rule Prf.intros) + apply(simp) + apply(simp) + apply(drule_tac x="x" in meta_spec) + apply(simp) + apply(drule_tac x="s" in meta_spec) + apply(simp) + apply(erule exE) + apply(subgoal_tac "\n. nth list n = x \ n < length list") + apply(erule exE) + apply(rule_tac x="Nth (Suc n) v" in exI) + apply(simp) + apply(rule Prf.intros) + apply(simp) + apply(simp) + by (meson in_set_conv_nth) +qed (auto intro: Prf.intros) + + +lemma L_flat_Prf: + shows "L(r) = {flat v | v. \ v : r}" +using L_flat_Prf1 L_flat_Prf2 by blast + + + +section {* Sets of Lexical Values *} + +text {* + Shows that lexical values are finite for a given regex and string. +*} + +definition + LV :: "rexp \ string \ val set" +where "LV r s \ {v. \ v : r \ flat v = s}" + +lemma LV_simps: + shows "LV ZERO s = {}" + and "LV ONE s = (if s = [] then {Void} else {})" + and "LV (CHAR c) s = (if s = [c] then {Char c} else {})" +unfolding LV_def +by (auto intro: Prf.intros elim: Prf.cases) + + +abbreviation + "Prefixes s \ {s'. prefix s' s}" + +abbreviation + "Suffixes s \ {s'. suffix s' s}" + +abbreviation + "SSuffixes s \ {s'. strict_suffix s' s}" + +lemma Suffixes_cons [simp]: + shows "Suffixes (c # s) = Suffixes s \ {c # s}" +by (auto simp add: suffix_def Cons_eq_append_conv) + + +lemma finite_Suffixes: + shows "finite (Suffixes s)" +by (induct s) (simp_all) + +lemma finite_SSuffixes: + shows "finite (SSuffixes s)" +proof - + have "SSuffixes s \ Suffixes s" + unfolding strict_suffix_def suffix_def by auto + then show "finite (SSuffixes s)" + using finite_Suffixes finite_subset by blast +qed + +lemma finite_Prefixes: + shows "finite (Prefixes s)" +proof - + have "finite (Suffixes (rev s))" + by (rule finite_Suffixes) + then have "finite (rev ` Suffixes (rev s))" by simp + moreover + have "rev ` (Suffixes (rev s)) = Prefixes s" + unfolding suffix_def prefix_def image_def + by (auto)(metis rev_append rev_rev_ident)+ + ultimately show "finite (Prefixes s)" by simp +qed + +lemma LV_STAR_finite: + assumes "\s. finite (LV r s)" + shows "finite (LV (STAR r) s)" +proof(induct s rule: length_induct) + fix s::"char list" + assume "\s'. length s' < length s \ finite (LV (STAR r) s')" + then have IH: "\s' \ SSuffixes s. finite (LV (STAR r) s')" + by (force simp add: strict_suffix_def suffix_def) + define f where "f \ \(v, vs). Stars (v # vs)" + define S1 where "S1 \ \s' \ Prefixes s. LV r s'" + define S2 where "S2 \ \s2 \ SSuffixes s. Stars -` (LV (STAR r) s2)" + have "finite S1" using assms + unfolding S1_def by (simp_all add: finite_Prefixes) + moreover + with IH have "finite S2" unfolding S2_def + by (auto simp add: finite_SSuffixes inj_on_def finite_vimageI) + ultimately + have "finite ({Stars []} \ f ` (S1 \ S2))" by simp + moreover + have "LV (STAR r) s \ {Stars []} \ f ` (S1 \ S2)" + unfolding S1_def S2_def f_def + unfolding LV_def image_def prefix_def strict_suffix_def + apply(auto) + apply(case_tac x) + apply(auto elim: Prf_elims) + apply(erule Prf_elims) + apply(auto) + apply(case_tac vs) + apply(auto intro: Prf.intros) + apply(rule exI) + apply(rule conjI) + apply(rule_tac x="flat a" in exI) + apply(rule conjI) + apply(rule_tac x="flats list" in exI) + apply(simp) + apply(blast) + apply(simp add: suffix_def) + using Prf.intros(5) by blast + ultimately + show "finite (LV (STAR r) s)" by (simp add: finite_subset) +qed + + +lemma LV_finite: + shows "finite (LV r s)" +proof(induct r arbitrary: s) + case (ZERO s) + show "finite (LV ZERO s)" by (simp add: LV_simps) +next + case (ONE s) + show "finite (LV ONE s)" by (simp add: LV_simps) +next + case (CHAR c s) + show "finite (LV (CHAR c) s)" by (simp add: LV_simps) +next + case (ALTS rs s) + then show "finite (LV (ALTS rs) s)" + sorry +next + case (SEQ r1 r2 s) + define f where "f \ \(v1, v2). Seq v1 v2" + define S1 where "S1 \ \s' \ Prefixes s. LV r1 s'" + define S2 where "S2 \ \s' \ Suffixes s. LV r2 s'" + have IHs: "\s. finite (LV r1 s)" "\s. finite (LV r2 s)" by fact+ + then have "finite S1" "finite S2" unfolding S1_def S2_def + by (simp_all add: finite_Prefixes finite_Suffixes) + moreover + have "LV (SEQ r1 r2) s \ f ` (S1 \ S2)" + unfolding f_def S1_def S2_def + unfolding LV_def image_def prefix_def suffix_def + apply (auto elim!: Prf_elims) + by (metis (mono_tags, lifting) mem_Collect_eq) + ultimately + show "finite (LV (SEQ r1 r2) s)" + by (simp add: finite_subset) +next + case (STAR r s) + then show "finite (LV (STAR r) s)" by (simp add: LV_STAR_finite) +qed + + +(* +section {* Our POSIX Definition *} + +inductive + Posix :: "string \ rexp \ val \ bool" ("_ \ _ \ _" [100, 100, 100] 100) +where + Posix_ONE: "[] \ ONE \ Void" +| Posix_CHAR: "[c] \ (CHAR c) \ (Char c)" +| Posix_ALT1: "s \ r1 \ v \ s \ (ALT r1 r2) \ (Left v)" +| Posix_ALT2: "\s \ r2 \ v; s \ L(r1)\ \ s \ (ALT r1 r2) \ (Right v)" +| Posix_SEQ: "\s1 \ r1 \ v1; s2 \ r2 \ v2; + \(\s\<^sub>3 s\<^sub>4. s\<^sub>3 \ [] \ s\<^sub>3 @ s\<^sub>4 = s2 \ (s1 @ s\<^sub>3) \ L r1 \ s\<^sub>4 \ L r2)\ \ + (s1 @ s2) \ (SEQ r1 r2) \ (Seq v1 v2)" +| Posix_STAR1: "\s1 \ r \ v; s2 \ STAR r \ Stars vs; flat v \ []; + \(\s\<^sub>3 s\<^sub>4. s\<^sub>3 \ [] \ s\<^sub>3 @ s\<^sub>4 = s2 \ (s1 @ s\<^sub>3) \ L r \ s\<^sub>4 \ L (STAR r))\ + \ (s1 @ s2) \ STAR r \ Stars (v # vs)" +| Posix_STAR2: "[] \ STAR r \ Stars []" + +inductive_cases Posix_elims: + "s \ ZERO \ v" + "s \ ONE \ v" + "s \ CHAR c \ v" + "s \ ALT r1 r2 \ v" + "s \ SEQ r1 r2 \ v" + "s \ STAR r \ v" + +lemma Posix1: + assumes "s \ r \ v" + shows "s \ L r" "flat v = s" +using assms +by (induct s r v rule: Posix.induct) + (auto simp add: Sequ_def) + +text {* + Our Posix definition determines a unique value. +*} + +lemma Posix_determ: + assumes "s \ r \ v1" "s \ r \ v2" + shows "v1 = v2" +using assms +proof (induct s r v1 arbitrary: v2 rule: Posix.induct) + case (Posix_ONE v2) + have "[] \ ONE \ v2" by fact + then show "Void = v2" by cases auto +next + case (Posix_CHAR c v2) + have "[c] \ CHAR c \ v2" by fact + then show "Char c = v2" by cases auto +next + case (Posix_ALT1 s r1 v r2 v2) + have "s \ ALT r1 r2 \ v2" by fact + moreover + have "s \ r1 \ v" by fact + then have "s \ L r1" by (simp add: Posix1) + ultimately obtain v' where eq: "v2 = Left v'" "s \ r1 \ v'" by cases auto + moreover + have IH: "\v2. s \ r1 \ v2 \ v = v2" by fact + ultimately have "v = v'" by simp + then show "Left v = v2" using eq by simp +next + case (Posix_ALT2 s r2 v r1 v2) + have "s \ ALT r1 r2 \ v2" by fact + moreover + have "s \ L r1" by fact + ultimately obtain v' where eq: "v2 = Right v'" "s \ r2 \ v'" + by cases (auto simp add: Posix1) + moreover + have IH: "\v2. s \ r2 \ v2 \ v = v2" by fact + ultimately have "v = v'" by simp + then show "Right v = v2" using eq by simp +next + case (Posix_SEQ s1 r1 v1 s2 r2 v2 v') + have "(s1 @ s2) \ SEQ r1 r2 \ v'" + "s1 \ r1 \ v1" "s2 \ r2 \ v2" + "\ (\s\<^sub>3 s\<^sub>4. s\<^sub>3 \ [] \ s\<^sub>3 @ s\<^sub>4 = s2 \ s1 @ s\<^sub>3 \ L r1 \ s\<^sub>4 \ L r2)" by fact+ + then obtain v1' v2' where "v' = Seq v1' v2'" "s1 \ r1 \ v1'" "s2 \ r2 \ v2'" + apply(cases) apply (auto simp add: append_eq_append_conv2) + using Posix1(1) by fastforce+ + moreover + have IHs: "\v1'. s1 \ r1 \ v1' \ v1 = v1'" + "\v2'. s2 \ r2 \ v2' \ v2 = v2'" by fact+ + ultimately show "Seq v1 v2 = v'" by simp +next + case (Posix_STAR1 s1 r v s2 vs v2) + have "(s1 @ s2) \ STAR r \ v2" + "s1 \ r \ v" "s2 \ STAR r \ Stars vs" "flat v \ []" + "\ (\s\<^sub>3 s\<^sub>4. s\<^sub>3 \