# HG changeset patch # User urbanc # Date 1297317476 0 # Node ID a9ebc410a5c876b5ce0130f9a3efd90b03352c2d # Parent 37ab56205097b8436d89a1ec0f6b0980d7ca09cc more on paper diff -r 37ab56205097 -r a9ebc410a5c8 Myhill_1.thy --- a/Myhill_1.thy Wed Feb 09 12:34:30 2011 +0000 +++ b/Myhill_1.thy Thu Feb 10 05:57:56 2011 +0000 @@ -382,18 +382,18 @@ text {* Transitions between equivalence classes *} definition - transition :: "lang \ rexp \ lang \ bool" ("_ \_\_" [100,100,100] 100) + transition :: "lang \ char \ lang \ bool" ("_ \_\_" [100,100,100] 100) where - "Y \r\ X \ Y ;; (L r) \ X" + "Y \c\ X \ Y ;; {[c]} \ X" text {* Initial equational system *} definition "init_rhs CS X \ if ([] \ X) then - {Lam EMPTY} \ {Trn Y (CHAR c) | Y c. Y \ CS \ Y \(CHAR c)\ X} + {Lam EMPTY} \ {Trn Y (CHAR c) | Y c. Y \ CS \ Y \c\ X} else - {Trn Y (CHAR c)| Y c. Y \ CS \ Y \(CHAR c)\ X}" + {Trn Y (CHAR c)| Y c. Y \ CS \ Y \c\ X}" definition "eqs CS \ {(X, init_rhs CS X) | X. X \ CS}" @@ -408,14 +408,14 @@ *} fun - attach_rexp :: "rexp \ rhs_item \ rhs_item" + append_rexp :: "rexp \ rhs_item \ rhs_item" where - "attach_rexp r (Lam rexp) = Lam (SEQ rexp r)" -| "attach_rexp r (Trn X rexp) = Trn X (SEQ rexp r)" + "append_rexp r (Lam rexp) = Lam (SEQ rexp r)" +| "append_rexp r (Trn X rexp) = Trn X (SEQ rexp r)" definition - "append_rhs_rexp rhs rexp \ (attach_rexp rexp) ` rhs" + "append_rhs_rexp rhs rexp \ (append_rexp rexp) ` rhs" definition "arden_op X rhs \ @@ -624,7 +624,7 @@ qed lemma [simp]: - "L (attach_rexp r xb) = L xb ;; L r" + "L (append_rexp r xb) = L xb ;; L r" apply (cases xb, auto simp: Seq_def) apply(rule_tac x = "s\<^isub>1 @ s\<^isub>1'" in exI, rule_tac x = "s\<^isub>2'" in exI) apply(auto simp: Seq_def) @@ -635,7 +635,7 @@ apply (auto simp:append_rhs_rexp_def image_def) apply (auto simp:Seq_def) apply (rule_tac x = "L xb ;; L r" in exI, auto simp add:Seq_def) -by (rule_tac x = "attach_rexp r xb" in exI, auto simp:Seq_def) +by (rule_tac x = "append_rexp r xb" in exI, auto simp:Seq_def) lemma classes_of_union_distrib: "classes_of A \ classes_of B = classes_of (A \ B)" @@ -702,7 +702,7 @@ and "clist \ Y" using decom "(1)" every_eqclass_has_transition by blast hence - "x \ L {Trn Y (CHAR c)| Y c. Y \ UNIV // (\Lang) \ Y \(CHAR c)\ X}" + "x \ L {Trn Y (CHAR c)| Y c. Y \ UNIV // (\Lang) \ Y \c\ X}" unfolding transition_def using "(1)" decom by (simp, rule_tac x = "Trn Y (CHAR c)" in exI, simp add:Seq_def) diff -r 37ab56205097 -r a9ebc410a5c8 Paper/Paper.thy --- a/Paper/Paper.thy Wed Feb 09 12:34:30 2011 +0000 +++ b/Paper/Paper.thy Thu Feb 10 05:57:56 2011 +0000 @@ -12,6 +12,10 @@ abbreviation "EClass x R \ R `` {x}" +abbreviation + "append_rexp2 r_itm r \ append_rexp r r_itm" + + notation (latex output) str_eq_rel ("\\<^bsub>_\<^esub>") and str_eq ("_ \\<^bsub>_\<^esub> _") and @@ -27,7 +31,10 @@ Trn ("'(_, _')" [100, 100] 100) and EClass ("\_\\<^bsub>_\<^esub>" [100, 100] 100) and transition ("_ \<^raw:\ensuremath{\stackrel{\text{>_\<^raw:}}{\Longmapsto}}> _" [100, 100, 100] 100) and - Setalt ("\<^raw:\ensuremath{\bigplus}>_" [1000] 999) + Setalt ("\<^raw:\ensuremath{\bigplus}>_" [1000] 999) and + append_rexp2 ("_ \<^raw:\ensuremath{\triangleleft}> _" [100, 100] 100) and + append_rhs_rexp ("_ \<^raw:\ensuremath{\triangleleft}> _" [100, 100] 100) + (*>*) @@ -127,7 +134,7 @@ \noindent changes the type---the disjoint union is not a set, but a set of pairs. Using this definition for disjoint unions means we do not have a single type for automata - and hence will not be able to state properties about \emph{all} + and hence will not be able to state certain properties about \emph{all} automata, since there is no type quantification available in HOL. An alternative, which provides us with a single type for automata, is to give every state node an identity, for example a natural @@ -141,23 +148,25 @@ problems as with graphs. Composing, for example, two non-deterministic automata in parallel poses again the problem of how to implement disjoint unions. Nipkow \cite{Nipkow98} dismisses the option of using identities, because it leads to ``messy proofs''. He - opts for a variant of \eqref{disjointunion}, but writes + opts for a variant of \eqref{disjointunion} using bitlists, but writes \begin{quote} \it ``If the reader finds the above treatment in terms of bit lists revoltingly - concrete, I cannot disagree.'' + concrete, \phantom{``}I cannot disagree.'' \end{quote} \noindent Moreover, it is not so clear how to conveniently impose a finiteness condition upon functions in order to represent \emph{finite} automata. The best is - probably to resort to more advanced reasoning frameworks, such as \emph{locales}. + probably to resort to more advanced reasoning frameworks, such as \emph{locales} + or \emph{type classes}, + which are not avaiable in all HOL-based theorem provers. Because of these problems to do with representing automata, there seems to be no substantial formalisation of automata theory and regular languages carried out in a HOL-based theorem prover. Nipkow establishes in \cite{Nipkow98} the link between regular expressions and automata in - the context of lexing. The only larger formalisations of automata theory + the restricted context of lexing. The only larger formalisations of automata theory are carried out in Nuprl \cite{Constable00} and in Coq (for example \cite{Filliatre97}). @@ -197,7 +206,7 @@ text {* Strings in Isabelle/HOL are lists of characters with the \emph{empty string} - being represented by the empty list, written @{term "[]"}. \emph{Languages} + being represented by the empty list, written @{term "[]"}. \emph{Languages} are sets of strings. The language containing all strings is written in Isabelle/HOL as @{term "UNIV::string set"}. The concatenation of two languages is written @{term "A ;; B"} and a language raised to the power @{text n} is written @@ -217,14 +226,16 @@ we will make use of the following properties of these constructions. \begin{proposition}\label{langprops}\mbox{}\\ - \begin{tabular}{@ {}ll@ {\hspace{10mm}}ll} - (i) & @{thm star_cases} & (ii) & @{thm[mode=IfThen] pow_length}\\ - (iii) & @{thm seq_Union_left} & + \begin{tabular}{@ {}ll} + (i) & @{thm star_cases} \\ + (ii) & @{thm[mode=IfThen] pow_length}\\ + (iii) & @{thm seq_Union_left} \\ \end{tabular} \end{proposition} \noindent - We omit the proofs, but invite the reader to consult + In @{text "(ii)"} we use the notation @{term "length s"} for the length of a string. + We omit the proofs for these properties, but invite the reader to consult our formalisation.\footnote{Available at ???} @@ -311,9 +322,9 @@ \end{tabular} \end{center} - Given a set of regular expressions @{text rs}, we will need the operation of generating - a corresponding regular expressions that matches all languages of @{text rs}. We only need the existence - of such a regular expressions and therefore we use Isabelle's @{const "fold_graph"} and Hilbert's + Given a set of regular expressions @{text rs}, we will make use of the operation of generating + a regular expression that matches all languages of @{text rs}. We only need to know the existence + of such a regular expression and therefore we use Isabelle/HOL's @{const "fold_graph"} and Hilbert's @{text "\"} to define @{term "\rs"}. This function, roughly speaking, folds @{const ALT} over the set @{text rs} with @{const NULL} for the empty set. We can prove that for finite sets @{text rs} @@ -344,7 +355,7 @@ It is easy to see that @{term "\A"} is an equivalence relation, which partitions the set of all strings, @{text "UNIV"}, into a set of disjoint equivalence classes. An example is the regular language containing just - the string @{text "[c]"}, then @{term "\({[c]})"} partitions @{text UNIV} + the string @{text "[c]"}. The relation @{term "\({[c]})"} partitions @{text UNIV} into the three equivalence classes @{text "X\<^isub>1"}, @{text "X\<^isub>2"} and @{text "X\<^isub>3"} as follows @@ -371,31 +382,31 @@ \end{equation} \noindent - In our running example, @{text "X\<^isub>1"} is the only equivalence class in @{term "finals {[c]}"}. + In our running example, @{text "X\<^isub>2"} is the only equivalence class in @{term "finals {[c]}"}. It is straightforward to show that in general @{thm lang_is_union_of_finals} and @{thm finals_in_partitions} hold. Therefore if we know that there exists a regular expression for every equivalence class in @{term "finals A"} (which by assumption must be - a finite set), then we can obtain a regular expression using @{text "\"} - that matches every string in @{text A}. + a finite set), then we can using @{text "\"} obtain a regular expression + using that matches every string in @{text A}. Our proof of Thm.~\ref{myhillnerodeone} relies on a method that can calculate a regular expression for \emph{every} equivalence class, not just the ones in @{term "finals A"}. We - first define the notion of \emph{transition} between equivalence classes + first define the notion of \emph{one-character-transition} between equivalence classes % \begin{equation} @{thm transition_def} \end{equation} \noindent - which means that if we concatenate all strings matching the regular expression @{text r} - to the end of all strings in the equivalence class @{text Y}, we obtain a subset of + which means that if we concatenate the character @{text c} to the end of all + strings in the equivalence class @{text Y}, we obtain a subset of @{text X}. Note that we do not define an automaton here, we merely relate two sets (with the help of a regular expression). In our concrete example we have - @{term "X\<^isub>1 \(CHAR c)\ X\<^isub>2"} and @{term "X\<^isub>1 \r\ X\<^isub>3"} with @{text r} being any - other regular expression than @{const EMPTY} and @{term "CHAR c"}. + @{term "X\<^isub>1 \c\ X\<^isub>2"}, @{term "X\<^isub>1 \d\ X\<^isub>3"} with @{text d} being any + other character than @{text c}, and @{term "X\<^isub>3 \d\ X\<^isub>3"} for any @{text c}. Next we build an equational system that contains an equation for each equivalence class. Suppose we have @@ -414,20 +425,24 @@ \noindent where the pairs @{text "(Y\<^isub>i\<^isub>j, CHAR c\<^isub>i\<^isub>j)"} stand for all transitions - @{term "Y\<^isub>i\<^isub>j \(CHAR c\<^isub>i\<^isub>j)\ X\<^isub>i"}. The term @{text "\(EMPTY)"} acts as a marker for the equivalence - class containing @{text "[]"}. (Note that we mark, roughly speaking, the + @{term "Y\<^isub>i\<^isub>j \c\<^isub>i\<^isub>j\ X\<^isub>i"}. There can only be finitely many such + transitions since there are only finitely many equivalence classes + and finitely many characters. + The term @{text "\(EMPTY)"} in the first equation acts as a marker for the equivalence + class containing @{text "[]"}. (As an aside note that we mark, roughly speaking, the single ``initial'' state in the equational system, which is different from - the method by Brzozowski \cite{Brzozowski64}, since he marks the ``terminal'' + the method by Brzozowski \cite{Brzozowski64}: he marks the ``terminal'' states. We are forced to set up the equational system in this way, because the Myhill-Nerode relation determines the ``direction'' of the transitions. The successor ``state'' of an equivalence class @{text Y} can be reached by adding characters to the end of @{text Y}. This is also the reason why we have to use our reverse version of Arden's lemma.) Overloading the function @{text L} for the two kinds of terms in the - equational system as follows + equational system, we have \begin{center} - @{thm L_rhs_item.simps(2)[where X="Y" and r="r", THEN eq_reflection]}\hspace{10mm} + @{text "\(Y, r) \"} % + @{thm (rhs) L_rhs_item.simps(2)[where X="Y" and r="r", THEN eq_reflection]}\hspace{10mm} @{thm L_rhs_item.simps(1)[where r="r", THEN eq_reflection]} \end{center} @@ -447,11 +462,11 @@ \noindent The reason for adding the @{text \}-marker to our equational system is - to obtain this equation, which only holds in this form since none of + to obtain this equation: it only holds in this form since none of the other terms contain the empty string. - Our proof of Thm.~\ref{myhillnerodeone} will be by transforming the + Our proof of Thm.~\ref{myhillnerodeone} will proceed by transforming the equational system into a \emph{solved form} maintaining the invariants \eqref{inv1} and \eqref{inv2}. From the solved form we will be able to read off the regular expressions. @@ -459,17 +474,24 @@ In order to transform an equational system into solved form, we have two main operations: one that takes an equation of the form @{text "X = rhs"} and removes the recursive occurences of @{text X} in @{text rhs} using our variant of Arden's - Lemma (Lem.~\ref{arden}). The other operation takes an equation @{text "X = rhs"} + Lemma. The other operation takes an equation @{text "X = rhs"} and substitutes @{text X} throughout the rest of the equational system - adjusting the regular expressions approriately. To define them we need the - operation + adjusting the remaining regular expressions approriately. To define this adjustment + we define the \emph{append-operation} \begin{center} - @{thm attach_rexp.simps(2)[where X="Y" and r="r\<^isub>1" and rexp="r\<^isub>2", THEN eq_reflection]}\hspace{10mm} - @{thm attach_rexp.simps(1)[where r="r\<^isub>1" and rexp="r\<^isub>2", THEN eq_reflection]} + @{thm append_rexp.simps(2)[where X="Y" and r="r\<^isub>1" and rexp="r\<^isub>2", THEN eq_reflection]}\hspace{10mm} + @{thm append_rexp.simps(1)[where r="r\<^isub>1" and rexp="r\<^isub>2", THEN eq_reflection]} \end{center} + \noindent + which we also lift to entire right-hand sides of equations, written as + @{thm (lhs) append_rhs_rexp_def[where rexp="r"]}. + + \begin{center} + @{thm arden_op_def} + \end{center} *} section {* Regular Expressions Generate Finitely Many Partitions *} @@ -498,6 +520,17 @@ section {* Conclusion and Related Work *} +text {* + In this paper we took the view that a regular language as one where there exists + a regular expression that matches all its strings. For us it was important to find + out how far we can push this point of view. Having formalised the Myhill-Nerode + theorem means pushed very far. Having the Myhill-Nerode theorem means we can + formalise much of the textbook results in this subject. + + +*} + + (*<*) end (*>*) \ No newline at end of file diff -r 37ab56205097 -r a9ebc410a5c8 Paper/document/root.tex --- a/Paper/document/root.tex Wed Feb 09 12:34:30 2011 +0000 +++ b/Paper/document/root.tex Thu Feb 10 05:57:56 2011 +0000 @@ -32,7 +32,7 @@ \title{A Formalisation of the Myhill-Nerode Theorem\\ based on Regular Expressions (Proof Pearl)} \author{Chunhan Wu\inst{1} \and Xingjuan Zhang\inst{1} \and Christian Urban\inst{2}} -\institute{PLA University, China \and TU Munich, Germany} +\institute{PLA University of Science and Technology, China \and TU Munich, Germany} \maketitle \begin{abstract} diff -r 37ab56205097 -r a9ebc410a5c8 tphols-2011/myhill.pdf Binary file tphols-2011/myhill.pdf has changed