regexp: comparison Journal/Paper.thy

equal deleted inserted replaced

-:feb7b31d6bf1
+:21ee3a852a02
 (*<*)
 theory Paper
-imports "../Closure"
+imports "../Closures"
 begin
 declare [[show_question_marks = false]]
 consts
 abbreviation
 "Append_rexp2 r_itm r \<equiv> Append_rexp r r_itm"
+abbreviation
+"pow" (infixl "\<up>" 100)
+where
+"A \<up> n \<equiv> A ^^ n"
+syntax (latex output)
+"_Collect" :: "pttrn => bool => 'a set"              ("(1{_ | _})")
+"_CollectIn" :: "pttrn => 'a set => bool => 'a set"   ("(1{_ \<in> _ | _})")
+translations
+"_Collect p P"      <= "{p. P}"
+"_Collect p P"      <= "{p|xs. P}"
+"_CollectIn p A P"  <= "{p : A. P}"
+abbreviation "NULL \<equiv> Zero"
+abbreviation "EMPTY \<equiv> One"
+abbreviation "CHAR \<equiv> Atom"
+abbreviation "ALT \<equiv> Plus"
+abbreviation "SEQ \<equiv> Times"
+abbreviation "STAR \<equiv> Star"
+ML {* @{term "op ^^"} *}
 notation (latex output)
 str_eq_rel ("\<approx>\<^bsub>_\<^esub>") and
 str_eq ("_ \<approx>\<^bsub>_\<^esub> _") and
-Seq (infixr "\<cdot>" 100) and
+conc (infixr "\<cdot>" 100) and
-Star ("_\<^bsup>\<star>\<^esup>") and
+star ("_\<^bsup>\<star>\<^esup>") and
 pow ("_\<^bsup>_\<^esup>" [100, 100] 100) and
 Suc ("_+1" [100] 100) and
 quotient ("_ \<^raw:\ensuremath{\!\sslash\!}> _" [90, 90] 90) and
 REL ("\<approx>") and
 UPLUS ("_ \<^raw:\ensuremath{\uplus}> _" [90, 90] 90) and
-L_rexp ("\<^raw:\ensuremath{\cal{L}}>'(_')" [0] 101) and
+lang ("\<^raw:\ensuremath{\cal{L}}>'(_')" [0] 101) and
 Lam ("\<lambda>'(_')" [100] 100) and
 Trn ("'(_, _')" [100, 100] 100) and
 EClass ("\<lbrakk>_\<rbrakk>\<^bsub>_\<^esub>" [100, 100] 100) and
 transition ("_ \<^raw:\ensuremath{\stackrel{\text{>_\<^raw:}}{\Longmapsto}}> _" [100, 100, 100] 100) and
 Setalt ("\<^raw:\ensuremath{\bigplus}>_" [1000] 999) and
 Append_rexp2 ("_ \<^raw:\ensuremath{\triangleleft}> _" [100, 100] 100) and
 Append_rexp_rhs ("_ \<^raw:\ensuremath{\triangleleft}> _" [100, 100] 50) and
 uminus ("\<^raw:\ensuremath{\overline{>_\<^raw:}}>" [100] 100) and
-tag_str_ALT ("tag\<^isub>A\<^isub>L\<^isub>T _ _" [100, 100] 100) and
+tag_str_Plus ("tag\<^isub>A\<^isub>L\<^isub>T _ _" [100, 100] 100) and
-tag_str_ALT ("tag\<^isub>A\<^isub>L\<^isub>T _ _ _" [100, 100, 100] 100) and
+tag_str_Plus ("tag\<^isub>A\<^isub>L\<^isub>T _ _ _" [100, 100, 100] 100) and
-tag_str_SEQ ("tag\<^isub>S\<^isub>E\<^isub>Q _ _" [100, 100] 100) and
+tag_str_Times ("tag\<^isub>S\<^isub>E\<^isub>Q _ _" [100, 100] 100) and
-tag_str_SEQ ("tag\<^isub>S\<^isub>E\<^isub>Q _ _ _" [100, 100, 100] 100) and
+tag_str_Times ("tag\<^isub>S\<^isub>E\<^isub>Q _ _ _" [100, 100, 100] 100) and
-tag_str_STAR ("tag\<^isub>S\<^isub>T\<^isub>A\<^isub>R _" [100] 100) and
+tag_str_Star ("tag\<^isub>S\<^isub>T\<^isub>A\<^isub>R _" [100] 100) and
-tag_str_STAR ("tag\<^isub>S\<^isub>T\<^isub>A\<^isub>R _ _" [100, 100] 100)
+tag_str_Star ("tag\<^isub>S\<^isub>T\<^isub>A\<^isub>R _ _" [100, 100] 100)
 lemma meta_eq_app:
 shows "f \<equiv> \<lambda>x. g x \<Longrightarrow> f x \<equiv> g x"
 by auto
+lemma conc_def':
+"A \<cdot> B = {s\<^isub>1 @ s\<^isub>2 | s\<^isub>1 s\<^isub>2. s\<^isub>1 \<in> A \<and> s\<^isub>2 \<in> B}"
+unfolding conc_def by simp
+lemma conc_Union_left:
+shows "B \<cdot> (\<Union>n. A \<up> n) = (\<Union>n. B \<cdot> (A \<up> n))"
+unfolding conc_def by auto
+lemma test:
+assumes X_in_eqs: "(X, rhs) \<in> Init (UNIV // \<approx>A)"
+shows "X = \<Union> (lang_trm `  rhs)"
+using assms l_eq_r_in_eqs by (simp)
 (* THEOREMS *)
 notation (Rule output)
 "==>"  ("\<^raw:\mbox{}\inferrule{\mbox{>_\<^raw:}}>\<^raw:{\mbox{>_\<^raw:}}>")
 \noindent
 Regular languages are an important and well-understood subject in Computer
 Science, with many beautiful theorems and many useful algorithms. There is a
 wide range of textbooks on this subject, many of which are aimed at students
 and contain very detailed `pencil-and-paper' proofs
-(e.g.~\cite{Kozen97}). It seems natural to exercise theorem provers by
+(e.g.~\cite{Kozen97, HopcroftUllman69}). It seems natural to exercise theorem provers by
-formalising the theorems and by verifying formally the algorithms.
+formalising the theorems and by verifying formally the algorithms.  A
-Some of the popular theorem provers are based on Higher-Order Logic (HOL).
+popular choice for a theorem prover would be one based on Higher-Order Logic
+(HOL), such as HOL4, HOLlight and Isabelle/HOL. For our development we will
-There is however a problem: the typical approach to regular languages is to
+use the latter theorem prover. One distinguishing feature of HOL is it's
+type system based on Church's Simple Theory of Types \cite{Church40}.  The
+limitations of this type system are one of the main motivations
+behind the work presented in this paper.
+The typical approach to regular languages is to
 introduce finite automata and then define everything in terms of them.  For
 example, a regular language is normally defined as one whose strings are
 recognised by a finite deterministic automaton. This approach has many
 benefits. Among them is the fact that it is easy to convince oneself that
 regular languages are closed under complementation: one just has to exchange
 the accepting and non-accepting states in the corresponding automaton to
-obtain an automaton for the complement language.  The problem, however, lies with
+obtain an automaton for the complement language.  The problem, however, lies
-formalising such reasoning in a HOL-based theorem prover, in our case
+with formalising such reasoning in a HOL-based theorem prover. Automata are
-Isabelle/HOL. Automata are built up from states and transitions that
+built up from states and transitions that need to be represented as graphs,
-need to be represented as graphs, matrices or functions, none
+matrices or functions, none of which can be defined as an inductive
-of which can be defined as an inductive datatype.
+datatype.
 In case of graphs and matrices, this means we have to build our own
 reasoning infrastructure for them, as neither Isabelle/HOL nor HOL4 nor
 HOLlight support them with libraries. Even worse, reasoning about graphs and
-matrices can be a real hassle in HOL-based theorem provers.  Consider for
+matrices can be a real hassle in HOL-based theorem provers, because
+we have to be able to combine automata.  Consider for
 example the operation of sequencing two automata, say $A_1$ and $A_2$, by
 connecting the accepting states of $A_1$ to the initial state of $A_2$:
 %
 \begin{center}
 \begin{tabular}{ccc}
-\begin{tikzpicture}[scale=0.8]
+\begin{tikzpicture}[scale=0.9]
 %\draw[step=2mm] (-1,-1) grid (1,1);
 \draw[rounded corners=1mm, very thick] (-1.0,-0.3) rectangle (-0.2,0.3);
 \draw[rounded corners=1mm, very thick] ( 0.2,-0.3) rectangle ( 1.0,0.3);
 \raisebox{1.1mm}{\bf\Large$\;\;\;\Rightarrow\,\;\;$}
 &
-\begin{tikzpicture}[scale=0.8]
+\begin{tikzpicture}[scale=0.9]
 %\draw[step=2mm] (-1,-1) grid (1,1);
 \draw[rounded corners=1mm, very thick] (-1.0,-0.3) rectangle (-0.2,0.3);
 \draw[rounded corners=1mm, very thick] ( 0.2,-0.3) rectangle ( 1.0,0.3);
 \end{tabular}
 \end{center}
 \noindent
-On `paper' we can define the corresponding graph in terms of the disjoint
+On `paper' or a theorem prover based on set-theory, we can define the corresponding
+graph in terms of the disjoint
 union of the state nodes. Unfortunately in HOL, the standard definition for disjoint
 union, namely
 %
 \begin{equation}\label{disjointunion}
-@{term "UPLUS A\<^isub>1 A\<^isub>2 \<equiv> {(1, x) | x. x \<in> A\<^isub>1} \<union> {(2, y) | y. y \<in> A\<^isub>2}"}
+@{text "A\<^isub>1 \<uplus> A\<^isub>2 \<equiv> {(1, x) | x \<in> A\<^isub>1} \<union> {(2, y) | y \<in> A\<^isub>2}"}
 \end{equation}
 \noindent
 changes the type---the disjoint union is not a set, but a set of pairs.
 Using this definition for disjoint union means we do not have a single type for automata
 and hence will not be able to state certain properties about \emph{all}
-automata, since there is no type quantification available in HOL (unlike in Coq, for example). An
+automata, since there is no type quantification available in HOL (unlike in Coq, for example).
-alternative, which provides us with a single type for automata, is to give every
+As a result, we would not be able to define a language being regular
+in terms of the existence of an automata.
+An alternative, which provides us with a single type for automata, is to give every
 state node an identity, for example a natural
 number, and then be careful to rename these identities apart whenever
 connecting two automata. This results in clunky proofs
 establishing that properties are invariant under renaming. Similarly,
 connecting two automata represented as matrices results in very adhoc
 \end{tabular}
 \end{quote}
 \noindent
-Moreover, it is not so clear how to conveniently impose a finiteness condition
+Moreover, it is not so clear how to conveniently impose a finiteness
-upon functions in order to represent \emph{finite} automata. The best is
+condition upon functions in order to represent \emph{finite} automata. The
-probably to resort to more advanced reasoning frameworks, such as \emph{locales}
+best is probably to resort to more advanced reasoning frameworks, such as
-or \emph{type classes},
+\emph{locales} or \emph{type classes}, which are \emph{not} available in all
-which are \emph{not} available in all HOL-based theorem provers.
+HOL-based theorem provers.
-{\bf add commnets from Brozowski}
+Because of these problems to do with representing automata, there seems to
+be no substantial formalisation of automata theory and regular languages
-Because of these problems to do with representing automata, there seems
+carried out in HOL-based theorem provers. Nipkow \cite{Nipkow98} establishes
-to be no substantial formalisation of automata theory and regular languages
+the link between regular expressions and automata in the context of
-carried out in HOL-based theorem provers. Nipkow  \cite{Nipkow98} establishes
+lexing. Berghofer and Reiter \cite{BerghoferReiter09} formalise automata
-the link between regular expressions and automata in
+working over bit strings in the context of Presburger arithmetic.  The only
-the context of lexing. Berghofer and Reiter \cite{BerghoferReiter09}
+larger formalisations of automata theory are carried out in Nuprl
-formalise automata working over
+\cite{Constable00} and in Coq \cite{Filliatre97}.
-bit strings in the context of Presburger arithmetic.
-The only larger formalisations of automata theory
+Also one might consider the Myhill-Nerode theorem as well-worn stock
-are carried out in Nuprl \cite{Constable00} and in Coq \cite{Filliatre97}.
+material for a computer scientists, but paper proofs of this theorem contain some
+subtle side-conditions which are easily overlooked and which make formal reasoning
+painful. For example in Kozen's proof \cite{Kozen97} it is not sufficient to
+have just any automata recognising a regular language, but one which does
+not have inaccessible states. This means if we define a regular language
+for which \emph{any} finite automaton exists, then one has to ensure that
+another equivalent can be found satisfying the side-conditions. Similarly
+Brozowski mentiones in \cite{Brozowski10} side-conditions of finite automata
+in connection of state-complexity: there it is required that automata
+must be complete in the sense of having a total transition function.
+Furthermore, if a `sink' state is present which accepts no word, then there
+must be only one such state. . . .
+Such 'little' lemmas make formalisation of these results in a theroem
+prover hair-pulling experiences.
 In this paper, we will not attempt to formalise automata theory in
-Isabelle/HOL, but take a different approach to regular
+Isabelle/HOL nor attempt to formalise any automata proof from the
-languages. Instead of defining a regular language as one where there exists
+literature, but take a different approach to regular languages than is
-an automaton that recognises all strings of the language, we define a
+usually taken. Instead of defining a regular language as one where there
+exists an automaton that recognises all strings of the language, we define a
 regular language as:
 \begin{dfntn}
 A language @{text A} is \emph{regular}, provided there is a regular expression that matches all
 strings of @{text "A"}.
 \end{dfntn}
 \noindent
-The reason is that regular expressions, unlike graphs, matrices and functions, can
+The reason is that regular expressions, unlike graphs, matrices and
-be easily defined as inductive datatype. Consequently a corresponding reasoning
+functions, can be easily defined as inductive datatype. Consequently a
-infrastructure comes for free. This has recently been exploited in HOL4 with a formalisation
+corresponding reasoning infrastructure (like induction or recursion) comes
-of regular expression matching based on derivatives \cite{OwensSlind08} and
+for free. This has recently been exploited in HOL4 with a formalisation of
-with an equivalence checker for regular expressions in Isabelle/HOL \cite{KraussNipkow11}.
+regular expression matching based on derivatives \cite{OwensSlind08} and
-The purpose of this paper is to
+with an equivalence checker for regular expressions in Isabelle/HOL
-show that a central result about regular languages---the Myhill-Nerode theorem---can
+\cite{KraussNipkow11}.  The purpose of this paper is to show that a central
-be recreated by only using regular expressions. This theorem gives necessary
+result about regular languages---the Myhill-Nerode theorem---can be
-and sufficient conditions for when a language is regular. As a corollary of this
+recreated by only using regular expressions. This theorem gives necessary
-theorem we can easily establish the usual closure properties, including
+and sufficient conditions for when a language is regular. As a corollary of
-complementation, for regular languages.\smallskip
+this theorem we can easily establish the usual closure properties, including
+complementation, for regular languages.\medskip
-\noindent
-{\bf Contributions:}
+\noindent
-There is an extensive literature on regular languages.
+{\bf Contributions:} There is an extensive literature on regular languages.
-To our best knowledge, our proof of the Myhill-Nerode theorem is the
+To our best knowledge, our proof of the Myhill-Nerode theorem is the first
-first that is based on regular expressions, only. We prove the part of this theorem
+that is based on regular expressions, only. The part of this theorem stating
-stating that a regular expression has only finitely many partitions using certain
+that finitely many partitions of a language w.r.t.~to the Myhill-Nerode
-tagging-functions. Again to our best knowledge, these tagging-functions have
+relation imply that the language is regular is proved by a folklore argument
-not been used before to establish the Myhill-Nerode theorem.
+of solving equational sytems.  For the other part we give two proofs: a
+direct proof using certain tagging-functions, and an indirect proof using
+Antimirov's partial derivatives \cite{Antimirov95} (also earlier russion work).
+Again to our best knowledge, the tagging-functions have not been used before to
+establish the Myhill-Nerode theorem.
 *}
 section {* Preliminaries *}
 text {*
+\noindent
 Strings in Isabelle/HOL are lists of characters with the \emph{empty string}
 being represented by the empty list, written @{term "[]"}.  \emph{Languages}
 are sets of strings. The language containing all strings is written in
 Isabelle/HOL as @{term "UNIV::string set"}. The concatenation of two languages
 is written @{term "A \<cdot> B"} and a language raised to the power @{text n} is written
 @{term "A \<up> n"}. They are defined as usual
 \begin{center}
-@{thm Seq_def[THEN eq_reflection, where A1="A" and B1="B"]}
+@{thm conc_def'[THEN eq_reflection, where A1="A" and B1="B"]}
 \hspace{7mm}
-@{thm pow.simps(1)[THEN eq_reflection, where A1="A"]}
+@{thm lang_pow.simps(1)[THEN eq_reflection, where A1="A"]}
 \hspace{7mm}
-@{thm pow.simps(2)[THEN eq_reflection, where A1="A" and n1="n"]}
+@{thm lang_pow.simps(2)[THEN eq_reflection, where A1="A" and n1="n"]}
 \end{center}
 \noindent
 where @{text "@"} is the list-append operation. The Kleene-star of a language @{text A}
-is defined as the union over all powers, namely @{thm Star_def}. In the paper
+is defined as the union over all powers, namely @{thm star_def}. In the paper
 we will make use of the following properties of these constructions.
 \begin{prpstn}\label{langprops}\mbox{}\\
 \begin{tabular}{@ {}ll}
 (i)   & @{thm star_cases}     \\
 (ii)  & @{thm[mode=IfThen] pow_length}\\
-(iii) & @{thm seq_Union_left} \\
+(iii) & @{thm conc_Union_left} \\
 \end{tabular}
 \end{prpstn}
 \noindent
 In @{text "(ii)"} we use the notation @{term "length s"} for the length of a
 and the language matched by a regular expression is defined as
 \begin{center}
 \begin{tabular}{c@ {\hspace{10mm}}c}
 \begin{tabular}{rcl}
-@{thm (lhs) L_rexp.simps(1)} & @{text "\<equiv>"} & @{thm (rhs) L_rexp.simps(1)}\\
+@{thm (lhs) lang.simps(1)} & @{text "\<equiv>"} & @{thm (rhs) lang.simps(1)}\\
-@{thm (lhs) L_rexp.simps(2)} & @{text "\<equiv>"} & @{thm (rhs) L_rexp.simps(2)}\\
+@{thm (lhs) lang.simps(2)} & @{text "\<equiv>"} & @{thm (rhs) lang.simps(2)}\\
-@{thm (lhs) L_rexp.simps(3)[where c="c"]} & @{text "\<equiv>"} & @{thm (rhs) L_rexp.simps(3)[where c="c"]}\\
+@{thm (lhs) lang.simps(3)[where a="c"]} & @{text "\<equiv>"} & @{thm (rhs) lang.simps(3)[where a="c"]}\\
 \end{tabular}
 &
 \begin{tabular}{rcl}
-@{thm (lhs) L_rexp.simps(4)[where ?r1.0="r\<^isub>1" and ?r2.0="r\<^isub>2"]} & @{text "\<equiv>"} &
+@{thm (lhs) lang.simps(4)[where ?r="r\<^isub>1" and ?s="r\<^isub>2"]} & @{text "\<equiv>"} &
-@{thm (rhs) L_rexp.simps(4)[where ?r1.0="r\<^isub>1" and ?r2.0="r\<^isub>2"]}\\
+@{thm (rhs) lang.simps(4)[where ?r="r\<^isub>1" and ?s="r\<^isub>2"]}\\
-@{thm (lhs) L_rexp.simps(5)[where ?r1.0="r\<^isub>1" and ?r2.0="r\<^isub>2"]} & @{text "\<equiv>"} &
+@{thm (lhs) lang.simps(5)[where ?r="r\<^isub>1" and ?s="r\<^isub>2"]} & @{text "\<equiv>"} &
-@{thm (rhs) L_rexp.simps(5)[where ?r1.0="r\<^isub>1" and ?r2.0="r\<^isub>2"]}\\
+@{thm (rhs) lang.simps(5)[where ?r="r\<^isub>1" and ?s="r\<^isub>2"]}\\
-@{thm (lhs) L_rexp.simps(6)[where r="r"]} & @{text "\<equiv>"} &
+@{thm (lhs) lang.simps(6)[where r="r"]} & @{text "\<equiv>"} &
-@{thm (rhs) L_rexp.simps(6)[where r="r"]}\\
+@{thm (rhs) lang.simps(6)[where r="r"]}\\
 \end{tabular}
 \end{tabular}
 \end{center}
 Given a finite set of regular expressions @{text rs}, we will make use of the operation of generating
 section {* The Myhill-Nerode Theorem, First Part *}
 text {*
+Folklore: Henzinger (arden-DFA-regexp.pdf)
+\noindent
 The key definition in the Myhill-Nerode theorem is the
 \emph{Myhill-Nerode relation}, which states that w.r.t.~a language two
 strings are related, provided there is no distinguishing extension in this
 language. This can be defined as a tertiary relation.
 Overloading the function @{text \<calL>} for the two kinds of terms in the
 equational system, we have
 \begin{center}
 @{text "\<calL>(Y, r) \<equiv>"} %
-@{thm (rhs) L_trm.simps(2)[where X="Y" and r="r", THEN eq_reflection]}\hspace{10mm}
+@{thm (rhs) lang_trm.simps(2)[where X="Y" and r="r", THEN eq_reflection]}\hspace{10mm}
-@{thm L_trm.simps(1)[where r="r", THEN eq_reflection]}
+@{thm lang_trm.simps(1)[where r="r", THEN eq_reflection]}
 \end{center}
 \noindent
 and we can prove for @{text "X\<^isub>2\<^isub>.\<^isub>.\<^isub>n"} that the following equations
 %
 The cases for @{const NULL}, @{const EMPTY} and @{const CHAR} are routine, because
 we can easily establish that
 \begin{center}
 \begin{tabular}{l}
-@{thm quot_null_eq}\\
+@{thm quot_zero_eq}\\
-@{thm quot_empty_subset}\\
+@{thm quot_one_subset}\\
-@{thm quot_char_subset}
+@{thm quot_atom_subset}
 \end{tabular}
 \end{center}
 \noindent
 hold, which shows that @{term "UNIV // \<approx>(L r)"} must be finite.\qed
 \begin{proof}[@{const "ALT"}-Case]
 We take as tagging-function
 %
 \begin{center}
-@{thm tag_str_ALT_def[where A="A" and B="B", THEN meta_eq_app]}
+@{thm tag_str_Plus_def[where A="A" and B="B", THEN meta_eq_app]}
 \end{center}
 \noindent
 where @{text "A"} and @{text "B"} are some arbitrary languages.
 We can show in general, if @{term "finite (UNIV // \<approx>A)"} and @{term "finite (UNIV // \<approx>B)"}
 or @{text x} and a prefix of @{text "z"} is in @{text A} and the rest in @{text B} (second picture).
 In both cases we have to show that @{term "y @ z \<in> A \<cdot> B"}. For this we use the
 following tagging-function
 %
 \begin{center}
-@{thm tag_str_SEQ_def[where ?L1.0="A" and ?L2.0="B", THEN meta_eq_app]}
+@{thm tag_str_Times_def[where ?L1.0="A" and ?L2.0="B", THEN meta_eq_app]}
 \end{center}
 \noindent
 with the idea that in the first split we have to make sure that @{text "(x - x') @ z"}
 is in the language @{text B}.
 In order to show that @{term "x @ z \<in> A\<star>"} implies @{term "y @ z \<in> A\<star>"}, we use
 the following tagging-function:
 %
 \begin{center}
-@{thm tag_str_STAR_def[where ?L1.0="A", THEN meta_eq_app]}\smallskip
+@{thm tag_str_Star_def[where ?L1.0="A", THEN meta_eq_app]}\smallskip
 \end{center}
 \begin{proof}[@{const STAR}-Case]
 If @{term "finite (UNIV // \<approx>A)"}
 then @{term "finite (Pow (UNIV // \<approx>A))"} holds. The range of
 to formalise in a HOL-based theorem prover. However, it is also the
 direction where we had to spend most of the `conceptual' time, as
 our proof-argument based on tagging-functions is new for
 establishing the Myhill-Nerode theorem. All standard proofs of this
 direction proceed by arguments over automata.\medskip
+We expect that the development of Krauss \& Nipkoe gets easier by
+using partial derivatives.
 \noindent
 {\bf Acknowledgements:} We are grateful for the comments we received from Larry
 Paulson.

changeset 172	21ee3a852a02
parent 170	b1258b7d2789
child 173	d371536861bc