diff -r 114064363ef0 -r e2dc11e12e0b Journal/Paper.thy
--- a/Journal/Paper.thy	Sun Sep 04 07:28:48 2011 +0000
+++ b/Journal/Paper.thy	Mon Sep 05 12:07:16 2011 +0000
@@ -1,6 +1,6 @@
 (*<*)
 theory Paper
-imports "../Closures" "../Attic/Prefix_subtract" 
+imports "../Closures2" "../Attic/Prefix_subtract" 
 begin
 
 declare [[show_question_marks = false]]
@@ -36,6 +36,7 @@
 abbreviation "TIMES \<equiv> Times"
 abbreviation "TIMESS \<equiv> Timess"
 abbreviation "STAR \<equiv> Star"
+abbreviation "ALLREG \<equiv> Allreg"
 
 definition
   Delta :: "'a lang \<Rightarrow> 'a lang"
@@ -63,7 +64,7 @@
   Append_rexp2 ("_ \<^raw:\ensuremath{\triangleleft}> _" [100, 100] 100) and
   Append_rexp_rhs ("_ \<^raw:\ensuremath{\triangleleft}> _" [100, 100] 50) and
   
-  uminus ("\<^raw:\ensuremath{\overline{>_\<^raw:}}>" [100] 100) and
+  uminus ("\<^raw:\ensuremath{\overline{\isa{>_\<^raw:}}}>" [100] 100) and
   tag_Plus ("+tag _ _" [100, 100] 100) and
   tag_Plus ("+tag _ _ _" [100, 100, 100] 100) and
   tag_Times ("\<times>tag _ _" [100, 100] 100) and
@@ -87,7 +88,10 @@
   derivs ("ders") and
   pderiv ("pder") and
   pderivs ("pders") and
-  pderivs_set ("pderss")
+  pderivs_set ("pderss") and
+  SUBSEQ ("Subseq") and
+  SUPSEQ ("Supseq") and
+  UP ("'(_')\<up>")
   
   
 lemmas Deriv_simps = Deriv_empty Deriv_epsilon Deriv_char Deriv_union
@@ -124,6 +128,16 @@
   shows "X = \<Union> (lang_trm `  rhs)"
 using assms l_eq_r_in_eqs by (simp)
 
+abbreviation
+  notprec ("_ \<^raw:\mbox{$\not\preceq$}>_")
+where
+ "notprec x y \<equiv> \<not>(x \<preceq> y)"
+
+abbreviation
+  minimal_syn ("min\<^bsub>_\<^esub> _")
+where
+  "minimal_syn A x \<equiv> minimal x A"   
+
 
 (* THEOREMS *)
 
@@ -180,6 +194,9 @@
     by (simp only: length_append)
 qed
 
+
+
+
 (*>*)
 
 
@@ -200,7 +217,7 @@
   that allows quantification over predicate variables. Its type system is
   based on Church's Simple Theory of Types \cite{Church40}.  Although many
   mathematical concepts can be conveniently expressed in HOL, there are some
-  limitations that hurt badly, if one attempts a simple-minded formalisation
+  limitations that hurt badly when attempting a simple-minded formalisation
   of regular languages in it. 
 
   The typical approach (for example \cite{HopcroftUllman69,Kozen97}) to
@@ -361,23 +378,23 @@
   larger formalisations of automata theory are carried out in Nuprl
   \cite{Constable00} and in Coq, e.g.~\cite{Filliatre97,Almeidaetal10}.
 
-  Also one might consider automata theory and regular languages as a meticulously
-  researched subject where everything is crystal clear. However, paper proofs about
-  automata often involve subtle side-conditions which are easily overlooked,
-  but which make formal reasoning rather painful. For example Kozen's proof of
-  the Myhill-Nerode theorem requires that automata do not have inaccessible
-  states \cite{Kozen97}. Another subtle side-condition is completeness of
-  automata, that is automata need to have total transition functions and at
-  most one `sink' state from which there is no connection to a final state
-  (Brzozowski mentions this side-condition in the context of state complexity
-  of automata \cite{Brzozowski10}). Such side-conditions mean that if we
-  define a regular language as one for which there exists \emph{a} finite
-  automaton that recognises all its strings (see Definition~\ref{baddef}), then we
-  need a lemma which ensures that another equivalent one can be found
-  satisfying the side-condition, and also need to make sure our operations on
-  automata preserve them. Unfortunately, such `little' and `obvious'
-  lemmas make a formalisation of automata theory a hair-pulling experience.
-
+  Also one might consider that automata are convenient `vehicles' for
+  establishing properties about regular languages.  However, paper proofs
+  about automata often involve subtle side-conditions which are easily
+  overlooked, but which make formal reasoning rather painful. For example
+  Kozen's proof of the Myhill-Nerode theorem requires that automata do not
+  have inaccessible states \cite{Kozen97}. Another subtle side-condition is
+  completeness of automata, that is automata need to have total transition
+  functions and at most one `sink' state from which there is no connection to
+  a final state (Brzozowski mentions this side-condition in the context of
+  state complexity of automata \cite{Brzozowski10}). Such side-conditions mean
+  that if we define a regular language as one for which there exists \emph{a}
+  finite automaton that recognises all its strings (see
+  Definition~\ref{baddef}), then we need a lemma which ensures that another
+  equivalent one can be found satisfying the side-condition, and also need to
+  make sure our operations on automata preserve them. Unfortunately, such
+  `little' and `obvious' lemmas make a formalisation of automata theory a
+  hair-pulling experience.
 
   In this paper, we will not attempt to formalise automata theory in
   Isabelle/HOL nor will we attempt to formalise automata proofs from the
@@ -405,7 +422,9 @@
   regular expressions. This theorem gives necessary and sufficient conditions
   for when a language is regular. As a corollary of this theorem we can easily
   establish the usual closure properties, including complementation, for
-  regular languages.\medskip
+  regular languages. We also use in one example the continuation lemma, which
+  is based on Myhill-Nerode, for establishing non-regularity of languages 
+  \cite{rosenberg06}.\medskip
   
   \noindent 
   {\bf Contributions:} There is an extensive literature on regular languages.
@@ -704,7 +723,7 @@
   equivalence classes and only finitely many characters.
   The term @{text "\<lambda>(ONE)"} in the first equation acts as a marker for the initial state, that
   is the equivalence class
-  containing @{text "[]"}.\footnote{Note that we mark, roughly speaking, the
+  containing the empty string @{text "[]"}.\footnote{Note that we mark, roughly speaking, the
   single `initial' state in the equational system, which is different from
   the method by Brzozowski \cite{Brzozowski64}, where he marks the
   `terminal' states. We are forced to set up the equational system in our
@@ -1141,6 +1160,13 @@
   Since the left-hand side is equal to @{text A}, we can use @{term "\<Uplus>rs"} 
   as the regular expression that is needed in the theorem.
   \end{proof}
+
+  \noindent
+  Note that solving our equational system also gives us a method for
+  calculating the regular expression for a complement of a regular language:
+  similar to the construction on automata, if we combine all regular
+  expressions corresponding to equivalence classes not in @{term "finals A"},
+  we obtain a regular expression for the complement @{term "- A"}.
 *}
 
 
@@ -2001,7 +2027,9 @@
   holds for any strings @{text "s\<^isub>1"} and @{text
   "s\<^isub>2"}. Therefore @{text A} and the complement language @{term "-A"}
   give rise to the same partitions. So if one is finite, the other is too, and
-  vice versa. Proving the existence of such a regular expression via
+  vice versa. As noted earlier, our algorithm for solving equational systems
+  actually calculates the regular expression. Calculating this regular expression 
+  via
   automata using the standard method would be quite involved. It includes the
   steps: regular expression @{text "\<Rightarrow>"} non-deterministic automaton @{text
   "\<Rightarrow>"} deterministic automaton @{text "\<Rightarrow>"} complement automaton @{text "\<Rightarrow>"}
@@ -2080,6 +2108,152 @@
   the right-hand side of \eqref{eqq} is equal to the language \mbox{@{term "lang (\<Uplus>(pderivs_lang B
   r))"}}. Thus the regular expression @{term "\<Uplus>(pderivs_lang B r)"} verifies that
   @{term "Deriv_lang B A"} is regular.
+
+  Even more surprising is the fact that for every language @{text A}, the language
+  consisting of all substrings of @{text A} is regular \cite{Shallit08, Gasarch09}. 
+  A substring can be obtained
+  by striking out zero or more characters from a string. This can be defined 
+  inductively by the following three rules:
+
+  \begin{center}
+  @{thm[mode=Axiom] emb0[where bs="x"]}\hspace{10mm} 
+  @{thm[mode=Rule] emb1[where as="x" and b="c" and bs="y"]}\hspace{10mm} 
+  @{thm[mode=Rule] emb2[where as="x" and a="c" and bs="y"]}
+  \end{center}
+
+  \noindent
+  It can be easily proved that @{text "\<preceq>"} is a partial order. Now define the 
+  language of substrings and superstrings of a language @{text A}:
+
+  \begin{center}
+  \begin{tabular}{l}
+  @{thm SUBSEQ_def}\\
+  @{thm SUPSEQ_def}
+  \end{tabular}
+  \end{center}
+  
+  \noindent
+  We like to establish
+
+  \begin{lmm}\label{subseqreg}
+  For any language @{text A}, the languages @{term "SUBSEQ A"} and @{term "SUPSEQ A"}
+  are regular.
+  \end{lmm}
+
+  \noindent
+  Our proof follows the one given in \cite[92--95]{Shallit08}, except that we use
+  Higman's lemma, which is already proved in the Isabelle/HOL library. Higman's
+  lemma allows us to prove that every set @{text A} of antichains, namely
+
+  \begin{equation}\label{higman}
+  @{text "\<forall>x, y \<in> A."}~@{term "x \<noteq> y \<longrightarrow> \<not>(x \<preceq> y) \<and> \<not>(y \<preceq> x)"}
+  \end{equation} 
+
+  \noindent
+  is finite.
+
+  It is straightforward to prove the following properties of @{term SUPSEQ}
+
+  \begin{equation}\label{supseqprops}
+  \mbox{\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  @{thm (lhs) SUPSEQ_simps(1)} & @{text "\<equiv>"} & @{thm (rhs) SUPSEQ_simps(1)}\\  
+  @{thm (lhs) SUPSEQ_simps(2)} & @{text "\<equiv>"} & @{thm (rhs) SUPSEQ_simps(2)}\\ 
+  @{thm (lhs) SUPSEQ_atom} & @{text "\<equiv>"} & @{thm (rhs) SUPSEQ_atom}\\ 
+  @{thm (lhs) SUPSEQ_union} & @{text "\<equiv>"} & @{thm (rhs) SUPSEQ_union}\\
+  @{thm (lhs) SUPSEQ_conc} & @{text "\<equiv>"} & @{thm (rhs) SUPSEQ_conc}\\
+  @{thm (lhs) SUPSEQ_star} & @{text "\<equiv>"} & @{thm (rhs) SUPSEQ_star}
+  \end{tabular}}
+  \end{equation}
+
+  \noindent
+  whereby the last equation follows from the fact that @{term "A\<star>"} contains the
+  empty string. With these properties at our disposal we can establish the lemma
+
+  \begin{lmm}
+  If @{text A} is regular, then also @{term "SUBSEQ A"}.
+  \end{lmm}
+
+  \begin{proof}
+  Since our alphabet is finite, we can find a regular expression, written @{const Allreg}, that
+  matches every single-character string. With this regular expression we can inductively define
+  the operation @{text "r\<up>"} over regular expressions
+
+  \begin{center}
+  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
+  @{thm (lhs) UP.simps(1)} & @{text "\<equiv>"} & @{thm (rhs) UP.simps(1)}\\  
+  @{thm (lhs) UP.simps(2)} & @{text "\<equiv>"} & @{thm (rhs) UP.simps(2)}\\ 
+  @{thm (lhs) UP.simps(3)} & @{text "\<equiv>"} & @{thm (rhs) UP.simps(3)}\\ 
+  @{thm (lhs) UP.simps(4)[where ?r1.0="r\<^isub>1" and ?r2.0="r\<^isub>2"]} & 
+     @{text "\<equiv>"} & @{thm (rhs) UP.simps(4)[where ?r1.0="r\<^isub>1" and ?r2.0="r\<^isub>2"]}\\
+  @{thm (lhs) UP.simps(5)[where ?r1.0="r\<^isub>1" and ?r2.0="r\<^isub>2"]} & 
+     @{text "\<equiv>"} & @{thm (rhs) UP.simps(5)[where ?r1.0="r\<^isub>1" and ?r2.0="r\<^isub>2"]}\\
+  @{thm (lhs) UP.simps(6)} & @{text "\<equiv>"} & @{thm (rhs) UP.simps(6)}
+  \end{tabular}
+  \end{center}
+
+  \noindent
+  and using \eqref{supseqprops} establish that @{thm lang_UP}. This shows
+  that @{term "SUBSEQ A"} is regular provided @{text A} is.
+  \end{proof}
+
+  \noindent
+  Now we can prove the main lemma, namely
+
+  \begin{lmm}\label{mset}
+  For every language @{text A}, there exists a finite language @{text M} such that
+  \begin{center}
+  \mbox{@{term "SUPSEQ M = SUPSEQ A"}}\;.
+  \end{center}
+  \end{lmm}
+
+  \begin{proof}
+  For @{text M} we take the set of all minimal elements of @{text A}. An element @{text x} 
+  is minimal in @{text A} provided
+
+  \begin{center}
+  @{thm minimal_def}
+  \end{center}
+
+  \noindent
+  By Higman's lemma \eqref{higman} we know
+  that @{text "M"} is finite, since every minimal element is incomparable, 
+  except with itself.
+  It is also straightforward to show that @{term "SUPSEQ M \<subseteq> SUPSEQ A"}. For
+  the other direction we have  @{term "x \<in> SUPSEQ A"}. From this we can obtain 
+  a @{text y} such that @{term "y \<in> A"} and @{term "y \<preceq> x"}. Since we know that
+  the relation \mbox{@{term "{(y, x). y \<preceq> x \<and> x \<noteq> y}"}} is well-founded, there must
+  be a minimal element @{text "z"} such that @{term "z \<in> A"} and @{term "z \<preceq> y"}, 
+  and hence by transitivity also \mbox{@{term "z \<preceq> x"}} (here we deviate from the argument 
+  given in \cite{Shallit08}, because Isabelle/HOL provides already an extensive infrastructure
+  for reasoning about well-foundedness). Since @{term "z"} is
+  minimal and in @{term A}, we also know that @{term z} is in @{term M}.
+  From this together with @{term "z \<preceq> x"}, we can infer that @{term x} is in 
+  @{term "SUPSEQ M"} as required.
+  \end{proof}
+
+  \noindent
+  This lemma allows us to establish the second part of Lemma~\ref{subseqreg}.
+
+  \begin{proof}[The Second Part of Lemma~\ref{subseqreg}]
+  Given any language @{text A}, by Lemma~\ref{mset} we know there exists
+  a finite, and thus regular, language @{text M}. We further have @{term "SUPSEQ M = SUPSEQ A"},
+  which establishes the second part.    
+  \end{proof}
+
+  \noindent
+  In order to establish the first part of Lemma~\ref{subseqreg} we use the
+  property proved in \cite{Shallit08}
+
+  \begin{equation}\label{compl}
+  @{thm SUBSEQ_complement}
+  \end{equation}
+
+  \begin{proof}[The First Part of Lemma~\ref{subseqreg}]
+  By the second part of Lemma~\ref{subseqreg}, we know the right-hand side of \eqref{compl}
+  is regular, which means the complement of @{term "SUBSEQ A"} is regular. But since
+  we established already that regularity is preserved under complement, also @{term "SUBSEQ A"}
+  must be regular. 
+  \end{proof}
 *}