Journal/Paper.thy
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Sun, 22 Dec 2013 07:37:26 +0000
changeset 393 058f29ab515c
parent 392 87d3306acca8
permissions -rw-r--r--
added

(*<*)
theory Paper
imports "../Closures2" "../Attic/Prefix_subtract" 
begin

lemma nullable_iff:
  shows "nullable r \<longleftrightarrow> [] \<in> lang r"
by (induct r) (auto simp add: conc_def split: if_splits)

lemma Deriv_deriv:
  shows "Deriv c (lang r) = lang (deriv c r)"
by (induct r) (simp_all add: nullable_iff)

lemma Derivs_derivs:
  shows "Derivs s (lang r) = lang (derivs s r)"
by (induct s arbitrary: r) (simp_all add: Deriv_deriv)

declare [[show_question_marks = false]]

consts
 REL :: "(string \<times> string) set"
 UPLUS :: "'a set \<Rightarrow> 'a set \<Rightarrow> (nat \<times> 'a) set"

abbreviation
  "EClass x R \<equiv> R `` {x}"

abbreviation 
  "Append_rexp2 r_itm r \<equiv> Append_rexp r r_itm"


abbreviation
  "pow" (infixl "\<up>" 100)
where
  "A \<up> n \<equiv> A ^^ n"

syntax (latex output)
  "_Collect" :: "pttrn => bool => 'a set"              ("(1{_ | _})")
  "_CollectIn" :: "pttrn => 'a set => bool => 'a set"   ("(1{_ \<in> _ | _})")
translations
  "_Collect p P"      <= "{p. P}"
  "_Collect p P"      <= "{p|xs. P}"
  "_CollectIn p A P"  <= "{p : A. P}"

syntax (latex output)
  "_UNION_le"   :: "'a \<Rightarrow> 'a => 'b set => 'b set"       ("(3\<Union>(00\<^bsub>_ \<le> _\<^esub>)/ _)" [0, 0, 10] 10)

abbreviation "ZERO \<equiv> Zero"
abbreviation "ONE \<equiv> One"
abbreviation "ATOM \<equiv> Atom"
abbreviation "PLUS \<equiv> Plus"
abbreviation "TIMES \<equiv> Times"
abbreviation "TIMESS \<equiv> Timess"
abbreviation "STAR \<equiv> Star"
abbreviation "ALLS \<equiv> Star Allreg"

definition
  Delta :: "'a lang \<Rightarrow> 'a lang"
where
  "Delta A = (if [] \<in> A then {[]} else {})"

notation (latex output)
  str_eq ("\<approx>\<^bsub>_\<^esub>") and
  str_eq_applied ("_ \<approx>\<^bsub>_\<^esub> _") and
  conc (infixr "\<cdot>" 100) and
  star ("_\<^bsup>\<star>\<^esup>" [101] 200) and
  pow ("_\<^bsup>_\<^esup>" [100, 100] 100) and
  Suc ("_+1" [100] 100) and
  quotient ("_ \<^raw:\ensuremath{\!\sslash\!}> _" [90, 90] 90) and
  REL ("\<approx>") and
  UPLUS ("_ \<^raw:\ensuremath{\uplus}> _" [90, 90] 90) and
  lang ("\<^raw:\ensuremath{\cal{L}}>" 101) and
  lang ("\<^raw:\ensuremath{\cal{L}}>'(_')" [0] 101) and
  lang_trm ("\<^raw:\ensuremath{\cal{L}}>'(_')" [0] 101) and
  Lam ("\<lambda>'(_')" [100] 100) and 
  Trn ("'(_, _')" [100, 100] 100) and 
  EClass ("\<lbrakk>_\<rbrakk>\<^bsub>_\<^esub>" [100, 100] 100) and
  transition ("_ \<^raw:\ensuremath{\stackrel{\text{>_\<^raw:}}{\Longmapsto}}> _" [100, 100, 100] 100) and
  Setalt ("\<^raw:\ensuremath{\bigplus}>_" [1000] 999) and
  Append_rexp2 ("_ \<^raw:\ensuremath{\triangleleft}> _" [100, 100] 100) and
  Append_rexp_rhs ("_ \<^raw:\ensuremath{\triangleleft}> _" [100, 100] 50) and
  
  uminus ("\<^raw:\ensuremath{\overline{\isa{>_\<^raw:}}}>" [100] 100) and
  tag_Plus ("+tag _ _" [100, 100] 100) and
  tag_Plus ("+tag _ _ _" [100, 100, 100] 100) and
  tag_Times ("\<times>tag _ _" [100, 100] 100) and
  tag_Times ("\<times>tag _ _ _" [100, 100, 100] 100) and
  tag_Star ("\<star>tag _" [100] 100) and
  tag_Star ("\<star>tag _ _" [100, 100] 100) and
  tag_eq ("\<^raw:$\threesim$>\<^bsub>_\<^esub>" [100] 100) and
  Delta ("\<Delta>'(_')") and
  nullable ("\<delta>'(_')") and
  Cons ("_ :: _" [100, 100] 100) and
  Rev ("Rev _" [1000] 100) and
  Deriv ("Der _ _" [1000, 1000] 100) and
  Derivs ("Ders") and
  ONE ("ONE" 999) and
  ZERO ("ZERO" 999) and
  pderivs_lang ("pdersl") and
  UNIV1 ("UNIV\<^sup>+") and
  Deriv_lang ("Dersl") and
  Derivss ("Derss") and
  deriv ("der") and
  derivs ("ders") and
  pderiv ("pder") and
  pderivs ("pders") and
  pderivs_set ("pderss") and
  SUBSEQ ("Sub") and
  SUPSEQ ("Sup") and
  UP ("'(_')\<up>") and
  ALLS ("ALL")
  
  
lemmas Deriv_simps = Deriv_empty Deriv_epsilon Deriv_char Deriv_union

definition
  Der :: "'a \<Rightarrow> 'a lang \<Rightarrow> 'a lang"
where
  "Der c A \<equiv> {s. [c] @ s \<in> A}"

definition
  Ders :: "'a list \<Rightarrow> 'a lang \<Rightarrow> 'a lang"
where
  "Ders s A \<equiv> {s'. s @ s' \<in> A}"


lemma meta_eq_app:
  shows "f \<equiv> \<lambda>x. g x \<Longrightarrow> f x \<equiv> g x"
  by auto

lemma str_eq_def':
  shows "x \<approx>A y \<equiv> (\<forall>z. x @ z \<in> A \<longleftrightarrow> y @ z \<in> A)"
unfolding str_eq_def by simp

lemma conc_def':
  "A \<cdot> B = {s\<^sub>1 @ s\<^sub>2 | s\<^sub>1 s\<^sub>2. s\<^sub>1 \<in> A \<and> s\<^sub>2 \<in> B}"
unfolding conc_def by simp

lemma conc_Union_left: 
  shows "B \<cdot> (\<Union>n. A \<up> n) = (\<Union>n. B \<cdot> (A \<up> n))"
unfolding conc_def by auto

lemma test:
  assumes X_in_eqs: "(X, rhs) \<in> Init (UNIV // \<approx>A)"
  shows "X = \<Union> (lang_trm `  rhs)"
using assms l_eq_r_in_eqs by (simp)

abbreviation
  notprec ("_ \<^raw:\mbox{$\not\preceq$}>_")
where
 "notprec x y \<equiv> \<not>(x \<preceq> y)"

abbreviation
  minimal_syn ("min\<^bsub>_\<^esub> _")
where
  "minimal_syn A x \<equiv> minimal x A"   


(* THEOREMS *)

notation (Rule output)
  "==>"  ("\<^raw:\mbox{}\inferrule{\mbox{>_\<^raw:}}>\<^raw:{\mbox{>_\<^raw:}}>")

syntax (Rule output)
  "_bigimpl" :: "asms \<Rightarrow> prop \<Rightarrow> prop"
  ("\<^raw:\mbox{}\inferrule{>_\<^raw:}>\<^raw:{\mbox{>_\<^raw:}}>")

  "_asms" :: "prop \<Rightarrow> asms \<Rightarrow> asms" 
  ("\<^raw:\mbox{>_\<^raw:}\\>/ _")

  "_asm" :: "prop \<Rightarrow> asms" ("\<^raw:\mbox{>_\<^raw:}>")

notation (Axiom output)
  "Trueprop"  ("\<^raw:\mbox{}\inferrule{\mbox{}}{\mbox{>_\<^raw:}}>")

notation (IfThen output)
  "==>"  ("\<^raw:{\normalsize{}>If\<^raw:\,}> _/ \<^raw:{\normalsize \,>then\<^raw:\,}>/ _.")
syntax (IfThen output)
  "_bigimpl" :: "asms \<Rightarrow> prop \<Rightarrow> prop"
  ("\<^raw:{\normalsize{}>If\<^raw:\,}> _ /\<^raw:{\normalsize \,>then\<^raw:\,}>/ _.")
  "_asms" :: "prop \<Rightarrow> asms \<Rightarrow> asms" ("\<^raw:\mbox{>_\<^raw:}> /\<^raw:{\normalsize \,>and\<^raw:\,}>/ _")
  "_asm" :: "prop \<Rightarrow> asms" ("\<^raw:\mbox{>_\<^raw:}>")

notation (IfThenNoBox output)
  "==>"  ("\<^raw:{\normalsize{}>If\<^raw:\,}> _/ \<^raw:{\normalsize \,>then\<^raw:\,}>/ _.")
syntax (IfThenNoBox output)
  "_bigimpl" :: "asms \<Rightarrow> prop \<Rightarrow> prop"
  ("\<^raw:{\normalsize{}>If\<^raw:\,}> _ /\<^raw:{\normalsize \,>then\<^raw:\,}>/ _.")
  "_asms" :: "prop \<Rightarrow> asms \<Rightarrow> asms" ("_ /\<^raw:{\normalsize \,>and\<^raw:\,}>/ _")
  "_asm" :: "prop \<Rightarrow> asms" ("_")

lemma pow_length:
  assumes a: "[] \<notin> A"
  and     b: "s \<in> A \<up> Suc n"
  shows "n < length s"
using b
proof (induct n arbitrary: s)
  case 0
  have "s \<in> A \<up> Suc 0" by fact
  with a have "s \<noteq> []" by auto
  then show "0 < length s" by auto
next
  case (Suc n)
  have ih: "\<And>s. s \<in> A \<up> Suc n \<Longrightarrow> n < length s" by fact
  have "s \<in> A \<up> Suc (Suc n)" by fact
  then obtain s1 s2 where eq: "s = s1 @ s2" and *: "s1 \<in> A" and **: "s2 \<in> A \<up> Suc n"
    by (auto simp add: Seq_def)
  from ih ** have "n < length s2" by simp
  moreover have "0 < length s1" using * a by auto
  ultimately show "Suc n < length s" unfolding eq 
    by (simp only: length_append)
qed




(*>*)


section {* Introduction *}

text {*
  \noindent
  Regular languages are an important and well-understood subject in Computer
  Science, with many beautiful theorems and many useful algorithms. There is a
  wide range of textbooks on this subject, many of which are aimed at students
  and contain very detailed `pencil-and-paper' proofs 
  (e.g.~the textbooks by \citeN{HopcroftUllman69} and by \citeN{Kozen97}). 
  It seems natural to exercise theorem provers by
  formalising the theorems and by verifying formally the algorithms.  

  A popular choice for a theorem prover would be one based on Higher-Order
  Logic (HOL), for example HOL4, HOLlight or Isabelle/HOL. For the development
  presented in this paper we will use the Isabelle/HOL system. HOL is a predicate calculus
  that allows quantification over predicate variables. Its type system is
  based on the Simple Theory of Types by \citeN{Church40}.  Although many
  mathematical concepts can be conveniently expressed in HOL, there are some
  limitations that hurt when attempting a simple-minded formalisation
  of regular languages in it. 

  The typical approach to
  regular languages, taken for example by \citeN{HopcroftUllman69}
  and by \citeN{Kozen97},  is to introduce finite deterministic automata and then
  define most notions in terms of them.  For example, a regular language is
  normally defined as:

  \begin{definition}\label{baddef}
  A language @{text A} is \emph{regular}, provided there is a 
  finite deterministic automaton that recognises all strings of @{text "A"}.
  \end{definition}

  \noindent  
  This approach has many benefits. Among them is the fact that it is easy to
  convince oneself that regular languages are closed under complementation:
  one just has to exchange the accepting and non-accepting states in the
  corresponding automaton to obtain an automaton for the complement language.
  The problem, however, lies with formalising such reasoning in a 
  theorem prover. Automata are built up from states and transitions that are 
  commonly represented as graphs, matrices or functions, none of which, unfortunately, 
  can be defined as an inductive datatype.

  In case of graphs and matrices, this means we have to build our own
  reasoning infrastructure for them, as neither Isabelle/HOL nor HOL4 nor
  HOLlight support them with libraries. Also, reasoning about graphs and
  matrices can be a hassle in theorem provers, because
  we have to be able to combine automata.  Consider for
  example the operation of sequencing two automata, say $A_1$ and $A_2$, by
  connecting the accepting states of $A_1$ to the initial state of $A_2$:

  \begin{center}
  \begin{tabular}{ccc}
  \begin{tikzpicture}[scale=1]
  %\draw[step=2mm] (-1,-1) grid (1,1);
  
  \draw[rounded corners=1mm, very thick] (-1.0,-0.3) rectangle (-0.2,0.3);
  \draw[rounded corners=1mm, very thick] ( 0.2,-0.3) rectangle ( 1.0,0.3);

  \node (A) at (-1.0,0.0) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  \node (B) at ( 0.2,0.0) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  
  \node (C) at (-0.2, 0.13) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  \node (D) at (-0.2,-0.13) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};

  \node (E) at (1.0, 0.2) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  \node (F) at (1.0,-0.0) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  \node (G) at (1.0,-0.2) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};

  \draw (-0.6,0.0) node {\small$A_1$};
  \draw ( 0.6,0.0) node {\small$A_2$};
  \end{tikzpicture}

  & 

  \raisebox{2.1mm}{\bf\Large$\;\;\;\Rightarrow\,\;\;$}

  &

  \begin{tikzpicture}[scale=1]
  %\draw[step=2mm] (-1,-1) grid (1,1);
  
  \draw[rounded corners=1mm, very thick] (-1.0,-0.3) rectangle (-0.2,0.3);
  \draw[rounded corners=1mm, very thick] ( 0.2,-0.3) rectangle ( 1.0,0.3);

  \node (A) at (-1.0,0.0) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  \node (B) at ( 0.2,0.0) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  
  \node (C) at (-0.2, 0.13) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  \node (D) at (-0.2,-0.13) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};

  \node (E) at (1.0, 0.2) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  \node (F) at (1.0,-0.0) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  \node (G) at (1.0,-0.2) [circle, very thick, draw, fill=white, inner sep=0.4mm] {};
  
  \draw (C) to [very thick, bend left=45] (B);
  \draw (D) to [very thick, bend right=45] (B);

  \draw (-0.6,0.0) node {\small$A_1$};
  \draw ( 0.6,0.0) node {\small$A_2$};
  \end{tikzpicture}

  \end{tabular}
  \end{center}

  \noindent
  On `paper' we can define the corresponding 
  graph in terms of the disjoint 
  union of the state nodes. Unfortunately in HOL, the standard definition for disjoint 
  union, namely 
  %
  \begin{equation}\label{disjointunion}
  @{text "A\<^sub>1 \<uplus> A\<^sub>2 \<equiv> {(1, x) | x \<in> A\<^sub>1} \<union> {(2, y) | y \<in> A\<^sub>2}"}
  \end{equation}

  \noindent
  changes the type---the disjoint union is not a set, but a set of
  pairs. Using this definition for disjoint union means we do not have
  a single type for the states of automata. As a result we will not be
  able to define a regular language as one for which there exists an
  automaton that recognises all its strings
  (Definition~\ref{baddef}). This is because we cannot make a
  definition in HOL that is only polymorphic in the state type, but
  not in the predicate for regularity; and there is no type
  quantification available in HOL.\footnote{Slind already pointed out
  this problem in an email to the HOL4 mailing list on 21st April
  2005.} Systems such as Coq permit quantification over types and thus can
  state such a definition.  This has been recently exploited in an
  elegant and short formalisation of the Myhill-Nerode theorem in Coq by
  \citeN{Smolka13}, but as said this is not available to us working in Isabelle/HOL.

  An alternative, which provides us with a single type for states of automata,
  is to give every state node an identity, for example a natural number, and
  then be careful to rename these identities apart whenever connecting two
  automata. This results in clunky proofs establishing that properties are
  invariant under renaming. Similarly, connecting two automata represented as
  matrices results in messy constructions, which are not pleasant to
  formally reason about. \citeN[Page 67]{Braibant12}, for example, writes that there are no 
  problems with reasoning about matrices, but that there is an
  ``intrinsic difficulty of working with rectangular matrices'' in some 
  parts of his formalisation of formal languages in Coq.

  Functions are much better supported in Isabelle/HOL, but they still lead to
  similar problems as with graphs.  Composing, for example, two
  non-deterministic automata in parallel requires also the formalisation of
  disjoint unions. \citeN{Nipkow98} dismisses for this the option of
  using identities, because it leads according to him to ``messy
  proofs''. Since he does not need to define what regular languages are,
  Nipkow opts for a variant of \eqref{disjointunion} using bit lists, but
  writes\smallskip


  \begin{quote}
  \it%
  \begin{tabular}{@ {}l@ {}p{0.88\textwidth}@ {}}
  `` & All lemmas appear obvious given a picture of the composition of automata\ldots
       Yet their proofs require a painful amount of detail.''
  \end{tabular}
  \end{quote}\smallskip

  \noindent
  and\smallskip
  
  \begin{quote}
  \it%
  \begin{tabular}{@ {}l@ {}p{0.88\textwidth}@ {}}
  `` & If the reader finds the above treatment in terms of bit lists revoltingly
       concrete, I cannot disagree. A more abstract approach is clearly desirable.''
  \end{tabular}
  \end{quote}\smallskip

  %\noindent
  %Moreover, it is not so clear how to conveniently impose a finiteness
  %condition upon functions in order to represent \emph{finite} automata. The
  %best is probably to resort to more advanced reasoning frameworks, such as
  %\emph{locales} or \emph{type classes}, which are \emph{not} available in all
  %HOL-based theorem provers.

  Because of these problems to do with representing automata, formalising 
  automata theory is surprisingly not as easy as one might think, despite the 
  sometimes very detailed, but informal, textbook proofs. \citeN{LammichTuerk12}
  formalised Hopcroft's algorithm using an automata library of 14 kloc in
  Isabelle/HOL. There they use matrices for representing automata. 
  Functions are used by \citeN{Nipkow98}, who establishes
  the link between regular expressions and automata in the context of
  lexing. \citeN{BerghoferReiter09} use functions as well for formalising automata
  working over bit strings in the context of Presburger arithmetic.  
  A larger formalisation of automata theory, including the Myhill-Nerode theorem, 
  was carried out in Nuprl by \citeN{Constable00} which also uses functions.
  Other large formalisations of automata theory in Coq are by 
  \citeN{Filliatre97} who essentially uses graphs and by \citeN{Almeidaetal10}
  and \citeN{Braibant12}, who both use matrices. Many of these works,
  like \citeN{Nipkow98} or \citeN{Braibant12}, mention intrinsic problems with 
  their representation of
  automata which made them `fight' their respective  theorem prover.

  %Also, one might consider automata as just convenient `vehicles' for
  %establishing properties about regular languages.  However, paper proofs
  %about automata often involve subtle side-conditions which are easily
  %overlooked, but which make formal reasoning rather painful. For example
  %Kozen's proof of the Myhill-Nerode Theorem requires that automata do not
  %have inaccessible states \cite{Kozen97}. Another subtle side-condition is
  %completeness of automata, that is automata need to have total transition
  %functions and at most one `sink' state from which there is no connection to
  %a final state (Brzozowski mentions this side-condition in the context of
  %state complexity of automata \cite{Brzozowski10}, but it is also needed
  %in the usual construction of the complement automaton). Such side-conditions mean
  %that if we define a regular language as one for which there exists \emph{a}
  %finite automaton that recognises all its strings (see
  %Definition~\ref{baddef}), then we need a lemma which ensures that another
  %equivalent one can be found satisfying the side-condition, and also need to
  %make sure our operations on automata preserve them. Unfortunately, such
  %`little' and `obvious' lemmas make formalisations of automata theory 
  %hair-pulling experiences.

  In this paper, we will not attempt to formalise automata theory in
  Isabelle/HOL nor will we attempt to formalise automata proofs from the
  literature, but take a different approach to regular languages than is
  usually taken. Instead of defining a regular language as one where there
  exists an automaton that recognises all its strings, we define a
  regular language as:

  \begin{definition}\label{regular}
  A language @{text A} is \emph{regular}, provided there is a regular expression 
  that matches all strings of @{text "A"}.
  \end{definition}
  
  \noindent
  We then want to `forget' automata completely and see how far we come
  with formalising results from regular language theory by only using regular 
  expressions.  The reason 
  is that regular expressions, unlike graphs, matrices and
  functions, can be defined as an inductive datatype and a reasoning
  infrastructure for them (like induction and recursion) comes for
  free in HOL. 

  While our choice of using regular expressions is motivated by
  shortcomings in theorem provers, the question whether formal
  language theory can be done without automata crops up also in
  non-theorem prover contexts. For example, Gasarch asked in the
  Computational Complexity blog by \citeN{GasarchBlog} whether the
  complementation of regular-expression languages can be proved
  without using automata. He concluded

  \begin{quote}
  ``{\it \ldots it can't be done}''
  \end{quote} 

  \noindent
  and even pondered

  \begin{quote}
  ``{\it \ldots [b]ut is there a rigorous way to even state that?}'' 
  \end{quote} 

  \noindent
  We shall give an answer to these questions with our paper.

  %Moreover, no side-conditions will be needed for regular expressions,
  %like we need for automata. 
 

  The convenience of regular expressions has
  recently been exploited in HOL4 with a formalisation of regular expression
  matching based on derivatives by \citeN{OwensSlind08}, and with an equivalence
  checker for regular expressions in Isabelle/HOL by \citeN{KraussNipkow11}
  and in Matita by \citeN{Asperti12} and in Coq by \citeN{CoquandSiles12}.  The
  main purpose of this paper is to show that a central result about regular
  languages---the Myhill-Nerode Theorem---can be recreated by only using
  regular expressions. This theorem gives necessary and sufficient conditions
  for when a language is regular. As a corollary of this theorem we can easily
  establish the usual closure properties, including complementation, for
  regular languages. We use the Continuation Lemma, which is also a corollary 
  of the Myhill-Nerode Theorem, for establishing 
  the non-regularity of the language @{text "a\<^sup>nb\<^sup>n"}.\medskip 
  
  \noindent 
  {\bf Contributions:} There is an extensive literature on regular languages.
  To our best knowledge, our proof of the Myhill-Nerode Theorem is the first
  that is based on regular expressions, only. The part of this theorem stating
  that finitely many partitions imply regularity of the language is proved by
  an argument about solving equational systems.  This argument requires a
  `reversed' version of Arden's Lemma. For the other part, we give two proofs: one direct proof using
  certain tagging-functions, and another indirect proof using Antimirov's
  partial derivatives (\citeyear{Antimirov95}). Again to our best knowledge, the
  tagging-functions have not been used before for establishing the Myhill-Nerode
  Theorem. Derivatives of regular expressions have been used recently quite
  widely in the literature; partial derivatives, in contrast, attract much
  less attention. However, partial derivatives are more suitable in the
  context of the Myhill-Nerode Theorem, since it is easier to 
  formally establish their finiteness result. We are not aware of any proof that uses
  either of them for proving the Myhill-Nerode Theorem.
*}

section {* Preliminaries *}

text {*
  \noindent
  Strings in Isabelle/HOL are lists of characters with the \emph{empty string}
  being represented by the empty list, written @{term "[]"}.  We assume there
  are only finitely many different characters.  \emph{Languages} are sets of
  strings. The language containing all strings is written in Isabelle/HOL as
  @{term "UNIV::string set"}. The concatenation of two languages is written
  @{term "A \<cdot> B"} and a language raised to the power @{text n} is written
  @{term "A \<up> n"}. They are defined as usual

  \begin{center}
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) conc_def'[THEN eq_reflection, where A1="A" and B1="B"]}
  & @{text "\<equiv>"} & @{thm (rhs) conc_def'[THEN eq_reflection, where A1="A" and B1="B"]}\\

  @{thm (lhs) lang_pow.simps(1)[THEN eq_reflection, where A1="A"]}
  & @{text "\<equiv>"} & @{thm (rhs) lang_pow.simps(1)[THEN eq_reflection, where A1="A"]}\\

  @{thm (lhs) lang_pow.simps(2)[THEN eq_reflection, where A1="A" and n1="n"]}
  & @{text "\<equiv>"} & @{thm (rhs) lang_pow.simps(2)[THEN eq_reflection, where A1="A" and n1="n"]}
  \end{tabular}
  \end{center}

  \noindent
  where @{text "@"} is the list-append operation. The Kleene-star of a language @{text A}
  is defined as the union over all powers, namely @{thm star_def[where A="A", THEN eq_reflection]}. 
  In the paper
  we will make use of the following properties of these constructions.
  
  \begin{proposition}\label{langprops}\mbox{}\\
  \begin{tabular}{@ {}lp{10cm}}
  (i)   & @{thm star_unfold_left}     \\ 
  (ii)  & @{thm[mode=IfThen] pow_length}\\
  (iii) & @{thm conc_Union_left} \\ 
  (iv)  & If @{thm (prem 1) star_decom} and @{thm (prem 2) star_decom} then
          there exists an @{text "x\<^sub>p"} and @{text "x\<^sub>s"} with @{text "x = x\<^sub>p @ x\<^sub>s"} 
          and \mbox{@{term "x\<^sub>p \<noteq> []"}} such that @{term "x\<^sub>p \<in> A"} and @{term "x\<^sub>s \<in> A\<star>"}.
  \end{tabular}
  \end{proposition}

  \noindent
  In @{text "(ii)"} we use the notation @{term "length s"} for the length of a
  string; this property states that if \mbox{@{term "[] \<notin> A"}} then the lengths of
  the strings in @{term "A \<up> (Suc n)"} must be longer than @{text n}.  
  Property @{text "(iv)"} states that a non-empty string in @{term "A\<star>"} can
  always be split up into a non-empty prefix belonging to @{text "A"} and the
  rest being in @{term "A\<star>"}. We omit
  the proofs for these properties, but invite the reader to consult our
  formalisation.\footnote{Available under \citeN{myhillnerodeafp11} in the Archive of Formal Proofs at\\ 
  \url{http://afp.sourceforge.net/entries/Myhill-Nerode.shtml}.}

  The notation in Isabelle/HOL for the quotient of a language @{text A}
  according to an equivalence relation @{term REL} is @{term "A // REL"}. We
  will write @{text "\<lbrakk>x\<rbrakk>\<^sub>\<approx>"} for the equivalence class defined as
  \mbox{@{text "{y | y \<approx> x}"}}, and have @{text "x \<approx> y"} if and only if @{text
  "\<lbrakk>x\<rbrakk>\<^sub>\<approx> = \<lbrakk>y\<rbrakk>\<^sub>\<approx>"}.


  Central to our proof will be the solution of equational systems
  involving equivalence classes of languages. For this we will use Arden's Lemma 
  (see for example \citet[Page 100]{Sakarovitch09}),
  which solves equations of the form @{term "X = A \<cdot> X \<union> B"} provided
  @{term "[] \<notin> A"}. However we will need the following `reversed' 
  version of Arden's Lemma (`reversed' in the sense of changing the order of @{term "A \<cdot> X"} to
  \mbox{@{term "X \<cdot> A"}}).
  %\footnote{The details of the proof for the reversed Arden's Lemma
  %are given in the Appendix.}

  \begin{lemma}[Reversed Arden's Lemma]\label{arden}\mbox{}\\
  If @{thm (prem 1) reversed_Arden} then
  @{thm (lhs) reversed_Arden} if and only if
  @{thm (rhs) reversed_Arden}.
  \end{lemma}

  \noindent
  The proof of this reversed version of Arden's lemma follows the proof of the
  original version.
  Regular expressions are defined as the inductive datatype

  \begin{center}
  \begin{tabular}{rcl}
  @{text r} & @{text "::="} & @{term ZERO}\\
   & @{text"|"} & @{term One}\\ 
   & @{text"|"} & @{term "Atom c"}\\
   & @{text"|"} & @{term "Times r r"}\\
   & @{text"|"} & @{term "Plus r r"}\\
   & @{text"|"} & @{term "Star r"}
  \end{tabular}
  \end{center}

  \noindent
  and the language matched by a regular expression is defined by recursion as

  \begin{center}
  \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
  @{thm (lhs) lang.simps(1)} & @{text "\<equiv>"} & @{thm (rhs) lang.simps(1)}\\
  @{thm (lhs) lang.simps(2)} & @{text "\<equiv>"} & @{thm (rhs) lang.simps(2)}\\
  @{thm (lhs) lang.simps(3)[where a="c"]} & @{text "\<equiv>"} & @{thm (rhs) lang.simps(3)[where a="c"]}\\
  @{thm (lhs) lang.simps(4)[where ?r="r\<^sub>1" and ?s="r\<^sub>2"]} & @{text "\<equiv>"} &
       @{thm (rhs) lang.simps(4)[where ?r="r\<^sub>1" and ?s="r\<^sub>2"]}\\
  @{thm (lhs) lang.simps(5)[where ?r="r\<^sub>1" and ?s="r\<^sub>2"]} & @{text "\<equiv>"} &
       @{thm (rhs) lang.simps(5)[where ?r="r\<^sub>1" and ?s="r\<^sub>2"]}\\
  @{thm (lhs) lang.simps(6)[where r="r"]} & @{text "\<equiv>"} &
      @{thm (rhs) lang.simps(6)[where r="r"]}\\
  \end{tabular}
  \end{center}

  Given a finite set of regular expressions @{text rs}, we will make use of the operation of generating 
  a regular expression that matches the union of all languages of @{text rs}. 
  This definition is not trivial in a theorem prover, because @{text rs} (being a set) is unordered, 
  but the regular expression needs an order. Since 
  we only need to know the 
  existence
  of such a regular expression, we can use Isabelle/HOL's @{const "fold_graph"} and Hilbert's
  choice operator, written @{text "SOME"} in Isabelle/HOL, for defining @{term "\<Uplus>rs"}. 
  This operation, roughly speaking, folds @{const PLUS} over the 
  set @{text rs} with @{const ZERO} for the empty set. We can prove that for a finite set @{text rs}
  %
  \begin{equation}\label{uplus}
  \mbox{@{thm (lhs) folds_plus_simp} @{text "= \<Union> (\<calL> ` rs)"}}
  \end{equation}

  \noindent
  holds, whereby @{text "\<calL> ` rs"} stands for the 
  image of the set @{text rs} under function @{text "\<calL>"} defined as

  \begin{center}
  @{term "lang ` rs \<equiv> {lang r | r. r \<in> rs}"}
  \end{center}

  \noindent
  In what follows we shall use this convenient short-hand notation for images of sets 
  also with other functions.
*}

section {* The Myhill-Nerode Theorem, First Part *}

text {*
  \noindent
  The key definition in the Myhill-Nerode Theorem is the
  \emph{Myhill-Nerode Relation}, which states that  two 
  strings are related w.r.t.~a language, provided there is no distinguishing extension in this
  language. This can be defined as a ternary relation.

  \begin{definition}[Myhill-Nerode Relation]\label{myhillneroderel} 
  Given a language @{text A}, two strings @{text x} and
  @{text y} are Myhill-Nerode related provided
  \begin{center}
  @{thm str_eq_def'}
  \end{center}
  \end{definition}

  \noindent
  It is easy to see that @{term "\<approx>A"} is an equivalence relation, which
  partitions the set of all strings, @{text "UNIV"}, into a set of disjoint
  equivalence classes. To illustrate this quotient construction, let us give a simple 
  example: consider the regular language built up over the alphabet @{term "{a, b}"} and 
  containing just the string two strings @{text "[a]"} and @{text "[a, b]"}. The 
  relation @{term "\<approx>({[a], [a, b]})"} partitions @{text UNIV}
  into four equivalence classes @{text "X\<^sub>1"}, @{text "X\<^sub>2"}, @{text "X\<^sub>3"} and @{text "X\<^sub>4"}
  as follows
  
  \begin{center}
  \begin{tabular}{l}
  @{text "X\<^sub>1 = {[]}"}\\
  @{text "X\<^sub>2 = {[a]}"}\\
  @{text "X\<^sub>3 = {[a, b]}"}\\
  @{text "X\<^sub>4 = UNIV - {[], [a], [a, b]}"}
  \end{tabular}
  \end{center}

  One direction of the Myhill-Nerode Theorem establishes 
  that if there are finitely many equivalence classes, like in the example above, then 
  the language is regular. In our setting we therefore have to show:
  
  \begin{theorem}\label{myhillnerodeone}
  @{thm[mode=IfThen] Myhill_Nerode1}
  \end{theorem}

  \noindent
  To prove this theorem, we first define the set @{term "finals A"} as those equivalence
  classes from \mbox{@{term "UNIV // \<approx>A"}} that contain strings of @{text A}, namely
  %
  \begin{equation} 
  @{thm finals_def}
  \end{equation}

  \noindent
  In our running example, @{text "X\<^sub>2"} and @{text "X\<^sub>3"} are the only 
  equivalence classes in @{term "finals {[a], [a, b]}"}.
  It is straightforward to show that in general 
  %
  \begin{equation}\label{finalprops}
  @{thm lang_is_union_of_finals}\hspace{15mm} 
  @{thm finals_in_partitions} 
  \end{equation}

  \noindent
  hold. 
  Therefore if we know that there exists a regular expression for every
  equivalence class in \mbox{@{term "finals A"}} (which by assumption must be
  a finite set), then we can use @{text "\<bigplus>"} to obtain a regular expression 
  that matches every string in @{text A}.


  Our proof of Theorem~\ref{myhillnerodeone} relies on a method that can calculate a
  regular expression for \emph{every} equivalence class, not just the ones 
  in @{term "finals A"}. We
  first define the notion of \emph{one-character-transition} between 
  two equivalence classes
  %
  \begin{equation} 
  @{thm transition_def}
  \end{equation}

  \noindent
  which means that if we append the character @{text c} to the end of all 
  strings in the equivalence class @{text Y}, we obtain a subset of 
  @{text X}. 
  %Note that we do not define formally an automaton here, 
  %we merely relate two sets
  %(with the help of a character). 
  In our concrete example we have

  \begin{center} 
  \begin{tabular}{l}
  @{term "X\<^sub>1 \<Turnstile>a\<Rightarrow> X\<^sub>2"},\; @{term "X\<^sub>1 \<Turnstile>b\<Rightarrow> X\<^sub>4"};\\ 
  @{term "X\<^sub>2 \<Turnstile>b\<Rightarrow> X\<^sub>3"},\; @{term "X\<^sub>2 \<Turnstile>a\<Rightarrow> X\<^sub>4"};\\
  @{term "X\<^sub>3 \<Turnstile>a\<Rightarrow> X\<^sub>4"},\; @{term "X\<^sub>3 \<Turnstile>b\<Rightarrow> X\<^sub>4"} and\\ 
  @{term "X\<^sub>4 \<Turnstile>a\<Rightarrow> X\<^sub>4"},\; @{term "X\<^sub>4 \<Turnstile>b\<Rightarrow> X\<^sub>4"}.
  \end{tabular}
  \end{center}
  
  Next we construct an \emph{initial equational system} that
  contains an equation for each equivalence class. We first give
  an informal description of this construction. Suppose we have 
  the equivalence classes @{text "X\<^sub>1,\<dots>,X\<^sub>n"}, there must be one and only one that
  contains the empty string @{text "[]"} (since equivalence classes are disjoint).
  Let us assume @{text "[] \<in> X\<^sub>1"}. We build the following initial equational system
  
  \begin{center}
  \begin{tabular}{rcl}
  @{text "X\<^sub>1"} & @{text "="} & @{text "(Y\<^sub>1\<^sub>1, ATOM c\<^sub>1\<^sub>1) + \<dots> + (Y\<^sub>1\<^sub>p, ATOM c\<^sub>1\<^sub>p) + \<lambda>(ONE)"} \\
  @{text "X\<^sub>2"} & @{text "="} & @{text "(Y\<^sub>2\<^sub>1, ATOM c\<^sub>2\<^sub>1) + \<dots> + (Y\<^sub>2\<^sub>o, ATOM c\<^sub>2\<^sub>o)"} \\
  & $\vdots$ \\
  @{text "X\<^sub>n"} & @{text "="} & @{text "(Y\<^sub>n\<^sub>1, ATOM c\<^sub>n\<^sub>1) + \<dots> + (Y\<^sub>n\<^sub>q, ATOM c\<^sub>n\<^sub>q)"}\\
  \end{tabular}
  \end{center}

  \noindent
  where the terms @{text "(Y\<^sub>i\<^sub>j, ATOM c\<^sub>i\<^sub>j)"} are pairs consisting of an equivalence class and
  a regular expression. In the initial equational system, they
  stand for all transitions @{term "Y\<^sub>i\<^sub>j \<Turnstile>c\<^sub>i\<^sub>j\<Rightarrow>
  X\<^sub>i"}. 
  %The intuition behind the equational system is that every 
  %equation @{text "X\<^sub>i = rhs\<^sub>i"} in this system
  %corresponds roughly to a state of an automaton whose name is @{text X\<^sub>i} and its predecessor states 
  %are the @{text "Y\<^sub>i\<^sub>j"}; the @{text "c\<^sub>i\<^sub>j"} are the labels of the transitions from these 
  %predecessor states to @{text X\<^sub>i}. 
  There can only be
  finitely many terms of the form @{text "(Y\<^sub>i\<^sub>j, ATOM c\<^sub>i\<^sub>j)"} in a right-hand side 
  since by assumption there are only finitely many
  equivalence classes and only finitely many characters.
  The term @{text "\<lambda>(ONE)"} in the first equation acts as a marker for the initial state, that
  is the equivalence class
  containing the empty string @{text "[]"}.
  In our running example we have the initial equational system
  %
  \begin{equation}\label{exmpcs}
  \mbox{\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{term "X\<^sub>1"} & @{text "="} & @{text "\<lambda>(ONE)"}\\
  @{term "X\<^sub>2"} & @{text "="} & @{text "(X\<^sub>1, ATOM a)"}\\
  @{term "X\<^sub>3"} & @{text "="} & @{text "(X\<^sub>2, ATOM b)"}\\
  @{term "X\<^sub>4"} & @{text "="} & @{text "(X\<^sub>1, ATOM b) + (X\<^sub>2, ATOM a) + (X\<^sub>3, ATOM a)"}\\
          & & \mbox{}\hspace{0mm}@{text "+ (X\<^sub>3, ATOM b) + (X\<^sub>4, ATOM a) + (X\<^sub>4, ATOM b)"}\\
  \end{tabular}}
  \end{equation}

  Note that we mark, roughly speaking, the
  single `initial' state in the equational system, which is different from
  the method by \citeN{Brzozowski64}, where he marks the
  `terminal' states. We are forced to set up the equational system in our
  way, because the Myhill-Nerode Relation determines the `direction' of the
  transitions---the successor `state' of an equivalence class @{text Y} can
  be reached by adding a character to the end of @{text Y}. If we had defined
  our equations the `Brzozowski-way', then our variables do not correspond to 
  the equivalence classes generated by the Myhill-Nerode relation. This need of reverse marking is also the
  reason why we have to use our reversed version of Arden's Lemma.

  Overloading the function @{text \<calL>} for the two kinds of terms in the
  equational system, we have
  
  \begin{center}
  @{text "\<calL>(Y, r) \<equiv>"} %
  @{thm (rhs) lang_trm.simps(2)[where X="Y" and r="r", THEN eq_reflection]}\hspace{10mm}
  @{thm lang_trm.simps(1)[where r="r", THEN eq_reflection]}
  \end{center}

  \noindent
  and we can prove for @{text "X\<^sub>2\<^sub>.\<^sub>.\<^sub>n"} that the following equations
  %
  \begin{equation}\label{inv1}
  @{text "X\<^sub>i = \<calL>(Y\<^sub>i\<^sub>1, ATOM c\<^sub>i\<^sub>1) \<union> \<dots> \<union> \<calL>(Y\<^sub>i\<^sub>q, ATOM c\<^sub>i\<^sub>q)"}.
  \end{equation}

  \noindent
  hold. Similarly for @{text "X\<^sub>1"} we can show the following equation
  %
  \begin{equation}\label{inv2}
  @{text "X\<^sub>1 = \<calL>(Y\<^sub>1\<^sub>1, ATOM c\<^sub>1\<^sub>1) \<union> \<dots> \<union> \<calL>(Y\<^sub>1\<^sub>p, ATOM c\<^sub>1\<^sub>p) \<union> \<calL>(\<lambda>(ONE))"}.
  \end{equation}

  \noindent
  holds. Again note that the reason for adding the @{text \<lambda>}-marker to our initial equational system is 
  to obtain this equation: it only holds with the marker, since none of 
  the other terms contain the empty string. 
  The point of the initial equational system is
  that solving it means we will be able to extract a regular expression for every equivalence class
  generated by the Myhill-Nerode relation. 

  Our representation for the equations in Isabelle/HOL are pairs,
  where the first component is an equivalence class (a set of strings)
  and the second component
  is a set of terms. Given a set of equivalence
  classes @{text CS}, our initial equational system @{term "Init CS"} is thus 
  formally defined as
  %
  \begin{equation}\label{initcs}
  \mbox{\begin{tabular}{rcl}     
  @{thm (lhs) Init_rhs_def} & @{text "\<equiv>"} & 
  @{text "if"}~@{term "[] \<in> X"}\\
  & & @{text "then"}~@{term "{Trn Y (ATOM c) | Y c. Y \<in> CS \<and> Y \<Turnstile>c\<Rightarrow> X} \<union> {Lam ONE}"}\\
  & & @{text "else"}~@{term "{Trn Y (ATOM c)| Y c. Y \<in> CS \<and> Y \<Turnstile>c\<Rightarrow> X}"}\\
  @{thm (lhs) Init_def}     & @{text "\<equiv>"} & @{thm (rhs) Init_def}
  \end{tabular}}
  \end{equation}

  \noindent
  Because we use sets of terms 
  for representing the right-hand sides of equations, we can 
  prove \eqref{inv1} and \eqref{inv2} more concisely as
  %
  \begin{lemma}\label{inv}
  If @{thm (prem 1) test} then @{text "X = \<Union> \<calL> ` rhs"}.
  \end{lemma}

  \noindent
  Our proof of Theorem~\ref{myhillnerodeone} will proceed by transforming the
  initial equational system into one in \emph{solved form} maintaining the invariant
  in Lemma~\ref{inv}. From the solved form we will be able to read
  off the regular expressions. 

  In order to transform an equational system into solved form, we have two 
  operations: one that takes an equation of the form @{text "X = rhs"} and removes
  any recursive occurrences of @{text X} in the @{text rhs} using our variant of Arden's 
  Lemma. The other operation takes an equation @{text "X = rhs"}
  and substitutes @{text X} throughout the rest of the equational system
  adjusting the remaining regular expressions appropriately. To define this adjustment 
  we define the \emph{append-operation} taking a term and a regular expression as argument

  \begin{center}
  \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
  @{thm (lhs) Append_rexp.simps(2)[where X="Y" and r="r\<^sub>1" and rexp="r\<^sub>2", THEN eq_reflection]}
  & @{text "\<equiv>"} & 
  @{thm (rhs) Append_rexp.simps(2)[where X="Y" and r="r\<^sub>1" and rexp="r\<^sub>2", THEN eq_reflection]}\\
  @{thm (lhs) Append_rexp.simps(1)[where r="r\<^sub>1" and rexp="r\<^sub>2", THEN eq_reflection]}
  & @{text "\<equiv>"} & 
  @{thm (rhs) Append_rexp.simps(1)[where r="r\<^sub>1" and rexp="r\<^sub>2", THEN eq_reflection]}
  \end{tabular}
  \end{center}

  \noindent
  We lift this operation to entire right-hand sides of equations, written as
  @{thm (lhs) Append_rexp_rhs_def[where rexp="r"]}. With this we can define
  the \emph{arden-operation} for an equation of the form @{text "X = rhs"} as:
  %
  \begin{equation}\label{arden_def}
  \mbox{\begin{tabular}{rc@ {\hspace{2mm}}r@ {\hspace{1mm}}l}
  @{thm (lhs) Arden_def} & @{text "\<equiv>"}~~\mbox{} & \multicolumn{2}{@ {\hspace{-2mm}}l}{@{text "let"}}\\ 
   & & @{text "rhs' ="} & @{term "rhs - {Trn X r | r. Trn X r \<in> rhs}"} \\
   & & @{text "r' ="}   & @{term "Star (\<Uplus> {r. Trn X r \<in> rhs})"}\\
   & &  \multicolumn{2}{@ {\hspace{-2mm}}l}{@{text "in"}~~@{term "Append_rexp_rhs rhs' r'"}}\\ 
  \end{tabular}}
  \end{equation}

  \noindent
  In this definition, we first delete all terms of the form @{text "(X, r)"} from @{text rhs};
  then we calculate the combined regular expressions for all @{text r} coming 
  from the deleted @{text "(X, r)"}, and take the @{const Star} of it;
  finally we append this regular expression to @{text rhs'}. If we apply this
  operation to the right-hand side of @{text "X\<^sub>4"} in \eqref{exmpcs}, we obtain
  the equation:

  \begin{center}
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{term "X\<^sub>4"} & @{text "="} & 
    @{text "(X\<^sub>1, TIMES (ATOM b) (STAR \<^raw:\ensuremath{\bigplus}>{ATOM a, ATOM b})) +"}\\
  & & @{text "(X\<^sub>2, TIMES (ATOM a) (STAR \<^raw:\ensuremath{\bigplus}>{ATOM a, ATOM b})) +"}\\
  & & @{text "(X\<^sub>3, TIMES (ATOM a) (STAR \<^raw:\ensuremath{\bigplus}>{ATOM a, ATOM b})) +"}\\
  & & @{text "(X\<^sub>3, TIMES (ATOM b) (STAR \<^raw:\ensuremath{\bigplus}>{ATOM a, ATOM b}))"}
  \end{tabular}
  \end{center}


  \noindent
  That means we eliminated the recursive occurrence of @{text "X\<^sub>4"} on the
  right-hand side. 

  It can be easily seen that the @{text "Arden"}-operation mimics Arden's
  Lemma on the level of equations. To ensure the non-emptiness condition of
  Arden's Lemma we say that a right-hand side is @{text ardenable} provided

  \begin{center}
  @{thm ardenable_def}
  \end{center}

  \noindent
  This allows us to prove a version of Arden's Lemma on the level of equations.

  \begin{lemma}\label{ardenable}
  Given an equation @{text "X = rhs"}.
  If @{text "X = \<Union>\<calL> ` rhs"},
  @{thm (prem 2) Arden_preserves_soundness}, and
  @{thm (prem 3) Arden_preserves_soundness}, then
  @{text "X = \<Union>\<calL> ` (Arden X rhs)"}.
  \end{lemma}
  
  \noindent
  Our @{text ardenable} condition is slightly stronger than needed for applying Arden's Lemma,
  but we can still ensure that it holds throughout our algorithm of transforming equations
  into solved form. 

  The \emph{substitution-operation} takes an equation
  of the form @{text "X = xrhs"} and substitutes it into the right-hand side @{text rhs}.

  \begin{center}
  \begin{tabular}{rc@ {\hspace{2mm}}r@ {\hspace{1mm}}l}
  @{thm (lhs) Subst_def} & @{text "\<equiv>"}~~\mbox{} & \multicolumn{2}{@ {\hspace{-2mm}}l}{@{text "let"}}\\ 
   & & @{text "rhs' ="} & @{term "rhs - {Trn X r | r. Trn X r \<in> rhs}"} \\
   & & @{text "r' ="}   & @{term "\<Uplus> {r. Trn X r \<in> rhs}"}\\
   & &  \multicolumn{2}{@ {\hspace{-2mm}}l}{@{text "in"}~~@{term "rhs' \<union> Append_rexp_rhs xrhs r'"}}\\ 
  \end{tabular}
  \end{center}

  \noindent
  We again delete first all occurrences of @{text "(X, r)"} in @{text rhs}; we then calculate
  the regular expression corresponding to the deleted terms; finally we append this
  regular expression to @{text "xrhs"} and union it up with @{text rhs'}. When we use
  the substitution operation we will arrange it so that @{text "xrhs"} does not contain
  any occurrence of @{text X}. For example substituting the first equation in
  \eqref{exmpcs} into the right-hand side of the second, thus eliminating the equivalence 
  class @{text "X\<^sub>1"}, gives us the equation
  %
  \begin{equation}\label{exmpresult}
  \mbox{\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{term "X\<^sub>2"} & @{text "="} & @{text "\<lambda>(TIMES ONE (ATOM a))"}
  \end{tabular}}
  \end{equation}

  With these two operations in place, we can define the operation that removes one equation
  from an equational systems @{text ES}. The operation @{const Subst_all}
  substitutes an equation @{text "X = xrhs"} throughout an equational system @{text ES}; 
  @{const Remove} then completely removes such an equation from @{text ES} by substituting 
  it to the rest of the equational system, but first eliminating all recursive occurrences
  of @{text X} by applying @{const Arden} to @{text "xrhs"}.

  \begin{center}
  \begin{tabular}{rcl}
  @{thm (lhs) Subst_all_def} & @{text "\<equiv>"} & @{thm (rhs) Subst_all_def}\\
  @{thm (lhs) Remove_def}    & @{text "\<equiv>"} & @{thm (rhs) Remove_def}
  \end{tabular}
  \end{center}

  \noindent
  Finally, we can define how an equational system should be solved. For this 
  we will need to iterate the process of eliminating equations until only one equation
  will be left in the system. However, we do not just want to have any equation
  as being the last one, but the one involving the equivalence class for 
  which we want to calculate the regular 
  expression. Let us suppose this equivalence class is @{text X}. 
  Since @{text X} is the one to be solved, in every iteration step we have to pick an
  equation to be eliminated that is different from @{text X}. In this way 
  @{text X} is kept to the final step. The choice is implemented using Hilbert's choice 
  operator, written @{text SOME} in the definition below.
  
  \begin{center}
  \begin{tabular}{rc@ {\hspace{4mm}}r@ {\hspace{1mm}}l}
  @{thm (lhs) Iter_def} & @{text "\<equiv>"}~~\mbox{} & \multicolumn{2}{@ {\hspace{-4mm}}l}{@{text "let"}}\\ 
   & & @{text "(Y, yrhs) ="} & @{term "SOME (Y, yrhs). (Y, yrhs) \<in> ES \<and> X \<noteq> Y"} \\
   & &  \multicolumn{2}{@ {\hspace{-4mm}}l}{@{text "in"}~~@{term "Remove ES Y yrhs"}}\\ 
  \end{tabular}
  \end{center}

  \noindent
  The last definition we need applies @{term Iter} over and over until a condition 
  @{text Cond} is \emph{not} satisfied anymore. This condition states that there
  are more than one equation left in the equational system @{text ES}. To solve
  an equational system we use Isabelle/HOL's @{text while}-operator as follows:
  
  \begin{center}
  @{thm Solve_def}
  \end{center}

  \noindent
  We are not concerned here with the definition of this operator (see
  \cite{BerghoferNipkow00} for example), but note that we
  eliminate in each @{const Iter}-step a single equation, and therefore have a
  well-founded termination order by taking the cardinality of the equational
  system @{text ES}. This enables us to prove properties about our definition
  of @{const Solve} when we `call' it with the equivalence class @{text X} and
  the initial equational system @{term "Init (UNIV // \<approx>A)"} from
  \eqref{initcs} using the principle:
  %
  \begin{equation}\label{whileprinciple}
  \mbox{\begin{tabular}{l}
  @{term "invariant (Init (UNIV // \<approx>A))"} \\
  @{term "\<forall>ES. invariant ES \<and> Cond ES \<longrightarrow> invariant (Iter X ES)"}\\
  @{term "\<forall>ES. invariant ES \<and> Cond ES \<longrightarrow> card (Iter X ES) < card ES"}\\
  @{term "\<forall>ES. invariant ES \<and> \<not> Cond ES \<longrightarrow> P ES"}\\
  \hline
  \multicolumn{1}{c}{@{term "P (Solve X (Init (UNIV // \<approx>A)))"}}
  \end{tabular}}
  \end{equation}

  \noindent
  This principle states that given an invariant (which we will specify below) 
  we can prove a property
  @{text "P"} involving @{const Solve}. For this we have to discharge the following
  proof obligations: first the
  initial equational system satisfies the invariant; second the iteration
  step @{text "Iter"} preserves the invariant as long as the condition @{term Cond} holds;
  third @{text "Iter"} decreases the termination order, and fourth that
  once the condition does not hold anymore then the property @{text P} must hold.

  The property @{term P} in our proof will state that @{term "Solve X (Init (UNIV // \<approx>A))"}
  returns with a single equation @{text "X = xrhs"} for some @{text "xrhs"}, and
  that this equational system still satisfies the invariant. In order to get
  the proof through, the invariant is composed of the following six properties:

  \begin{center}
  \begin{tabular}{@ {}rcl@ {\hspace{-13mm}}l @ {}}
  @{text "invariant ES"} & @{text "\<equiv>"} &
      @{term "finite ES"} & @{text "(finiteness)"}\\
  & @{text "\<and>"} & @{thm (rhs) finite_rhs_def} & @{text "(finiteness rhs)"}\\
  & @{text "\<and>"} & @{text "\<forall>(X, rhs)\<in>ES. X = \<Union>\<calL> ` rhs"} & @{text "(soundness)"}\\
  & @{text "\<and>"} & @{thm (rhs) distinctness_def}\\
  &             &  & @{text "(distinctness)"}\\
  & @{text "\<and>"} & @{thm (rhs) ardenable_all_def} & @{text "(ardenable)"}\\   
  & @{text "\<and>"} & @{thm (rhs) validity_def} & @{text "(validity)"}\\
  \end{tabular}
  \end{center}
 
  \noindent
  The first two ensure that the equational system is always finite (number of equations
  and number of terms in each equation); the third makes sure the `meaning' of the 
  equations is preserved under our transformations. The other properties are a bit more
  technical, but are needed to get our proof through. Distinctness states that every
  equation in the system is distinct. @{text Ardenable} ensures that we can always
  apply the @{text Arden} operation. 
  The last property states that every @{text rhs} can only contain equivalence classes
  for which there is an equation. Therefore @{text lhss} is just the set containing 
  the first components of an equational system,
  while @{text "rhss"} collects all equivalence classes @{text X} in the terms of the 
  form @{term "Trn X r"}. That means formally @{thm (lhs) lhss_def}~@{text "\<equiv> {X | (X, rhs) \<in> ES}"} 
  and @{thm (lhs) rhss_def}~@{text "\<equiv> {X | (X, r) \<in> rhs}"}.
  

  It is straightforward to prove that the initial equational system satisfies the
  invariant.

  \begin{lemma}\label{invzero}
  @{thm[mode=IfThen] Init_ES_satisfies_invariant}
  \end{lemma}

  \begin{proof}
  Finiteness is given by the assumption and the way how we set up the 
  initial equational system. Soundness is proved in Lemma~\ref{inv}. Distinctness
  follows from the fact that the equivalence classes are disjoint. The @{text ardenable}
  property also follows from the setup of the initial equational system, as does 
  validity.\qed
  \end{proof}

  \noindent
  Next we show that @{text Iter} preserves the invariant.

  \begin{lemma}\label{iterone} If
  @{thm (prem 1) iteration_step_invariant[where xrhs="rhs"]},
  @{thm (prem 2) iteration_step_invariant[where xrhs="rhs"]} and
  @{thm (prem 3) iteration_step_invariant[where xrhs="rhs"]}, then
  @{thm (concl) iteration_step_invariant[where xrhs="rhs"]}.
  \end{lemma}

  \begin{proof} 
  The argument boils down to choosing an equation @{text "Y = yrhs"} to be eliminated
  and to show that @{term "Subst_all (ES - {(Y, yrhs)}) Y (Arden Y yrhs)"} 
  preserves the invariant.
  We prove this as follows:

  \begin{center}
  \begin{tabular}{@ {}l@ {}}
  @{text "\<forall>ES."}~@{thm (prem 1) Subst_all_satisfies_invariant} implies
  @{thm (concl) Subst_all_satisfies_invariant}
  \end{tabular}
  \end{center}

  \noindent
  Finiteness is straightforward, as the @{const Subst} and @{const Arden} operations 
  keep the equational system finite. These operations also preserve soundness 
  and distinctness (we proved soundness for @{const Arden} in Lemma~\ref{ardenable}).
  The property @{text ardenable} is clearly preserved because the append-operation
  cannot make a regular expression to match the empty string. Validity is
  given because @{const Arden} removes an equivalence class from @{text yrhs}
  and then @{const Subst_all} removes @{text Y} from the equational system.
  Having proved the implication above, we can instantiate @{text "ES"} with @{text "ES - {(Y, yrhs)}"}
  which matches with our proof-obligation of @{const "Subst_all"}. Since
  \mbox{@{term "ES = ES - {(Y, yrhs)} \<union> {(Y, yrhs)}"}}, we can use the assumption 
  to complete the proof.\qed
  \end{proof}

  \noindent
  We also need the fact that @{text Iter} decreases the termination measure.

  \begin{lemma}\label{itertwo} If
  @{thm (prem 1) iteration_step_measure[simplified (no_asm), where xrhs="rhs"]},
  @{thm (prem 2) iteration_step_measure[simplified (no_asm), where xrhs="rhs"]} and
  @{thm (prem 3) iteration_step_measure[simplified (no_asm), where xrhs="rhs"]}, then\\
  \mbox{@{thm (concl) iteration_step_measure[simplified (no_asm), where xrhs="rhs"]}}.
  \end{lemma}

  \begin{proof}
  By assumption we know that @{text "ES"} is finite and has more than one element.
  Therefore there must be an element @{term "(Y, yrhs) \<in> ES"} with 
  @{term "(Y, yrhs) \<noteq> (X, rhs)"}. Using the distinctness property we can infer
  that @{term "Y \<noteq> X"}. We further know that @{text "Remove ES Y yrhs"}
  removes the equation @{text "Y = yrhs"} from the system, and therefore 
  the cardinality of @{const Iter} strictly decreases.\qed
  \end{proof}

  \noindent
  This brings us to our property we want to establish for @{text Solve}.


  \begin{lemma}
  If @{thm (prem 1) Solve} and @{thm (prem 2) Solve} then there exists
  a @{text rhs} such that  @{term "Solve X (Init (UNIV // \<approx>A)) = {(X, rhs)}"}
  and @{term "invariant {(X, rhs)}"}.
  \end{lemma}

  \begin{proof} 
  In order to prove this lemma using \eqref{whileprinciple}, we have to use a slightly
  stronger invariant since Lemma~\ref{iterone} and \ref{itertwo} have the precondition 
  that @{term "(X, rhs) \<in> ES"} for some @{text rhs}. This precondition is needed
  in order to choose in the @{const Iter}-step an equation that is not \mbox{@{term "X = rhs"}}.
  Therefore our invariant cannot be just @{term "invariant ES"}, but must be 
  @{term "invariant ES \<and> (\<exists>rhs. (X, rhs) \<in> ES)"}. By assumption 
  @{thm (prem 2) Solve} and Lemma~\ref{invzero}, the more general invariant holds for
  the initial equational system. This is premise 1 of~\eqref{whileprinciple}.
  Premise 2 is given by Lemma~\ref{iterone} and the fact that @{const Iter} might
  modify the @{text rhs} in the equation @{term "X = rhs"}, but does not remove it.
  Premise 3 of~\eqref{whileprinciple} is by Lemma~\ref{itertwo}. Now in premise 4
  we like to show that there exists a @{text rhs} such that @{term "ES = {(X, rhs)}"}
  and that @{text "invariant {(X, rhs)}"} holds, provided the condition @{text "Cond"}
  does not hold. By the stronger invariant we know there exists such a @{text "rhs"}
  with @{term "(X, rhs) \<in> ES"}. Because @{text Cond} is not true, we know the cardinality
  of @{text ES} is @{text 1}. This means @{text "ES"} must actually be the set @{text "{(X, rhs)}"},
  for which the invariant holds. This allows us to conclude that 
  @{term "Solve X (Init (UNIV // \<approx>A)) = {(X, rhs)}"} and @{term "invariant {(X, rhs)}"} hold,
  as needed.\qed
  \end{proof}

  \noindent
  With this lemma in place we can show that for every equivalence class in @{term "UNIV // \<approx>A"}
  there exists a regular expression.

  \begin{lemma}\label{every_eqcl_has_reg}
  @{thm[mode=IfThen] every_eqcl_has_reg}
  \end{lemma}

  \begin{proof}
  By the preceding lemma, we know that there exists a @{text "rhs"} such
  that @{term "Solve X (Init (UNIV // \<approx>A))"} returns the equation @{text "X = rhs"},
  and that the invariant holds for this equation. That means we 
  know @{text "X = \<Union>\<calL> ` rhs"}. We further know that
  this is equal to \mbox{@{text "\<Union>\<calL> ` (Arden X rhs)"}} using the properties of the 
  invariant and Lemma~\ref{ardenable}. Using the validity property for the equation @{text "X = rhs"},
  we can infer that @{term "rhss rhs \<subseteq> {X}"} and because the @{text Arden} operation
  removes that @{text X} from @{text rhs}, that @{term "rhss (Arden X rhs) = {}"}.
  This means the right-hand side @{term "Arden X rhs"} can only consist of terms of the form @{term "Lam r"}.
  So we can collect those (finitely many) regular expressions @{text rs} and have @{term "X = lang (\<Uplus>rs)"}.
  With this we can conclude the proof.\qed
  \end{proof}

  \noindent
  Lemma~\ref{every_eqcl_has_reg} allows us to finally give a proof for the first direction
  of the Myhill-Nerode Theorem.

  \begin{proof}[of Theorem~\ref{myhillnerodeone}]
  By Lemma~\ref{every_eqcl_has_reg} we know that there exists a regular expression for
  every equivalence class in @{term "UNIV // \<approx>A"}:

  \[
  \mbox{if}\;@{term "X \<in> (UNIV // \<approx>A)"}\;\mbox{then there exists a}\; 
  @{text "r"}\;\mbox{such that}\;@{term "X = lang r"}
  \]\smallskip

  \noindent
  Since @{text "finals A"} is
  a subset of  @{term "UNIV // \<approx>A"}, we also know that for every equivalence class
  in @{term "finals A"} there exists a regular expression. Moreover by assumption 
  we know that @{term "finals A"} must be finite, and therefore there must be a finite
  set of regular expressions @{text "rs"} such that
  @{term "\<Union>(finals A) = lang (\<Uplus>rs)"}.
  Since the left-hand side is equal to @{text A}, we can use @{term "\<Uplus>rs"} 
  as the regular expression that is needed in the theorem.\qed
  \end{proof}

  \noindent
  Solving the equational system of our running example gives as solution for the 
  two final equivalence classes:

  \begin{center}
  \begin{tabular}{r@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{text "X\<^sub>2"} & @{text "="} & @{text "TIMES ONE (ATOM a)"}\\
  @{text "X\<^sub>3"} & @{text "="} & @{text "TIMES (TIMES ONE (ATOM a)) (ATOM b)"}
  \end{tabular}
  \end{center}

  \noindent
  Combining them with $\bigplus$ gives us a regular expression for the language @{text "{[a], [a, b]}"}.

  Note that our algorithm for solving equational systems provides also a method for
  calculating a regular expression for the complement of a regular language:
  if we combine all regular
  expressions corresponding to equivalence classes not in @{term "finals A"} 
  (in the running example @{text "X\<^sub>1"} and @{text "X\<^sub>4"}),
  then we obtain a regular expression for the complement language @{term "- A"}.
  This is similar to the usual construction of a `complement automaton'.

*}

section {* Myhill-Nerode, Second Part *}

text {*
  \noindent
  In this section we will give a proof for establishing the second 
  part of the Myhill-Nerode Theorem. It can be formulated in our setting as follows:

  \begin{theorem}\label{myhillnerodetwo}
  Given @{text "r"} is a regular expression, then @{thm Myhill_Nerode2}.
  \end{theorem}  

  \noindent
  The proof will be by induction on the structure of @{text r}. It turns out
  the base cases are straightforward.


  \begin{proof}[Base Cases]
  The cases for @{const ZERO}, @{const ONE} and @{const ATOM} are routine, because 
  we can easily establish that

  \begin{center}
  \begin{tabular}{l}
  @{thm quot_zero_eq}\\
  @{thm quot_one_subset}\\
  @{thm quot_atom_subset}
  \end{tabular}
  \end{center}

  \noindent
  hold, which shows that @{term "UNIV // \<approx>(lang r)"} must be finite.\qed
  \end{proof}

  \noindent
  Much more interesting, however, are the inductive cases. They seem hard to be solved 
  directly. The reader is invited to try.\footnote{The induction hypothesis is not strong enough 
  to make any progress with the @{text TIMES} and @{text STAR} cases.} 

  In order to see how our proof proceeds consider the following suggestive picture 
  given by \citeN{Constable00}:
  %
  \begin{equation}\label{pics}
  \mbox{\begin{tabular}{c@ {\hspace{10mm}}c@ {\hspace{10mm}}c}
  \begin{tikzpicture}[scale=0.95]
  %Circle
  \draw[thick] (0,0) circle (1.1);    
  \end{tikzpicture}
  &
  \begin{tikzpicture}[scale=0.95]
  %Circle
  \draw[thick] (0,0) circle (1.1);    
  %Main rays
  \foreach \a in {0, 90,...,359}
    \draw[very thick] (0, 0) -- (\a:1.1);
  \foreach \a / \l in {45/1, 135/2, 225/3, 315/4}
      \draw (\a: 0.65) node {$a_\l$};
  \end{tikzpicture}
  &
  \begin{tikzpicture}[scale=0.95]
  %Circle
  \draw[thick] (0,0) circle (1.1);    
  %Main rays
  \foreach \a in {0, 45,...,359}
     \draw[very thick] (0, 0) -- (\a:1.1);
  \foreach \a / \l in {22.5/1.1, 67.5/1.2, 112.5/2.1, 157.5/2.2, 202.4/3.1, 247.5/3.2, 292.5/4.1, 337.5/4.2}
      \draw (\a: 0.77) node {$a_{\l}$};
  \end{tikzpicture}\\
  @{term UNIV} & @{term "UNIV // (\<approx>(lang r))"} & @{term "UNIV // R"}
  \end{tabular}}
  \end{equation}

  \noindent
  The relation @{term "\<approx>(lang r)"} partitions the set of all strings, @{term UNIV}, into some
  equivalence classes. To show that there are only finitely many of them, it
  suffices to show in each induction step that another relation, say @{text
  R}, has finitely many equivalence classes and refines @{term "\<approx>(lang r)"}. 

  \begin{definition}
  A relation @{text "R\<^sub>1"} \emph{refines} @{text "R\<^sub>2"}
  provided @{text "R\<^sub>1 \<subseteq> R\<^sub>2"}. 
  \end{definition}

  \noindent
  For constructing @{text R}, we will rely on some \emph{tagging-functions}
  defined over strings, see Fig.~\ref{tagfig}. These functions are parameterised by sets
  of strings @{text A} and @{text B} standing for arbitrary languages and will
  by instantiated with the induction hypotheses.
  Given the inductive hypotheses, it will be easy to
  prove that the \emph{range} of these tagging-functions is finite. The range
  of a function @{text f} is defined as

 
  \begin{center}
  @{text "range f \<equiv> f ` UNIV"}
  \end{center}

  \noindent
  that means we take the image of @{text f} w.r.t.~all elements in the
  domain. With this we will be able to infer that the tagging-functions, seen
  as relations, give rise to finitely many equivalence classes. 
  Finally we will show that the tagging-relations are more refined than
  @{term "\<approx>(lang r)"}, which implies that @{term "UNIV // \<approx>(lang r)"} must
  also be finite.  We formally define the notion of a \emph{tagging-relation}
  as follows.


  \begin{definition}[Tagging-Relation] Given a tagging-function @{text tag}, then two strings @{text x}
  and @{text y} are \emph{tag-related} provided
  \[
  @{text "x \<^raw:$\threesim$>\<^bsub>tag\<^esub> y \<equiv> tag x = tag y"}\;.
  \]
  \end{definition}


  In order to establish finiteness of a set @{text A}, we shall use the following powerful
  principle from Isabelle/HOL's library.
  %
  \begin{equation}\label{finiteimageD}
  @{thm[mode=IfThen] finite_imageD}
  \end{equation}

  \noindent
  It states that if an image of a set under an injective function @{text f} (injective over this set) 
  is finite, then the set @{text A} itself must be finite. We can use it to establish the following 
  two lemmas.

  \begin{figure}[t]
  \normalsize
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) tag_Plus_def[where A="A" and B="B", THEN meta_eq_app]} &
  @{text "\<equiv>"} &
  @{thm (rhs) tag_Plus_def[where A="A" and B="B", THEN meta_eq_app]} \\
  @{thm (lhs) tag_Times_def[where ?A="A" and ?B="B"]}\;@{text "x"} & @{text "\<equiv>"} &
  @{text "(\<lbrakk>x\<rbrakk>\<^bsub>\<approx>A\<^esub>, {\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>B\<^esub> | x\<^sub>p \<in> A \<and> (x\<^sub>p, x\<^sub>s) \<in> Partitions x})"}\\
  @{thm (lhs) tag_Star_def[where ?A="A", THEN meta_eq_app]} & @{text "\<equiv>"} &
  @{text "{\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>A\<^esub> | x\<^sub>p < x \<and> x\<^sub>p \<in> A\<^sup>\<star> \<and> (x\<^sub>p, x\<^sub>s) \<in> Partitions x}"}
  \end{tabular}
  \caption{Three tagging functions used the cases for @{term "PLUS"}, @{term "TIMES"} and @{term "STAR"} 
  regular expressions. The sets @{text A} and @{text B} are arbitrary languages instantiated
  in the proof with the induction hypotheses. \emph{Partitions} is a function
  that generates all possible partitions of a string.\label{tagfig}}
  \end{figure}


  \begin{lemma}\label{finone}
  @{thm[mode=IfThen] finite_eq_tag_rel}
  \end{lemma}

  \begin{proof}
  We set in \eqref{finiteimageD}, @{text f} to be @{text "X \<mapsto> tag ` X"}. We have
  @{text "range f"} to be a subset of @{term "Pow (range tag)"}, which we know must be
  finite by assumption. Now @{term "f ` (UNIV // =tag=)"} is a subset of @{text "range f"},
  and so also finite. Injectivity amounts to showing that @{text "X = Y"} under the
  assumptions that @{text "X, Y \<in> "}~@{term "UNIV // =tag="} and @{text "f X = f Y"}.
  From the assumptions we obtain \mbox{@{text "x \<in> X"}} and @{text "y \<in> Y"} with 
  @{text "tag x = tag y"}. Since @{text x} and @{text y} are tag-related, this in 
  turn means that the equivalence classes @{text X}
  and @{text Y} must be equal. Therefore \eqref{finiteimageD} allows us to conclude
  with @{thm (concl) finite_eq_tag_rel}.\qed
  \end{proof}

  \begin{lemma}\label{fintwo} 
  Given two equivalence relations @{text "R\<^sub>1"} and @{text "R\<^sub>2"}, whereby
  @{text "R\<^sub>1"} refines @{text "R\<^sub>2"}.
  If @{thm (prem 1) refined_partition_finite[where ?R1.0="R\<^sub>1" and ?R2.0="R\<^sub>2"]}
  then @{thm (concl) refined_partition_finite[where ?R1.0="R\<^sub>1" and ?R2.0="R\<^sub>2"]}.
  \end{lemma}

  \begin{proof}
  We prove this lemma again using \eqref{finiteimageD}. This time we set @{text f} to
  be @{text "X \<mapsto>"}~@{term "{R\<^sub>1 `` {x} | x. x \<in> X}"}. It is easy to see that 
  @{term "finite (f ` (UNIV // R\<^sub>2))"} because it is a subset of @{term "Pow (UNIV // R\<^sub>1)"},
  which must be finite by assumption. What remains to be shown is that @{text f} is injective
  on @{term "UNIV // R\<^sub>2"}. This is equivalent to showing that two equivalence 
  classes, say @{text "X"} and @{text Y}, in @{term "UNIV // R\<^sub>2"} are equal, provided
  @{text "f X = f Y"}. For @{text "X = Y"} to be equal, we have to find two elements
  @{text "x \<in> X"} and @{text "y \<in> Y"} such that they are @{text R\<^sub>2} related.
  We know there exists a @{text "x \<in> X"} with \mbox{@{term "X = R\<^sub>2 `` {x}"}}. 
  From the latter fact we can infer that @{term "R\<^sub>1 ``{x} \<in> f X"}
  and further @{term "R\<^sub>1 ``{x} \<in> f Y"}. This means we can obtain a @{text y}
  such that @{term "R\<^sub>1 `` {x} = R\<^sub>1 `` {y}"} holds. Consequently @{text x} and @{text y}
  are @{text "R\<^sub>1"}-related. Since by assumption @{text "R\<^sub>1"} refines @{text "R\<^sub>2"},
  they must also be @{text "R\<^sub>2"}-related, as we need to show.\qed
  \end{proof}

  \noindent
  Chaining Lemma~\ref{finone} and \ref{fintwo} together, means in order to show
  that @{term "UNIV // \<approx>(lang r)"} is finite, we have to construct a tagging-function whose
  range can be shown to be finite and whose tagging-relation refines @{term "\<approx>(lang r)"}.
  Let us attempt the @{const PLUS}-case first. We take from Fig.~\ref{tagfig}
 
  \begin{center}
  @{thm tag_Plus_def[where A="A" and B="B", THEN meta_eq_app]}
  \end{center}

  \noindent
  where @{text "A"} and @{text "B"} are some arbitrary languages. The reason for this choice 
  is that we need to establish that @{term "=(tag_Plus A B)="} refines @{term "\<approx>(A \<union> B)"}. 
  This amounts to showing @{term "x \<approx>A y"} or @{term "x \<approx>B y"} under the assumption
  @{term "x"}~@{term "=(tag_Plus A B)="}~@{term y}. As we shall see, this definition will 
  provide us with just the right assumptions in order to get the proof through.

 \begin{proof}[@{const "PLUS"}-Case]
  We can show in general, if @{term "finite (UNIV // \<approx>A)"} and @{term "finite
  (UNIV // \<approx>B)"} then @{term "finite ((UNIV // \<approx>A) \<times> (UNIV // \<approx>B))"}
  holds. The range of @{term "tag_Plus A B"} is a subset of this product
  set---so finite. For the refinement proof-obligation, we know that @{term
  "(\<approx>A `` {x}, \<approx>B `` {x}) = (\<approx>A `` {y}, \<approx>B `` {y})"} holds by assumption. Then
  clearly either @{term "x \<approx>A y"} or @{term "x \<approx>B y"}, as we needed to
  show. Finally we can discharge this case by setting @{text A} to @{term
  "lang r\<^sub>1"} and @{text B} to @{term "lang r\<^sub>2"}.\qed
  \end{proof}

  \noindent
  The @{const TIMES}-case is slightly more complicated. We first prove the
  following lemma, which will aid the proof about refinement.

  \begin{lemma}\label{refinement}
  The relation @{text "\<^raw:$\threesim$>\<^bsub>tag\<^esub>"} refines @{term "\<approx>A"}, provided for
  all strings @{text x}, @{text y} and @{text z} we have that \mbox{@{text "x \<^raw:$\threesim$>\<^bsub>tag\<^esub> y"}}
  and @{term "x @ z \<in> A"} imply @{text "y @ z \<in> A"}.
  \end{lemma}


  \noindent
  We therefore can analyse how the strings @{text "x @ z"} are in the language
  @{text A} and then construct an appropriate tagging-function to infer that
  @{term "y @ z"} are also in @{text A}.  For this we will use the notion of
  the set of all possible \emph{partitions} of a string:
  %
  \begin{equation}
  @{thm Partitions_def}
  \end{equation}

  \noindent
  If we know that @{text "(x\<^sub>p, x\<^sub>s) \<in> Partitions x"}, we will
  refer to @{text "x\<^sub>p"} as the \emph{prefix} of the string @{text x},
  and respectively to @{text "x\<^sub>s"} as the \emph{suffix}.

*}

text {*
  Now assuming  @{term "x @ z \<in> A \<cdot> B"}, there are only two possible ways of how to `split' 
  this string to be in @{term "A \<cdot> B"}:
  %
  \begin{center}
  \begin{tabular}{c}
  \scalebox{1}{
  \begin{tikzpicture}[scale=0.8,fill=gray!20]
    \node[draw,minimum height=3.8ex, fill] (x) { $\hspace{4.8em}@{text x}\hspace{4.8em}$ };
    \node[draw,minimum height=3.8ex, right=-0.03em of x, fill] (za) { $\hspace{0.6em}@{text "z\<^sub>p"}\hspace{0.6em}$ };
    \node[draw,minimum height=3.8ex, right=-0.03em of za, fill] (zza) { $\hspace{2.6em}@{text "z\<^sub>s"}\hspace{2.6em}$  };

    \draw[decoration={brace,transform={yscale=3}},decorate]
           (x.north west) -- ($(za.north west)+(0em,0em)$)
               node[midway, above=0.5em]{@{text x}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(za.north west)+(0em,0ex)$) -- ($(zza.north east)+(0em,0ex)$)
               node[midway, above=0.5em]{@{text z}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(x.north west)+(0em,3ex)$) -- ($(zza.north east)+(0em,3ex)$)
               node[midway, above=0.8em]{@{term "x @ z \<in> A \<cdot> B"}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(za.south east)+(0em,0ex)$) -- ($(x.south west)+(0em,0ex)$)
               node[midway, below=0.5em]{@{text "x @ z\<^sub>p \<in> A"}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(zza.south east)+(0em,0ex)$) -- ($(za.south east)+(0em,0ex)$)
               node[midway, below=0.5em]{@{text "z\<^sub>s \<in> B"}};
  \end{tikzpicture}}
  \\[2mm]
  \scalebox{1}{
  \begin{tikzpicture}[scale=0.8,fill=gray!20]
    \node[draw,minimum height=3.8ex, fill] (xa) { $\hspace{3em}@{text "x\<^sub>p"}\hspace{3em}$ };
    \node[draw,minimum height=3.8ex, right=-0.03em of xa, fill] (xxa) { $\hspace{0.2em}@{text "x\<^sub>s"}\hspace{0.2em}$ };
    \node[draw,minimum height=3.8ex, right=-0.03em of xxa, fill] (z) { $\hspace{5em}@{text z}\hspace{5em}$ };

    \draw[decoration={brace,transform={yscale=3}},decorate]
           (xa.north west) -- ($(xxa.north east)+(0em,0em)$)
               node[midway, above=0.5em]{@{text x}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           (z.north west) -- ($(z.north east)+(0em,0em)$)
               node[midway, above=0.5em]{@{text z}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(xa.north west)+(0em,3ex)$) -- ($(z.north east)+(0em,3ex)$)
               node[midway, above=0.8em]{@{term "x @ z \<in> A \<cdot> B"}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(z.south east)+(0em,0ex)$) -- ($(xxa.south west)+(0em,0ex)$)
               node[midway, below=0.5em]{@{term "x\<^sub>s @ z \<in> B"}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(xa.south east)+(0em,0ex)$) -- ($(xa.south west)+(0em,0ex)$)
               node[midway, below=0.5em]{@{term "x\<^sub>p \<in> A"}};
  \end{tikzpicture}}
  \end{tabular}
  \end{center}
  %

  \noindent
  Either @{text x} and a prefix of @{text "z"} is in @{text A} and the rest in @{text B} 
  (first picture) or there is a prefix of @{text x} in @{text A} and the rest is in @{text B} 
  (second picture). In both cases we have to show that @{term "y @ z \<in> A \<cdot> B"}. The first case
  we will only go through if we know that  @{term "x \<approx>A y"} holds @{text "("}@{text "*"}@{text ")"}.  
  Because then 
  we can infer from @{term "x @ z\<^sub>p \<in> A"} that @{term "y @ z\<^sub>p \<in> A"} holds for all @{text "z\<^sub>p"}.
  In the second case we only know that @{text "x\<^sub>p"} and @{text "x\<^sub>s"} is one possible partition
  of the string @{text x}. We have to know that both @{text "x\<^sub>p"} and the
  corresponding partition @{text "y\<^sub>p"} are in @{text "A"}, and that @{text "x\<^sub>s"} is `@{text B}-related' 
  to @{text "y\<^sub>s"} @{text "("}@{text "**"}@{text ")"}. From the latter fact we can infer that @{text "y\<^sub>s @ z \<in> B"}.
  This will solve the second case.
  Taking the two requirements, @{text "("}@{text "*"}@{text ")"} and @{text "(**)"}, together we define the
  tagging-function in the @{const Times}-case as (see Fig.~\ref{tagfig}):

  \begin{center}
  @{thm (lhs) tag_Times_def[where ?A="A" and ?B="B"]}\;@{text "x"}~@{text "\<equiv>"}~
  @{text "(\<lbrakk>x\<rbrakk>\<^bsub>\<approx>A\<^esub>, {\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>B\<^esub> | x\<^sub>p \<in> A \<and> (x\<^sub>p, x\<^sub>s) \<in> Partitions x})"}
  \end{center}

  \noindent
  Note that we have to make the assumption for all suffixes @{text "x\<^sub>s"}, since we do 
  not know anything about how the string @{term x} is partitioned.
  With this definition in place, let us prove the @{const "Times"}-case.

  \begin{proof}[@{const TIMES}-Case]
  If @{term "finite (UNIV // \<approx>A)"} and @{term "finite (UNIV // \<approx>B)"}
  then @{term "finite ((UNIV // \<approx>A) \<times> (Pow (UNIV // \<approx>B)))"} holds. The range of
  @{term "tag_Times A B"} is a subset of this product set, and therefore finite.
  For the refinement of @{term "\<approx>(A \<cdot> B)"} and @{text "\<^raw:$\threesim$>\<^bsub>\<times>tag A B\<^esub>"}, 
  we have by Lemma \ref{refinement} 

  \begin{center}
   @{term "tag_Times A B x = tag_Times A B y"}
  \end{center}

  \noindent
  and @{term "x @ z \<in> A \<cdot> B"}, and have to establish @{term "y @ z \<in> A \<cdot>
  B"}. As shown in the pictures above, there are two cases to be
  considered. First, there exists a @{text "z\<^sub>p"} and @{text
  "z\<^sub>s"} such that @{term "x @ z\<^sub>p \<in> A"} and @{text "z\<^sub>s
  \<in> B"}.  By the assumption about @{term "tag_Times A B"} we have @{term "\<approx>A
  `` {x} = \<approx>A `` {y}"} and thus @{term "x \<approx>A y"}. Hence by the Myhill-Nerode
  Relation @{term "y @ z\<^sub>p \<in> A"} holds. Using @{text "z\<^sub>s \<in> B"},
  we can conclude in this case with @{term "y @ z \<in> A \<cdot> B"} (recall @{text
  "z\<^sub>p @ z\<^sub>s = z"}).

  Second there exists a partition @{text "x\<^sub>p"} and @{text "x\<^sub>s"} with 
  @{text "x\<^sub>p \<in> A"} and @{text "x\<^sub>s @ z \<in> B"}. We therefore have
  
  \begin{center}
  @{text "\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>B\<^esub> \<in> {\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>B\<^esub> | x\<^sub>p \<in> A \<and> (x\<^sub>p, x\<^sub>s) \<in> Partitions x}"}
  \end{center}
  
  \noindent
  and by the assumption about @{term "tag_Times A B"} also 
  
  \begin{center}
  @{text "\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>B\<^esub> \<in> {\<lbrakk>y\<^sub>s\<rbrakk>\<^bsub>\<approx>B\<^esub> | y\<^sub>p \<in> A \<and> (y\<^sub>p, y\<^sub>s) \<in> Partitions y}"}
  \end{center}
  
  \noindent
  This means there must be a partition @{text "y\<^sub>p"} and @{text "y\<^sub>s"}
  such that @{term "y\<^sub>p \<in> A"} and @{term "\<approx>B `` {x\<^sub>s} = \<approx>B ``
  {y\<^sub>s}"}. Unfolding the Myhill-Nerode Relation and together with the
  facts that @{text "x\<^sub>p \<in> A"} and \mbox{@{text "x\<^sub>s @ z \<in> B"}}, we
  obtain @{term "y\<^sub>p \<in> A"} and @{text "y\<^sub>s @ z \<in> B"}, as needed in
  this case.  We again can complete the @{const TIMES}-case by setting @{text
  A} to @{term "lang r\<^sub>1"} and @{text B} to @{term "lang r\<^sub>2"}.\qed
  \end{proof}

  \noindent
  The case for @{const Star} is similar to @{const TIMES}, but poses a few
  extra challenges.  To deal with them, we define first the notion of a \emph{string
  prefix} and a \emph{strict string prefix}:

  \begin{center}
  \begin{tabular}{l}
  @{text "x \<le> y \<equiv> \<exists>z. y = x @ z"}\\
  @{text "x < y \<equiv> x \<le> y \<and> x \<noteq> y"}
  \end{tabular}
  \end{center}

  When analysing the case of @{text "x @ z"} being an element in @{term "A\<star>"}
  and @{text x} is not the empty string, we have the following picture:

  \begin{center}
  \scalebox{1}{
  \begin{tikzpicture}[scale=0.8,fill=gray!20]
    \node[draw,minimum height=3.8ex, fill] (xa) { $\hspace{4em}@{text "x\<^bsub>pmax\<^esub>"}\hspace{4em}$ };
    \node[draw,minimum height=3.8ex, right=-0.03em of xa, fill] (xxa) { $\hspace{0.5em}@{text "x\<^bsub>s\<^esub>"}\hspace{0.5em}$ };
    \node[draw,minimum height=3.8ex, right=-0.03em of xxa, fill] (za) { $\hspace{2em}@{text "z\<^sub>a"}\hspace{2em}$ };
    \node[draw,minimum height=3.8ex, right=-0.03em of za, fill] (zb) { $\hspace{7em}@{text "z\<^sub>b"}\hspace{7em}$ };

    \draw[decoration={brace,transform={yscale=3}},decorate]
           (xa.north west) -- ($(xxa.north east)+(0em,0em)$)
               node[midway, above=0.5em]{@{text x}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           (za.north west) -- ($(zb.north east)+(0em,0em)$)
               node[midway, above=0.5em]{@{text z}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(xa.north west)+(0em,3ex)$) -- ($(zb.north east)+(0em,3ex)$)
               node[midway, above=0.8em]{@{term "x @ z \<in> A\<star>"}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(za.south east)+(0em,0ex)$) -- ($(xxa.south west)+(0em,0ex)$)
               node[midway, below=0.5em]{@{term "x\<^sub>s @ z\<^sub>a \<in> A"}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(xa.south east)+(0em,0ex)$) -- ($(xa.south west)+(0em,0ex)$)
               node[midway, below=0.5em]{@{text "x\<^bsub>pmax\<^esub> \<in> A\<^sup>\<star>"}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(zb.south east)+(0em,0ex)$) -- ($(zb.south west)+(0em,0ex)$)
               node[midway, below=0.5em]{@{term "z\<^sub>b \<in> A\<star>"}};

    \draw[decoration={brace,transform={yscale=3}},decorate]
           ($(zb.south east)+(0em,-4ex)$) -- ($(xxa.south west)+(0em,-4ex)$)
               node[midway, below=0.5em]{@{term "x\<^sub>s @ z \<in> A\<star>"}};
  \end{tikzpicture}}
  \end{center}
  %
  \noindent
  We can find a strict prefix @{text "x\<^sub>p"} of @{text x} such that @{term "x\<^sub>p \<in> A\<star>"},
  @{text "x\<^sub>p < x"} and the rest @{term "x\<^sub>s @ z \<in> A\<star>"}. For example the empty string 
  @{text "[]"} would do (recall @{term "x \<noteq> []"}).
  There are potentially many such prefixes, but there can only be finitely many of them (the 
  string @{text x} is finite). Let us therefore choose the longest one and call it 
  @{text "x\<^bsub>pmax\<^esub>"}. Now for the rest of the string @{text "x\<^sub>s @ z"} we
  know it is in @{term "A\<star>"} and cannot be the empty string. By Property~\ref{langprops}@{text "(iv)"}, 
  we can separate
  this string into two parts, say @{text "a"} and @{text "b"}, such that @{text "a \<noteq> []"}, @{text "a \<in> A"}
  and @{term "b \<in> A\<star>"}. Now @{text a} must be strictly longer than @{text "x\<^sub>s"},
  otherwise @{text "x\<^bsub>pmax\<^esub>"} is not the longest prefix. That means @{text a}
  `overlaps' with @{text z}, splitting it into two components @{text "z\<^sub>a"} and
   @{text "z\<^sub>b"}. For this we know that @{text "x\<^sub>s @ z\<^sub>a \<in> A"} and
  @{term "z\<^sub>b \<in> A\<star>"}. To cut a story short, we have divided @{term "x @ z \<in> A\<star>"}
  such that we have a string @{text a} with @{text "a \<in> A"} that lies just on the
  `border' of @{text x} and @{text z}. This string is @{text "x\<^sub>s @ z\<^sub>a"}.

  In order to show that @{term "x @ z \<in> A\<star>"} implies @{term "y @ z \<in> A\<star>"}, we use
  the following tagging-function:
  %
  \begin{center}
  @{thm (lhs) tag_Star_def[where ?A="A", THEN meta_eq_app]}~@{text "\<equiv>"}~
  @{text "{\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>A\<^esub> | x\<^sub>p < x \<and> x\<^sub>p \<in> A\<^sup>\<star> \<and> (x\<^sub>p, x\<^sub>s) \<in> Partitions x}"}
  \end{center}

  \begin{proof}[@{const Star}-Case]
  If @{term "finite (UNIV // \<approx>A)"} 
  then @{term "finite (Pow (UNIV // \<approx>A))"} holds. The range of
  @{term "tag_Star A"} is a subset of this set, and therefore finite.
  Again we have to show under the assumption @{term "x"}~@{term "=(tag_Star A)="}~@{term y}
  that @{term "x @ z \<in> A\<star>"} implies @{term "y @ z \<in> A\<star>"}.

  We first need to consider the case that @{text x} is the empty string.
  From the assumption about strict prefixes in @{text "\<^raw:$\threesim$>\<^bsub>\<star>tag A\<^esub>"}, we 
  can infer @{text y} is the empty string and
  then clearly have @{term "y @ z \<in> A\<star>"}. In case @{text x} is not the empty
  string, we can divide the string @{text "x @ z"} as shown in the picture 
  above. By the tagging-function and the facts @{text "x\<^bsub>pmax\<^esub> \<in> A\<^sup>\<star>"} and @{text "x\<^bsub>pmax\<^esub> < x"}, 
  we have

  \begin{center}
  @{text "\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>A\<^esub> \<in> {\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>A\<^esub> | x\<^bsub>pmax\<^esub> < x \<and> x\<^bsub>pmax\<^esub> \<in> A\<^sup>\<star> \<and> (x\<^bsub>pmax\<^esub>, x\<^sub>s) \<in> Partitions x}"}
  \end{center}
  
  \noindent
  which by assumption is equal to
  
  \begin{center}
  @{text "\<lbrakk>x\<^sub>s\<rbrakk>\<^bsub>\<approx>A\<^esub> \<in> {\<lbrakk>y\<^sub>s\<rbrakk>\<^bsub>\<approx>A\<^esub> | y\<^bsub>p\<^esub> < y \<and> y\<^bsub>p\<^esub> \<in> A\<^sup>\<star> \<and> (y\<^bsub>p\<^esub>, y\<^sub>s) \<in> Partitions y}"}
  \end{center}
  
  \noindent 
  From this we know there exist a partition @{text "y\<^sub>p"} and @{text
  "y\<^sub>s"} with @{term "y\<^sub>p \<in> A\<star>"} and also @{term "x\<^sub>s \<approx>A
  y\<^sub>s"}. Unfolding the Myhill-Nerode Relation we know @{term
  "y\<^sub>s @ z\<^sub>a \<in> A"}. We also know that @{term "z\<^sub>b \<in> A\<star>"}.
  Therefore @{term "y\<^sub>p @ (y\<^sub>s @ z\<^sub>a) @ z\<^sub>b \<in>
  A\<star>"}, which means @{term "y @ z \<in> A\<star>"}. The last step is to set
  @{text "A"} to @{term "lang r"} and thus complete the proof.\qed
  \end{proof}

  \begin{rmk}
  While our proof using tagging functions might seem like a `rabbit pulled 
  out of a hat', the intuition behind can be somewhat rationalised by taking the 
  view that the equivalence classes @{term "UNIV // \<approx>(lang r)"} stand for the 
  states of the minimal automaton for the language @{term "lang r"}. The theorem 
  amounts to showing that this minimal automaton has finitely many states. 
  However, by showing that our @{text "\<^raw:$\threesim$>\<^bsub>tag\<^esub>"} relation 
  refines @{term "\<approx>A"} we do not actually have to show that the minimal automata
  has finitely many states, but only that we can show finiteness for an 
  automaton with at least as many states and which can recognise the same 
  language. By performing the induction over the regular expression @{text r},
  this means we have to construct inductively an automaton in
  the cases for @{term PLUS}, @{term TIMES} and @{term STAR}.

  In the @{text PLUS}-case, we can assume finitely many equivalence classes of the form
  @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>A\<^esub>"} and @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>B\<^esub>"}, each standing for an automaton recognising the 
  languages @{text A} and @{text B} respectively. 
  The @{text "+tag"} function constructs pairs of the form @{text "(\<lbrakk>_\<rbrakk>\<^bsub>\<approx>A\<^esub>, \<lbrakk>_\<rbrakk>\<^bsub>\<approx>B\<^esub>)"} 
  which can be seen as the states of an automaton recognising the language 
  \mbox{@{term "A \<union> B"}}. This is the usual product construction of automata: 
  Given a string @{text x}, the first automata will be in the state  @{text "\<lbrakk>x\<rbrakk>\<^bsub>\<approx>A\<^esub>"}
  after recognising @{text x} (similarly @{text "\<lbrakk>x\<rbrakk>\<^bsub>\<approx>B\<^esub>"} for the other automaton). The product
  automaton will be in the state \mbox{@{text "(\<lbrakk>x\<rbrakk>\<^bsub>\<approx>A\<^esub>, \<lbrakk>x\<rbrakk>\<^bsub>\<approx>B\<^esub>)"}}, which is accepting
  if at least one component is an accepting state.

  The @{text "TIMES"}-case is a bit more complicated---essentially we 
  need to sequentially compose the two automata with the states @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>A\<^esub>"}, 
  respectively @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>B\<^esub>"}. We achieve this sequential composition by
  taking the first automaton @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>A\<^esub>"} and append on each of its accepting
  state the automaton @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>B\<^esub>"}. Unfortunately, this does not lead directly
  to a correct composition, since the automaton @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>A\<^esub>"} might have consumed
  some of the input needed for the automaton @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>B\<^esub>"} to succeed. The 
  textbook construction for sequentially composing two automata circumvents this
  problem by connecting the terminating states of the first automaton via
  an epsilon-transition to the initial state of the second automaton (see for
  example \citeN{Kozen97}). In the absence of any epsilon-transition analogue in our work, 
  we let the second automaton
  start in all possible states that may have been reached if the first
  automaton had not consumed the input meant for the second. We achieve this
  by having a set of states as the second component generated by our 
  @{text "\<times>tag"} function (see second clause in Fig.~\ref{tagfig}).
  A state of this sequentially composed automaton is accepting, if the first 
  component is accepting and at least one state in the set is also accepting.

  The idea behind the @{text "STAR"}-case is similar to the @{text "TIMES"}-case.
  We assume some automaton has consumed some strictly smaller part of the input in @{text "A\<^sup>\<star>"};
  we need to check that from the state we ended up in a terminal state in the 
  automaton @{text "\<lbrakk>_\<rbrakk>\<^bsub>\<approx>A\<^esub>"} can be reached. Since we do not know from which state this will 
  succeed, we need to run the automaton from all possible states we could have 
  ended up in. Therefore the @{text "\<star>tag"} function generates again a set of states.

  However, note that while the automata view sheds some light behind the idea of the 
  tagging functions, our proof only works because we can perform a structural 
  induction on the regular expression @{text r}.
  \end{rmk}
*}

section {* Second Part proved using Partial Derivatives *}

text {*
  \label{derivatives} 
  \noindent
  As we have seen in the previous section, in order to establish
  the second direction of the Myhill-Nerode Theorem, it is sufficient to find 
  a more refined relation than @{term "\<approx>(lang r)"} for which we can
  show that there are only finitely many equivalence classes. So far we 
  showed this directly by induction on @{text "r"} using tagging-functions. 
  However, there is also  an indirect method to come up with such a refined 
  relation by using derivatives of regular expressions introduced by \citeN{Brzozowski64}. 

  Assume the following two definitions for the \emph{left-quotient} of a language,
  which we write as @{term "Deriv c A"} and @{term "Derivs s A"} where @{text c}
  is a character and @{text s} a string, respectively:

  \begin{center}
  \begin{tabular}{r@ {\hspace{1mm}}c@ {\hspace{2mm}}l}
  @{thm (lhs) Der_def}  & @{text "\<equiv>"} & @{thm (rhs) Der_def}\\
  @{thm (lhs) Ders_def} & @{text "\<equiv>"} & @{thm (rhs) Ders_def}\\
  \end{tabular}
  \end{center}

  \noindent
  In order to aid readability, we shall make use of the following abbreviation

  \begin{center}
  @{abbrev "Derivss s As"}
  \end{center}
  
  \noindent
  where we apply the left-quotient to a set of languages and then combine the results.
  Clearly we have the following equivalence between the Myhill-Nerode Relation
  (Definition~\ref{myhillneroderel}) and left-quotients
  %
  \begin{equation}\label{mhders}
  @{term "x \<approx>A y"} \hspace{4mm}\text{if and only if}\hspace{4mm} @{term "Derivs x A = Derivs y A"}
  \end{equation}

  \noindent
  It is also straightforward to establish the following properties of left-quotients
  % 
  \begin{equation}
  \mbox{\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{2mm}}l}
  @{thm (lhs) Deriv_simps(1)} & $=$ & @{thm (rhs) Deriv_simps(1)}\\
  @{thm (lhs) Deriv_simps(2)} & $=$ & @{thm (rhs) Deriv_simps(2)}\\
  @{thm (lhs) Deriv_simps(3)} & $=$ & @{thm (rhs) Deriv_simps(3)}\\
  @{thm (lhs) Deriv_simps(4)} & $=$ & @{thm (rhs) Deriv_simps(4)}\\
  @{thm (lhs) Der_conc}  & $=$ & @{thm (rhs) Der_conc}\\
  @{thm (lhs) Deriv_star}  & $=$ & @{thm (rhs) Deriv_star}\\
  @{thm (lhs) Derivs_simps(1)} & $=$ & @{thm (rhs) Derivs_simps(1)}\\
  @{thm (lhs) Derivs_simps(2)} & $=$ & @{thm (rhs) Derivs_simps(2)}\\
  %@{thm (lhs) Derivs_simps(3)[where ?s1.0="s\<^sub>1" and ?s2.0="s\<^sub>2"]}  & $=$ 
  %   & @{thm (rhs) Derivs_simps(3)[where ?s1.0="s\<^sub>1" and ?s2.0="s\<^sub>2"]}\\
  \end{tabular}}
  \end{equation}

  \noindent
  Note that in the last equation we use the list-cons operator written
  \mbox{@{text "_ :: _"}}.  The only interesting case is the case of @{term "A\<star>"}
  where we use Property~\ref{langprops}@{text "(i)"} in order to infer that
  @{term "Deriv c (A\<star>) = Deriv c (A \<cdot> A\<star>)"}. We can then complete the proof by
  using the fifth equation and noting that @{term "Deriv c (A\<star>) \<subseteq> (Deriv
  c A) \<cdot> A\<star>"} provided @{text "[] \<in> A"}. 

  \citeN{Brzozowski64} observed that the left-quotients for languages of
  regular expressions can be calculated directly using the notion of
  \emph{derivatives of a regular expression}. We define
  this notion in Isabelle/HOL as follows:
  %
  \begin{center}
  \begin{longtable}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1.5mm}}l@ {}}
  @{thm (lhs) deriv.simps(1)}  & @{text "\<equiv>"} & @{thm (rhs) deriv.simps(1)}\\
  @{thm (lhs) deriv.simps(2)}  & @{text "\<equiv>"} & @{thm (rhs) deriv.simps(2)}\\
  @{thm (lhs) deriv.simps(3)[where c'="d"]}  & @{text "\<equiv>"} & @{thm (rhs) deriv.simps(3)[where c'="d"]}\\
  @{thm (lhs) deriv.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}  
     & @{text "\<equiv>"} & @{thm (rhs) deriv.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  @{thm (lhs) deriv.simps(5)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}  
     & @{text "\<equiv>"} & @{text "if"}~@{term "nullable r\<^sub>1"}~@{text "then"}~%
       @{term "Plus (Times (deriv c r\<^sub>1) r\<^sub>2) (deriv c r\<^sub>2)"}\\
     &             & \phantom{@{text "if"}~@{term "nullable r\<^sub>1"}~}@{text "else"}~%  
                    @{term "Times (deriv c r\<^sub>1) r\<^sub>2"}\\ 
  @{thm (lhs) deriv.simps(6)}  & @{text "\<equiv>"} & @{thm (rhs) deriv.simps(6)}\smallskip\\
  @{thm (lhs) derivs.simps(1)}  & @{text "\<equiv>"} & @{thm (rhs) derivs.simps(1)}\\
  @{thm (lhs) derivs.simps(2)}  & @{text "\<equiv>"} & @{thm (rhs) derivs.simps(2)}
  \end{longtable}
  \end{center}

  \noindent
  The last two clauses extend derivatives from characters to strings. The
  boolean function @{term "nullable r"} needed in the @{const Times}-case tests
  whether a regular expression can recognise the empty string. It can be defined as 
  follows.
  %
  \begin{center}
  \begin{longtable}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1.5mm}}l@ {}}
  @{thm (lhs) nullable.simps(1)}  & @{text "\<equiv>"} & @{thm (rhs) nullable.simps(1)}\\
  @{thm (lhs) nullable.simps(2)}  & @{text "\<equiv>"} & @{thm (rhs) nullable.simps(2)}\\
  @{thm (lhs) nullable.simps(3)}  & @{text "\<equiv>"} & @{thm (rhs) nullable.simps(3)}\\
  @{thm (lhs) nullable.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}  
     & @{text "\<equiv>"} & @{thm (rhs) nullable.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(5)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}  
     & @{text "\<equiv>"} & @{thm (rhs) nullable.simps(5)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(6)}  & @{text "\<equiv>"} & @{thm (rhs) nullable.simps(6)}
  \end{longtable}
  \end{center}

  \noindent
  By induction on the regular expression @{text r}, respectively on the string @{text s}, 
  one can easily show that left-quotients and derivatives of regular expressions 
  relate as follows (see for example~\cite{Sakarovitch09}):
  %
  \begin{equation}\label{Dersders}
  \mbox{\begin{tabular}{c}
  @{thm Deriv_deriv}\\
  @{thm Derivs_derivs}
  \end{tabular}}
  \end{equation}

  \noindent
  The importance of this fact in the context of the Myhill-Nerode Theorem is that 
  we can use \eqref{mhders} and \eqref{Dersders} in order to 
  establish that 

  \begin{center}
  @{term "x \<approx>(lang r) y"} \hspace{4mm}if and only if\hspace{4mm}
  @{term "lang (derivs x r) = lang (derivs y r)"}. 
  \end{center}

  \noindent
  holds and hence
  %
  \begin{equation}
  @{term "x \<approx>(lang r) y"}\hspace{4mm}\mbox{provided}\hspace{4mm}@{term "derivs x r = derivs y r"}
  \end{equation}


  \noindent
  This means the right-hand side (seen as a relation) refines the Myhill-Nerode
  Relation.  Consequently, we can use @{text
  "\<^raw:$\threesim$>\<^bsub>(\<lambda>x. ders x r)\<^esub>"} as a
  tagging-relation. However, in order to be useful for the second part of the
  Myhill-Nerode Theorem, we have to be able to establish that for the
  corresponding language there are only finitely many derivatives---thus
  ensuring that there are only finitely many equivalence
  classes. Unfortunately, this is not true in general. Sakarovitch gives an
  example where a regular expression has infinitely many derivatives
  w.r.t.~the language @{text "(ab)\<^sup>\<star> \<union> (ab)\<^sup>\<star>a"}, which is formally 
  written in our notation as \mbox{@{text "{[a,b]}\<^sup>\<star> \<union> ({[a,b]}\<^sup>\<star> \<cdot> {[a]})"}}
  (see \cite[Page~141]{Sakarovitch09}).


  What \citeN{Brzozowski64} established is that for every language there
  \emph{are} only finitely `dissimilar' derivatives for a regular
  expression. Two regular expressions are said to be \emph{similar} provided
  they can be identified using the using the @{text "ACI"}-identities:
  %
  \begin{equation}\label{ACI}
  \mbox{\begin{tabular}{cl}
  (@{text A}) & @{term "Plus (Plus r\<^sub>1 r\<^sub>2) r\<^sub>3"} $\equiv$ @{term "Plus r\<^sub>1 (Plus r\<^sub>2 r\<^sub>3)"}\\
  (@{text C}) & @{term "Plus r\<^sub>1 r\<^sub>2"} $\equiv$ @{term "Plus r\<^sub>2 r\<^sub>1"}\\
  (@{text I}) & @{term "Plus r r"} $\equiv$ @{term "r"}\\
  \end{tabular}}
  \end{equation}

  \noindent
  Carrying this idea through, we must not consider the set of all derivatives,
  but the one modulo @{text "ACI"}.  In principle, this can be done formally, 
  but it is very painful in a theorem prover---since there is no
  direct characterisation of the set of dissimilar derivatives. Therefore
  \citeN{CoquandSiles12} who follow through this idea had to spend extra effort and
  first formalise the quite intricate notion of \emph{inductively finite sets} in 
  order to formalise this property.


  Fortunately, there is a much simpler approach using \emph{partial
  derivatives}. They were introduced by \citeN{Antimirov95} and can be defined
  in Isabelle/HOL as follows:
  %
  \begin{center}
  \begin{longtable}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1.5mm}}l@ {}}
  @{thm (lhs) pderiv.simps(1)}  & @{text "\<equiv>"} & @{thm (rhs) pderiv.simps(1)}\\
  @{thm (lhs) pderiv.simps(2)}  & @{text "\<equiv>"} & @{thm (rhs) pderiv.simps(2)}\\
  @{thm (lhs) pderiv.simps(3)[where c'="d"]}  & @{text "\<equiv>"} & @{thm (rhs) pderiv.simps(3)[where c'="d"]}\\
  @{thm (lhs) pderiv.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}  
     & @{text "\<equiv>"} & @{thm (rhs) pderiv.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  @{thm (lhs) pderiv.simps(5)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}  
     & @{text "\<equiv>"} & @{text "if"}~@{term "nullable r\<^sub>1"}~@{text "then"}~%
       @{term "(Timess (pderiv c r\<^sub>1) r\<^sub>2) \<union> (pderiv c r\<^sub>2)"}\\
     & & \phantom{@{text "if"}~@{term "nullable r\<^sub>1"}~}@{text "else"}~%  
                    @{term "Timess (pderiv c r\<^sub>1) r\<^sub>2"}\\ 
  @{thm (lhs) pderiv.simps(6)}  & @{text "\<equiv>"} & @{thm (rhs) pderiv.simps(6)}\smallskip\\
  @{thm (lhs) pderivs.simps(1)}  & @{text "\<equiv>"} & @{thm (rhs) pderivs.simps(1)}\\
  @{thm (lhs) pderivs.simps(2)}  & @{text "\<equiv>"} & @{text "\<Union> (pders s) ` (pder c r)"}
  \end{longtable}
  \end{center}

  \noindent
  Again the last two clauses extend partial derivatives from characters to strings. 
  Unlike `simple' derivatives, the functions for partial derivatives return sets of regular
  expressions. In the @{const Times} and @{const Star} cases we therefore use the
  auxiliary definition

  \begin{center}
  @{text "TIMESS rs r \<equiv> {TIMES r' r | r' \<in> rs}"}
  \end{center}

  \noindent
  in order to `sequence' a regular expression with a set of regular
  expressions. Note that in the last clause we first build the set of partial
  derivatives w.r.t~the character @{text c}, then build the image of this set under the
  function @{term "pderivs s"} and finally `union up' all resulting sets. It will be
  convenient to introduce for this the following abbreviation

  \begin{center}
  @{abbrev "pderivs_set s rs"}
  \end{center}

  \noindent
  which simplifies the last clause of @{const "pderivs"} to

  \begin{center}
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1.5mm}}l@ {}}
  @{thm (lhs) pderivs.simps(2)}  & @{text "\<equiv>"} & @{thm (rhs) pderivs.simps(2)}\\
  \end{tabular}
  \end{center}

  Partial derivatives can be seen as having the @{text "ACI"}-identities already built in: 
  taking the partial derivatives of the
  regular expressions in \eqref{ACI} gives us in each case
  equal sets.  \citeN{Antimirov95} showed a similar result to
  \eqref{Dersders} for partial derivatives, namely
  %
  \begin{equation}\label{Derspders}
  \mbox{\begin{tabular}{lc}
  @{text "(i)"}  & @{thm Deriv_pderiv}\\
  @{text "(ii)"} & @{thm Derivs_pderivs}
  \end{tabular}}
  \end{equation} 

  \begin{proof}
  The first fact is by a simple induction on @{text r}. For the second we slightly
  modify Antimirov's proof by performing an induction on @{text s} where we
  generalise over all @{text r}. That means in the @{text "cons"}-case the 
  induction hypothesis is

  \begin{center}
  @{text "(IH)"}\hspace{3mm}@{term "\<forall>r. Derivs s (lang r) = \<Union> (lang ` (pderivs s r))"}
  \end{center}

  \noindent
  With this we can establish

  \begin{center}
  \begin{tabular}{r@ {\hspace{1.5mm}}c@ {\hspace{1.5mm}}ll}
  @{term "Derivs (c # s) (lang r)"} 
    & @{text "="} & @{term "Derivs s (Deriv c (lang r))"} & by def.\\
    & @{text "="} & @{term "Derivs s (\<Union> (lang ` (pderiv c r)))"} & by @{text "("}\ref{Derspders}@{text ".i)"}\\
    & @{text "="} & @{term "\<Union> ((Derivs s) ` (lang ` (pderiv c r)))"} & by def.~of @{text "Ders"}\\
    & @{text "="} & @{term "\<Union> (lang ` (\<Union> (pderivs s ` (pderiv c r))))"} & by IH\\
    & @{text "="} & @{term "\<Union> (lang ` (pderivs (c # s) r))"} & by def.\\
  \end{tabular}
  \end{center}
  
  \noindent
  Note that in order to apply the induction hypothesis in the fourth equation, we
  need the generalisation over all regular expressions @{text r}. The case for
  the empty string is routine and omitted.\qed
  \end{proof}

  \noindent
  Taking \eqref{Dersders} and \eqref{Derspders} together gives the relationship 
  between languages of derivatives and partial derivatives
  %
  \begin{equation}
  \mbox{\begin{tabular}{lc}
  @{text "(i)"}  & @{thm deriv_pderiv[symmetric]}\\
  @{text "(ii)"} & @{thm derivs_pderivs[symmetric]}
  \end{tabular}}
  \end{equation} 

  \noindent
  These two properties confirm the observation made earlier 
  that by using sets, partial derivatives have the @{text "ACI"}-identities
  of derivatives already built in. 

  Antimirov also proved that for every language and every regular expression 
  there are only finitely many partial derivatives, whereby the set of partial
  derivatives of @{text r} w.r.t.~a language @{text A} is defined as
  %
  \begin{equation}\label{Pdersdef}
  @{thm pderivs_lang_def}
  \end{equation}

  \begin{theorem}[\cite{Antimirov95}]\label{antimirov}
  For every language @{text A} and every regular expression @{text r}, 
  \mbox{@{thm finite_pderivs_lang}}.
  \end{theorem}

  \noindent
  Antimirov's proof first establishes this theorem for the language @{term UNIV1}, 
  which is the set of all non-empty strings. For this he proves:
  %
  \begin{equation}\label{pdersunivone}
  \mbox{\begin{tabular}{l}
  @{thm pderivs_lang_Zero}\\
  @{thm pderivs_lang_One}\\
  @{thm pderivs_lang_Atom}\\
  @{thm pderivs_lang_Plus[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  @{thm pderivs_lang_Times[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  @{thm pderivs_lang_Star}\\
  \end{tabular}}
  \end{equation}

  \noindent
  from which one can deduce by induction on @{text r} that

  \begin{center}
  @{thm finite_pderivs_lang_UNIV1}
  \end{center}

  \noindent
  holds. Now Antimirov's theorem follows because

  \begin{center}
  @{thm pderivs_lang_UNIV}\\
  \end{center}

  \noindent
  and for all languages @{text "A"}, @{term "pderivs_lang A r"} is a subset of
  @{term "pderivs_lang UNIV r"}.  Since we follow Antimirov's proof quite
  closely in our formalisation (only the last two cases of
  \eqref{pdersunivone} involve some non-routine induction arguments), we omit
  the details.

  Let us now return to our proof for the second direction in the Myhill-Nerode
  Theorem. The point of the above calculations is to use 
  @{text "\<^raw:$\threesim$>\<^bsub>(\<lambda>x. pders x r)\<^esub>"} as tagging-relation. 


  \begin{proof}[of Theorem~\ref{myhillnerodetwo} (second version)]
  Using \eqref{mhders}
  and \eqref{Derspders} we can easily infer that

  \begin{center}
  @{term "x \<approx>(lang r) y"}\hspace{4mm}\mbox{provided}\hspace{4mm}@{term "pderivs x r = pderivs y r"}
  \end{center}

  \noindent
  which means the tagging-relation @{text "\<^raw:$\threesim$>\<^bsub>(\<lambda>x. pders x r)\<^esub>"} refines @{term "\<approx>(lang r)"}.
  So we know by Lemma~\ref{fintwo}, \mbox{@{term "finite (UNIV // (\<approx>(lang r)))"}} 
  holds if @{term "finite (UNIV // (=(\<lambda>x. pderivs x r)=))"}. In order to establish 
  the latter, we can use Lemma~\ref{finone} and show that the range of the 
  tagging-function \mbox{@{term "\<lambda>x. pderivs x r"}} is finite. For this recall Definition
  \ref{Pdersdef}, which gives us that 

  \begin{center}
  @{thm pderivs_lang_def[where A="UNIV", simplified]}
  \end{center}

  \noindent
  Now the range of @{term "\<lambda>x. pderivs x r"} is a subset of @{term "Pow (pderivs_lang UNIV r)"},
  which we know is finite by Theorem~\ref{antimirov}. Consequently there 
  are only finitely many equivalence classes of @{text "\<^raw:$\threesim$>\<^bsub>(\<lambda>x. pders x r)\<^esub>"}.
  This relation refines @{term "\<approx>(lang r)"}, and therefore we can again conclude the 
  second part of the Myhill-Nerode Theorem.\qed
  \end{proof}
*}

section {* Closure Properties of Regular Languages *}

text {*
  \label{closure} 
  \noindent
  The beauty of regular languages is that they are closed under many set
  operations. Closure under union, concatenation and Kleene-star are trivial
  to establish given our definition of regularity (recall Definition~\ref{regular}).
  More interesting in our setting is the closure under complement, because it seems difficult
  to construct a regular expression for the complement language by direct
  means. However the existence of such a regular expression can now be easily
  proved using both parts of the Myhill-Nerode Theorem, since

  \begin{center}
  @{term "s\<^sub>1 \<approx>A s\<^sub>2"} if and only if @{term "s\<^sub>1 \<approx>(-A) s\<^sub>2"}
  \end{center}
  
  \noindent
  holds for any strings @{text "s\<^sub>1"} and @{text
  "s\<^sub>2"}. Therefore @{text A} and the complement language @{term "-A"}
  give rise to the same partitions. So if one is finite, the other is too, and
  vice versa. As noted earlier, our algorithm for solving equational systems
  actually calculates a regular expression for the complement language. 
  Calculating such a regular expression via
  automata using the standard method would be quite involved. It includes the
  steps: regular expression @{text "\<Rightarrow>"} non-deterministic automaton @{text
  "\<Rightarrow>"} deterministic automaton @{text "\<Rightarrow>"} complement automaton @{text "\<Rightarrow>"}
  regular expression. Clearly not something you want to formalise in a theorem
  prover if it is cumbersome to reason about automata.

  %Once closure under complement is established, closure under intersection
  %and set difference is also easy, because
  %
  %\begin{center}
  %\begin{tabular}{c}
  %@{term "A \<inter> B = - (- A \<union> - B)"}\\
  %@{term "A - B = - (- A \<union> B)"}
  %\end{tabular}
  %\end{center}
  %
  %\noindent
  %Since all finite languages are regular, then by closure under complement also
  %all co-finite languages. 
  %
  %Closure of regular languages under reversal, that is
  %
  %\begin{center}
  %@{text "A\<^bsup>-1\<^esup> \<equiv> {s\<^bsup>-1\<^esup> | s \<in> A}"}
  %\end{center}
  %
  %\noindent 
  %can be shown with the help of the following operation defined recursively over 
  %regular expressions
  %
  %\begin{center}
  %\begin{tabular}{r@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  %@{thm (lhs) Rev.simps(1)} & @{text "\<equiv>"} & @{thm (rhs) Rev.simps(1)}\\
  %@{thm (lhs) Rev.simps(2)} & @{text "\<equiv>"} & @{thm (rhs) Rev.simps(2)}\\
  %@{thm (lhs) Rev.simps(3)} & @{text "\<equiv>"} & @{thm (rhs) Rev.simps(3)}\\
  %@{thm (lhs) Rev.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]} & @{text "\<equiv>"} & 
  %    @{thm (rhs) Rev.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  %@{thm (lhs) Rev.simps(5)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]} & @{text "\<equiv>"} & 
  %    @{thm (rhs) Rev.simps(5)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  %@{thm (lhs) Rev.simps(6)} & @{text "\<equiv>"} & @{thm (rhs) Rev.simps(6)}\\
  %\end{tabular}
  %\end{center}
  %
  %\noindent
  %For this operation we can show
  %
  %\begin{center}
  %@{text "(\<calL>(r))\<^bsup>-1\<^esup>"}~@{text "="}~@{thm (rhs) rev_lang}
  %\end{center}
  %
  %\noindent
  %from which closure under reversal of regular languages follows.

  A perhaps surprising fact is that regular languages are closed under any
  left-quotient. Define

  \begin{center}
  @{abbrev "Deriv_lang B A"}
  \end{center}

  \noindent
  and assume @{text B} is any language and @{text A} is regular, then @{term
  "Deriv_lang B A"} is regular. To see this consider the following argument
  using partial derivatives (which we used in Section~\ref{derivatives}): From @{text A} 
  being regular we know there exists
  a regular expression @{text r} such that @{term "A = lang r"}. We also know
  that @{term "pderivs_lang B r"} is finite for every language @{text B} and 
  regular expression @{text r} (recall Theorem~\ref{antimirov}). By definition 
  and \eqref{Derspders} we have
  %
  \begin{equation}\label{eqq}
  @{term "Deriv_lang B (lang r) = (\<Union> (lang ` (pderivs_lang B r)))"}
  \end{equation}

  \noindent
  Since there are only finitely many regular expressions in @{term "pderivs_lang
  B r"}, we know by \eqref{uplus} that there exists a regular expression so that
  the right-hand side of \eqref{eqq} is equal to the language \mbox{@{term "lang (\<Uplus>(pderivs_lang B
  r))"}}. Thus the regular expression @{term "\<Uplus>(pderivs_lang B r)"} verifies that
  @{term "Deriv_lang B A"} is regular.

  Even more surprising is the fact given first by \citeN{Haines69} stating 
  that for \emph{every} language @{text A}, the language
  consisting of all (scattered) substrings of @{text A} is regular  (see also 
  \cite{Shallit08,Gasarch09}). 
  A \emph{(scattered) substring} can be obtained
  by striking out zero or more characters from a string. This can be defined 
  inductively in Isabelle/HOL by the following three rules:

  %\begin{center}
  %@ {thm[mode=Axiom] emb0[where bs="x"]}\hspace{10mm} 
  %@ {thm[mode=Rule] emb1[where as="x" and b="c" and bs="y"]}\hspace{10mm} 
  %@ {thm[mode=Rule] emb2[where as="x" and a="c" and bs="y"]}
  %\end{center}
  \begin{center}
  @{thm[mode=Axiom] emb0}\hspace{10mm} 
  @{thm[mode=Rule] emb1}\hspace{10mm} 
  @{thm[mode=Rule] emb2}
  \end{center}

  \noindent
  It is straightforward to prove that @{text "\<preceq>"} is a partial order. Now define the 
  \emph{language of substrings} and \emph{superstrings} of a language @{text A} 
  respectively as

  \begin{center}
  \begin{tabular}{l}
  @{thm SUBSEQ_def}\\
  @{thm SUPSEQ_def}
  \end{tabular}
  \end{center}
  
  \noindent
  We like to establish

  \begin{theorem}[\cite{Haines69}]\label{subseqreg}
  For every language @{text A}, the languages @{text "(i)"} @{term "SUBSEQ A"} and 
  @{text "(ii)"} @{term "SUPSEQ A"}
  are regular.
  \end{theorem}

  \noindent
  Our proof follows the one given by \citeN[Pages 92--95]{Shallit08}, except that we use
  Higman's Lemma, which is already proved in the Isabelle/HOL library by
  \citeN{Sternagel13}.
  Higman's Lemma allows us to infer that every language @{text A} of antichains, satisfying
  %
  \begin{equation}\label{higman}
  @{text "\<forall>x, y \<in> A."}~@{term "x \<noteq> y \<longrightarrow> \<not>(x \<preceq> y) \<and> \<not>(y \<preceq> x)"}
  \end{equation} 

  \noindent
  is finite.

  The first step in our proof of Theorem~\ref{subseqreg} is to establish the 
  following simple properties for @{term SUPSEQ}
  %
  \begin{equation}\label{supseqprops}
  \mbox{\begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) SUPSEQ_simps(1)} & @{text "="} & @{thm (rhs) SUPSEQ_simps(1)}\\  
  @{thm (lhs) SUPSEQ_simps(2)} & @{text "="} & @{thm (rhs) SUPSEQ_simps(2)}\\ 
  @{thm (lhs) SUPSEQ_atom} & @{text "="} & @{thm (rhs) SUPSEQ_atom}\\ 
  @{thm (lhs) SUPSEQ_union} & @{text "="} & @{thm (rhs) SUPSEQ_union}\\
  @{thm (lhs) SUPSEQ_conc} & @{text "="} & @{thm (rhs) SUPSEQ_conc}\\
  @{thm (lhs) SUPSEQ_star} & @{text "="} & @{thm (rhs) SUPSEQ_star}
  \end{tabular}}
  \end{equation}

  \noindent
  whereby the last equation follows from the fact that @{term "A\<star>"} contains the
  empty string. With these properties at our disposal we can establish the lemma

  \begin{lemma}
  If @{text A} is regular, then also @{term "SUPSEQ A"}.
  \end{lemma}

  \begin{proof}
  Since our alphabet is finite, we have a regular expression, written @{text ALL}, that
  matches every string. Using this regular expression we can inductively define
  the operation @{text "r\<up>"} 

  \begin{center}
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) UP.simps(1)} & @{text "\<equiv>"} & @{thm (rhs) UP.simps(1)}\\  
  @{thm (lhs) UP.simps(2)} & @{text "\<equiv>"} & @{thm (rhs) UP.simps(2)}\\ 
  @{thm (lhs) UP.simps(3)} & @{text "\<equiv>"} & @{thm (rhs) UP.simps(3)}\\ 
  @{thm (lhs) UP.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]} & 
     @{text "\<equiv>"} & @{thm (rhs) UP.simps(4)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  @{thm (lhs) UP.simps(5)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]} & 
     @{text "\<equiv>"} & @{thm (rhs) UP.simps(5)[where ?r1.0="r\<^sub>1" and ?r2.0="r\<^sub>2"]}\\
  @{thm (lhs) UP.simps(6)} & @{text "\<equiv>"} & @{thm (rhs) UP.simps(6)}
  \end{tabular}
  \end{center}

  \noindent
  and use \eqref{supseqprops} to establish that @{thm lang_UP} holds. This shows
  that @{term "SUPSEQ A"} is regular, provided @{text A} is.\qed
  \end{proof}

  \noindent
  Now we can prove the main lemma w.r.t.~@{const "SUPSEQ"}, namely

  \begin{lemma}\label{mset}
  For every language @{text A}, there exists a finite language @{text M} such that
  \begin{center}
  \mbox{@{term "SUPSEQ M = SUPSEQ A"}}\;.
  \end{center}
  \end{lemma}

  \begin{proof}
  For @{text M} we take the set of all minimal elements of @{text A}. An element @{text x} 
  is said to be \emph{minimal} in @{text A} provided

  \begin{center}
  @{thm minimal_def}
  \end{center}

  \noindent
  By Higman's Lemma \eqref{higman} we know
  that @{term "M \<equiv> {x \<in> A. minimal x A}"} is finite, since every minimal element is incomparable, 
  except with itself.
  It is also straightforward to show that @{term "SUPSEQ M \<subseteq> SUPSEQ A"}. For
  the other direction we have  @{term "x \<in> SUPSEQ A"}. From this we obtain 
  a @{text y} such that @{term "y \<in> A"} and @{term "y \<preceq> x"}. Since we have that
  the relation \mbox{@{term "{(y, x). y \<preceq> x \<and> x \<noteq> y}"}} is well-founded, there must
  be a minimal element @{text "z"} such that @{term "z \<in> A"} and @{term "z \<preceq> y"}, 
  and hence by transitivity also \mbox{@{term "z \<preceq> x"}} (here we deviate from the argument 
  given by \citeN{Shallit08}, because Isabelle/HOL provides already an extensive infrastructure
  for reasoning about well-foundedness). Since @{term "z"} is
  minimal and an element in @{term A}, we also know that @{term z} is in @{term M}.
  From this together with \mbox{@{term "z \<preceq> x"}}, we can infer that @{term x} is in 
  @{term "SUPSEQ M"}, as required.\qed
  \end{proof}

  \noindent
  This lemma allows us to establish the second part of Theorem~\ref{subseqreg}.

  \begin{proof}[of the Second Part of Theorem~\ref{subseqreg}]
  Given any language @{text A}, by Lemma~\ref{mset} we know there exists
  a finite, and thus regular, language @{text M}. We further have @{term "SUPSEQ M = SUPSEQ A"},
  which establishes the second part.\qed    
  \end{proof}

  \noindent
  In order to establish the first part of this theorem, we use the
  property proved by \citeN{Shallit08}, namely that
  %
  \begin{equation}\label{compl}
  @{thm SUBSEQ_complement}
  \end{equation}

  \noindent
  holds. Now the first part of Theorem~\ref{subseqreg} is a simple consequence of the second part.

  \begin{proof}[of the First Part of Theorem~\ref{subseqreg}]
  By the second part, we know the right-hand side of \eqref{compl}
  is regular, which means @{term "- SUBSEQ A"} is regular. But since
  we established already that regularity is preserved under complement (using Myhill-Nerode), 
  also @{term "SUBSEQ A"} must be regular.\qed 
  \end{proof}

  Finally we like to show that the Myhill-Nerode Theorem is also convenient for establishing 
  the non-regularity of languages. For this we use the following version of the Continuation
  Lemma (see for example~\cite{Rosenberg06}).

  \begin{lemma}[Continuation Lemma]
  If a language @{text A} is regular and a set of strings @{text B} is infinite,
  then there exist two distinct strings @{text x} and @{text y} in @{text B} 
  such that @{term "x \<approx>A y"}.
  \end{lemma}

  \noindent
  This lemma can be easily deduced from the Myhill-Nerode Theorem and the Pigeonhole
  Principle: Since @{text A} is regular, there can be only finitely many 
  equivalence classes. Hence an infinite set must contain 
  at least two strings that are in the same equivalence class, that is
  they need to be related by the Myhill-Nerode Relation.

  Using this lemma, it is straightforward to establish that the language 
  \mbox{@{text "A \<equiv> \<Union>\<^sub>n a\<^sup>n @ b\<^sup>n"}} is not regular (@{text "a\<^sup>n"} stands
  for the strings consisting of @{text n} times the character a; similarly for
  @{text "b\<^sup>n"}). For this consider the infinite set @{text "B \<equiv> \<Union>\<^sub>n a\<^sup>n"}.

  \begin{lemma}
  No two distinct strings in set @{text "B"} are Myhill-Nerode related by language @{text A}.
  \end{lemma} 

  \begin{proof}
  After unfolding the definition of @{text "B"}, we need to establish that given @{term "i \<noteq> j"},
  the strings @{text "a\<^sup>i"} and @{text "a\<^sup>j"} are not Myhill-Nerode related by @{text "A"}.
  That means we have to show that @{text "\<forall>z. a\<^sup>i @ z \<in> A = a\<^sup>j @ z \<in> A"} leads to 
  a contradiction. Let us take @{text "b\<^sup>i"} for @{text "z"}. Then we know @{text "a\<^sup>i @ b\<^sup>i \<in> A"}.
  But since @{term "i \<noteq> j"}, @{text "a\<^sup>j @ b\<^sup>i \<notin> A"}. Therefore  @{text "a\<^sup>i"} and @{text "a\<^sup>j"}
  cannot be Myhill-Nerode related by @{text "A"}, and we are done.\qed
  \end{proof}

  \noindent
  To conclude the proof of non-regularity for the language @{text A}, the
  Continuation Lemma and the lemma above lead to a contradiction assuming
  @{text A} is regular. Therefore the language @{text A} is not regular, as we
  wanted to show.
*}



section {* Conclusion and Related Work *}

text {*
  \noindent
  In this paper we took the view that a regular language is one where there
  exists a regular expression that matches all of its strings. Regular
  expressions can be conveniently defined as a datatype in theorem
  provers. For us it was therefore interesting to find out how far we can push
  this point of view. But this question whether regular language theory can
  be done without automata crops up also in non-theorem prover contexts. Recall 
  Gasarch's comment cited in the Introduction. 
  We have established in Isabelle/HOL both directions 
  of the Myhill-Nerode Theorem.
  %
  \begin{theorem}[Myhill-Nerode Theorem]\mbox{}\\
  A language @{text A} is regular if and only if @{thm (rhs) Myhill_Nerode}.
  \end{theorem}
  
  \noindent
  Having formalised this theorem means we pushed our point of view quite
  far. Using this theorem we can obviously prove when a language is \emph{not}
  regular---by establishing that it has infinitely many equivalence classes
  generated by the Myhill-Nerode Relation (this is usually the purpose of the
  Pumping Lemma).  We can also use it to establish the standard
  textbook results about closure properties of regular languages. The case of 
  closure under complement follows easily from the Myhill-Nerode Theorem.
  So our answer to Gasarch is `{\it yes we can'}!  

  %Our insistence on regular expressions for proving the Myhill-Nerode Theorem
  %arose from the problem of defining formally the regularity of a language.
  %In order to guarantee consistency,
  %formalisations in HOL can only extend the logic with definitions that introduce a new concept in
  %terms of already existing notions. A convenient definition for automata
  %(based on graphs) uses a polymorphic type for the state nodes. This allows
  %us to use the standard operation for disjoint union whenever we need to compose two
  %automata. Unfortunately, we cannot use such a polymorphic definition
  %in HOL as part of the definition for regularity of a language (a predicate
  %over sets of strings).  Consider for example the following attempt:
  %
  %\begin{center}
  %@{text "is_regular A \<equiv> \<exists>M(\<alpha>). is_dfa (M) \<and> \<calL>(M) = A"}
  %\end{center}
  %
  %\noindent
  %In this definifion, the definiens is polymorphic in the type of the automata
  %@{text "M"} (indicated by dependency on the type-variable @{text "\<alpha>"}), but the definiendum
  %@{text "is_regular"} is not. Such definitions are excluded from HOL, because
  %they can lead easily to inconsistencies (see \cite{PittsHOL4} for a simple
  %example). Also HOL does not contain type-quantifiers which would allow us to
  %get rid of the polymorphism by quantifying over the type-variable 
  %@{text "\<alpha>"}. Therefore when defining regularity in terms of automata, the
  %natural way out in HOL is to resort to state nodes with an identity, for
  %example a natural number. Unfortunatly, the consequence is that we have to
  %be careful when combining two automata so that there is no clash between two
  %such states. This makes formalisations quite fiddly and rather
  %unpleasant. Regular expressions proved much more convenient for reasoning in
  %HOL since they can be defined as inductive datatype and a reasoning
  %infrastructure comes for free. The definition of regularity in terms of
  %regular expressions poses no problem at all for HOL.  We showed in this
  %paper that they can be used for establishing the central result in regular
  %language theory---the Myhill-Nerode Theorem.

  While regular expressions are convenient, they have some limitations. One is
  that there are some regular expressions for which the smallest regular 
  expression for the complement language has a doubly-exponential blowup in size 
  as shown by \citeN{Gelade12}. 
  Another is
  that there seems to be no method of calculating a minimal regular expression
  (for example in terms of length) for a regular language, like there is for
  automata. On the other hand, efficient regular expression matching, without
  using automata, poses no problem as shown by \citeN{OwensReppyTuron09}.  For an
  implementation of a simple regular expression matcher, whose correctness has
  been formally established, we refer the reader to 
  \citeN{OwensSlind08}. In our opinion, their formalisation is considerably
  slicker than for example the approach to regular expression matching taken
  by \citeN{Harper99} and by \citeN{Yi06}. 

 
  Our proof of the first direction is very much inspired by \emph{Brzozowski's
  algebraic method} \citeyear{Brzozowski64} used to convert a finite automaton to a regular expression. 
  The close connection can be seen by considering the
  equivalence classes as the states of the minimal automaton for the regular
  language.  However there are some subtle differences. Because our equivalence 
  classes (or correspondingly states) arise from the Myhill-Nerode Relation, the most natural
  choice is to characterise each state with the set of strings starting from
  the initial state leading up to that state. Usually, however, the states are
  characterised as the strings starting from that state leading to the
  terminal states.  The first choice has consequences about how the initial
  equational system is set up. We have the $\lambda$-term on our `initial
  state', while Brzozowski has it on the terminal states. This means we also
  need to reverse the direction of Arden's Lemma. We have not found anything
  in the literature about our way of proving the 
  first direction of the Myhill-Nerode Theorem.

  We presented two proofs for the second direction of the
  Myhill-Nerode Theorem. One direct proof using tagging-functions and
  another using partial derivatives. This part of our work is where
  our method using regular expressions shines, because we can perform
  an induction on the structure of regular expressions. However, it is
  also the direction where we had to spend most of the `conceptual'
  time, as our first proof based on tagging-functions is new for
  establishing the Myhill-Nerode Theorem. All standard proofs of this
  direction proceed by arguments over automata.
  
  The indirect proof for the second direction arose from our interest in
  Brzozowski's derivatives for regular expression matching.  While \citeN{Brzozowski64} 
  already established that there are only
  finitely many dissimilar derivatives for every regular expression, this
  result is not as straightforward to formalise in a theorem prover as one
  might wish. The reason is that the set of dissimilar derivatives is not
  defined inductively, but in terms of an ACI-equivalence relation. This
  difficulty prevented for example \citeN{KraussNipkow11} to prove termination of
  their equivalence checker for regular expressions. Their checker is based on Brzozowski's derivatives
  and for their argument the lack of a formal proof of termination is not
  crucial (it merely lets them ``sleep better'' \cite{KraussNipkow11}).  We
  expect that their development simplifies by using partial derivatives,
  instead of derivatives, and that the termination of the algorithm can be
  formally established (the main ingredient is
  Theorem~\ref{antimirov}). However, since partial derivatives use sets of
  regular expressions, one needs to carefully analyse whether the resulting
  algorithm is still executable. Given the infrastructure for
  executable sets  introduced by \citeN{Haftmann09} in Isabelle/HOL, it should.

  We started out by claiming that in a theorem prover it is easier to
  reason about regular expressions than about automata. Here are some
  numbers: Our formalisation of the Myhill-Nerode Theorem consists of
  780 lines of Isabelle/Isar code for the first direction and 460 for
  the second (the one based on tagging-functions), plus around 300
  lines of standard material about regular languages. The
  formalisation of derivatives and partial derivatives shown in
  Section~\ref{derivatives} consists of 390 lines of code.  The
  closure properties in Section~\ref{closure} (except
  Theorem~\ref{subseqreg}) can be established in 100 lines of
  code. The Continuation Lemma and the non-regularity of @{text "a\<^sup>n
  b\<^sup>n"} require 70 lines of code.  The algorithm for solving equational
  systems, which we used in the first direction, is conceptually
  relatively simple. Still the use of sets over which the algorithm
  operates means it is not as easy to formalise as one might
  wish. However, it seems sets cannot be avoided since the `input' of
  the algorithm consists of equivalence classes and we cannot see how
  to reformulate the theory so that we can use lists or
  matrices. Lists would be much easier to reason about, since we can
  define functions over them by recursion. For sets we have to use
  set-comprehensions, which is slightly unwieldy. Matrices would allow
  us to use the slick formalisation by \citeN{Nipkow11} of the
  Gauss-Jordan algorithm.

  % OLD
  %While our formalisation might appear large, it should be seen in the
  %context of the work done by \citeN{Constable00} who formalised the
  %Myhill-Nerode Theorem in Nuprl using automata. They write that their
  %four-member team would need something on the magnitude of 18 months
  %for their formalisation of the first eleven chapters of the textbook
  %by \citeN{HopcroftUllman69}, which includes the Myhill-Nerode
  %theorem. It is hard to gauge the size of a formalisation in Nurpl,
  %but from what is shown in the Nuprl Math Library about their
  %development it seems \emph{substantially} larger than ours. We
  %attribute this to our use of regular expressions, which meant we did
  %not need to `fight' the theorem prover. 

  %%% NEW
  While our formalisation might appear large, it should be seen in the
  context of the work done by \citeN{Constable00} who formalised the
  Myhill-Nerode Theorem in Nuprl using automata.  They choose to formalise the 
  this theorem, because it gives them state minimization of automata
  as a corollary. It is hard to gauge the size of a formalisation in Nurpl,
  but from what is shown in the Nuprl Math Library about this
  development it seems \emph{substantially} larger than ours. We
  attribute this to our use of regular expressions, which meant we did
  not need to `fight' the theorem prover. 
  %
  Recently,
  \citeN{LammichTuerk12} formalised Hopcroft's algorithm in
  Isabelle/HOL (in 7000 lines of code) using an automata library of
  27000 lines of code.  Also, \citeN{Filliatre97} reports that his
  formalisation in Coq of automata theory and Kleene's theorem is
  ``rather big''.  \citeN{Almeidaetal10} reported about another
  formalisation of regular languages in Coq. Their main result is the
  correctness of Mirkin's construction of an automaton from a regular
  expression using partial derivatives. This took approximately 10600
  lines of code.  \citeN{Braibant12} formalised a large part of
  regular language theory and Kleene algebras in Coq. While he is
  mainly interested in implementing decision procedures for Kleene
  algebras, his library includes a proof of the Myhill-Nerode
  theorem. He reckons that our Myhill-Nerode ``development is more concise'' than
  his one based on matrices \cite[Page 67]{Braibant12}.  He writes
  that there is no conceptual problems with formally reasoning about
  matrices for automata, but notes ``intrinsic difficult[ies]'' when
  working with matrices in Coq, which is the sort of `fighting' one
  would encounter also in other theorem provers.
  In terms of time, the estimate for our formalisation is that we
  needed approximately 3 months and this included the time to find our proof
  arguments. Unlike \citeN{Constable00}, who were able to follow the Myhill-Nerode 
  proof by \citeN{HopcroftUllman69}, we had to find our own arguments.  
  So for us the formalisation was not the bottleneck. The conclusion we draw 
  from all these comparisons is that if one is interested in formalising
  results from regular language theory, not results from automata theory,
  then regular expressions are easier to work with formally. 
  The code of
  our formalisation \cite{myhillnerodeafp11} can be found in the Archive of Formal Proofs at 
  \mbox{\url{http://afp.sourceforge.net/entries/Myhill-Nerode.shtml}}.
  
   In this paper we leave out a discussion
  about the computational content of our proofs. While the described algorithms
  (for instance for solving our equational systems) are exectuable as is, we 
  have cut some `computational corners' in our formalisation. For example, we did 
  not specify which particular equation should be chosen in each step, rather 
  proved that any choice will do. Similarly with our $\bigplus$-operation, we 
  did not specify the order in which the regular expression should be composed, rather
  proved that any composition with @{term PLUS} will do.
 

  Our future work will focus on formalising the regular expression matchers
  developed by \citeN{Sulzmann12} which generate variable assignments for
  regular expression sub-matching. For this they use both, derivatives and
  partial derivatives of regular expressions.\smallskip

  \noindent
  {\bf Acknowledgements:}
  We are grateful for the comments we received from Larry Paulson.  Tobias
  Nipkow made us aware of the properties in Theorem~\ref{subseqreg} and Tjark
  Weber helped us with proving them. Christian Sternagel provided us with a
  version of Higman's Lemma that applies to arbitrary, but finite alphabets. 

  \bibliographystyle{acmtrans}
  \bibliography{root}

  %\newpage
  %\begin{appendix}
  %\section{Appendix$^\star$}

  %\renewcommand{\thefootnote}{\mbox{$\star$}}
  %\footnotetext{If the reviewers deem more suitable, the authors are
  %prepared to drop material or move it to an electronic appendix.}
  
  %\begin{proof}[of Lemma~\ref{arden}]
  %For the right-to-left direction we assume @{thm (rhs) reversed_Arden} and show
  %that @{thm (lhs) reversed_Arden} holds. From Property~\ref{langprops}@{text "(i)"} 
  %we have @{term "A\<star> = A \<cdot> A\<star> \<union> {[]}"},
  %which is equal to @{term "A\<star> = A\<star> \<cdot> A \<union> {[]}"}. Adding @{text B} to both 
  %sides gives @{term "B \<cdot> A\<star> = B \<cdot> (A\<star> \<cdot> A \<union> {[]})"}, whose right-hand side
  %is equal to @{term "(B \<cdot> A\<star>) \<cdot> A \<union> B"}. Applying the assumed equation 
  %completes this direction. 

  %For the other direction we assume @{thm (lhs) reversed_Arden}. By a simple induction
  %on @{text n}, we can establish the property

  %\begin{center}
  %@{text "("}@{text "*"}@{text ")"}\hspace{5mm} @{thm (concl) reversed_arden_helper}
  %\end{center}
  
  %\noindent
  %Using this property we can show that @{term "B \<cdot> (A \<up> n) \<subseteq> X"} holds for
  %all @{text n}. From this we can infer @{term "B \<cdot> A\<star> \<subseteq> X"} using the definition
  %of @{text "\<star>"}.
  %For the inclusion in the other direction we assume a string @{text s}
  %with length @{text k} is an element in @{text X}. Since @{thm (prem 1) reversed_Arden}
  %we know by Property~\ref{langprops}@{text "(ii)"} that 
  %@{term "s \<notin> X \<cdot> (A \<up> Suc k)"} since its length is only @{text k}
  %(the strings in @{term "X \<cdot> (A \<up> Suc k)"} are all longer). 
  %From @{text "("}@{text "*"}@{text ")"} it follows then that
  %@{term s} must be an element in @{term "(\<Union>m\<le>k. B \<cdot> (A \<up> m))"}. This in turn
  %implies that @{term s} is in @{term "(\<Union>n. B \<cdot> (A \<up> n))"}. Using Property~\ref{langprops}@{text "(iii)"} 
  %this is equal to @{term "B \<cdot> A\<star>"}, as we needed to show.
  %\end{proof}
  % \end{appendix}
*}


(*<*)
end
(*>*)