thys/Journal/Paper.thy
changeset 267 32b222d77fa0
parent 266 fff2e1b40dfc
child 268 6746f5e1f1f8
--- a/thys/Journal/Paper.thy	Wed Jul 19 14:55:46 2017 +0100
+++ b/thys/Journal/Paper.thy	Fri Aug 11 20:29:01 2017 +0100
@@ -15,6 +15,11 @@
 
 declare [[show_question_marks = false]]
 
+syntax (latex output)
+  "_Collect" :: "pttrn => bool => 'a set"              ("(1{_ \<^raw:\mbox{\boldmath$\mid$}> _})")
+  "_CollectIn" :: "pttrn => 'a set => bool => 'a set"   ("(1{_ \<in> _ |e _})")
+
+
 abbreviation 
   "der_syn r c \<equiv> der c r"
 
@@ -26,12 +31,14 @@
   "nprec v1 v2 \<equiv> \<not>(v1 :\<sqsubset>val v2)"
 
 
+
+
 notation (latex output)
   If  ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and
   Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [75,73] 73) and  
 
   ZERO ("\<^bold>0" 78) and 
-  ONE ("\<^bold>1" 78) and 
+  ONE ("\<^bold>1" 1000) and 
   CHAR ("_" [1000] 80) and
   ALT ("_ + _" [77,77] 78) and
   SEQ ("_ \<cdot> _" [77,77] 78) and
@@ -53,8 +60,10 @@
   mkeps ("mkeps _" [79] 76) and 
   length ("len _" [73] 73) and
   intlen ("len _" [73] 73) and
+  set ("_" [73] 73) and
  
-  Prf ("_ : _" [75,75] 75) and  
+  Prf ("_ : _" [75,75] 75) and
+  CPrf ("_ \<^raw:\mbox{\textbf{\textlengthmark}}> _" [75,75] 75) and
   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
  
   lexer ("lexer _ _" [78,78] 77) and 
@@ -84,6 +93,13 @@
 definition 
   "match r s \<equiv> nullable (ders s r)"
 
+
+lemma CV_STAR_ONE_empty: 
+  shows "CV (STAR ONE) [] = {Stars []}"
+by(auto simp add: CV_def elim: CPrf.cases intro: CPrf.intros)
+
+
+
 (*
 comments not implemented
 
@@ -94,6 +110,8 @@
 
 (*>*)
 
+
+
 section {* Introduction *}
 
 
@@ -153,7 +171,7 @@
 not match an empty string unless this is the only match for the repetition.\smallskip
 
 \item[$\bullet$] \emph{Empty String Rule:} An empty string shall be considered to 
-be longer than no match at all.
+be longer than no match at all.\marginpar{Explain its purpose}
 \end{itemize}
 
 \noindent Consider for example a regular expression @{text "r\<^bsub>key\<^esub>"} for recognising keywords
@@ -165,7 +183,9 @@
 by the Longest Match Rule a single identifier token, not a keyword
 followed by an identifier. For @{text "if"} we obtain by the Priority
 Rule a keyword token, not an identifier token---even if @{text "r\<^bsub>id\<^esub>"}
-matches also.
+matches also. By the Star Rule we know @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} matches @{text "iffoo"}, respectively @{text "if"}, in exactly one
+`iteration' of the star.
+
 
 One limitation of Brzozowski's matcher is that it only generates a
 YES/NO answer for whether a string is being matched by a regular
@@ -185,7 +205,8 @@
 algorithm. This proof idea is inspired by work of Frisch and Cardelli
 \cite{Frisch2004} on a GREEDY regular expression matching
 algorithm. However, we were not able to establish transitivity and
-totality for the ``order relation'' by Sulzmann and Lu. ??In Section
+totality for the ``order relation'' by Sulzmann and Lu. \marginpar{We probably drop this section} 
+??In Section
 \ref{argu} we identify some inherent problems with their approach (of
 which some of the proofs are not published in \cite{Sulzmann2014});
 perhaps more importantly, we give a simple inductive (and
@@ -230,6 +251,8 @@
 informal proof contains gaps, and possible fixes are not fully worked out.}
 Our specification of a POSIX value consists of a simple inductive definition
 that given a string and a regular expression uniquely determines this value.
+We also show that our definition is equivalent to an ordering 
+of values based on positions by Okui and Suzuki \cite{OkuiSuzuki2013}.
 Derivatives as calculated by Brzozowski's method are usually more complex
 regular expressions than the initial one; various optimisations are
 possible. We prove the correctness when simplifications of @{term "ALT ZERO
@@ -267,13 +290,10 @@
   recursive function @{term L} with the six clauses:
 
   \begin{center}
-  \begin{tabular}{l@ {\hspace{3mm}}rcl}
+  \begin{tabular}{l@ {\hspace{4mm}}rcl}
   (1) & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
   (2) & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
   (3) & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
-  \end{tabular}
-  \hspace{14mm}
-  \begin{tabular}{l@ {\hspace{3mm}}rcl}
   (4) & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   (5) & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   (6) & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
@@ -325,14 +345,12 @@
   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
-  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
-  
-  %\end{tabular}
-  %\end{center}
+  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}%\medskip\\
+  \end{tabular}
+  \end{center}
 
-  %\begin{center}
-  %\begin{tabular}{lcl}
-  
+  \begin{center}
+  \begin{tabular}{lcl}
   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
@@ -384,7 +402,7 @@
 
 text {* 
 
-  The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to define
+  The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to use
   values for encoding \emph{how} a regular expression matches a string
   and then define a function on values that mirrors (but inverts) the
   construction of the derivative on regular expressions. \emph{Values}
@@ -422,7 +440,7 @@
   \end{center}
 
   \noindent Sulzmann and Lu also define inductively an inhabitation relation
-  that associates values to regular expressions:
+  that associates values to regular expressions
 
   \begin{center}
   \begin{tabular}{c}
@@ -436,7 +454,10 @@
   \end{tabular}
   \end{center}
 
-  \noindent Note that no values are associated with the regular expression
+  \noindent 
+  where in the clause for @{const "Stars"} we use the notation @{term "v \<in> set vs"}
+  for indicating that @{text v} is a member in the list @{text vs}.
+  Note that no values are associated with the regular expression
   @{term ZERO}, and that the only value associated with the regular
   expression @{term ONE} is @{term Void}, pronounced (if one must) as @{text
   "Void"}. It is routine to establish how values ``inhabiting'' a regular
@@ -446,10 +467,58 @@
   @{thm L_flat_Prf}
   \end{proposition}
 
-  In general there is more than one value associated with a regular
-  expression. In case of POSIX matching the problem is to calculate the
-  unique value that satisfies the (informal) POSIX rules from the
-  Introduction. Graphically the POSIX value calculation algorithm by
+  \noindent
+  Given a regular expression @{text r} and a string @{text s}, we can define the 
+  set of all \emph{Lexical Values} inhabited by @{text r} with the underlying string 
+  being @{text s} by
+  
+  \begin{center}
+  @{thm LV_def}
+  \end{center}
+
+  \noindent However, later on it will sometimes be necessary to
+  restrict the set of lexical values to a subset called
+  \emph{Canonical Values}. The idea of canonical values is that they
+  satisfy the Star Rule (see Introduction) where the $^\star$ does not
+  match the empty string unless this is the only match for the
+  repetition.  One way to define canonical values formally is to use a
+  stronger inhabitation relation, written @{term "\<Turnstile> DUMMY : DUMMY"}, which has the same rules as @{term
+  "\<turnstile> DUMMY : DUMMY"} shown above, except that the rule for 
+  @{term Stars} has
+  the additional side-condition of flattened values not being the
+  empty string, namely
+
+  \begin{center}
+  @{thm [mode=Rule] CPrf.intros(6)}
+  \end{center}
+
+  \noindent
+  With this we can define
+  
+  \begin{center}
+  @{thm CV_def}
+  \end{center}
+
+  \noindent
+  Clearly we have @{thm LV_CV_subset}.
+  The main point of canonical values is that for every regular expression @{text r} and every
+  string @{text s}, the set @{term "CV r s"} is finite.
+
+  \begin{lemma}
+  @{thm CV_finite}
+  \end{lemma}
+
+  \noindent This finiteness property does not generally hold for lexical values where
+  for example @{term "LV (STAR ONE) []"} contains infinitely many
+  values, but @{thm CV_STAR_ONE_empty}. However, if a regular
+  expression @{text r} matches a string @{text s}, then in general the
+  set @{term "CV r s"} is not just a
+  singleton set.  In case of POSIX matching the problem is to
+  calculate the unique value that satisfies the (informal) POSIX rules
+  from the Introduction. It will turn out that this POSIX value is in fact a
+  canonical value.
+
+  Graphically the POSIX value calculation algorithm by
   Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
   where the path from the left to the right involving @{term derivatives}/@{const
   nullable} is the first phase of the algorithm (calculating successive
@@ -614,7 +683,7 @@
   \end{proof}
 
   Having defined the @{const mkeps} and @{text inj} function we can extend
-  \Brz's matcher so that a [lexical] value is constructed (assuming the
+  \Brz's matcher so that a value is constructed (assuming the
   regular expression matches the string). The clauses of the Sulzmann and Lu lexer are
 
   \begin{center}
@@ -633,19 +702,25 @@
   functional programming language and also in Isabelle/HOL. In the remaining
   part of this section we prove that this algorithm is correct.
 
-  The well-known idea of POSIX matching is informally defined by the longest
-  match and priority rule (see Introduction); as correctly argued in \cite{Sulzmann2014}, this
+  The well-known idea of POSIX matching is informally defined by some
+  rules such as the longest match and priority rule (see
+  Introduction); as correctly argued in \cite{Sulzmann2014}, this
   needs formal specification. Sulzmann and Lu define an ``ordering
-  relation'' between values and argue
-  that there is a maximum value, as given by the derivative-based algorithm.
-  In contrast, we shall introduce a simple inductive definition that
-  specifies directly what a \emph{POSIX value} is, incorporating the
-  POSIX-specific choices into the side-conditions of our rules. Our
-  definition is inspired by the matching relation given by Vansummeren
-  \cite{Vansummeren2006}. The relation we define is ternary and written as
-  \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating strings, regular expressions and
-  values.
+  relation'' between values and argue that there is a maximum value,
+  as given by the derivative-based algorithm.  In contrast, we shall
+  introduce a simple inductive definition that specifies directly what
+  a \emph{POSIX value} is, incorporating the POSIX-specific choices
+  into the side-conditions of our rules. Our definition is inspired by
+  the matching relation given by Vansummeren
+  \cite{Vansummeren2006}. The relation we define is ternary and
+  written as \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating
+  strings, regular expressions and values; the inductive rules are given in 
+  Figure~\ref{POSIXrules}.
+  We can prove that given a string @{term s} and regular expression @{term
+   r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow> v"}.
+
   %
+  \begin{figure}[t]
   \begin{center}
   \begin{tabular}{c}
   @{thm[mode=Axiom] Posix.intros(1)}@{text "P"}@{term "ONE"} \qquad
@@ -668,10 +743,10 @@
    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$@{text "P\<star>"}
   \end{tabular}
   \end{center}
+  \caption{Our inductive definition of POSIX values.}\label{POSIXrules}
+  \end{figure}
 
-   \noindent
-   We can prove that given a string @{term s} and regular expression @{term
-   r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow> v"}.
+   
 
   \begin{theorem}\mbox{}\smallskip\\\label{posixdeterm}
   \begin{tabular}{ll}
@@ -687,7 +762,7 @@
   \end{proof}
 
   \noindent
-  We claim that our @{term "s \<in> r \<rightarrow> v"} relation captures the idea behind the two
+  We claim that our @{term "s \<in> r \<rightarrow> v"} relation captures the idea behind the four
   informal POSIX rules shown in the Introduction: Consider for example the
   rules @{text "P+L"} and @{text "P+R"} where the POSIX value for a string
   and an alternative regular expression, that is @{term "(s, ALT r\<^sub>1 r\<^sub>2)"},
@@ -718,8 +793,19 @@
   @{term v} cannot be flattened to the empty string. In effect, we require
   that in each ``iteration'' of the star, some non-empty substring needs to
   be ``chipped'' away; only in case of the empty string we accept @{term
-  "Stars []"} as the POSIX value.
+  "Stars []"} as the POSIX value. Indeed we can show that our POSIX value
+  is a canonical value which excludes those @{text Stars} containing values 
+  that flatten to the empty string.
 
+  \begin{lemma}
+  @{thm [mode=IfThen] Posix_CV}
+  \end{lemma}
+
+  \begin{proof}
+  By routine induction on @{thm (prem 1) Posix_CV}.\qed 
+  \end{proof}
+
+  \noindent
   Next is the lemma that shows the function @{term "mkeps"} calculates
   the POSIX value for the empty string and a nullable regular expression.
 
@@ -1117,17 +1203,11 @@
 
 text {*
 
- Theorems:
-
- @{thm [mode=IfThen] Posix_CPT}
+ Theorem 1:
 
  @{thm [mode=IfThen] Posix_PosOrd[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
 
- Corrollary from the last one
-
- @{thm [mode=IfThen] Posix_PosOrd_stronger[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
-
- Theorem
+ Theorem 2:
 
  @{thm [mode=IfThen] PosOrd_Posix[where ?v1.0="v\<^sub>1"]}
 *}