thys/Paper/Paper.thy
changeset 122 7c6c907660d8
parent 121 4c85af262ee7
child 123 1bee7006b92b
equal deleted inserted replaced
121:4c85af262ee7 122:7c6c907660d8
   384   In general there is more than one value associated with a regular
   384   In general there is more than one value associated with a regular
   385   expression. In case of POSIX matching the problem is to calculate the
   385   expression. In case of POSIX matching the problem is to calculate the
   386   unique value that satisfies the (informal) POSIX rules from the
   386   unique value that satisfies the (informal) POSIX rules from the
   387   Introduction. Graphically the POSIX value calculation algorithm by
   387   Introduction. Graphically the POSIX value calculation algorithm by
   388   Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
   388   Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
   389   where the path from the left to the right involving @{const der}/@{const
   389   where the path from the left to the right involving @{term derivatives}/@{const
   390   nullable} is the first phase of the algorithm (calculating successive
   390   nullable} is the first phase of the algorithm (calculating successive
   391   \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
   391   \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
   392   left, the second phase. This picture shows the steps required when a
   392   left, the second phase. This picture shows the steps required when a
   393   regular expression, say @{text "r\<^sub>1"}, matches the string @{term
   393   regular expression, say @{text "r\<^sub>1"}, matches the string @{term
   394   "[a,b,c]"}. We first build the three derivatives (according to @{term a},
   394   "[a,b,c]"}. We first build the three derivatives (according to @{term a},
   492 
   492 
   493   \noindent To better understand what is going on in this definition it
   493   \noindent To better understand what is going on in this definition it
   494   might be instructive to look first at the three sequence cases (clauses
   494   might be instructive to look first at the three sequence cases (clauses
   495   (4)--(6)). In each case we need to construct an ``injected value'' for
   495   (4)--(6)). In each case we need to construct an ``injected value'' for
   496   @{term "SEQ r\<^sub>1 r\<^sub>2"}. This must be a value of the form @{term
   496   @{term "SEQ r\<^sub>1 r\<^sub>2"}. This must be a value of the form @{term
   497   "Seq DUMMY DUMMY"}. Recall the clause of the @{const der}-function
   497   "Seq DUMMY DUMMY"}. Recall the clause of the @{text derivative}-function
   498   for sequence regular expressions:
   498   for sequence regular expressions:
   499 
   499 
   500   \begin{center}
   500   \begin{center}
   501   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} $\dn$ @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}
   501   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} $\dn$ @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}
   502   \end{center}
   502   \end{center}
   527   but our deviation is harmless.}
   527   but our deviation is harmless.}
   528 
   528 
   529   The idea of @{term inj} to ``inject back'' a character into a value can
   529   The idea of @{term inj} to ``inject back'' a character into a value can
   530   be made precise by the first part of the following lemma; the second
   530   be made precise by the first part of the following lemma; the second
   531   part shows that the underlying string of an @{const mkeps}-value is
   531   part shows that the underlying string of an @{const mkeps}-value is
   532   the empty string.
   532   always the empty string.
   533 
   533 
   534   \begin{lemma}\mbox{}\\\label{Prf_injval_flat}
   534   \begin{lemma}\mbox{}\smallskip\\\label{Prf_injval_flat}
   535   \begin{tabular}{ll}
   535   \begin{tabular}{ll}
   536   (1) & @{thm[mode=IfThen] Prf_injval_flat}\\
   536   (1) & @{thm[mode=IfThen] Prf_injval_flat}\\
   537   (2) & @{thm[mode=IfThen] mkeps_flat}
   537   (2) & @{thm[mode=IfThen] mkeps_flat}
   538   \end{tabular}
   538   \end{tabular}
   539   \end{lemma}
   539   \end{lemma}
       
   540 
       
   541   \begin{proof}
       
   542   Both properties are by routine inductions: the first one, for example,
       
   543   by an induction over the definition of @{term derivatives}; the second by
       
   544   induction on @{term r}. There are no interesting cases.
       
   545   \end{proof}
   540 
   546 
   541   Having defined the @{const mkeps} and @{text inj} function we can extend
   547   Having defined the @{const mkeps} and @{text inj} function we can extend
   542   \Brz's matcher so that a [lexical] value is constructed (assuming the
   548   \Brz's matcher so that a [lexical] value is constructed (assuming the
   543   regular expression matches the string). The clauses of the lexer are
   549   regular expression matches the string). The clauses of the lexer are
   544 
   550 
   549                      & & \phantom{$|$} @{term "None"}  @{text "\<Rightarrow>"} @{term None}\\
   555                      & & \phantom{$|$} @{term "None"}  @{text "\<Rightarrow>"} @{term None}\\
   550                      & & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
   556                      & & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
   551   \end{tabular}
   557   \end{tabular}
   552   \end{center}
   558   \end{center}
   553 
   559 
   554   \noindent If the regular expression does not match, @{const None} is
   560   \noindent If the regular expression does not match the string, @{const None} is
   555   returned, indicating an error is raised. If the regular expression does
   561   returned, indicating an error is raised. If the regular expression does
   556   match the string, then @{const Some} value is returned. One important
   562   match the string, then @{const Some} value is returned. One important
   557   virtue of this algorithm is that it can be implemented with ease in a
   563   virtue of this algorithm is that it can be implemented with ease in a
   558   functional programming language and also in Isabelle/HOL. In the remaining
   564   functional programming language and also in Isabelle/HOL. In the remaining
   559   part of this section we prove that this algorithm is correct.
   565   part of this section we prove that this algorithm is correct.
   560 
   566 
   561   The well-known idea of POSIX matching is informally defined by the longest
   567   The well-known idea of POSIX matching is informally defined by the longest
   562   match and priority rule; as correctly argued in \cite{Sulzmann2014}, this
   568   match and priority rule; as correctly argued in \cite{Sulzmann2014}, this
   563   needs formal specification. Sulzmann and Lu define a \emph{dominance}
   569   needs formal specification. Sulzmann and Lu define a \emph{dominance}
   564   relation\footnote{Sulzmann and Lu call it an ordering relation, but
   570   relation\footnote{Sulzmann and Lu call it an ordering relation, but
   565   without giving evidence that it is transitive.} between values and argue that
   571   without giving evidence that it is transitive.} between values and argue
   566   there is a maximum value, as given by the derivative-based algorithm. In
   572   that there is a maximum value, as given by the derivative-based algorithm.
   567   contrast, we shall next introduce a simple inductive definition to specify
   573   In contrast, we shall introduce a simple inductive definition that
   568   what a \emph{POSIX value} is, incorporating the POSIX-specific choices
   574   specifies directly what a \emph{POSIX value} is, incorporating the
   569   into the side-conditions of our rules. Our definition is inspired by the
   575   POSIX-specific choices into the side-conditions of our rules. Our
   570   matching relation given in \cite{Vansummeren2006}. The relation we define
   576   definition is inspired by the matching relation given in
   571   is ternary and written as \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating strings,
   577   \cite{Vansummeren2006}. The relation we define is ternary and written as
   572   regular expressions and values.
   578   \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating strings, regular expressions and
       
   579   values.
   573 
   580 
   574   \begin{center}
   581   \begin{center}
   575   \begin{tabular}{c}
   582   \begin{tabular}{c}
   576   @{thm[mode=Axiom] PMatch.intros(1)} \qquad
   583   @{thm[mode=Axiom] PMatch.intros(1)}@{text "P"}@{term "ONE"} \qquad
   577   @{thm[mode=Axiom] PMatch.intros(2)}\bigskip\\
   584   @{thm[mode=Axiom] PMatch.intros(2)}@{text "P"}@{term "c"}\bigskip\\
   578   @{thm[mode=Rule] PMatch.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\qquad
   585   @{thm[mode=Rule] PMatch.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}@{text "P+L"}\qquad
   579   @{thm[mode=Rule] PMatch.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\bigskip\\
   586   @{thm[mode=Rule] PMatch.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}@{text "P+R"}\bigskip\\
   580   $\mprset{flushleft}
   587   $\mprset{flushleft}
   581    \inferrule
   588    \inferrule
   582    {@{thm (prem 1) PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
   589    {@{thm (prem 1) PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
   583     @{thm (prem 2) PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
   590     @{thm (prem 2) PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
   584     @{thm (prem 3) PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
   591     @{thm (prem 3) PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
   585    {@{thm (concl) PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\\
   592    {@{thm (concl) PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$@{text "PS"}\\
   586   @{thm[mode=Axiom] PMatch.intros(7)}\bigskip\\
   593   @{thm[mode=Axiom] PMatch.intros(7)}@{text "P[]"}\bigskip\\
   587   $\mprset{flushleft}
   594   $\mprset{flushleft}
   588    \inferrule
   595    \inferrule
   589    {@{thm (prem 1) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
   596    {@{thm (prem 1) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
   590     @{thm (prem 2) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
   597     @{thm (prem 2) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
   591     @{thm (prem 3) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
   598     @{thm (prem 3) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
   592     @{thm (prem 4) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   599     @{thm (prem 4) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   593    {@{thm (concl) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$
   600    {@{thm (concl) PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$@{text "P\<star>"}
   594   \end{tabular}
   601   \end{tabular}
   595   \end{center}
   602   \end{center}
   596 
   603 
   597   \noindent We claim that this relation captures the idea behind the two
   604   \noindent We claim that this relation captures the idea behind the two
   598   informal POSIX rules shown in the Introduction: Consider the second line
   605   informal POSIX rules shown in the Introduction: Consider for example the
   599   where the POSIX values for an alternative regular expression is
   606   rules @{text "P+L"} and @{text "P+R"} where the POSIX value for an
   600   specified---it is always a @{text "Left"}-value, \emph{except} when the
   607   alternative regular expression is specified---it is always a @{text
   601   string to be matched is not in the language of @{term "r\<^sub>1"}; only then it
   608   "Left"}-value, \emph{except} when the string to be matched is not in the
   602   is a @{text Right}-value (see the side-condition in the rule on the
   609   language of @{term "r\<^sub>1"}; only then it is a @{text Right}-value (see the
   603   right). Interesting is also the rule for sequence regular expressions
   610   side-condition in @{text "P+R"}). Interesting is also the rule for
   604   (third line). The first two premises state that @{term "v\<^sub>1"} and
   611   sequence regular expressions (@{text "PS"}). The first two premises state
   605   @{term "v\<^sub>2"} are the POSIX values for @{term "(s\<^sub>1,
   612   that @{term "v\<^sub>1"} and @{term "v\<^sub>2"} are the POSIX values for
   606   r\<^sub>1)"} and @{term "(s\<^sub>2, r\<^sub>2)"} respectively.
   613   @{term "(s\<^sub>1, r\<^sub>1)"} and @{term "(s\<^sub>2, r\<^sub>2)"}
   607   
   614   respectively. Consider now the third premise and note that the POSIX value
       
   615   of this rule should match the string @{term "s\<^sub>1 @ s\<^sub>2"}. According to the
       
   616   longest match rule, we want that the @{term "s\<^sub>1"} is the longest initial
       
   617   split of @{term "s\<^sub>1 @ s\<^sub>2"} such that @{term "s\<^sub>2"} is still recognised
       
   618   by @{term "r\<^sub>2"}. Let us assume, contrary to the third premise, that there
       
   619   \emph{exists} an @{term "s\<^sub>3"} and @{term "s\<^sub>4"} such that @{term "s\<^sub>2"}
       
   620   can be split up into a non-empty @{term "s\<^sub>3"} and @{term "s\<^sub>4"}. Moreover
       
   621   the longer @{term "s\<^sub>1 @ s\<^sub>3"} can be matched by @{text "r\<^sub>1"} and the
       
   622   shorter @{term "s\<^sub>4"} can still be matched by @{term "r\<^sub>2"}. In this case
       
   623   @{term "s\<^sub>1"} would not be the longest initial split of @{term "s\<^sub>1 @
       
   624   s\<^sub>2"} and therefore @{term "Seq v\<^sub>1 v\<^sub>2"} cannot be a POSIX value
       
   625   for @{term "(s\<^sub>1 @ s\<^sub>2, SEQ r\<^sub>1 r\<^sub>2)"}. A similar condition is imposed
       
   626   onto the POSIX value in the @{text "P\<star>"}-rule. Also there we want that
       
   627    @{term "s\<^sub>1"} is the longest initial split of @{term "s\<^sub>1 @ s\<^sub>2"} and
       
   628    furthermore the corresponding value @{term v} cannot be flatten to
       
   629    the empty string. In effect, we require that in each ``iteration''
       
   630    of the star, some parts of the string need to be ``nibbled'' away; only
       
   631    in case of the empty string weBy accept @{term "Stars []"} as the 
       
   632    POSIX value.
       
   633 
       
   634    We can prove that given a string @{term s} and regular expression @{term
       
   635    r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow>
       
   636    v"}.
       
   637 
   608   \begin{theorem}
   638   \begin{theorem}
   609   @{thm[mode=IfThen] PMatch_determ(1)[of _ _ "v\<^sub>1" "v\<^sub>2"]}
   639   @{thm[mode=IfThen] PMatch_determ(1)[of _ _ "v\<^sub>1" "v\<^sub>2"]}
   610   \end{theorem}
   640   \end{theorem}
       
   641 
       
   642   \begin{proof}
       
   643   By induction on the definition of @{term "s \<in> r \<rightarrow> v\<^sub>1"} and a case
       
   644   analysis of @{term "s \<in> r \<rightarrow> v\<^sub>2"}.
       
   645   \end{proof}
   611 
   646 
   612   \begin{lemma}
   647   \begin{lemma}
   613   @{thm[mode=IfThen] PMatch_mkeps}
   648   @{thm[mode=IfThen] PMatch_mkeps}
   614   \end{lemma}
   649   \end{lemma}
   615 
   650