thys/Journal/Paper.thy
changeset 267 32b222d77fa0
parent 266 fff2e1b40dfc
child 268 6746f5e1f1f8
equal deleted inserted replaced
266:fff2e1b40dfc 267:32b222d77fa0
    13 
    13 
    14 
    14 
    15 
    15 
    16 declare [[show_question_marks = false]]
    16 declare [[show_question_marks = false]]
    17 
    17 
       
    18 syntax (latex output)
       
    19   "_Collect" :: "pttrn => bool => 'a set"              ("(1{_ \<^raw:\mbox{\boldmath$\mid$}> _})")
       
    20   "_CollectIn" :: "pttrn => 'a set => bool => 'a set"   ("(1{_ \<in> _ |e _})")
       
    21 
       
    22 
    18 abbreviation 
    23 abbreviation 
    19   "der_syn r c \<equiv> der c r"
    24   "der_syn r c \<equiv> der c r"
    20 
    25 
    21 abbreviation 
    26 abbreviation 
    22   "ders_syn r s \<equiv> ders s r"
    27   "ders_syn r s \<equiv> ders s r"
    23 
    28 
    24 
    29 
    25 abbreviation
    30 abbreviation
    26   "nprec v1 v2 \<equiv> \<not>(v1 :\<sqsubset>val v2)"
    31   "nprec v1 v2 \<equiv> \<not>(v1 :\<sqsubset>val v2)"
       
    32 
       
    33 
    27 
    34 
    28 
    35 
    29 notation (latex output)
    36 notation (latex output)
    30   If  ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and
    37   If  ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and
    31   Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [75,73] 73) and  
    38   Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [75,73] 73) and  
    32 
    39 
    33   ZERO ("\<^bold>0" 78) and 
    40   ZERO ("\<^bold>0" 78) and 
    34   ONE ("\<^bold>1" 78) and 
    41   ONE ("\<^bold>1" 1000) and 
    35   CHAR ("_" [1000] 80) and
    42   CHAR ("_" [1000] 80) and
    36   ALT ("_ + _" [77,77] 78) and
    43   ALT ("_ + _" [77,77] 78) and
    37   SEQ ("_ \<cdot> _" [77,77] 78) and
    44   SEQ ("_ \<cdot> _" [77,77] 78) and
    38   STAR ("_\<^sup>\<star>" [1000] 78) and
    45   STAR ("_\<^sup>\<star>" [1000] 78) and
    39   
    46   
    51   Sequ ("_ @ _" [78,77] 63) and
    58   Sequ ("_ @ _" [78,77] 63) and
    52   injval ("inj _ _ _" [79,77,79] 76) and 
    59   injval ("inj _ _ _" [79,77,79] 76) and 
    53   mkeps ("mkeps _" [79] 76) and 
    60   mkeps ("mkeps _" [79] 76) and 
    54   length ("len _" [73] 73) and
    61   length ("len _" [73] 73) and
    55   intlen ("len _" [73] 73) and
    62   intlen ("len _" [73] 73) and
       
    63   set ("_" [73] 73) and
    56  
    64  
    57   Prf ("_ : _" [75,75] 75) and  
    65   Prf ("_ : _" [75,75] 75) and
       
    66   CPrf ("_ \<^raw:\mbox{\textbf{\textlengthmark}}> _" [75,75] 75) and
    58   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
    67   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
    59  
    68  
    60   lexer ("lexer _ _" [78,78] 77) and 
    69   lexer ("lexer _ _" [78,78] 77) and 
    61   F_RIGHT ("F\<^bsub>Right\<^esub> _") and
    70   F_RIGHT ("F\<^bsub>Right\<^esub> _") and
    62   F_LEFT ("F\<^bsub>Left\<^esub> _") and  
    71   F_LEFT ("F\<^bsub>Left\<^esub> _") and  
    82   *)
    91   *)
    83 
    92 
    84 definition 
    93 definition 
    85   "match r s \<equiv> nullable (ders s r)"
    94   "match r s \<equiv> nullable (ders s r)"
    86 
    95 
       
    96 
       
    97 lemma CV_STAR_ONE_empty: 
       
    98   shows "CV (STAR ONE) [] = {Stars []}"
       
    99 by(auto simp add: CV_def elim: CPrf.cases intro: CPrf.intros)
       
   100 
       
   101 
       
   102 
    87 (*
   103 (*
    88 comments not implemented
   104 comments not implemented
    89 
   105 
    90 p9. The condtion "not exists s3 s4..." appears often enough (in particular in
   106 p9. The condtion "not exists s3 s4..." appears often enough (in particular in
    91 the proof of Lemma 3) to warrant a definition.
   107 the proof of Lemma 3) to warrant a definition.
    92 
   108 
    93 *)
   109 *)
    94 
   110 
    95 (*>*)
   111 (*>*)
       
   112 
       
   113 
    96 
   114 
    97 section {* Introduction *}
   115 section {* Introduction *}
    98 
   116 
    99 
   117 
   100 text {*
   118 text {*
   151 
   169 
   152 \item[$\bullet$] \emph{Star Rule:} A subexpression repeated by ${}^\star$ shall 
   170 \item[$\bullet$] \emph{Star Rule:} A subexpression repeated by ${}^\star$ shall 
   153 not match an empty string unless this is the only match for the repetition.\smallskip
   171 not match an empty string unless this is the only match for the repetition.\smallskip
   154 
   172 
   155 \item[$\bullet$] \emph{Empty String Rule:} An empty string shall be considered to 
   173 \item[$\bullet$] \emph{Empty String Rule:} An empty string shall be considered to 
   156 be longer than no match at all.
   174 be longer than no match at all.\marginpar{Explain its purpose}
   157 \end{itemize}
   175 \end{itemize}
   158 
   176 
   159 \noindent Consider for example a regular expression @{text "r\<^bsub>key\<^esub>"} for recognising keywords
   177 \noindent Consider for example a regular expression @{text "r\<^bsub>key\<^esub>"} for recognising keywords
   160 such as @{text "if"}, @{text "then"} and so on; and @{text "r\<^bsub>id\<^esub>"}
   178 such as @{text "if"}, @{text "then"} and so on; and @{text "r\<^bsub>id\<^esub>"}
   161 recognising identifiers (say, a single character followed by
   179 recognising identifiers (say, a single character followed by
   163 @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} and use POSIX matching to tokenise strings,
   181 @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} and use POSIX matching to tokenise strings,
   164 say @{text "iffoo"} and @{text "if"}.  For @{text "iffoo"} we obtain
   182 say @{text "iffoo"} and @{text "if"}.  For @{text "iffoo"} we obtain
   165 by the Longest Match Rule a single identifier token, not a keyword
   183 by the Longest Match Rule a single identifier token, not a keyword
   166 followed by an identifier. For @{text "if"} we obtain by the Priority
   184 followed by an identifier. For @{text "if"} we obtain by the Priority
   167 Rule a keyword token, not an identifier token---even if @{text "r\<^bsub>id\<^esub>"}
   185 Rule a keyword token, not an identifier token---even if @{text "r\<^bsub>id\<^esub>"}
   168 matches also.
   186 matches also. By the Star Rule we know @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} matches @{text "iffoo"}, respectively @{text "if"}, in exactly one
       
   187 `iteration' of the star.
       
   188 
   169 
   189 
   170 One limitation of Brzozowski's matcher is that it only generates a
   190 One limitation of Brzozowski's matcher is that it only generates a
   171 YES/NO answer for whether a string is being matched by a regular
   191 YES/NO answer for whether a string is being matched by a regular
   172 expression.  Sulzmann and Lu~\cite{Sulzmann2014} extended this matcher
   192 expression.  Sulzmann and Lu~\cite{Sulzmann2014} extended this matcher
   173 to allow generation not just of a YES/NO answer but of an actual
   193 to allow generation not just of a YES/NO answer but of an actual
   183 r}, and to show that (once a string to be matched is chosen) there is
   203 r}, and to show that (once a string to be matched is chosen) there is
   184 a maximum element and that it is computed by their derivative-based
   204 a maximum element and that it is computed by their derivative-based
   185 algorithm. This proof idea is inspired by work of Frisch and Cardelli
   205 algorithm. This proof idea is inspired by work of Frisch and Cardelli
   186 \cite{Frisch2004} on a GREEDY regular expression matching
   206 \cite{Frisch2004} on a GREEDY regular expression matching
   187 algorithm. However, we were not able to establish transitivity and
   207 algorithm. However, we were not able to establish transitivity and
   188 totality for the ``order relation'' by Sulzmann and Lu. ??In Section
   208 totality for the ``order relation'' by Sulzmann and Lu. \marginpar{We probably drop this section} 
       
   209 ??In Section
   189 \ref{argu} we identify some inherent problems with their approach (of
   210 \ref{argu} we identify some inherent problems with their approach (of
   190 which some of the proofs are not published in \cite{Sulzmann2014});
   211 which some of the proofs are not published in \cite{Sulzmann2014});
   191 perhaps more importantly, we give a simple inductive (and
   212 perhaps more importantly, we give a simple inductive (and
   192 algorithm-independent) definition of what we call being a {\em POSIX
   213 algorithm-independent) definition of what we call being a {\em POSIX
   193 value} for a regular expression @{term r} and a string @{term s}; we
   214 value} for a regular expression @{term r} and a string @{term s}; we
   228 \cite{Sulzmann2014} is available at the website of its first author; this
   249 \cite{Sulzmann2014} is available at the website of its first author; this
   229 extended version already includes remarks in the appendix that their
   250 extended version already includes remarks in the appendix that their
   230 informal proof contains gaps, and possible fixes are not fully worked out.}
   251 informal proof contains gaps, and possible fixes are not fully worked out.}
   231 Our specification of a POSIX value consists of a simple inductive definition
   252 Our specification of a POSIX value consists of a simple inductive definition
   232 that given a string and a regular expression uniquely determines this value.
   253 that given a string and a regular expression uniquely determines this value.
       
   254 We also show that our definition is equivalent to an ordering 
       
   255 of values based on positions by Okui and Suzuki \cite{OkuiSuzuki2013}.
   233 Derivatives as calculated by Brzozowski's method are usually more complex
   256 Derivatives as calculated by Brzozowski's method are usually more complex
   234 regular expressions than the initial one; various optimisations are
   257 regular expressions than the initial one; various optimisations are
   235 possible. We prove the correctness when simplifications of @{term "ALT ZERO
   258 possible. We prove the correctness when simplifications of @{term "ALT ZERO
   236 r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
   259 r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
   237 @{term r} are applied.
   260 @{term r} are applied.
   265   only the empty string and @{term c} for matching a character literal. The
   288   only the empty string and @{term c} for matching a character literal. The
   266   language of a regular expression is also defined as usual by the
   289   language of a regular expression is also defined as usual by the
   267   recursive function @{term L} with the six clauses:
   290   recursive function @{term L} with the six clauses:
   268 
   291 
   269   \begin{center}
   292   \begin{center}
   270   \begin{tabular}{l@ {\hspace{3mm}}rcl}
   293   \begin{tabular}{l@ {\hspace{4mm}}rcl}
   271   (1) & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
   294   (1) & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
   272   (2) & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
   295   (2) & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
   273   (3) & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
   296   (3) & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
   274   \end{tabular}
       
   275   \hspace{14mm}
       
   276   \begin{tabular}{l@ {\hspace{3mm}}rcl}
       
   277   (4) & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   297   (4) & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   278   (5) & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   298   (5) & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   279   (6) & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
   299   (6) & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
   280   \end{tabular}
   300   \end{tabular}
   281   \end{center}
   301   \end{center}
   323   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
   343   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
   324   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
   344   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
   325   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
   345   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
   326   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   346   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   327   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   347   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   328   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
   348   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}%\medskip\\
   329   
   349   \end{tabular}
   330   %\end{tabular}
   350   \end{center}
   331   %\end{center}
   351 
   332 
   352   \begin{center}
   333   %\begin{center}
   353   \begin{tabular}{lcl}
   334   %\begin{tabular}{lcl}
       
   335   
       
   336   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
   354   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
   337   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
   355   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
   338   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
   356   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
   339   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
   357   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
   340   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
   358   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
   382 
   400 
   383 section {* POSIX Regular Expression Matching\label{posixsec} *}
   401 section {* POSIX Regular Expression Matching\label{posixsec} *}
   384 
   402 
   385 text {* 
   403 text {* 
   386 
   404 
   387   The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to define
   405   The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to use
   388   values for encoding \emph{how} a regular expression matches a string
   406   values for encoding \emph{how} a regular expression matches a string
   389   and then define a function on values that mirrors (but inverts) the
   407   and then define a function on values that mirrors (but inverts) the
   390   construction of the derivative on regular expressions. \emph{Values}
   408   construction of the derivative on regular expressions. \emph{Values}
   391   are defined as the inductive datatype
   409   are defined as the inductive datatype
   392 
   410 
   420   @{thm (lhs) flat.simps(7)} & $\dn$ & @{thm (rhs) flat.simps(7)}\\
   438   @{thm (lhs) flat.simps(7)} & $\dn$ & @{thm (rhs) flat.simps(7)}\\
   421   \end{tabular}
   439   \end{tabular}
   422   \end{center}
   440   \end{center}
   423 
   441 
   424   \noindent Sulzmann and Lu also define inductively an inhabitation relation
   442   \noindent Sulzmann and Lu also define inductively an inhabitation relation
   425   that associates values to regular expressions:
   443   that associates values to regular expressions
   426 
   444 
   427   \begin{center}
   445   \begin{center}
   428   \begin{tabular}{c}
   446   \begin{tabular}{c}
   429   \\[-8mm]
   447   \\[-8mm]
   430   @{thm[mode=Axiom] Prf.intros(4)} \qquad
   448   @{thm[mode=Axiom] Prf.intros(4)} \qquad
   434   @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]} \qquad
   452   @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]} \qquad
   435   @{thm[mode=Rule] Prf.intros(6)[of "vs"]}
   453   @{thm[mode=Rule] Prf.intros(6)[of "vs"]}
   436   \end{tabular}
   454   \end{tabular}
   437   \end{center}
   455   \end{center}
   438 
   456 
   439   \noindent Note that no values are associated with the regular expression
   457   \noindent 
       
   458   where in the clause for @{const "Stars"} we use the notation @{term "v \<in> set vs"}
       
   459   for indicating that @{text v} is a member in the list @{text vs}.
       
   460   Note that no values are associated with the regular expression
   440   @{term ZERO}, and that the only value associated with the regular
   461   @{term ZERO}, and that the only value associated with the regular
   441   expression @{term ONE} is @{term Void}, pronounced (if one must) as @{text
   462   expression @{term ONE} is @{term Void}, pronounced (if one must) as @{text
   442   "Void"}. It is routine to establish how values ``inhabiting'' a regular
   463   "Void"}. It is routine to establish how values ``inhabiting'' a regular
   443   expression correspond to the language of a regular expression, namely
   464   expression correspond to the language of a regular expression, namely
   444 
   465 
   445   \begin{proposition}
   466   \begin{proposition}
   446   @{thm L_flat_Prf}
   467   @{thm L_flat_Prf}
   447   \end{proposition}
   468   \end{proposition}
   448 
   469 
   449   In general there is more than one value associated with a regular
   470   \noindent
   450   expression. In case of POSIX matching the problem is to calculate the
   471   Given a regular expression @{text r} and a string @{text s}, we can define the 
   451   unique value that satisfies the (informal) POSIX rules from the
   472   set of all \emph{Lexical Values} inhabited by @{text r} with the underlying string 
   452   Introduction. Graphically the POSIX value calculation algorithm by
   473   being @{text s} by
       
   474   
       
   475   \begin{center}
       
   476   @{thm LV_def}
       
   477   \end{center}
       
   478 
       
   479   \noindent However, later on it will sometimes be necessary to
       
   480   restrict the set of lexical values to a subset called
       
   481   \emph{Canonical Values}. The idea of canonical values is that they
       
   482   satisfy the Star Rule (see Introduction) where the $^\star$ does not
       
   483   match the empty string unless this is the only match for the
       
   484   repetition.  One way to define canonical values formally is to use a
       
   485   stronger inhabitation relation, written @{term "\<Turnstile> DUMMY : DUMMY"}, which has the same rules as @{term
       
   486   "\<turnstile> DUMMY : DUMMY"} shown above, except that the rule for 
       
   487   @{term Stars} has
       
   488   the additional side-condition of flattened values not being the
       
   489   empty string, namely
       
   490 
       
   491   \begin{center}
       
   492   @{thm [mode=Rule] CPrf.intros(6)}
       
   493   \end{center}
       
   494 
       
   495   \noindent
       
   496   With this we can define
       
   497   
       
   498   \begin{center}
       
   499   @{thm CV_def}
       
   500   \end{center}
       
   501 
       
   502   \noindent
       
   503   Clearly we have @{thm LV_CV_subset}.
       
   504   The main point of canonical values is that for every regular expression @{text r} and every
       
   505   string @{text s}, the set @{term "CV r s"} is finite.
       
   506 
       
   507   \begin{lemma}
       
   508   @{thm CV_finite}
       
   509   \end{lemma}
       
   510 
       
   511   \noindent This finiteness property does not generally hold for lexical values where
       
   512   for example @{term "LV (STAR ONE) []"} contains infinitely many
       
   513   values, but @{thm CV_STAR_ONE_empty}. However, if a regular
       
   514   expression @{text r} matches a string @{text s}, then in general the
       
   515   set @{term "CV r s"} is not just a
       
   516   singleton set.  In case of POSIX matching the problem is to
       
   517   calculate the unique value that satisfies the (informal) POSIX rules
       
   518   from the Introduction. It will turn out that this POSIX value is in fact a
       
   519   canonical value.
       
   520 
       
   521   Graphically the POSIX value calculation algorithm by
   453   Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
   522   Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
   454   where the path from the left to the right involving @{term derivatives}/@{const
   523   where the path from the left to the right involving @{term derivatives}/@{const
   455   nullable} is the first phase of the algorithm (calculating successive
   524   nullable} is the first phase of the algorithm (calculating successive
   456   \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
   525   \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
   457   left, the second phase. This picture shows the steps required when a
   526   left, the second phase. This picture shows the steps required when a
   612   be proved by induction over the definition of @{term derivatives}; the second by
   681   be proved by induction over the definition of @{term derivatives}; the second by
   613   an induction on @{term r}. There are no interesting cases.\qed
   682   an induction on @{term r}. There are no interesting cases.\qed
   614   \end{proof}
   683   \end{proof}
   615 
   684 
   616   Having defined the @{const mkeps} and @{text inj} function we can extend
   685   Having defined the @{const mkeps} and @{text inj} function we can extend
   617   \Brz's matcher so that a [lexical] value is constructed (assuming the
   686   \Brz's matcher so that a value is constructed (assuming the
   618   regular expression matches the string). The clauses of the Sulzmann and Lu lexer are
   687   regular expression matches the string). The clauses of the Sulzmann and Lu lexer are
   619 
   688 
   620   \begin{center}
   689   \begin{center}
   621   \begin{tabular}{lcl}
   690   \begin{tabular}{lcl}
   622   @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
   691   @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
   631   match the string, then @{const Some} value is returned. One important
   700   match the string, then @{const Some} value is returned. One important
   632   virtue of this algorithm is that it can be implemented with ease in any
   701   virtue of this algorithm is that it can be implemented with ease in any
   633   functional programming language and also in Isabelle/HOL. In the remaining
   702   functional programming language and also in Isabelle/HOL. In the remaining
   634   part of this section we prove that this algorithm is correct.
   703   part of this section we prove that this algorithm is correct.
   635 
   704 
   636   The well-known idea of POSIX matching is informally defined by the longest
   705   The well-known idea of POSIX matching is informally defined by some
   637   match and priority rule (see Introduction); as correctly argued in \cite{Sulzmann2014}, this
   706   rules such as the longest match and priority rule (see
       
   707   Introduction); as correctly argued in \cite{Sulzmann2014}, this
   638   needs formal specification. Sulzmann and Lu define an ``ordering
   708   needs formal specification. Sulzmann and Lu define an ``ordering
   639   relation'' between values and argue
   709   relation'' between values and argue that there is a maximum value,
   640   that there is a maximum value, as given by the derivative-based algorithm.
   710   as given by the derivative-based algorithm.  In contrast, we shall
   641   In contrast, we shall introduce a simple inductive definition that
   711   introduce a simple inductive definition that specifies directly what
   642   specifies directly what a \emph{POSIX value} is, incorporating the
   712   a \emph{POSIX value} is, incorporating the POSIX-specific choices
   643   POSIX-specific choices into the side-conditions of our rules. Our
   713   into the side-conditions of our rules. Our definition is inspired by
   644   definition is inspired by the matching relation given by Vansummeren
   714   the matching relation given by Vansummeren
   645   \cite{Vansummeren2006}. The relation we define is ternary and written as
   715   \cite{Vansummeren2006}. The relation we define is ternary and
   646   \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating strings, regular expressions and
   716   written as \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating
   647   values.
   717   strings, regular expressions and values; the inductive rules are given in 
       
   718   Figure~\ref{POSIXrules}.
       
   719   We can prove that given a string @{term s} and regular expression @{term
       
   720    r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow> v"}.
       
   721 
   648   %
   722   %
       
   723   \begin{figure}[t]
   649   \begin{center}
   724   \begin{center}
   650   \begin{tabular}{c}
   725   \begin{tabular}{c}
   651   @{thm[mode=Axiom] Posix.intros(1)}@{text "P"}@{term "ONE"} \qquad
   726   @{thm[mode=Axiom] Posix.intros(1)}@{text "P"}@{term "ONE"} \qquad
   652   @{thm[mode=Axiom] Posix.intros(2)}@{text "P"}@{term "c"}\medskip\\
   727   @{thm[mode=Axiom] Posix.intros(2)}@{text "P"}@{term "c"}\medskip\\
   653   @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}@{text "P+L"}\qquad
   728   @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}@{text "P+L"}\qquad
   666     @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
   741     @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
   667     @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   742     @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   668    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$@{text "P\<star>"}
   743    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$@{text "P\<star>"}
   669   \end{tabular}
   744   \end{tabular}
   670   \end{center}
   745   \end{center}
   671 
   746   \caption{Our inductive definition of POSIX values.}\label{POSIXrules}
   672    \noindent
   747   \end{figure}
   673    We can prove that given a string @{term s} and regular expression @{term
   748 
   674    r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow> v"}.
   749    
   675 
   750 
   676   \begin{theorem}\mbox{}\smallskip\\\label{posixdeterm}
   751   \begin{theorem}\mbox{}\smallskip\\\label{posixdeterm}
   677   \begin{tabular}{ll}
   752   \begin{tabular}{ll}
   678   @{text "(1)"} & If @{thm (prem 1) Posix1(1)} then @{thm (concl)
   753   @{text "(1)"} & If @{thm (prem 1) Posix1(1)} then @{thm (concl)
   679   Posix1(1)} and @{thm (concl) Posix1(2)}.\\
   754   Posix1(1)} and @{thm (concl) Posix1(2)}.\\
   685   The second parts follows by a case analysis of @{term "s \<in> r \<rightarrow> v'"} and
   760   The second parts follows by a case analysis of @{term "s \<in> r \<rightarrow> v'"} and
   686   the first part.\qed
   761   the first part.\qed
   687   \end{proof}
   762   \end{proof}
   688 
   763 
   689   \noindent
   764   \noindent
   690   We claim that our @{term "s \<in> r \<rightarrow> v"} relation captures the idea behind the two
   765   We claim that our @{term "s \<in> r \<rightarrow> v"} relation captures the idea behind the four
   691   informal POSIX rules shown in the Introduction: Consider for example the
   766   informal POSIX rules shown in the Introduction: Consider for example the
   692   rules @{text "P+L"} and @{text "P+R"} where the POSIX value for a string
   767   rules @{text "P+L"} and @{text "P+R"} where the POSIX value for a string
   693   and an alternative regular expression, that is @{term "(s, ALT r\<^sub>1 r\<^sub>2)"},
   768   and an alternative regular expression, that is @{term "(s, ALT r\<^sub>1 r\<^sub>2)"},
   694   is specified---it is always a @{text "Left"}-value, \emph{except} when the
   769   is specified---it is always a @{text "Left"}-value, \emph{except} when the
   695   string to be matched is not in the language of @{term "r\<^sub>1"}; only then it
   770   string to be matched is not in the language of @{term "r\<^sub>1"}; only then it
   716   "P\<star>"}-rule. Also there we want that @{term "s\<^sub>1"} is the longest initial
   791   "P\<star>"}-rule. Also there we want that @{term "s\<^sub>1"} is the longest initial
   717   split of @{term "s\<^sub>1 @ s\<^sub>2"} and furthermore the corresponding value
   792   split of @{term "s\<^sub>1 @ s\<^sub>2"} and furthermore the corresponding value
   718   @{term v} cannot be flattened to the empty string. In effect, we require
   793   @{term v} cannot be flattened to the empty string. In effect, we require
   719   that in each ``iteration'' of the star, some non-empty substring needs to
   794   that in each ``iteration'' of the star, some non-empty substring needs to
   720   be ``chipped'' away; only in case of the empty string we accept @{term
   795   be ``chipped'' away; only in case of the empty string we accept @{term
   721   "Stars []"} as the POSIX value.
   796   "Stars []"} as the POSIX value. Indeed we can show that our POSIX value
   722 
   797   is a canonical value which excludes those @{text Stars} containing values 
       
   798   that flatten to the empty string.
       
   799 
       
   800   \begin{lemma}
       
   801   @{thm [mode=IfThen] Posix_CV}
       
   802   \end{lemma}
       
   803 
       
   804   \begin{proof}
       
   805   By routine induction on @{thm (prem 1) Posix_CV}.\qed 
       
   806   \end{proof}
       
   807 
       
   808   \noindent
   723   Next is the lemma that shows the function @{term "mkeps"} calculates
   809   Next is the lemma that shows the function @{term "mkeps"} calculates
   724   the POSIX value for the empty string and a nullable regular expression.
   810   the POSIX value for the empty string and a nullable regular expression.
   725 
   811 
   726   \begin{lemma}\label{lemmkeps}
   812   \begin{lemma}\label{lemmkeps}
   727   @{thm[mode=IfThen] Posix_mkeps}
   813   @{thm[mode=IfThen] Posix_mkeps}
  1115 
  1201 
  1116 *}
  1202 *}
  1117 
  1203 
  1118 text {*
  1204 text {*
  1119 
  1205 
  1120  Theorems:
  1206  Theorem 1:
  1121 
       
  1122  @{thm [mode=IfThen] Posix_CPT}
       
  1123 
  1207 
  1124  @{thm [mode=IfThen] Posix_PosOrd[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
  1208  @{thm [mode=IfThen] Posix_PosOrd[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
  1125 
  1209 
  1126  Corrollary from the last one
  1210  Theorem 2:
  1127 
       
  1128  @{thm [mode=IfThen] Posix_PosOrd_stronger[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
       
  1129 
       
  1130  Theorem
       
  1131 
  1211 
  1132  @{thm [mode=IfThen] PosOrd_Posix[where ?v1.0="v\<^sub>1"]}
  1212  @{thm [mode=IfThen] PosOrd_Posix[where ?v1.0="v\<^sub>1"]}
  1133 *}
  1213 *}
  1134 
  1214 
  1135 
  1215