# HG changeset patch # User Christian Urban # Date 1457268511 0 # Node ID 022503caa18774fc154a9b6f81152158555e53c8 # Parent 15ef2af1a6f2c5405eb23385f40639d1052688d0 updated diff -r 15ef2af1a6f2 -r 022503caa187 thys/Paper/Paper.thy --- a/thys/Paper/Paper.thy Sat Mar 05 12:10:01 2016 +0000 +++ b/thys/Paper/Paper.thy Sun Mar 06 12:48:31 2016 +0000 @@ -13,10 +13,10 @@ notation (latex output) If ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and - Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [78,77] 73) and + Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [75,75] 73) and - ZERO ("\<^bold>0" 80) and - ONE ("\<^bold>1" 80) and + ZERO ("\<^bold>0" 78) and + ONE ("\<^bold>1" 78) and CHAR ("_" [1000] 80) and ALT ("_ + _" [77,77] 78) and SEQ ("_ \ _" [77,77] 78) and @@ -24,20 +24,23 @@ val.Void ("'(')" 79) and val.Char ("Char _" [1000] 79) and - val.Left ("Left _" [1000] 78) and - val.Right ("Right _" [1000] 78) and - + val.Left ("Left _" [79] 78) and + val.Right ("Right _" [79] 78) and + val.Seq ("Seq _ _" [79,79] 78) and + val.Stars ("Stars _" [79] 78) and + L ("L'(_')" [10] 78) and der_syn ("_\\_" [79, 1000] 76) and ders_syn ("_\\_" [79, 1000] 76) and flat ("|_|" [75] 73) and Sequ ("_ @ _" [78,77] 63) and - injval ("inj _ _ _" [78,77,78] 77) and + injval ("inj _ _ _" [79,77,79] 76) and + mkeps ("mkeps _" [79] 76) and projval ("proj _ _ _" [1000,77,1000] 77) and length ("len _" [78] 73) and Prf ("\ _ : _" [75,75] 75) and - PMatch ("_ : _ \ _" [75,75,75] 75) + PMatch ("'(_, _') \ _" [63,75,75] 75) (* and ValOrd ("_ \\<^bsub>_\<^esub> _" [78,77,77] 73) *) definition @@ -64,7 +67,8 @@ and easily definable and reasoned about in theorem provers---the definitions just consist of inductive datatypes and simple recursive functions. A completely formalised correctness proof of this matcher in for example HOL4 -has been mentioned in~\cite{Owens2008}. +has been mentioned in~\cite{Owens2008}. Another one in Isabelle/HOL is +in \cite{Krauss2011}. One limitation of Brzozowski's matcher is that it only generates a YES/NO answer for whether a string is being matched by a regular expression. @@ -86,25 +90,25 @@ observations that, without evidence that it is transitive, it cannot be called an ``order relation'', and that the relation is called a ``total order'' despite being evidently not total\footnote{The relation @{text -"\\<^bsub>r\<^esub>"} defined in \cite{Sulzmann2014} is a relation on the values for -the regular expression @{term r}; but it only holds between @{term v} and -@{term "v'"} in cases where @{term v} and @{term "v'"} have the same -flattening (underlying string). So a counterexample to totality is given by -taking two values @{term v} and @{term "v'"} for @{term r} that have -different flattenings. A different relation @{text "\\<^bsub>r,s\<^esub>"} -on the set of values for @{term r} with flattening @{term s} is definable by -the same approach, and is indeed total; but that is not what Proposition 1 -of \cite{Sulzmann2014} does.}, we identify problems with this approach (of -which some of the proofs are not published in \cite{Sulzmann2014}); perhaps -more importantly, we give a simple inductive (and algorithm-independent) -definition of what we call being a {\em POSIX value} for a regular -expression @{term r} and a string @{term s}; we show that the algorithm -computes such a value and that such a value is unique. Proofs are both done -by hand and checked in Isabelle/HOL. The experience of doing our proofs has -been that this mechanical checking was absolutely essential: this subject -area has hidden snares. This was also noted by Kuklewitz \cite{Kuklewicz} -who found that nearly all POSIX matching implementations are ``buggy'' -\cite[Page 203]{Sulzmann2014}. +"\\<^bsub>r\<^esub>"} defined in \cite{Sulzmann2014} is a relation on the +values for the regular expression @{term r}; but it only holds between +@{term v} and @{term "v'"} in cases where @{term v} and @{term "v'"} have +the same flattening (underlying string). So a counterexample to totality is +given by taking two values @{term v} and @{term "v'"} for @{term r} that +have different flattenings (see Section~\ref{posixsec}). A different +relation @{text "\\<^bsub>r,s\<^esub>"} on the set of values for @{term r} +with flattening @{term s} is definable by the same approach, and is indeed +total; but that is not what Proposition 1 of \cite{Sulzmann2014} does.}, we +identify problems with this approach (of which some of the proofs are not +published in \cite{Sulzmann2014}); perhaps more importantly, we give a +simple inductive (and algorithm-independent) definition of what we call +being a {\em POSIX value} for a regular expression @{term r} and a string +@{term s}; we show that the algorithm computes such a value and that such a +value is unique. Proofs are both done by hand and checked in Isabelle/HOL. +The experience of doing our proofs has been that this mechanical checking +was absolutely essential: this subject area has hidden snares. This was also +noted by Kuklewitz \cite{Kuklewicz} who found that nearly all POSIX matching +implementations are ``buggy'' \cite[Page 203]{Sulzmann2014}. If a regular expression matches a string, then in general there is more than one way of how the string is matched. There are two commonly used @@ -142,7 +146,7 @@ \noindent Consider for example @{text "r\<^bsub>key\<^esub>"} recognising keywords such as @{text "if"}, @{text "then"} and so on; and @{text "r\<^bsub>id\<^esub>"} recognising -identifiers (a single character followed by characters or numbers). Then we +identifiers (say, a single character followed by characters or numbers). Then we can form the regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\"} and use POSIX matching to tokenise strings, say @{text "iffoo"} and @{text "if"}. In the first case we obtain by the longest match rule a single identifier token, @@ -150,10 +154,10 @@ priority a keyword token, not an identifier token---even if @{text "r\<^bsub>id\<^esub>"} matches also.\bigskip -\noindent {\bf Contributions:} We have implemented in Isabelle/HOL the +\noindent {\bf Contributions:} (NOT DONE YET) We have implemented in Isabelle/HOL the derivative-based regular expression matching algorithm as described by Sulzmann and Lu \cite{Sulzmann2014}. We have proved the correctness of this -algorithms according to our specification what a POSIX value is. The +algorithm according to our specification of what a POSIX value is. The informal correctness proof given in \cite{Sulzmann2014} is in final form\footnote{} and to us contains unfillable gaps. Our specification of a POSIX value consists of a simple inductive definition that given a string @@ -183,7 +187,8 @@ of just a single character @{term c} is written @{term "[c]"}. By using the type @{type char} for characters we have a supply of finitely many characters roughly corresponding to the ASCII character set. Regular -expressions are defined as usual as the following inductive datatype: +expressions are defined as usual as the elements of the following inductive +datatype: \begin{center} @{text "r :="} @@ -198,7 +203,7 @@ \noindent where @{const ZERO} stands for the regular expression that does not match any string, @{const ONE} for the regular expression that matches only the empty string and @{term c} for matching a character literal. The - language of a regular expression is again defined as usual by the + language of a regular expression is also defined as usual by the recursive function @{term L} with the clauses: \begin{center} @@ -212,16 +217,16 @@ \end{tabular} \end{center} - \noindent In the fourth clause we use @{term "DUMMY ;; DUMMY"} for the - concatenation of two languages (it is also list-append for strings). We - use the star-notation for regular expressions and languages (in the last - clause above). The star on languages is defined inductively by two - clauses: @{text "(i)"} for the empty string being in the star of a + \noindent In the fourth clause we use the operation @{term "DUMMY ;; + DUMMY"} for the concatenation of two languages (it is also list-append for + strings). We use the star-notation for regular expressions and languages + (in the last clause above). The star on languages is defined inductively + by two clauses: @{text "(i)"} for the empty string being in the star of a language and @{text "(ii)"} if @{term "s\<^sub>1"} is in a language and @{term "s\<^sub>2"} in the star of this language, then also @{term "s\<^sub>1 @ s\<^sub>2"} is in the star of this language. It will also be convenient to use the following - notion of a \emph{semantic derivative} (or left quotient) of a language, - say @{text A}, defined as: + notion of a \emph{semantic derivative} (or \emph{left quotient}) of a + language, say @{text A}, defined as: \begin{center} \begin{tabular}{lcl} @@ -231,7 +236,7 @@ \noindent For semantic derivatives we have the following equations (for example - \cite{Krauss2011}): + mechanically proved in \cite{Krauss2011}): \begin{equation}\label{SemDer} \begin{array}{lcl} @@ -285,8 +290,8 @@ \begin{proposition}\mbox{}\\ \begin{tabular}{ll} @{text "(1)"} & @{thm (lhs) nullable_correctness} if and only if - @{thm (rhs) nullable_correctness} \\ - @{text "(2)"} & @{thm[mode=IfThen] der_correctness} + @{thm (rhs) nullable_correctness}, and \\ + @{text "(2)"} & @{thm[mode=IfThen] der_correctness}. \end{tabular} \end{proposition} @@ -297,22 +302,24 @@ @{thm match_def} \end{center} - \noindent gives a positive answer if and only if @{term "s \ L r"}. While - the matcher above calculates a provably correct a YES/NO answer for - whether a regular expression matches a string, the novel idea of Sulzmann - and Lu \cite{Sulzmann2014} is to append another phase to calculate a - value. We will explain the details next. + \noindent gives a positive answer if and only if @{term "s \ L r"}. + Consequently, this regular expression matching algorithm satisfies the + usual specification. While the matcher above calculates a provably correct + a YES/NO answer for whether a regular expression matches a string, the + novel idea of Sulzmann and Lu \cite{Sulzmann2014} is to append another + phase to this algorithm in order to calculate a [lexical] value. We will + explain the details next. *} -section {* POSIX Regular Expression Matching *} +section {* POSIX Regular Expression Matching\label{posixsec} *} text {* The clever idea in \cite{Sulzmann2014} is to introduce values for encoding \emph{how} a regular expression matches a string and then define a function on values that mirrors (but inverts) the construction of the derivative on -regular expressions. Values are defined as the inductive datatype +regular expressions. \emph{Values} are defined as the inductive datatype \begin{center} @{text "v :="} @@ -324,9 +331,11 @@ @{term "Stars vs"} \end{center} - \noindent where we use @{term vs} standing for a list of values. The string - underlying a values can be calculated by the @{const flat} function, written - @{term "flat DUMMY"} and defined as: + \noindent where we use @{term vs} standing for a list of values. (This is + similar to the approach taken by Frisch and Cardelli for GREEDY matching + \cite{Frisch2014}, and Sulzmann and Lu \cite{2014} for POSIX matching). + The string underlying a value can be calculated by the @{const flat} + function, written @{term "flat DUMMY"} and defined as: \begin{center} \begin{tabular}{lcl} @@ -345,12 +354,12 @@ \begin{center} \begin{tabular}{c} - @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\medskip\\ + @{thm[mode=Axiom] Prf.intros(4)} \qquad + @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\medskip\\ @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} \qquad @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\medskip\\ - @{thm[mode=Axiom] Prf.intros(4)} \qquad - @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\medskip\\ - @{thm[mode=Axiom] Prf.intros(6)[of "r"]} \qquad + @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\medskip\\ + @{thm[mode=Axiom] Prf.intros(6)[of "r"]} \qquad @{thm[mode=Rule] Prf.intros(7)[of "v" "r" "vs"]}\medskip\\ \end{tabular} \end{center} @@ -358,22 +367,30 @@ \noindent Note that no values are associated with the regular expression @{term ZERO}, and that the only value associated with the regular expression @{term ONE} is @{term Void}, pronounced (if one must) as {\em - ``Void''}. It is routine to stablish how values `inhabitated' by a regular + ``Void''}. It is routine to stablish how values ``inhabiting'' a regular expression correspond to the language of a regular expression, namely \begin{proposition} @{thm L_flat_Prf} \end{proposition} - Graphically the algorithm by Sulzmann \& Lu can be illustrated by the - picture in Figure~\ref{Sulz} where the path from the left to the right - involving $der/nullable$ is the first phase of the algorithm and - $mkeps/inj$, the path from right to left, the second phase. This picture - shows the steps required when a regular expression, say $r_1$, matches the - string $abc$. We first build the three derivatives (according to $a$, $b$ - and $c$). We then use $nullable$ to find out whether the resulting regular - expression can match the empty string. If yes, we call the function - $mkeps$. + In general there are more than one value associated with a regular + expression. In case of POSIX matching the problem is to calculate the + unique value that satisfies the (informal) POSIX constraints from the + Introduction. Graphically the regular expression matching algorithm by + Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz} + where the path from the left to the right involving @{const der}/@{const + nullable} is the first phase of the algorithm (calculating successive + \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to + left, the second phase. This picture shows the steps required when a + regular expression, say @{text "r\<^sub>1"}, matches the string @{term + "[a,b,c]"}. We first build the three derivatives (according to @{term a}, + @{term b} and @{term c}). We then use @{const nullable} to find out + whether the resulting derivative regular expression @{term "r\<^sub>4"} + can match the empty string. If yes, we call the function @{const mkeps} + that produces a value @{term "v\<^sub>4"} for how @{term "r\<^sub>4"} can + match the empty string (taking into account the POSIX constraints in case + there are several ways). This functions is defined by the clauses: \begin{figure}[t] \begin{center} @@ -409,15 +426,7 @@ \label{Sulz}} \end{figure} - NOT DONE YET - - We begin with the case of a nullable regular expression: from the - nullability we need to construct a value that witnesses the nullability. - The @{const mkeps} function (from \cite{Sulzmann2014}) is a partial (but - unambiguous) function from regular expressions to values, total on exactly - the set of nullable regular expressions. - - \begin{center} + \begin{center} \begin{tabular}{lcl} @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\ @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\ @@ -426,6 +435,70 @@ \end{tabular} \end{center} + \noindent Note that this function needs only to be partially defined, + namely only for regular expressions that are nullable. In case @{const + nullable} fails, the string @{term "[a,b,c]"} cannot be matched by @{term + "r\<^sub>1"} and an error is raised. Note also how this function makes + some subtle choices leading to a POSIX value: for example if the + alternative, say @{term "ALT r\<^sub>1 r\<^sub>2"}, can match the empty + string and furthermore @{term "r\<^sub>1"} can match the empty string, + then we return a @{const Left}-value. The @{const Right}-value will only + be returned if @{term "r\<^sub>1"} is not nullable. + + The most interesting novelty from Sulzmann and Lu \cite{Sulzmann2014} is + the construction value for how @{term "r\<^sub>1"} can match the string + @{term "[a,b,c]"} from the value how the last derivative, @{term + "r\<^sub>4"} in Fig~\ref{Sulz}, can match the empty string. Sulzmann and + Lu acchieve this by stepwise ``injecting back'' the characters into the + values thus inverting the operation of building derivatives on the level + of values. The corresponding function, called @{term inj}, takes three + arguments, a regular expression, a character and a value. For example in + the first @{term inj}-step in Fig~\ref{Sulz} the regular expression @{term + "r\<^sub>3"}, the character @{term c} from the last derivative step and + @{term "v\<^sub>4"}, which is the value corresponding to the derivative + regular expression @{term "r\<^sub>4"}. The result is the new value @{term + "v\<^sub>3"}. The final result of the algorithm is the value @{term + "v\<^sub>1"} corresponding to the input regular expression. The @{term + inj} function is by recursion on the regular expression and by analysing + the shape of values. + + \begin{center} + \begin{tabular}{lcl} + @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\ + @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & + @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\ + @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & + @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\ + @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ + & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\ + @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ + & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\ + @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ + & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\ + @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ + & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\ + \end{tabular} + \end{center} + + \noindent To better understand what is going on in this definition it + might be instructive to look first at the three sequence cases (clauses + 4--6). Recall the corresponding clause of the derivative + + \begin{center} + @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} $\dn$ @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} + \end{center} + + \noindent + NOT DONE YET + + Therefore there are, for example, three + cases for sequence regular expressions (for all possible shapes of the + value). + + Again the virtues of this algorithm is that it can be + implemented with ease in a functional programming language and + also in Isabelle/HOL. + The well-known idea of POSIX lexing is informally defined in (for example) \cite{posix}; as correctly argued in \cite{Sulzmann2014}, this needs formal specification. The rough idea is that, in contrast to the so-called GREEDY @@ -493,23 +566,7 @@ \noindent The @{text inj} function - \begin{center} - \begin{tabular}{lcl} - @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\ - @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & - @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\ - @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & - @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\ - @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ - & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\ - @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ - & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\ - @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ - & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\ - @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ - & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\ - \end{tabular} - \end{center} + \noindent The inhabitation relation: diff -r 15ef2af1a6f2 -r 022503caa187 thys/paper.pdf Binary file thys/paper.pdf has changed