diff -r fff2e1b40dfc -r 32b222d77fa0 thys/Journal/Paper.thy --- a/thys/Journal/Paper.thy Wed Jul 19 14:55:46 2017 +0100 +++ b/thys/Journal/Paper.thy Fri Aug 11 20:29:01 2017 +0100 @@ -15,6 +15,11 @@ declare [[show_question_marks = false]] +syntax (latex output) + "_Collect" :: "pttrn => bool => 'a set" ("(1{_ \<^raw:\mbox{\boldmath$\mid$}> _})") + "_CollectIn" :: "pttrn => 'a set => bool => 'a set" ("(1{_ \ _ |e _})") + + abbreviation "der_syn r c \ der c r" @@ -26,12 +31,14 @@ "nprec v1 v2 \ \(v1 :\val v2)" + + notation (latex output) If ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [75,73] 73) and ZERO ("\<^bold>0" 78) and - ONE ("\<^bold>1" 78) and + ONE ("\<^bold>1" 1000) and CHAR ("_" [1000] 80) and ALT ("_ + _" [77,77] 78) and SEQ ("_ \ _" [77,77] 78) and @@ -53,8 +60,10 @@ mkeps ("mkeps _" [79] 76) and length ("len _" [73] 73) and intlen ("len _" [73] 73) and + set ("_" [73] 73) and - Prf ("_ : _" [75,75] 75) and + Prf ("_ : _" [75,75] 75) and + CPrf ("_ \<^raw:\mbox{\textbf{\textlengthmark}}> _" [75,75] 75) and Posix ("'(_, _') \ _" [63,75,75] 75) and lexer ("lexer _ _" [78,78] 77) and @@ -84,6 +93,13 @@ definition "match r s \ nullable (ders s r)" + +lemma CV_STAR_ONE_empty: + shows "CV (STAR ONE) [] = {Stars []}" +by(auto simp add: CV_def elim: CPrf.cases intro: CPrf.intros) + + + (* comments not implemented @@ -94,6 +110,8 @@ (*>*) + + section {* Introduction *} @@ -153,7 +171,7 @@ not match an empty string unless this is the only match for the repetition.\smallskip \item[$\bullet$] \emph{Empty String Rule:} An empty string shall be considered to -be longer than no match at all. +be longer than no match at all.\marginpar{Explain its purpose} \end{itemize} \noindent Consider for example a regular expression @{text "r\<^bsub>key\<^esub>"} for recognising keywords @@ -165,7 +183,9 @@ by the Longest Match Rule a single identifier token, not a keyword followed by an identifier. For @{text "if"} we obtain by the Priority Rule a keyword token, not an identifier token---even if @{text "r\<^bsub>id\<^esub>"} -matches also. +matches also. By the Star Rule we know @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\"} matches @{text "iffoo"}, respectively @{text "if"}, in exactly one +`iteration' of the star. + One limitation of Brzozowski's matcher is that it only generates a YES/NO answer for whether a string is being matched by a regular @@ -185,7 +205,8 @@ algorithm. This proof idea is inspired by work of Frisch and Cardelli \cite{Frisch2004} on a GREEDY regular expression matching algorithm. However, we were not able to establish transitivity and -totality for the ``order relation'' by Sulzmann and Lu. ??In Section +totality for the ``order relation'' by Sulzmann and Lu. \marginpar{We probably drop this section} +??In Section \ref{argu} we identify some inherent problems with their approach (of which some of the proofs are not published in \cite{Sulzmann2014}); perhaps more importantly, we give a simple inductive (and @@ -230,6 +251,8 @@ informal proof contains gaps, and possible fixes are not fully worked out.} Our specification of a POSIX value consists of a simple inductive definition that given a string and a regular expression uniquely determines this value. +We also show that our definition is equivalent to an ordering +of values based on positions by Okui and Suzuki \cite{OkuiSuzuki2013}. Derivatives as calculated by Brzozowski's method are usually more complex regular expressions than the initial one; various optimisations are possible. We prove the correctness when simplifications of @{term "ALT ZERO @@ -267,13 +290,10 @@ recursive function @{term L} with the six clauses: \begin{center} - \begin{tabular}{l@ {\hspace{3mm}}rcl} + \begin{tabular}{l@ {\hspace{4mm}}rcl} (1) & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\ (2) & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\ (3) & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\ - \end{tabular} - \hspace{14mm} - \begin{tabular}{l@ {\hspace{3mm}}rcl} (4) & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\ (5) & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\ (6) & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\ @@ -325,14 +345,12 @@ @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\ @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\ @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\ - @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\ - - %\end{tabular} - %\end{center} + @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}%\medskip\\ + \end{tabular} + \end{center} - %\begin{center} - %\begin{tabular}{lcl} - + \begin{center} + \begin{tabular}{lcl} @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\ @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\ @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\ @@ -384,7 +402,7 @@ text {* - The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to define + The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to use values for encoding \emph{how} a regular expression matches a string and then define a function on values that mirrors (but inverts) the construction of the derivative on regular expressions. \emph{Values} @@ -422,7 +440,7 @@ \end{center} \noindent Sulzmann and Lu also define inductively an inhabitation relation - that associates values to regular expressions: + that associates values to regular expressions \begin{center} \begin{tabular}{c} @@ -436,7 +454,10 @@ \end{tabular} \end{center} - \noindent Note that no values are associated with the regular expression + \noindent + where in the clause for @{const "Stars"} we use the notation @{term "v \ set vs"} + for indicating that @{text v} is a member in the list @{text vs}. + Note that no values are associated with the regular expression @{term ZERO}, and that the only value associated with the regular expression @{term ONE} is @{term Void}, pronounced (if one must) as @{text "Void"}. It is routine to establish how values ``inhabiting'' a regular @@ -446,10 +467,58 @@ @{thm L_flat_Prf} \end{proposition} - In general there is more than one value associated with a regular - expression. In case of POSIX matching the problem is to calculate the - unique value that satisfies the (informal) POSIX rules from the - Introduction. Graphically the POSIX value calculation algorithm by + \noindent + Given a regular expression @{text r} and a string @{text s}, we can define the + set of all \emph{Lexical Values} inhabited by @{text r} with the underlying string + being @{text s} by + + \begin{center} + @{thm LV_def} + \end{center} + + \noindent However, later on it will sometimes be necessary to + restrict the set of lexical values to a subset called + \emph{Canonical Values}. The idea of canonical values is that they + satisfy the Star Rule (see Introduction) where the $^\star$ does not + match the empty string unless this is the only match for the + repetition. One way to define canonical values formally is to use a + stronger inhabitation relation, written @{term "\ DUMMY : DUMMY"}, which has the same rules as @{term + "\ DUMMY : DUMMY"} shown above, except that the rule for + @{term Stars} has + the additional side-condition of flattened values not being the + empty string, namely + + \begin{center} + @{thm [mode=Rule] CPrf.intros(6)} + \end{center} + + \noindent + With this we can define + + \begin{center} + @{thm CV_def} + \end{center} + + \noindent + Clearly we have @{thm LV_CV_subset}. + The main point of canonical values is that for every regular expression @{text r} and every + string @{text s}, the set @{term "CV r s"} is finite. + + \begin{lemma} + @{thm CV_finite} + \end{lemma} + + \noindent This finiteness property does not generally hold for lexical values where + for example @{term "LV (STAR ONE) []"} contains infinitely many + values, but @{thm CV_STAR_ONE_empty}. However, if a regular + expression @{text r} matches a string @{text s}, then in general the + set @{term "CV r s"} is not just a + singleton set. In case of POSIX matching the problem is to + calculate the unique value that satisfies the (informal) POSIX rules + from the Introduction. It will turn out that this POSIX value is in fact a + canonical value. + + Graphically the POSIX value calculation algorithm by Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz} where the path from the left to the right involving @{term derivatives}/@{const nullable} is the first phase of the algorithm (calculating successive @@ -614,7 +683,7 @@ \end{proof} Having defined the @{const mkeps} and @{text inj} function we can extend - \Brz's matcher so that a [lexical] value is constructed (assuming the + \Brz's matcher so that a value is constructed (assuming the regular expression matches the string). The clauses of the Sulzmann and Lu lexer are \begin{center} @@ -633,19 +702,25 @@ functional programming language and also in Isabelle/HOL. In the remaining part of this section we prove that this algorithm is correct. - The well-known idea of POSIX matching is informally defined by the longest - match and priority rule (see Introduction); as correctly argued in \cite{Sulzmann2014}, this + The well-known idea of POSIX matching is informally defined by some + rules such as the longest match and priority rule (see + Introduction); as correctly argued in \cite{Sulzmann2014}, this needs formal specification. Sulzmann and Lu define an ``ordering - relation'' between values and argue - that there is a maximum value, as given by the derivative-based algorithm. - In contrast, we shall introduce a simple inductive definition that - specifies directly what a \emph{POSIX value} is, incorporating the - POSIX-specific choices into the side-conditions of our rules. Our - definition is inspired by the matching relation given by Vansummeren - \cite{Vansummeren2006}. The relation we define is ternary and written as - \mbox{@{term "s \ r \ v"}}, relating strings, regular expressions and - values. + relation'' between values and argue that there is a maximum value, + as given by the derivative-based algorithm. In contrast, we shall + introduce a simple inductive definition that specifies directly what + a \emph{POSIX value} is, incorporating the POSIX-specific choices + into the side-conditions of our rules. Our definition is inspired by + the matching relation given by Vansummeren + \cite{Vansummeren2006}. The relation we define is ternary and + written as \mbox{@{term "s \ r \ v"}}, relating + strings, regular expressions and values; the inductive rules are given in + Figure~\ref{POSIXrules}. + We can prove that given a string @{term s} and regular expression @{term + r}, the POSIX value @{term v} is uniquely determined by @{term "s \ r \ v"}. + % + \begin{figure}[t] \begin{center} \begin{tabular}{c} @{thm[mode=Axiom] Posix.intros(1)}@{text "P"}@{term "ONE"} \qquad @@ -668,10 +743,10 @@ {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$@{text "P\"} \end{tabular} \end{center} + \caption{Our inductive definition of POSIX values.}\label{POSIXrules} + \end{figure} - \noindent - We can prove that given a string @{term s} and regular expression @{term - r}, the POSIX value @{term v} is uniquely determined by @{term "s \ r \ v"}. + \begin{theorem}\mbox{}\smallskip\\\label{posixdeterm} \begin{tabular}{ll} @@ -687,7 +762,7 @@ \end{proof} \noindent - We claim that our @{term "s \ r \ v"} relation captures the idea behind the two + We claim that our @{term "s \ r \ v"} relation captures the idea behind the four informal POSIX rules shown in the Introduction: Consider for example the rules @{text "P+L"} and @{text "P+R"} where the POSIX value for a string and an alternative regular expression, that is @{term "(s, ALT r\<^sub>1 r\<^sub>2)"}, @@ -718,8 +793,19 @@ @{term v} cannot be flattened to the empty string. In effect, we require that in each ``iteration'' of the star, some non-empty substring needs to be ``chipped'' away; only in case of the empty string we accept @{term - "Stars []"} as the POSIX value. + "Stars []"} as the POSIX value. Indeed we can show that our POSIX value + is a canonical value which excludes those @{text Stars} containing values + that flatten to the empty string. + \begin{lemma} + @{thm [mode=IfThen] Posix_CV} + \end{lemma} + + \begin{proof} + By routine induction on @{thm (prem 1) Posix_CV}.\qed + \end{proof} + + \noindent Next is the lemma that shows the function @{term "mkeps"} calculates the POSIX value for the empty string and a nullable regular expression. @@ -1117,17 +1203,11 @@ text {* - Theorems: - - @{thm [mode=IfThen] Posix_CPT} + Theorem 1: @{thm [mode=IfThen] Posix_PosOrd[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} - Corrollary from the last one - - @{thm [mode=IfThen] Posix_PosOrd_stronger[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} - - Theorem + Theorem 2: @{thm [mode=IfThen] PosOrd_Posix[where ?v1.0="v\<^sub>1"]} *}