thys/Paper/Paper.thy
changeset 121 4c85af262ee7
parent 120 d74bfa11802c
child 122 7c6c907660d8
equal deleted inserted replaced
120:d74bfa11802c 121:4c85af262ee7
    81 value, in an algorithm-independent fashion, and to show that Sulzmann and
    81 value, in an algorithm-independent fashion, and to show that Sulzmann and
    82 Lu's derivative-based algorithm does indeed calculate a value that is
    82 Lu's derivative-based algorithm does indeed calculate a value that is
    83 correct according to the specification.
    83 correct according to the specification.
    84 
    84 
    85 The answer given by Sulzmann and Lu \cite{Sulzmann2014} is to define a
    85 The answer given by Sulzmann and Lu \cite{Sulzmann2014} is to define a
    86 relation (called an ``Order Relation'') on the set of values of @{term r},
    86 relation (called an ``order relation'') on the set of values of @{term r},
    87 and to show that (once a string to be matched is chosen) there is a maximum
    87 and to show that (once a string to be matched is chosen) there is a maximum
    88 element and that it is computed by their derivative-based algorithm. This
    88 element and that it is computed by their derivative-based algorithm. This
    89 proof idea is inspired by work of Frisch and Cardelli \cite{Frisch2004} on a
    89 proof idea is inspired by work of Frisch and Cardelli \cite{Frisch2004} on a
    90 GREEDY regular expression matching algorithm. Beginning with our
    90 GREEDY regular expression matching algorithm. Beginning with our
    91 observations that, without evidence that it is transitive, it cannot be
    91 observations that, without evidence that it is transitive, it cannot be
   150 identifiers (say, a single character followed by characters or numbers).
   150 identifiers (say, a single character followed by characters or numbers).
   151 Then we can form the regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} and use
   151 Then we can form the regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} and use
   152 POSIX matching to tokenise strings, say @{text "iffoo"} and @{text "if"}.
   152 POSIX matching to tokenise strings, say @{text "iffoo"} and @{text "if"}.
   153 For @{text "iffoo"} we obtain by the longest match rule a single identifier
   153 For @{text "iffoo"} we obtain by the longest match rule a single identifier
   154 token, not a keyword followed by an identifier. For @{text "if"} we obtain by
   154 token, not a keyword followed by an identifier. For @{text "if"} we obtain by
   155 rule priority a keyword token, not an identifier token---even if @{text
   155 the priority rule a keyword token, not an identifier token---even if @{text
   156 "r\<^bsub>id\<^esub>"} matches also.\bigskip
   156 "r\<^bsub>id\<^esub>"} matches also.\bigskip
   157 
   157 
   158 \noindent {\bf Contributions:} (NOT DONE YET) We have implemented in
   158 \noindent {\bf Contributions:} (NOT DONE YET) We have implemented in
   159 Isabelle/HOL the derivative-based regular expression matching algorithm as
   159 Isabelle/HOL the derivative-based regular expression matching algorithm as
   160 described by Sulzmann and Lu \cite{Sulzmann2014}. We have proved the
   160 described by Sulzmann and Lu \cite{Sulzmann2014}. We have proved the
   211   only the empty string and @{term c} for matching a character literal. The
   211   only the empty string and @{term c} for matching a character literal. The
   212   language of a regular expression is also defined as usual by the
   212   language of a regular expression is also defined as usual by the
   213   recursive function @{term L} with the clauses:
   213   recursive function @{term L} with the clauses:
   214 
   214 
   215   \begin{center}
   215   \begin{center}
   216   \begin{tabular}{rcl}
   216   \begin{tabular}{l@ {\hspace{5mm}}rcl}
   217   @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
   217   (1) & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
   218   @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
   218   (2) & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
   219   @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
   219   (3) & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
   220   @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   220   (4) & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
   221   @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   221   (5) & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   222   @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
   222   (6) & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
   223   \end{tabular}
   223   \end{tabular}
   224   \end{center}
   224   \end{center}
   225   
   225   
   226   \noindent In the fourth clause we use the operation @{term "DUMMY ;;
   226   \noindent In clause (4) we use the operation @{term "DUMMY ;;
   227   DUMMY"} for the concatenation of two languages (it is also list-append for
   227   DUMMY"} for the concatenation of two languages (it is also list-append for
   228   strings). We use the star-notation for regular expressions and for
   228   strings). We use the star-notation for regular expressions and for
   229   languages (in the last clause above). The star for languages is defined
   229   languages (in the last clause above). The star for languages is defined
   230   inductively by two clauses: @{text "(i)"} the empty string being in
   230   inductively by two clauses: @{text "(i)"} the empty string being in
   231   the star of a language and @{text "(ii)"} if @{term "s\<^sub>1"} is in a
   231   the star of a language and @{text "(ii)"} if @{term "s\<^sub>1"} is in a
   308   @{thm match_def}
   308   @{thm match_def}
   309   \end{center}
   309   \end{center}
   310 
   310 
   311   \noindent gives a positive answer if and only if @{term "s \<in> L r"}.
   311   \noindent gives a positive answer if and only if @{term "s \<in> L r"}.
   312   Consequently, this regular expression matching algorithm satisfies the
   312   Consequently, this regular expression matching algorithm satisfies the
   313   usual specification. While the matcher above calculates a provably correct
   313   usual specification for regular expression matching. While the matcher
   314   YES/NO answer for whether a regular expression matches a string, the novel
   314   above calculates a provably correct YES/NO answer for whether a regular
   315   idea of Sulzmann and Lu \cite{Sulzmann2014} is to append another phase to
   315   expression matches a string or not, the novel idea of Sulzmann and Lu
   316   this algorithm in order to calculate a [lexical] value. We will explain
   316   \cite{Sulzmann2014} is to append another phase to this algorithm in order
   317   the details next.
   317   to calculate a [lexical] value. We will explain the details next.
   318 
   318 
   319 *}
   319 *}
   320 
   320 
   321 section {* POSIX Regular Expression Matching\label{posixsec} *}
   321 section {* POSIX Regular Expression Matching\label{posixsec} *}
   322 
   322 
   336   @{term "Right v"} $\mid$
   336   @{term "Right v"} $\mid$
   337   @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
   337   @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
   338   @{term "Stars vs"} 
   338   @{term "Stars vs"} 
   339   \end{center}  
   339   \end{center}  
   340 
   340 
   341   \noindent where we use @{term vs} standing for a list of values. (This is
   341   \noindent where we use @{term vs} to stand for a list of values. (This is
   342   similar to the approach taken by Frisch and Cardelli for GREEDY matching
   342   similar to the approach taken by Frisch and Cardelli for GREEDY matching
   343   \cite{Frisch2004}, and Sulzmann and Lu \cite{Sulzmann2014} for POSIX matching).
   343   \cite{Frisch2004}, and Sulzmann and Lu \cite{Sulzmann2014} for POSIX
   344   The string underlying a value can be calculated by the @{const flat}
   344   matching). The string underlying a value can be calculated by the @{const
   345   function, written @{term "flat DUMMY"} and defined as:
   345   flat} function, written @{term "flat DUMMY"} and defined as:
   346 
   346 
   347   \begin{center}
   347   \begin{center}
   348   \begin{tabular}{lcl}
   348   \begin{tabular}{lcl}
   349   @{thm (lhs) flat.simps(1)} & $\dn$ & @{thm (rhs) flat.simps(1)}\\
   349   @{thm (lhs) flat.simps(1)} & $\dn$ & @{thm (rhs) flat.simps(1)}\\
   350   @{thm (lhs) flat.simps(2)} & $\dn$ & @{thm (rhs) flat.simps(2)}\\
   350   @{thm (lhs) flat.simps(2)} & $\dn$ & @{thm (rhs) flat.simps(2)}\\
   420 \node (v1) [left=of v2] {@{term "v\<^sub>1"}};
   420 \node (v1) [left=of v2] {@{term "v\<^sub>1"}};
   421 \draw[->,line width=1mm](v2)--(v1) node[below,midway] {@{text "inj r\<^sub>1 a"}};
   421 \draw[->,line width=1mm](v2)--(v1) node[below,midway] {@{text "inj r\<^sub>1 a"}};
   422 \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
   422 \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
   423 \end{tikzpicture}
   423 \end{tikzpicture}
   424 \end{center}
   424 \end{center}
   425 \caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014}
   425 \caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
   426 matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
   426 matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
   427 left to right) is \Brz's matcher building succesive derivatives. If at the 
   427 left to right) is \Brz's matcher building succesive derivatives. If the 
   428 last regular expression is @{term nullable}, then functions of the 
   428 last regular expression is @{term nullable}, then the functions of the 
   429 second phase are called: first @{term mkeps} calculates a value witnessing
   429 second phase are called (the top-down and right-to-left arrows): first 
       
   430 @{term mkeps} calculates a value witnessing
   430 how the empty string has been recognised by @{term "r\<^sub>4"}. After
   431 how the empty string has been recognised by @{term "r\<^sub>4"}. After
   431 that the function @{term inj} `injects back' the characters of the string into
   432 that the function @{term inj} `injects back' the characters of the string into
   432 the values (the arrows from right to left).
   433 the values.
   433 \label{Sulz}}
   434 \label{Sulz}}
   434 \end{figure} 
   435 \end{figure} 
   435 
   436 
   436   \begin{center}
   437   \begin{center}
   437   \begin{tabular}{lcl}
   438   \begin{tabular}{lcl}
   448   "r\<^sub>1"} and an error is raised instead. Note also how this function
   449   "r\<^sub>1"} and an error is raised instead. Note also how this function
   449   makes some subtle choices leading to a POSIX value: for example if an
   450   makes some subtle choices leading to a POSIX value: for example if an
   450   alternative regular expression, say @{term "ALT r\<^sub>1 r\<^sub>2"}, can
   451   alternative regular expression, say @{term "ALT r\<^sub>1 r\<^sub>2"}, can
   451   match the empty string and furthermore @{term "r\<^sub>1"} can match the
   452   match the empty string and furthermore @{term "r\<^sub>1"} can match the
   452   empty string, then we return a @{text Left}-value. The @{text
   453   empty string, then we return a @{text Left}-value. The @{text
   453   Right}-value will only be returned if @{term "r\<^sub>1"} is not nullable.
   454   Right}-value will only be returned if @{term "r\<^sub>1"} cannot match the empty
   454 
   455   string.
   455   The most interesting novelty from Sulzmann and Lu \cite{Sulzmann2014} is
   456 
       
   457   The most interesting idea from Sulzmann and Lu \cite{Sulzmann2014} is
   456   the construction of a value for how @{term "r\<^sub>1"} can match the
   458   the construction of a value for how @{term "r\<^sub>1"} can match the
   457   string @{term "[a,b,c]"} from the value how the last derivative, @{term
   459   string @{term "[a,b,c]"} from the value how the last derivative, @{term
   458   "r\<^sub>4"} in Fig~\ref{Sulz}, can match the empty string. Sulzmann and
   460   "r\<^sub>4"} in Fig~\ref{Sulz}, can match the empty string. Sulzmann and
   459   Lu achieve this by stepwise ``injecting back'' the characters into the
   461   Lu achieve this by stepwise ``injecting back'' the characters into the
   460   values thus inverting the operation of building derivatives on the level
   462   values thus inverting the operation of building derivatives on the level
   508   @{text Right}-value. In case of the @{text Left}-value we know further it
   510   @{text Right}-value. In case of the @{text Left}-value we know further it
   509   must be a value for a sequence regular expression. Therefore the pattern
   511   must be a value for a sequence regular expression. Therefore the pattern
   510   we match in the clause (5) is @{term "Left (Seq v\<^sub>1 v\<^sub>2)"},
   512   we match in the clause (5) is @{term "Left (Seq v\<^sub>1 v\<^sub>2)"},
   511   while in (6) it is just @{term "Right v\<^sub>2"}. One more interesting
   513   while in (6) it is just @{term "Right v\<^sub>2"}. One more interesting
   512   point is in the right-hand side of clause (6): since in this case the
   514   point is in the right-hand side of clause (6): since in this case the
   513   regular expression @{text "r\<^sub>1"} does not ``contribute'' for
   515   regular expression @{text "r\<^sub>1"} does not ``contribute'' to
   514   matching the string, that is only matches the empty string, we need to
   516   matching the string, that means it only matches the empty string, we need to
   515   call @{const mkeps} in order to construct a value how @{term "r\<^sub>1"}
   517   call @{const mkeps} in order to construct a value for how @{term "r\<^sub>1"}
   516   can match this empty string. A similar argument applies for why we can
   518   can match this empty string. A similar argument applies for why we can
   517   expect in the left-hand side of clause (7) that the value is of the form
   519   expect in the left-hand side of clause (7) that the value is of the form
   518   @{term "Seq v (Stars vs)"}---the derivative of a star is @{term "SEQ r
   520   @{term "Seq v (Stars vs)"}---the derivative of a star is @{term "SEQ r
   519   (STAR r)"}. Finally, the reason for why we can ignore the second argument
   521   (STAR r)"}. Finally, the reason for why we can ignore the second argument
   520   in clause (1) of @{term inj} is that it will only ever be called in cases
   522   in clause (1) of @{term inj} is that it will only ever be called in cases
   521   where @{term "c=d"}, but the usual linearity restrictions in patterns do
   523   where @{term "c=d"}, but the usual linearity restrictions in patterns do
   522   not allow is to build this constraint explicitly into our function
   524   not allow is to build this constraint explicitly into our function
   523   definition.\footnote{Sulzmann and Lu state this clause as @{thm (lhs)
   525   definition.\footnote{Sulzmann and Lu state this clause as @{thm (lhs)
   524   injval.simps(1)[of "c" "c"]} $\dn$ @{thm (rhs) injval.simps(1)[of "c"]},
   526   injval.simps(1)[of "c" "c"]} $\dn$ @{thm (rhs) injval.simps(1)[of "c"]},
   525   but our deviation is harmless.}
   527   but our deviation is harmless.}
       
   528 
       
   529   The idea of @{term inj} to ``inject back'' a character into a value can
       
   530   be made precise by the first part of the following lemma; the second
       
   531   part shows that the underlying string of an @{const mkeps}-value is
       
   532   the empty string.
       
   533 
       
   534   \begin{lemma}\mbox{}\\\label{Prf_injval_flat}
       
   535   \begin{tabular}{ll}
       
   536   (1) & @{thm[mode=IfThen] Prf_injval_flat}\\
       
   537   (2) & @{thm[mode=IfThen] mkeps_flat}
       
   538   \end{tabular}
       
   539   \end{lemma}
   526 
   540 
   527   Having defined the @{const mkeps} and @{text inj} function we can extend
   541   Having defined the @{const mkeps} and @{text inj} function we can extend
   528   \Brz's matcher so that a [lexical] value is constructed (assuming the
   542   \Brz's matcher so that a [lexical] value is constructed (assuming the
   529   regular expression matches the string). The clauses of the lexer are
   543   regular expression matches the string). The clauses of the lexer are
   530 
   544 
   597 
   611 
   598   \begin{lemma}
   612   \begin{lemma}
   599   @{thm[mode=IfThen] PMatch_mkeps}
   613   @{thm[mode=IfThen] PMatch_mkeps}
   600   \end{lemma}
   614   \end{lemma}
   601 
   615 
   602   \begin{lemma}
   616   
   603   @{thm[mode=IfThen] Prf_injval_flat}
       
   604   \end{lemma}
       
   605 
   617 
   606   \begin{lemma}
   618   \begin{lemma}
   607   @{thm[mode=IfThen] PMatch2_roy_version}
   619   @{thm[mode=IfThen] PMatch2_roy_version}
   608   \end{lemma}
   620   \end{lemma}
   609 
   621