thys/Paper/Paper.thy
changeset 108 73f7dc60c285
parent 107 6adda4a667b1
child 109 2c38f10643ae
equal deleted inserted replaced
107:6adda4a667b1 108:73f7dc60c285
     2 theory Paper
     2 theory Paper
     3 imports "../ReStar" "~~/src/HOL/Library/LaTeXsugar"
     3 imports "../ReStar" "~~/src/HOL/Library/LaTeXsugar"
     4 begin
     4 begin
     5 
     5 
     6 declare [[show_question_marks = false]]
     6 declare [[show_question_marks = false]]
       
     7 
       
     8 abbreviation 
       
     9  "der_syn r c \<equiv> der c r"
     7 
    10 
     8 notation (latex output)
    11 notation (latex output)
     9    If  ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and
    12    If  ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and
    10   Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [78,77] 73) and
    13   Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [78,77] 73) and
    11   ZERO ("\<^raw:\textrm{0}>" 78) and 
    14   ZERO ("\<^raw:\textrm{0}>" 78) and 
    12   ONE ("\<^raw:\textrm{1}>" 78) and 
    15   ONE ("\<^raw:\textrm{1}>" 78) and 
    13   CHAR ("_" [1000] 10) and
    16   CHAR ("_" [1000] 10) and
    14   ALT ("_ + _" [1000,1000] 78) and
    17   ALT ("_ + _" [77,77] 78) and
    15   SEQ ("_ \<cdot> _" [1000,1000] 78) and
    18   SEQ ("_ \<cdot> _" [77,77] 78) and
    16   STAR ("_\<^sup>\<star>" [1000] 78) and
    19   STAR ("_\<^sup>\<star>" [1000] 78) and
    17   val.Char ("Char _" [1000] 78) and
    20   val.Char ("Char _" [1000] 78) and
    18   val.Left ("Left _" [1000] 78) and
    21   val.Left ("Left _" [1000] 78) and
    19   val.Right ("Right _" [1000] 78) and
    22   val.Right ("Right _" [1000] 78) and
    20   L ("L _" [1000] 0) and
    23   L ("L'(_')" [10] 78) and
       
    24   der_syn ("_\\_" [1000, 1000] 78) and
    21   flat ("|_|" [70] 73) and
    25   flat ("|_|" [70] 73) and
    22   Sequ ("_ @ _" [78,77] 63) and
    26   Sequ ("_ @ _" [78,77] 63) and
    23   injval ("inj _ _ _" [1000,77,1000] 77) and 
    27   injval ("inj _ _ _" [1000,77,1000] 77) and 
    24   projval ("proj _ _ _" [1000,77,1000] 77) and 
    28   projval ("proj _ _ _" [1000,77,1000] 77) and 
    25   length ("len _" [78] 73) 
    29   length ("len _" [78] 73) 
    27 (*>*)
    31 (*>*)
    28 
    32 
    29 section {* Introduction *}
    33 section {* Introduction *}
    30 
    34 
    31 text {*
    35 text {*
    32   
    36 
       
    37 Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em
       
    38 derivative} @{term "der c r"} of a regular expression @{text r} w.r.t.\ a
       
    39 character~@{text c}, and showed that it gave a simple solution to the
       
    40 problem of matching a string @{term s} with a regular expression @{term r}:
       
    41 if the derivative of @{term r} w.r.t.\ (in succession) all the characters of
       
    42 the string matches the empty string $\mts$, then @{term r} matches @{term s}
       
    43 (and {\em vice versa}). The derivative has the property (which may be
       
    44 regarded as its specification) that, for every string @{term s} and regular
       
    45 expression @{term r} and character @{term c}, one has @{term "cs \<in> L(r)"} if
       
    46 and only if \mbox{@{term "s \<in> L(der c r)"}}. The beauty of Brzozowski's
       
    47 derivatives is that they are neatly expressible in any functional language, and
       
    48 very easy to be defined and reasoned about in a theorem prover---the
       
    49 definitions consist just of inductive datatypes and recursive functions. A
       
    50 completely formalised proof of this matcher has for example been given in
       
    51 \cite{Owens2008}.
       
    52 
       
    53 One limitation of Brzozowski's matcher is that it only generates a YES/NO
       
    54 answer for a string being matched by a regular expression. Sulzmann and Lu
       
    55 \cite{Sulzmann2014} extended this matcher to allow generation not just of a
       
    56 YES/NO answer but of an actual matching, called a [lexical] {\em value}.
       
    57 They give a still very simple algorithm to calculate a value that appears to
       
    58 be the value associated with POSIX lexing (posix) %%\cite{posix}. The
       
    59 challenge then is to specify that value, in an algorithm-independent
       
    60 fashion, and to show that Sulzamman and Lu's derivative-based algorithm does
       
    61 indeed calculate a value that is correct according to the specification.
       
    62 
       
    63 Inspired by work of Frisch and Cardelli \cite{Frisch2004} on a GREEDY
       
    64 regular expression matching algorithm, the answer given in
       
    65 \cite{Sulzmann2014} is to define a relation (called an ``Order Relation'')
       
    66 on the set of values of @{term r}, and to show that (once a string to be
       
    67 matched is chosen) there is a maximum element and that it is computed by the
       
    68 derivative-based algorithm. Beginning with our observations that, without
       
    69 evidence that it is transitive, it cannot be called an ``order relation'',
       
    70 and that the relation is called a ``total order'' despite being evidently
       
    71 not total\footnote{\textcolor{green}{Why is it not total?}}, we identify
       
    72 problems with this approach (of which some of the proofs are not published
       
    73 in \cite{Sulzmann2014}); perhaps more importantly, we give a simple
       
    74 inductive (and algorithm-independent) definition of what we call being a
       
    75 {\em POSIX value} for a regular expression @{term r} and a string @{term s};
       
    76 we show that the algorithm computes such a value and that such a value is
       
    77 unique. Proofs are both done by hand and checked in {\em Isabelle}
       
    78 %\cite{isabelle}. Our experience of doing our proofs has been that this
       
    79 mechanical checking was absolutely essential: this subject area has hidden
       
    80 snares. This was also noted by Kuklewitz \cite{Kuklewicz} who found that
       
    81 nearly all POSIX matching implementations are ``buggy'' \cite[Page
       
    82 203]{Sulzmann2014}.
       
    83 
       
    84 \textcolor{green}{Say something about POSIX lexing}
       
    85 
       
    86 An extended version of \cite{Sulzmann2014} is available at the website
       
    87 of its first author; this includes some ``proofs'', claimed in
       
    88 \cite{Sulzmann2014} to be ``rigorous''. Since these are evidently not in
       
    89 final form, we make no comment thereon, preferring to give general
       
    90 reasons for our belief that the approach of \cite{Sulzmann2014} is
       
    91 problematic rather than to discuss details of unpublished work.
       
    92 
       
    93 Derivatives as calculated by Brzozowski's method are usually more
       
    94 complex regular expressions than the initial one; various optimisations
       
    95 are possible, such as the simplifications of @{term "ALT ZERO r"},
       
    96 @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
       
    97 @{term r}. One of the advantages of having a simple specification and
       
    98 correctness proof is that the latter can be refined to allow for such
       
    99 optimisations and simple correctness proof.
       
   100 
    33 
   101 
    34 Sulzmann and Lu \cite{Sulzmann2014}
   102 Sulzmann and Lu \cite{Sulzmann2014}
       
   103 \cite{Brzozowski1964}
    35 
   104 
    36 there are two commonly used
   105 there are two commonly used
    37 disambiguation strategies to create a unique matching tree:
   106 disambiguation strategies to create a unique matching tree:
    38 one is called \emph{greedy} matching \cite{Frisch2004} and the
   107 one is called \emph{greedy} matching \cite{Frisch2004} and the
    39 other is \emph{POSIX} matching~\cite{Kuklewicz,Sulzmann2014}.
   108 other is \emph{POSIX} matching~\cite{Kuklewicz,Sulzmann2014}.
    94 Isabelle. In fact, the purported proof they outline does not
   163 Isabelle. In fact, the purported proof they outline does not
    95 work in central places. We discovered this when filling in
   164 work in central places. We discovered this when filling in
    96 many gaps and attempting to formalise the proof in Isabelle. 
   165 many gaps and attempting to formalise the proof in Isabelle. 
    97 
   166 
    98 
   167 
    99 
   168 \medskip\noindent
   100 {\bf Contributions:}
   169 {\bf Contributions:}
   101 
   170 
   102 *}
   171 *}
   103 
   172 
   104 section {* Preliminaries *}
   173 section {* Preliminaries *}
   105 
   174 
   106 text {* \noindent Strings in Isabelle/HOL are lists of characters with
   175 text {* \noindent Strings in Isabelle/HOL are lists of characters with
   107   the empty string being represented by the empty list, written @{term
   176   the empty string being represented by the empty list, written @{term
   108   "[]"}, and list-cons being written as @{term "DUMMY # DUMMY"}. By
   177   "[]"}, and list-cons being written as @{term "DUMMY # DUMMY"}.
   109   using the type char we have a supply of finitely many characters
   178   Often we use the usual bracket notation for strings; for example a
   110   roughly corresponding to the ASCII character set.
   179   string consiting of a single character is written @{term "[c]"}.  By
   111   Regular exprtessions
   180   using the type @{type char} for characters we have a supply of
       
   181   finitely many characters roughly corresponding to the ASCII
       
   182   character set.  Regular expression are as usual and defined as the
       
   183   following inductive datatype:
   112 
   184 
   113   \begin{center}
   185   \begin{center}
   114   @{text "r :="}
   186   @{text "r :="}
   115   @{const "ZERO"} $\mid$
   187   @{const "ZERO"} $\mid$
   116   @{const "ONE"} $\mid$
   188   @{const "ONE"} $\mid$
   118   @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
   190   @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
   119   @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
   191   @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
   120   @{term "STAR r"} 
   192   @{term "STAR r"} 
   121   \end{center}
   193   \end{center}
   122 
   194 
       
   195   \noindent where @{const ZERO} stands for the regular expression that
       
   196   does not macth any string and @{const ONE} for the regular
       
   197   expression that matches only the empty string. The language of a regular expression
       
   198   is again defined as usual by the following clauses
       
   199 
       
   200   \begin{center}
       
   201   \begin{tabular}{rcl}
       
   202   @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
       
   203   @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
       
   204   @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
       
   205   @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   206   @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   207   @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
       
   208   \end{tabular}
       
   209   \end{center}
       
   210   
       
   211   \noindent We use the star-notation for regular expressions and sets of strings.
       
   212   The Kleene-star on sets is defined inductively.
       
   213   
       
   214   \emph{Semantic derivatives} of sets of strings are defined as
       
   215 
       
   216   \begin{center}
       
   217   \begin{tabular}{lcl}
       
   218   @{thm (lhs) Der_def} & $\dn$ & @{thm (rhs) Der_def}\\
       
   219   @{thm (lhs) Ders_def} & $\dn$ & @{thm (rhs) Ders_def}\\
       
   220   \end{tabular}
       
   221   \end{center}
       
   222   
       
   223   \noindent where the second definitions lifts the notion of semantic
       
   224   derivatives from characters to strings.  
       
   225 
       
   226   \noindent
       
   227   The nullable function
       
   228 
       
   229   \begin{center}
       
   230   \begin{tabular}{lcl}
       
   231   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
       
   232   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
       
   233   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
       
   234   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   235   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   236   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\
       
   237   \end{tabular}
       
   238   \end{center}
       
   239 
       
   240   \noindent
       
   241   The derivative function for characters and strings
       
   242 
       
   243   \begin{center}
       
   244   \begin{tabular}{lcp{7.5cm}}
       
   245   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
       
   246   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
       
   247   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
       
   248   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   249   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   250   @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\medskip\\
       
   251 
       
   252   @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\
       
   253   @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\
       
   254   \end{tabular}
       
   255   \end{center}
       
   256 
       
   257   It is a relatively easy exercise to prove that
       
   258 
       
   259   \begin{center}
       
   260   \begin{tabular}{l}
       
   261   @{thm[mode=IfThen] nullable_correctness}\\
       
   262   @{thm[mode=IfThen] der_correctness}\\
       
   263   \end{tabular}
       
   264   \end{center}
   123 *}
   265 *}
   124 
   266 
   125 section {* POSIX Regular Expression Matching *}
   267 section {* POSIX Regular Expression Matching *}
   126 
   268 
   127 section {* The Argument by Sulzmmann and Lu *}
   269 section {* The Argument by Sulzmmann and Lu *}
   165   @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   307   @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
   166   @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
   308   @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
   167   \end{tabular}
   309   \end{tabular}
   168   \end{center}
   310   \end{center}
   169 
   311 
   170   \noindent
   312  
   171   The nullable function
       
   172 
       
   173   \begin{center}
       
   174   \begin{tabular}{lcl}
       
   175   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
       
   176   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
       
   177   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
       
   178   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   179   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   180   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\
       
   181   \end{tabular}
       
   182   \end{center}
       
   183 
       
   184   \noindent
       
   185   The derivative function for characters and strings
       
   186 
       
   187   \begin{center}
       
   188   \begin{tabular}{lcp{7.5cm}}
       
   189   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
       
   190   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
       
   191   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
       
   192   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   193   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   194   @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\medskip\\
       
   195 
       
   196   @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\
       
   197   @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\
       
   198   \end{tabular}
       
   199   \end{center}
       
   200 
   313 
   201   \noindent
   314   \noindent
   202   The @{const flat} function for values
   315   The @{const flat} function for values
   203 
   316 
   204   \begin{center}
   317   \begin{center}