# HG changeset patch # User Christian Urban # Date 1456834211 0 # Node ID 73f7dc60c285b52b0dd6f1edec920db41a1ff8de # Parent 6adda4a667b13b17c0acf7b67c44b93df09cbbb9 updated paper diff -r 6adda4a667b1 -r 73f7dc60c285 thys/Paper/Paper.thy --- a/thys/Paper/Paper.thy Sun Feb 28 14:01:12 2016 +0000 +++ b/thys/Paper/Paper.thy Tue Mar 01 12:10:11 2016 +0000 @@ -5,19 +5,23 @@ declare [[show_question_marks = false]] +abbreviation + "der_syn r c \ der c r" + notation (latex output) If ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [78,77] 73) and ZERO ("\<^raw:\textrm{0}>" 78) and ONE ("\<^raw:\textrm{1}>" 78) and CHAR ("_" [1000] 10) and - ALT ("_ + _" [1000,1000] 78) and - SEQ ("_ \ _" [1000,1000] 78) and + ALT ("_ + _" [77,77] 78) and + SEQ ("_ \ _" [77,77] 78) and STAR ("_\<^sup>\" [1000] 78) and val.Char ("Char _" [1000] 78) and val.Left ("Left _" [1000] 78) and val.Right ("Right _" [1000] 78) and - L ("L _" [1000] 0) and + L ("L'(_')" [10] 78) and + der_syn ("_\\_" [1000, 1000] 78) and flat ("|_|" [70] 73) and Sequ ("_ @ _" [78,77] 63) and injval ("inj _ _ _" [1000,77,1000] 77) and @@ -29,9 +33,74 @@ section {* Introduction *} text {* - + +Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em +derivative} @{term "der c r"} of a regular expression @{text r} w.r.t.\ a +character~@{text c}, and showed that it gave a simple solution to the +problem of matching a string @{term s} with a regular expression @{term r}: +if the derivative of @{term r} w.r.t.\ (in succession) all the characters of +the string matches the empty string $\mts$, then @{term r} matches @{term s} +(and {\em vice versa}). The derivative has the property (which may be +regarded as its specification) that, for every string @{term s} and regular +expression @{term r} and character @{term c}, one has @{term "cs \ L(r)"} if +and only if \mbox{@{term "s \ L(der c r)"}}. The beauty of Brzozowski's +derivatives is that they are neatly expressible in any functional language, and +very easy to be defined and reasoned about in a theorem prover---the +definitions consist just of inductive datatypes and recursive functions. A +completely formalised proof of this matcher has for example been given in +\cite{Owens2008}. + +One limitation of Brzozowski's matcher is that it only generates a YES/NO +answer for a string being matched by a regular expression. Sulzmann and Lu +\cite{Sulzmann2014} extended this matcher to allow generation not just of a +YES/NO answer but of an actual matching, called a [lexical] {\em value}. +They give a still very simple algorithm to calculate a value that appears to +be the value associated with POSIX lexing (posix) %%\cite{posix}. The +challenge then is to specify that value, in an algorithm-independent +fashion, and to show that Sulzamman and Lu's derivative-based algorithm does +indeed calculate a value that is correct according to the specification. + +Inspired by work of Frisch and Cardelli \cite{Frisch2004} on a GREEDY +regular expression matching algorithm, the answer given in +\cite{Sulzmann2014} is to define a relation (called an ``Order Relation'') +on the set of values of @{term r}, and to show that (once a string to be +matched is chosen) there is a maximum element and that it is computed by the +derivative-based algorithm. Beginning with our observations that, without +evidence that it is transitive, it cannot be called an ``order relation'', +and that the relation is called a ``total order'' despite being evidently +not total\footnote{\textcolor{green}{Why is it not total?}}, we identify +problems with this approach (of which some of the proofs are not published +in \cite{Sulzmann2014}); perhaps more importantly, we give a simple +inductive (and algorithm-independent) definition of what we call being a +{\em POSIX value} for a regular expression @{term r} and a string @{term s}; +we show that the algorithm computes such a value and that such a value is +unique. Proofs are both done by hand and checked in {\em Isabelle} +%\cite{isabelle}. Our experience of doing our proofs has been that this +mechanical checking was absolutely essential: this subject area has hidden +snares. This was also noted by Kuklewitz \cite{Kuklewicz} who found that +nearly all POSIX matching implementations are ``buggy'' \cite[Page +203]{Sulzmann2014}. + +\textcolor{green}{Say something about POSIX lexing} + +An extended version of \cite{Sulzmann2014} is available at the website +of its first author; this includes some ``proofs'', claimed in +\cite{Sulzmann2014} to be ``rigorous''. Since these are evidently not in +final form, we make no comment thereon, preferring to give general +reasons for our belief that the approach of \cite{Sulzmann2014} is +problematic rather than to discuss details of unpublished work. + +Derivatives as calculated by Brzozowski's method are usually more +complex regular expressions than the initial one; various optimisations +are possible, such as the simplifications of @{term "ALT ZERO r"}, +@{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to +@{term r}. One of the advantages of having a simple specification and +correctness proof is that the latter can be refined to allow for such +optimisations and simple correctness proof. + Sulzmann and Lu \cite{Sulzmann2014} +\cite{Brzozowski1964} there are two commonly used disambiguation strategies to create a unique matching tree: @@ -96,7 +165,7 @@ many gaps and attempting to formalise the proof in Isabelle. - +\medskip\noindent {\bf Contributions:} *} @@ -105,10 +174,13 @@ text {* \noindent Strings in Isabelle/HOL are lists of characters with the empty string being represented by the empty list, written @{term - "[]"}, and list-cons being written as @{term "DUMMY # DUMMY"}. By - using the type char we have a supply of finitely many characters - roughly corresponding to the ASCII character set. - Regular exprtessions + "[]"}, and list-cons being written as @{term "DUMMY # DUMMY"}. + Often we use the usual bracket notation for strings; for example a + string consiting of a single character is written @{term "[c]"}. By + using the type @{type char} for characters we have a supply of + finitely many characters roughly corresponding to the ASCII + character set. Regular expression are as usual and defined as the + following inductive datatype: \begin{center} @{text "r :="} @@ -120,6 +192,76 @@ @{term "STAR r"} \end{center} + \noindent where @{const ZERO} stands for the regular expression that + does not macth any string and @{const ONE} for the regular + expression that matches only the empty string. The language of a regular expression + is again defined as usual by the following clauses + + \begin{center} + \begin{tabular}{rcl} + @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\ + @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\ + @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\ + @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\ + \end{tabular} + \end{center} + + \noindent We use the star-notation for regular expressions and sets of strings. + The Kleene-star on sets is defined inductively. + + \emph{Semantic derivatives} of sets of strings are defined as + + \begin{center} + \begin{tabular}{lcl} + @{thm (lhs) Der_def} & $\dn$ & @{thm (rhs) Der_def}\\ + @{thm (lhs) Ders_def} & $\dn$ & @{thm (rhs) Ders_def}\\ + \end{tabular} + \end{center} + + \noindent where the second definitions lifts the notion of semantic + derivatives from characters to strings. + + \noindent + The nullable function + + \begin{center} + \begin{tabular}{lcl} + @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\ + @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\ + @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\ + @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\ + \end{tabular} + \end{center} + + \noindent + The derivative function for characters and strings + + \begin{center} + \begin{tabular}{lcp{7.5cm}} + @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\ + @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\ + @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\ + @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\ + @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\medskip\\ + + @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\ + @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\ + \end{tabular} + \end{center} + + It is a relatively easy exercise to prove that + + \begin{center} + \begin{tabular}{l} + @{thm[mode=IfThen] nullable_correctness}\\ + @{thm[mode=IfThen] der_correctness}\\ + \end{tabular} + \end{center} *} section {* POSIX Regular Expression Matching *} @@ -167,36 +309,7 @@ \end{tabular} \end{center} - \noindent - The nullable function - - \begin{center} - \begin{tabular}{lcl} - @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\ - @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\ - @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\ - @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\ - @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\ - @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\ - \end{tabular} - \end{center} - - \noindent - The derivative function for characters and strings - - \begin{center} - \begin{tabular}{lcp{7.5cm}} - @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\ - @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\ - @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\ - @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\ - @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\ - @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\medskip\\ - - @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\ - @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\ - \end{tabular} - \end{center} + \noindent The @{const flat} function for values diff -r 6adda4a667b1 -r 73f7dc60c285 thys/Paper/document/root.tex --- a/thys/Paper/document/root.tex Sun Feb 28 14:01:12 2016 +0000 +++ b/thys/Paper/document/root.tex Tue Mar 01 12:10:11 2016 +0000 @@ -1,4 +1,5 @@ \documentclass[runningheads]{llncs} +\usepackage{times} \usepackage{isabelle} \usepackage{isabellesym} \usepackage{amsmath} @@ -8,16 +9,15 @@ \usepackage{pgf} \usepackage{pdfsetup} \usepackage{ot1patch} -\usepackage{times} \usepackage{stmaryrd} \usepackage{url} \usepackage{color} -\titlerunning{BLA BLA} +\titlerunning{POSIX Lexing with Derivatives} \urlstyle{rm} \isabellestyle{it} -\renewcommand{\isastyleminor}{\it}% +\renewcommand{\isastyleminor}{\it}% \renewcommand{\isastyle}{\normalsize\it}% @@ -28,25 +28,31 @@ \definecolor{mygrey}{rgb}{.80,.80,.80} \def\Brz{Brzozowski} - +\def\der{\backslash} +\newcommand{\eps}{\varepsilon} +\newcommand{\mts}{\eps} \begin{document} \title{POSIX {L}exing with {D}erivatives of {R}egular {E}xpressions (Proof Pearl)} -\author{Fahad Ausaf\inst{1} \and Roy Dyckhoff\inst{2} \and Christian Urban\inst{1}} -\institute{King's College London, United Kingdom \and - St Andrews} +\author{Fahad Ausaf\inst{1} \and Roy Dyckhoff\inst{2} \and Christian Urban\inst{3}} +\institute{King's College London\\ + \email{fahad.ausaf@icloud.com} +\and University of St Andrews\\ + \email{roy.dyckhoff@st-andrews.ac.uk} +\and King's College London\\ + \email{christian.urban@kcl.ac.uk}} \maketitle \begin{abstract} -\Brz{} introduced the notion of derivatives for regular +Brzozowski introduced the notion of derivatives for regular expressions. They can be used for a very simple regular expression matching algorithm. Sulzmann and Lu cleverly extended this algorithm in order to deal with POSIX matching, which is the underlying -disambiguation strategy for regular expressions needed in -lexers. Sulzmann and Lu have made available on-line what they call a -``rigorous proof'' of the correctness of their algorithm w.r.t.~their +disambiguation strategy for regular expressions needed in lexers. +Sulzmann and Lu have made available on-line what they call a +``rigorous proof'' of the correctness of their algorithm w.r.t.\ their specification; regrettably, it appears to us to have unfillable gaps. In the first part of this paper we give our inductive definition of what a POSIX value is and show $(i)$ that such a value is unique (for @@ -55,10 +61,10 @@ that the regular expression matches the string). We also prove the correctness of an optimised version of the POSIX matching algorithm. Our definitions and proof are much simpler than those by -Sulzmann and Lu and can be easily formalised in Isabelle/HOL. In the -second part we analyse the correctness argument by Sulzmann and Lu in -more detail and explain why it seems hard to turn it into a proof -rigorous enough to be accepted by a system such as Isabelle/HOL. +Sulzmann and Lu and can be easily formalised in Isabelle/HOL. In the +second part we analyse the correctness argument by Sulzmann and Lu and +explain why it seems hard to turn it into a proof rigorous enough to +be accepted by a system such as Isabelle/HOL.\smallskip {\bf Keywords:} POSIX matching, Derivatives of Regular Expressions, Isabelle/HOL diff -r 6adda4a667b1 -r 73f7dc60c285 thys/ReStar.thy --- a/thys/ReStar.thy Sun Feb 28 14:01:12 2016 +0000 +++ b/thys/ReStar.thy Tue Mar 01 12:10:11 2016 +0000 @@ -184,6 +184,9 @@ | Left val | Stars "val list" +datatype_compat val + + section {* The string behind a value *} fun @@ -352,7 +355,7 @@ apply(erule Prf.cases) apply(simp_all)[7] (* CHAR *) -apply(case_tac "c = char") +apply(case_tac "c = x") apply(simp) apply(erule Prf.cases) apply(simp_all)[7] @@ -361,7 +364,7 @@ apply(erule Prf.cases) apply(simp_all)[7] (* SEQ *) -apply(case_tac "nullable rexp1") +apply(case_tac "nullable x1") apply(simp) apply(erule Prf.cases) apply(simp_all)[7] @@ -491,10 +494,14 @@ apply(metis PMatch.intros(7)) done +find_theorems Stars +thm compat_val_list.induct compat_val.induct + + lemma PMatch_determ: shows "\s \ r \ v1; s \ r \ v2\ \ v1 = v2" and "\s \ (STAR r) \ Stars vs1; s \ (STAR r) \ Stars vs2\ \ vs1 = vs2" -apply(induct v1 and vs1 arbitrary: s r v2 and s r vs2 rule: val.inducts) +apply(induct v1 and vs1 arbitrary: s r v2 and s r vs2 rule: compat_val.induct compat_val_list.induct) apply(erule PMatch.cases) apply(simp_all)[7] apply(erule PMatch.cases) @@ -604,7 +611,7 @@ apply(erule PMatch.cases) apply(simp_all)[7] (* CHAR case *) -apply(case_tac "c = char") +apply(case_tac "c = x") apply(simp) apply(erule PMatch.cases) apply(simp_all)[7] @@ -622,7 +629,7 @@ apply metis apply(simp add: der_correctness Der_def) (* SEQ case *) -apply(case_tac "nullable rexp1") +apply(case_tac "nullable x1") apply(simp) prefer 2 (* not-nullbale case *) @@ -997,7 +1004,7 @@ shows "\ v : r \ \v'. flat v' = flat v \ \ v' : r" and "\ Stars vs : r \ \vs'. flat (Stars vs') = flat (Stars vs) \ \ Stars vs' : r" using assms -apply(induct v and vs arbitrary: r and r rule: val.inducts) +apply(induct v and vs arbitrary: r and r rule: compat_val.induct compat_val_list.induct) apply(auto)[1] apply(erule Prf.cases) apply(simp_all)[7] diff -r 6adda4a667b1 -r 73f7dc60c285 thys/paper.pdf Binary file thys/paper.pdf has changed