# HG changeset patch # User Christian Urban # Date 1456892005 0 # Node ID 2c38f10643ae9bf1c870b88df5e86c21342843a8 # Parent 73f7dc60c285b52b0dd6f1edec920db41a1ff8de updated diff -r 73f7dc60c285 -r 2c38f10643ae thys/Paper/Paper.thy --- a/thys/Paper/Paper.thy Tue Mar 01 12:10:11 2016 +0000 +++ b/thys/Paper/Paper.thy Wed Mar 02 04:13:25 2016 +0000 @@ -11,9 +11,9 @@ notation (latex output) If ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [78,77] 73) and - ZERO ("\<^raw:\textrm{0}>" 78) and - ONE ("\<^raw:\textrm{1}>" 78) and - CHAR ("_" [1000] 10) and + ZERO ("\<^bold>0" 80) and + ONE ("\<^bold>1" 80) and + CHAR ("_" [1000] 80) and ALT ("_ + _" [77,77] 78) and SEQ ("_ \ _" [77,77] 78) and STAR ("_\<^sup>\" [1000] 78) and @@ -21,7 +21,7 @@ val.Left ("Left _" [1000] 78) and val.Right ("Right _" [1000] 78) and L ("L'(_')" [10] 78) and - der_syn ("_\\_" [1000, 1000] 78) and + der_syn ("_\\_" [79, 1000] 76) and flat ("|_|" [70] 73) and Sequ ("_ @ _" [78,77] 63) and injval ("inj _ _ _" [1000,77,1000] 77) and @@ -44,44 +44,99 @@ regarded as its specification) that, for every string @{term s} and regular expression @{term r} and character @{term c}, one has @{term "cs \ L(r)"} if and only if \mbox{@{term "s \ L(der c r)"}}. The beauty of Brzozowski's -derivatives is that they are neatly expressible in any functional language, and -very easy to be defined and reasoned about in a theorem prover---the -definitions consist just of inductive datatypes and recursive functions. A -completely formalised proof of this matcher has for example been given in -\cite{Owens2008}. +derivatives is that they are neatly expressible in any functional language, +and easily definable and reasoned about in theorem provers---the definitions +just consist of inductive datatypes and simple recursive functions. A +completely formalised correctness proof of this matcher in for example HOL4 +has been given in~\cite{Owens2008}. One limitation of Brzozowski's matcher is that it only generates a YES/NO -answer for a string being matched by a regular expression. Sulzmann and Lu -\cite{Sulzmann2014} extended this matcher to allow generation not just of a -YES/NO answer but of an actual matching, called a [lexical] {\em value}. -They give a still very simple algorithm to calculate a value that appears to -be the value associated with POSIX lexing (posix) %%\cite{posix}. The -challenge then is to specify that value, in an algorithm-independent -fashion, and to show that Sulzamman and Lu's derivative-based algorithm does -indeed calculate a value that is correct according to the specification. +answer for whether a string is being matched by a regular expression. +Sulzmann and Lu \cite{Sulzmann2014} extended this matcher to allow +generation not just of a YES/NO answer but of an actual matching, called a +[lexical] {\em value}. They give a simple algorithm to calculate a value +that appears to be the value associated with POSIX lexing +\cite{Kuklewicz,Vansummeren2006}. The challenge then is to specify that +value, in an algorithm-independent fashion, and to show that Sulzamann and +Lu's derivative-based algorithm does indeed calculate a value that is +correct according to the specification. + +The answer given by Sulzmann and Lu \cite{Sulzmann2014} is to define a +relation (called an ``Order Relation'') on the set of values of @{term r}, +and to show that (once a string to be matched is chosen) there is a maximum +element and that it is computed by their derivative-based algorithm. This +proof idea is inspired by work of Frisch and Cardelli \cite{Frisch2004} on a +GREEDY regular expression matching algorithm. Beginning with our +observations that, without evidence that it is transitive, it cannot be +called an ``order relation'', and that the relation is called a ``total +order'' despite being evidently not total\footnote{\textcolor{green}{We +should give an argument as footnote}}, we identify problems with this +approach (of which some of the proofs are not published in +\cite{Sulzmann2014}); perhaps more importantly, we give a simple inductive +(and algorithm-independent) definition of what we call being a {\em POSIX +value} for a regular expression @{term r} and a string @{term s}; we show +that the algorithm computes such a value and that such a value is unique. +Proofs are both done by hand and checked in {\em Isabelle/HOL}. The +experience of doing our proofs has been that this mechanical checking was +absolutely essential: this subject area has hidden snares. This was also +noted by Kuklewitz \cite{Kuklewicz} who found that nearly all POSIX matching +implementations are ``buggy'' \cite[Page 203]{Sulzmann2014}. -Inspired by work of Frisch and Cardelli \cite{Frisch2004} on a GREEDY -regular expression matching algorithm, the answer given in -\cite{Sulzmann2014} is to define a relation (called an ``Order Relation'') -on the set of values of @{term r}, and to show that (once a string to be -matched is chosen) there is a maximum element and that it is computed by the -derivative-based algorithm. Beginning with our observations that, without -evidence that it is transitive, it cannot be called an ``order relation'', -and that the relation is called a ``total order'' despite being evidently -not total\footnote{\textcolor{green}{Why is it not total?}}, we identify -problems with this approach (of which some of the proofs are not published -in \cite{Sulzmann2014}); perhaps more importantly, we give a simple -inductive (and algorithm-independent) definition of what we call being a -{\em POSIX value} for a regular expression @{term r} and a string @{term s}; -we show that the algorithm computes such a value and that such a value is -unique. Proofs are both done by hand and checked in {\em Isabelle} -%\cite{isabelle}. Our experience of doing our proofs has been that this -mechanical checking was absolutely essential: this subject area has hidden -snares. This was also noted by Kuklewitz \cite{Kuklewicz} who found that -nearly all POSIX matching implementations are ``buggy'' \cite[Page -203]{Sulzmann2014}. +If a regular expression matches a string, then in general there are more +than one possibility of how the string is matched. There are two commonly +used disambiguation strategies to generate a unique answer: one is called +GREEDY matching \cite{Frisch2004} and the other is POSIX +matching~\cite{Kuklewicz,Sulzmann2014}. For example consider the string +@{term xy} and the regular expression @{term "STAR (ALT (ALT x y) xy)"}. +Either the string can be matched in two `iterations' by the single +letter-regular expressions @{term x} and @{term y}, or directly in one +iteration by @{term xy}. The first case corresponds to GREEDY matching, +which first matches with the left-most symbol and only matches the next +symbol in case of a mismatch. The second case is POSIX matching, which +prefers the longest match. + +In the context of lexing, where an input string is separated into a sequence +of tokens, POSIX is the more natural disambiguation strategy for what +programmers consider basic syntactic building blocks in their programs. +There are two underlying rules behind tokenising using POSIX matching: + +\begin{itemize} +\item[$\bullet$] \underline{The Longest Match Rule (or ``maximal munch rule''):} + +The longest initial substring matched by any regular expression is taken as +next token.\smallskip + +\item[$\bullet$] \underline{Rule Priority:} -\textcolor{green}{Say something about POSIX lexing} +For a particular longest initial substring, the first regular expression +that can match determines the token. +\end{itemize} + +\noindent Consider for example a regular expression @{text +"r\<^bsub>key\<^esub>"} that can recognise keywords, like @{text "if"}, +@{text "then"} and so on; and another regular expression @{text +"r\<^bsub>id\<^esub>"} that can recognise identifiers (such as single +characters followed by characters or numbers). Then we can form the regular +expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\"} and +use POSIX matching to, for example, tokenise the strings @{text "iffoo"} and +@{text "if"}. In the first case we obtain by the longest match rule a +single identifier token, not a keyword and identifier. In the second case we +obtain by rule priority a keyword token, not an identifier token. +\bigskip + +\noindent\textcolor{green}{Not Done Yet} + + +\medskip\noindent +{\bf Contributions:} + +Derivatives as calculated by Brzozowski's method are usually more complex +regular expressions than the initial one; various optimisations are +possible, such as the simplifications of @{term "ALT ZERO r"}, @{term "ALT r +ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to @{term r}. One of the +advantages of having a simple specification and correctness proof is that +the latter can be refined to allow for such optimisations and simple +correctness proof. An extended version of \cite{Sulzmann2014} is available at the website of its first author; this includes some ``proofs'', claimed in @@ -90,83 +145,6 @@ reasons for our belief that the approach of \cite{Sulzmann2014} is problematic rather than to discuss details of unpublished work. -Derivatives as calculated by Brzozowski's method are usually more -complex regular expressions than the initial one; various optimisations -are possible, such as the simplifications of @{term "ALT ZERO r"}, -@{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to -@{term r}. One of the advantages of having a simple specification and -correctness proof is that the latter can be refined to allow for such -optimisations and simple correctness proof. - - -Sulzmann and Lu \cite{Sulzmann2014} -\cite{Brzozowski1964} - -there are two commonly used -disambiguation strategies to create a unique matching tree: -one is called \emph{greedy} matching \cite{Frisch2004} and the -other is \emph{POSIX} matching~\cite{Kuklewicz,Sulzmann2014}. -For the latter there are two rough rules: - -\begin{itemize} -\item The Longest Match Rule (or ``maximal munch rule''):\smallskip\\ The - longest initial substring matched by any regular - expression is taken as next token. - -\item Rule Priority:\smallskip\\ For a particular longest initial - substring, the first regular expression that can match - determines the token. -\end{itemize} - -\noindent In the context of lexing, POSIX is the more -interesting disambiguation strategy as it produces longest -matches, which is necessary for tokenising programs. For -example the string \textit{iffoo} should not match the keyword -\textit{if} and the rest, but as one string \textit{iffoo}, -which might be a variable name in a program. As another -example consider the string $xy$ and the regular expression -\mbox{$(x + y + xy)^*$}. Either the input string can be -matched in two `iterations' by the single letter-regular -expressions $x$ and $y$, or directly in one iteration by $xy$. -The first case corresponds to greedy matching, which first -matches with the left-most symbol and only matches the next -symbol in case of a mismatch. The second case is POSIX -matching, which prefers the longest match. In case more than -one (longest) matches exist, only then it prefers the -left-most match. While POSIX matching seems natural, it turns -out to be much more subtle than greedy matching in terms of -implementations and in terms of proving properties about it. -If POSIX matching is implemented using automata, then one has -to follow transitions (according to the input string) until -one finds an accepting state, record this state and look for -further transition which might lead to another accepting state -that represents a longer input initial substring to be -matched. Only if none can be found, the last accepting state -is returned. - - -Sulzmann and Lu's paper \cite{Sulzmann2014} targets POSIX -regular expression matching. They write that it is known to be -to be a non-trivial problem and nearly all POSIX matching -implementations are ``buggy'' \cite[Page 203]{Sulzmann2014}. -For this they cite a study by Kuklewicz \cite{Kuklewicz}. My -current work is about formalising the proofs in the paper by -Sulzmann and Lu. Specifically, they propose in this paper a -POSIX matching algorithm and give some details of a -correctness proof for this algorithm inside the paper and some -more details in an appendix. This correctness proof is -unformalised, meaning it is just a ``pencil-and-paper'' proof, -not done in a theorem prover. Though, the paper and presumably -the proof have been peer-reviewed. Unfortunately their proof -does not give enough details such that it can be -straightforwardly implemented in a theorem prover, say -Isabelle. In fact, the purported proof they outline does not -work in central places. We discovered this when filling in -many gaps and attempting to formalise the proof in Isabelle. - - -\medskip\noindent -{\bf Contributions:} *} diff -r 73f7dc60c285 -r 2c38f10643ae thys/Paper/document/root.bib --- a/thys/Paper/document/root.bib Tue Mar 01 12:10:11 2016 +0000 +++ b/thys/Paper/document/root.bib Wed Mar 02 04:13:25 2016 +0000 @@ -14,6 +14,16 @@ howpublished = "\url{https://wiki.haskell.org/Regex_Posix}" } +@article{Vansummeren2006, + author = {S.~Vansummeren}, + title = {{T}ype {I}nference for {U}nique {P}attern {M}atching}, + year = {2006}, + journal = {ACM Transactions on Programming Languages and Systems}, + volume = {28}, + number = {3}, + pages = {389--428} +} + @InProceedings{Asperti12, author = {A.~Asperti}, title = {{A} {C}ompact {P}roof of {D}ecidability for {R}egular {E}xpression {E}quivalence}, diff -r 73f7dc60c285 -r 2c38f10643ae thys/Paper/document/root.tex --- a/thys/Paper/document/root.tex Tue Mar 01 12:10:11 2016 +0000 +++ b/thys/Paper/document/root.tex Wed Mar 02 04:13:25 2016 +0000 @@ -13,7 +13,7 @@ \usepackage{url} \usepackage{color} -\titlerunning{POSIX Lexing with Derivatives} +\titlerunning{POSIX Lexing with Derivatives of Regular Expressions} \urlstyle{rm} \isabellestyle{it} diff -r 73f7dc60c285 -r 2c38f10643ae thys/paper.pdf Binary file thys/paper.pdf has changed