--- a/thys/Journal/Paper.thy Wed Aug 15 13:48:57 2018 +0100
+++ b/thys/Journal/Paper.thy Thu Aug 16 01:12:00 2018 +0100
@@ -4,6 +4,7 @@
"../Lexer"
"../Simplifying"
"../Positions"
+ "../Sulzmann"
"~~/src/HOL/Library/LaTeXsugar"
begin
@@ -46,7 +47,7 @@
CHAR ("_" [1000] 80) and
ALT ("_ + _" [77,77] 78) and
SEQ ("_ \<cdot> _" [77,77] 78) and
- STAR ("_\<^sup>\<star>" [78] 78) and
+ STAR ("_\<^sup>\<star>" [79] 78) and
val.Void ("Empty" 78) and
val.Char ("Char _" [1000] 78) and
@@ -122,36 +123,38 @@
text {*
Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em
-derivative} @{term "der c r"} of a regular expression @{text r} w.r.t.\ a
-character~@{text c}, and showed that it gave a simple solution to the
-problem of matching a string @{term s} with a regular expression @{term r}:
-if the derivative of @{term r} w.r.t.\ (in succession) all the characters of
-the string matches the empty string, then @{term r} matches @{term s} (and
-{\em vice versa}). The derivative has the property (which may almost be
-regarded as its specification) that, for every string @{term s} and regular
-expression @{term r} and character @{term c}, one has @{term "cs \<in> L(r)"} if
-and only if \mbox{@{term "s \<in> L(der c r)"}}. The beauty of Brzozowski's
-derivatives is that they are neatly expressible in any functional language,
-and easily definable and reasoned about in theorem provers---the definitions
-just consist of inductive datatypes and simple recursive functions. A
+derivative} @{term "der c r"} of a regular expression @{text r} w.r.t.\
+a character~@{text c}, and showed that it gave a simple solution to the
+problem of matching a string @{term s} with a regular expression @{term
+r}: if the derivative of @{term r} w.r.t.\ (in succession) all the
+characters of the string matches the empty string, then @{term r}
+matches @{term s} (and {\em vice versa}). The derivative has the
+property (which may almost be regarded as its specification) that, for
+every string @{term s} and regular expression @{term r} and character
+@{term c}, one has @{term "cs \<in> L(r)"} if and only if \mbox{@{term "s \<in> L(der c r)"}}.
+The beauty of Brzozowski's derivatives is that
+they are neatly expressible in any functional language, and easily
+definable and reasoned about in theorem provers---the definitions just
+consist of inductive datatypes and simple recursive functions. A
mechanised correctness proof of Brzozowski's matcher in for example HOL4
-has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
-of the work by Krauss and Nipkow \cite{Krauss2011}. And another one in Coq is given
-by Coquand and Siles \cite{Coquand2012}.
+has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in
+Isabelle/HOL is part of the work by Krauss and Nipkow \cite{Krauss2011}.
+And another one in Coq is given by Coquand and Siles \cite{Coquand2012}.
-If a regular expression matches a string, then in general there is more than
-one way of how the string is matched. There are two commonly used
-disambiguation strategies to generate a unique answer: one is called GREEDY
-matching \cite{Frisch2004} and the other is POSIX
-matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}. For example consider
-the string @{term xy} and the regular expression \mbox{@{term "STAR (ALT
-(ALT x y) xy)"}}. Either the string can be matched in two `iterations' by
-the single letter-regular expressions @{term x} and @{term y}, or directly
-in one iteration by @{term xy}. The first case corresponds to GREEDY
-matching, which first matches with the left-most symbol and only matches the
-next symbol in case of a mismatch (this is greedy in the sense of preferring
-instant gratification to delayed repletion). The second case is POSIX
-matching, which prefers the longest match.
+If a regular expression matches a string, then in general there is more
+than one way of how the string is matched. There are two commonly used
+disambiguation strategies to generate a unique answer: one is called
+GREEDY matching \cite{Frisch2004} and the other is POSIX
+matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}.
+For example consider the string @{term xy} and the regular expression
+\mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be
+matched in two `iterations' by the single letter-regular expressions
+@{term x} and @{term y}, or directly in one iteration by @{term xy}. The
+first case corresponds to GREEDY matching, which first matches with the
+left-most symbol and only matches the next symbol in case of a mismatch
+(this is greedy in the sense of preferring instant gratification to
+delayed repletion). The second case is POSIX matching, which prefers the
+longest match.
In the context of lexing, where an input string needs to be split up
into a sequence of tokens, POSIX is the more natural disambiguation
@@ -160,8 +163,7 @@
regular expressions, say @{text "r\<^bsub>key\<^esub>"} and @{text
"r\<^bsub>id\<^esub>"} for recognising keywords and identifiers,
respectively. There are a few underlying (informal) rules behind
-tokenising a string in a POSIX \cite{POSIX} fashion according to a
-collection of regular expressions:
+tokenising a string in a POSIX \cite{POSIX} fashion:
\begin{itemize}
\item[$\bullet$] \emph{The Longest Match Rule} (or \emph{``{M}aximal {M}unch {R}ule''}):
@@ -226,7 +228,8 @@
alternative is used. This `tree view' leads naturally to the idea that
regular expressions act as types and values as inhabiting those types
(see, for example, \cite{HosoyaVouillonPierce2005}). The value for
-the single `iteration', i.e.~the POSIX value, would look as follows
+matching @{text "xy"} in a single `iteration', i.e.~the POSIX value,
+would look as follows
\begin{center}
@{term "Stars [Seq (Char x) (Char y)]"}
@@ -234,8 +237,10 @@
\noindent where @{const Stars} has only a single-element list for the
single iteration and @{const Seq} indicates that @{term xy} is matched
-by a sequence regular expression, which we will in what follows
-write more formally as @{term "SEQ x y"}.
+by a sequence regular expression.
+
+%, which we will in what follows
+%write more formally as @{term "SEQ x y"}.
Sulzmann and Lu give a simple algorithm to calculate a value that
@@ -271,16 +276,6 @@
various subexpressions.''
\end{quote}
-%\footnote{The relation @{text "\<ge>\<^bsub>r\<^esub>"} defined by Sulzmann and Lu \cite{Sulzmann2014}
-%is a relation on the
-%values for the regular expression @{term r}; but it only holds between
-%@{term "v\<^sub>1"} and @{term "v\<^sub>2"} in cases where @{term "v\<^sub>1"} and @{term "v\<^sub>2"} have
-%the same flattening (underlying string). So a counterexample to totality is
-%given by taking two values @{term "v\<^sub>1"} and @{term "v\<^sub>2"} for @{term r} that
-%have different flattenings (see Section~\ref{posixsec}). A different
-%relation @{text "\<ge>\<^bsub>r,s\<^esub>"} on the set of values for @{term r}
-%with flattening @{term s} is definable by the same approach, and is indeed
-%total; but that is not what Proposition 1 of \cite{Sulzmann2014} does.}
\noindent {\bf Contributions:} We have implemented in Isabelle/HOL the
@@ -297,11 +292,14 @@
that given a string and a regular expression uniquely determines this value.
We also show that our definition is equivalent to an ordering
of values based on positions by Okui and Suzuki \cite{OkuiSuzuki2010}.
-Derivatives as calculated by Brzozowski's method are usually more complex
-regular expressions than the initial one; various optimisations are
-possible. We prove the correctness when simplifications of @{term "ALT ZERO
-r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
-@{term r} are applied. We extend our results to ???
+
+%Derivatives as calculated by Brzozowski's method are usually more complex
+%regular expressions than the initial one; various optimisations are
+%possible. We prove the correctness when simplifications of @{term "ALT ZERO r"},
+%@{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
+%@{term r} are applied.
+
+We extend our results to ???
*}
--- a/thys/Journal/document/root.tex Wed Aug 15 13:48:57 2018 +0100
+++ b/thys/Journal/document/root.tex Thu Aug 16 01:12:00 2018 +0100
@@ -50,8 +50,8 @@
The advantage of the definition based on the
ordering is that it implements more directly the informal rules from the
POSIX standard.
- We also
- extend our results to additional constructors of regular
+ We also prove Sulzmann \& Lu's conjecture that their bitcoded version
+ of the POSIX algorithm is correct. Furthermore we extend our results to additional constructors of regular
expressions.} \renewcommand{\thefootnote}{\arabic{footnote}}
@@ -82,7 +82,7 @@
definition of a POSIX value is equivalent to an alternative definition
by Okui and Suzuki which identifies POSIX values as least elements
according to an ordering of values. We also prove the correctness of
-an optimised version of the POSIX matching algorithm and extend the
+Sulzmann's bitcoded version of the POSIX matching algorithm and extend the
results to additional constructors for regular expressions. \smallskip
{\bf Keywords:} POSIX matching, Derivatives of Regular Expressions,
--- a/thys/Lexer.thy Wed Aug 15 13:48:57 2018 +0100
+++ b/thys/Lexer.thy Thu Aug 16 01:12:00 2018 +0100
@@ -290,18 +290,14 @@
lemma flex_append:
shows "flex r f (s1 @ s2) = flex (ders s1 r) (flex r f s1) s2"
apply(induct s1 arbitrary: s2 r f)
- apply(simp)
- apply(drule_tac x="s2" in meta_spec)
- apply(drule_tac x="der a r" in meta_spec)
- apply(simp)
+ apply(simp_all)
done
lemma lexer_flex:
shows "lexer r s = (if nullable (ders s r)
then Some(flex r id s (mkeps (ders s r))) else None)"
apply(induct s arbitrary: r)
- apply(simp)
- apply(simp add: flex_fun_apply)
+ apply(simp_all add: flex_fun_apply)
done
unused_thms
--- a/thys/ROOT Wed Aug 15 13:48:57 2018 +0100
+++ b/thys/ROOT Thu Aug 16 01:12:00 2018 +0100
@@ -5,7 +5,7 @@
"Lexer"
(*"LexerExt"*)
"Simplifying"
- (*"Sulzmann"*)
+ "Sulzmann"
"Positions"
(*"PositionsExt"*)
"Exercises"
--- a/thys/Spec.thy Wed Aug 15 13:48:57 2018 +0100
+++ b/thys/Spec.thy Thu Aug 16 01:12:00 2018 +0100
@@ -174,6 +174,11 @@
by (induct s arbitrary: r)
(simp_all add: Ders_def der_correctness Der_def)
+lemma ders_append:
+ shows "ders (s1 @ s2) r = ders s2 (ders s1 r)"
+ apply(induct s1 arbitrary: s2 r)
+ apply(auto)
+ done
section {* Values *}
--- a/thys/Sulzmann.thy Wed Aug 15 13:48:57 2018 +0100
+++ b/thys/Sulzmann.thy Thu Aug 16 01:12:00 2018 +0100
@@ -167,16 +167,16 @@
fun
- aders :: "string \<Rightarrow> arexp \<Rightarrow> arexp"
+ aders :: "arexp \<Rightarrow> string \<Rightarrow> arexp"
where
- "aders [] r = r"
-| "aders (c # s) r = aders s (ader c r)"
+ "aders r [] = r"
+| "aders r (c#s) = aders (ader c r) s"
-fun
- alex :: "arexp \<Rightarrow> string \<Rightarrow> arexp"
-where
- "alex r [] = r"
-| "alex r (c#s) = alex (ader c r) s"
+lemma aders_append:
+ "aders r (s1 @ s2) = aders (aders r s1) s2"
+ apply(induct s1 arbitrary: r s2)
+ apply(simp_all)
+ done
lemma anullable_correctness:
shows "nullable (aerase r) = anullable r"
@@ -190,6 +190,11 @@
apply(simp_all)
done
+lemma aerase_internalise:
+ shows "aerase (internalise r) = r"
+ apply(induct r)
+ apply(simp_all add: aerase_fuse)
+ done
lemma aerase_ader:
shows "aerase (ader a r) = der a (aerase r)"
@@ -197,15 +202,8 @@
apply(simp_all add: aerase_fuse anullable_correctness)
done
-lemma aerase_internalise:
- shows "aerase (internalise r) = r"
- apply(induct r)
- apply(simp_all add: aerase_fuse)
- done
-
-
-lemma aerase_alex:
- shows "aerase (alex r s) = ders s (aerase r)"
+lemma aerase_aders:
+ shows "aerase (aders r s) = ders s (aerase r)"
apply(induct s arbitrary: r )
apply(simp_all add: aerase_ader)
done
@@ -255,18 +253,6 @@
done
-lemma alex_append:
- "alex r (s1 @ s2) = alex (alex r s1) s2"
- apply(induct s1 arbitrary: r s2)
- apply(simp_all)
- done
-
-lemma ders_append:
- shows "ders (s1 @ s2) r = ders s2 (ders s1 r)"
- apply(induct s1 arbitrary: s2 r)
- apply(auto)
- done
-
@@ -275,7 +261,7 @@
shows "\<Turnstile> v : r"
using assms
apply(induct)
- apply(auto intro: Prf.intros)
+ apply(auto intro: Prf.intros)
by (metis Prf.intros(6) Prf_elims(6) insert_iff list.simps(15) val.inject(5))
lemma Qa:
@@ -320,38 +306,34 @@
apply(auto elim!: Prf_elims)[1]
by (simp add: retrieve_afuse2 aerase_ader)
-
-
-
lemma MAIN:
assumes "\<Turnstile> v : ders s r"
- shows "code (flex r id s v) = retrieve (alex (internalise r) s) v"
+ shows "code (flex r id s v) = retrieve (aders (internalise r) s) v"
using assms
- apply(induct s arbitrary: r v rule: rev_induct)
- apply(simp)
- apply (simp add: retrieve_encode)
- apply(simp add: flex_append alex_append)
+ apply(induct s arbitrary: v rule: rev_induct)
+ apply(simp)
+ apply(simp add: retrieve_encode)
+ apply(simp add: flex_append aders_append)
apply(subst Qb)
- apply (simp add: aerase_internalise ders_append aerase_alex)
- apply(simp add: aerase_alex aerase_internalise)
- apply(drule_tac x="r" in meta_spec)
+ apply(simp add: aerase_internalise ders_append aerase_aders)
+ apply(simp add: aerase_aders aerase_internalise)
apply(drule_tac x="injval (ders xs r) x v" in meta_spec)
apply(drule meta_mp)
- apply (simp add: Prf_injval ders_append)
+ apply(simp add: Prf_injval ders_append)
apply(simp)
done
fun alexer where
- "alexer r s = (if anullable (alex (internalise r) s) then
- decode (amkeps (alex (internalise r) s)) r else None)"
+ "alexer r s = (if anullable (aders (internalise r) s) then
+ decode (amkeps (aders (internalise r) s)) r else None)"
lemma FIN:
"alexer r s = lexer r s"
- apply(auto split: prod.splits)
- apply (smt MAIN Q00 Qa aerase_alex aerase_internalise anullable_correctness decode_code lexer_correctness(1) lexer_flex mkeps_nullable)
- apply (simp add: aerase_internalise anullable_correctness[symmetric] lexer_flex aerase_alex)
- done
+ apply(auto simp add: lexer_flex)
+ apply (smt MAIN Q00 Qa aerase_aders aerase_internalise decode_code lexer_correctness(1) lexer_flex mkeps_nullable)
+ apply (metis aerase_aders aerase_internalise anullable_correctness)
+ using aerase_aders aerase_internalise anullable_correctness by force
unused_thms
Binary file thys/journal.pdf has changed