# HG changeset patch # User Christian Urban # Date 1643500401 0 # Node ID 46e5566ad4ba2860e45f0cb2709b44277a20c961 # Parent d73f19be3ce66847e470bd5875c8c4f8b8f36399 updated diff -r d73f19be3ce6 -r 46e5566ad4ba thys/BitCodedCT.thy --- a/thys/BitCodedCT.thy Sat Jan 29 16:43:51 2022 +0000 +++ b/thys/BitCodedCT.thy Sat Jan 29 23:53:21 2022 +0000 @@ -29,7 +29,7 @@ where "decode' ds ZERO = (Void, [])" | "decode' ds ONE = (Void, ds)" -| "decode' ds (CHAR d) = (Char d, ds)" +| "decode' ds (CH d) = (Char d, ds)" | "decode' [] (ALT r1 r2) = (Void, [])" | "decode' (Z # ds) (ALT r1 r2) = (let (v, ds') = decode' ds r1 in (Left v, ds'))" | "decode' (S # ds) (ALT r1 r2) = (let (v, ds') = decode' ds r2 in (Right v, ds'))" diff -r d73f19be3ce6 -r 46e5566ad4ba thys2/Paper/Paper.thy --- a/thys2/Paper/Paper.thy Sat Jan 29 16:43:51 2022 +0000 +++ b/thys2/Paper/Paper.thy Sat Jan 29 23:53:21 2022 +0000 @@ -34,24 +34,31 @@ text {* -Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em -derivative} @{term "der c r"} of a regular expression \r\ w.r.t.\ -a character~\c\, and showed that it gave a simple solution to the -problem of matching a string @{term s} with a regular expression @{term -r}: if the derivative of @{term r} w.r.t.\ (in succession) all the -characters of the string matches the empty string, then @{term r} -matches @{term s} (and {\em vice versa}). The derivative has the -property (which may almost be regarded as its specification) that, for -every string @{term s} and regular expression @{term r} and character -@{term c}, one has @{term "cs \ L(r)"} if and only if \mbox{@{term "s \ L(der c r)"}}. -The beauty of Brzozowski's derivatives is that -they are neatly expressible in any functional language, and easily -definable and reasoned about in theorem provers---the definitions just -consist of inductive datatypes and simple recursive functions. A -mechanised correctness proof of Brzozowski's matcher in for example HOL4 -has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in -Isabelle/HOL is part of the work by Krauss and Nipkow \cite{Krauss2011}. -And another one in Coq is given by Coquand and Siles \cite{Coquand2012}. +In the last fifteen or so years, Brzozowski's derivatives of regular +expressions have sparked quite a bit of interest in the functional +programming and theorem prover communities. The beauty of +Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly +expressible in any functional language, and easily definable and +reasoned about in theorem provers---the definitions just consist of +inductive datatypes and simple recursive functions. A mechanised +correctness proof of Brzozowski's matcher in for example HOL4 has been +mentioned by Owens and Slind~\cite{Owens2008}. Another one in +Isabelle/HOL is part of the work by Krauss and Nipkow +\cite{Krauss2011}. And another one in Coq is given by Coquand and +Siles \cite{Coquand2012}. + + +The notion of derivatives +\cite{Brzozowski1964}, written @{term "der c r"}, of a regular +expression give a simple solution to the problem of matching a string +@{term s} with a regular expression @{term r}: if the derivative of +@{term r} w.r.t.\ (in succession) all the characters of the string +matches the empty string, then @{term r} matches @{term s} (and {\em +vice versa}). The derivative has the property (which may almost be +regarded as its specification) that, for every string @{term s} and +regular expression @{term r} and character @{term c}, one has @{term +"cs \ L(r)"} if and only if \mbox{@{term "s \ L(der c r)"}}. + If a regular expression matches a string, then in general there is more than one way of how the string is matched. There are two commonly used @@ -158,6 +165,8 @@ section {* Simplification *} text {* + Sulzmann \& Lu apply simplification via a fixpoint operation; also does not use erase to filter out duplicates. + not direct correspondence with PDERs, because of example problem with retrieve diff -r d73f19be3ce6 -r 46e5566ad4ba thys2/Paper/document/root.tex --- a/thys2/Paper/document/root.tex Sat Jan 29 16:43:51 2022 +0000 +++ b/thys2/Paper/document/root.tex Sat Jan 29 23:53:21 2022 +0000 @@ -59,37 +59,45 @@ \begin{abstract} Sulzmann and Lu described a lexing algorithm that calculates - Brzozowski derivatives using bit-sequences annotated to regular + Brzozowski derivatives using bitcodes annotated to regular expressions. Their algorithm generates POSIX values which encode the information of \emph{how} a regular expression matches a string---that is, which part of the string is matched by which part - of the regular expression. The purpose of the bit-sequences in - Sulzmann and Lu's algorithm is to keep the size of derivatives small - which is achieved by `aggressively' simplifying regular expressions. - In this paper we describe a slight variant of Sulzmann and Lu's - algorithm and \textit{(i)} prove that this algorithm generates - unique POSIX values; \textit{(ii)} we also establish a cubic bound - for the size of the derivatives---in earlier works, derivatives can - grow exponentially even after simplification. + of the regular expression. The purpose of the bitcodes in Sulzmann + and Lu's algorithm is to generate POSIX values incrementally while + derivatives are calculated. However they also help with designing + `aggressive' simplification methods that keep the size of + derivatives small. Without simplification derivatives can grow + exponentially resulting in an extremely slow lexing algorithm. In this + paper we describe a variant of Sulzmann and Lu's algorithm: Our + algorithm is a small, recursive functional program, whereas Sulzmann + and Lu's version involves a fixpoint construction. We \textit{(i)} + prove in Isabelle/HOL that our program is correct and generates + unique POSIX values; we also \textit{(ii)} establish a polynomial + bound for the size of the derivatives. The size can be seen as a + proxy measure for the effeciency of the lexing algorithm---that means + our algorithm does not suffer from the exponential blowup. -%Brzozowski introduced the notion of derivatives for regular -%expressions. They can be used for a very simple regular expression -%matching algorithm. Sulzmann and Lu cleverly extended this algorithm -%in order to deal with POSIX matching, which is the underlying -%disambiguation strategy for regular expressions needed in lexers. -%Their algorithm generates POSIX values which encode the information of -%\emph{how} a regular expression matches a string---that is, which part -%of the string is matched by which part of the regular expression. In -%this paper we give our inductive definition of what a POSIX value is -%and show $(i)$ that such a value is unique (for given regular -%expression and string being matched) and $(ii)$ that Sulzmann and Lu's -%algorithm always generates such a value (provided that the regular -%expression matches the string). We show that $(iii)$ our inductive -%definition of a POSIX value is equivalent to an alternative definition -%by Okui and Suzuki which identifies POSIX values as least elements -%according to an ordering of values. We also prove the correctness of -%Sulzmann's bitcoded version of the POSIX matching algorithm and extend the -%results to additional constructors for regular expressions. \smallskip + % Brzozowski introduced the notion of derivatives for regular + % expressions. They can be used for a very simple regular expression + % matching algorithm. Sulzmann and Lu cleverly extended this + % algorithm in order to deal with POSIX matching, which is the + % underlying disambiguation strategy for regular expressions needed + % in lexers. Their algorithm generates POSIX values which encode + % the information of \emph{how} a regular expression matches a + % string---that is, which part of the string is matched by which + % part of the regular expression. In this paper we give our + % inductive definition of what a POSIX value is and show $(i)$ that + % such a value is unique (for given regular expression and string + % being matched) and $(ii)$ that Sulzmann and Lu's algorithm always + % generates such a value (provided that the regular expression + % matches the string). We show that $(iii)$ our inductive definition + % of a POSIX value is equivalent to an alternative definition by + % Okui and Suzuki which identifies POSIX values as least elements + % according to an ordering of values. We also prove the correctness + % of Sulzmann's bitcoded version of the POSIX matching algorithm and + % extend the results to additional constructors for regular + % expressions. \smallskip \end{abstract} diff -r d73f19be3ce6 -r 46e5566ad4ba thys2/SizeBound4.thy --- a/thys2/SizeBound4.thy Sat Jan 29 16:43:51 2022 +0000 +++ b/thys2/SizeBound4.thy Sat Jan 29 23:53:21 2022 +0000 @@ -1032,12 +1032,78 @@ using blexer_correctness main_blexer_simp by simp +(* below is the idempotency of bsimp *) -lemma asize_idem: - shows "asize (bsimp (bsimp r)) = asize (bsimp r)" +lemma bsimp_ASEQ_fuse: + shows "fuse bs1 (bsimp_ASEQ bs2 r1 r2) = bsimp_ASEQ (bs1 @ bs2) r1 r2" + apply(induct r1 r2 arbitrary: bs1 bs2 rule: bsimp_ASEQ.induct) + apply(auto) + done + +lemma bsimp_AALTs_fuse: + assumes "\r \ set rs. fuse bs1 (fuse bs2 r) = fuse (bs1 @ bs2) r" + shows "fuse bs1 (bsimp_AALTs bs2 rs) = bsimp_AALTs (bs1 @ bs2) rs" + using assms + apply(induct bs2 rs arbitrary: bs1 rule: bsimp_AALTs.induct) + apply(auto) + done + +lemma bsimp_fuse: + shows "fuse bs (bsimp r) = bsimp (fuse bs r)" + apply(induct r arbitrary: bs) + apply(simp_all add: bsimp_ASEQ_fuse bsimp_AALTs_fuse fuse_append) + done + +lemma bsimp_ASEQ_idem: + assumes "bsimp (bsimp r1) = bsimp r1" "bsimp (bsimp r2) = bsimp r2" + shows "bsimp (bsimp_ASEQ x1 (bsimp r1) (bsimp r2)) = bsimp_ASEQ x1 (bsimp r1) (bsimp r2)" + using assms + apply(case_tac "bsimp r1 = AZERO") + apply(simp) + apply(case_tac "bsimp r2 = AZERO") + apply(simp) + apply(case_tac "\bs. bsimp r1 = AONE bs") + apply(auto)[1] + apply (metis bsimp_fuse) + apply(simp add: bsimp_ASEQ1) + done + +lemma bsimp_AALTs_idem: + assumes "\r \ set rs. bsimp (bsimp r) = bsimp r" + shows "bsimp (bsimp_AALTs bs rs) = bsimp_AALTs bs (map bsimp rs)" + using assms + apply(induct bs rs rule: bsimp_AALTs.induct) + apply(simp) + apply(simp) + using bsimp_fuse apply presburger + oops + +lemma bsimp_idem_rev: + shows "\r2. bsimp r1 \ r2" + apply(induct r1 rule: bsimp.induct) + apply(auto) + defer + defer + using rrewrite.simps apply blast + using rrewrite.cases apply blast + using rrewrite.simps apply blast + using rrewrite.cases apply blast + apply(case_tac "bsimp r1 = AZERO") + apply(simp) + apply(case_tac "bsimp r2 = AZERO") + apply(simp) + apply(case_tac "\bs. bsimp r1 = AONE bs") + apply(auto)[1] + prefer 2 + apply (smt (verit, best) arexp.distinct(25) arexp.inject(3) bsimp_ASEQ1 rrewrite.simps) + defer + oops + +lemma bsimp_idem: + shows "bsimp (bsimp r) = bsimp r" apply(induct r rule: bsimp.induct) apply(auto) - prefer 2 + using bsimp_ASEQ_idem apply presburger oops export_code blexer_simp blexer lexer bders bders_simp in Scala module_name VerifiedLexers diff -r d73f19be3ce6 -r 46e5566ad4ba thys2/paper.pdf Binary file thys2/paper.pdf has changed