# HG changeset patch # User Christian Urban # Date 1643325926 0 # Node ID e1b74d618f1b013a023bf9d35e4b3fccd689bdbe # Parent cc8e231529fb408fffde99bda9d3803aafc4bb75 updated Sizebound4 diff -r cc8e231529fb -r e1b74d618f1b thys/LexerExt.thy --- a/thys/LexerExt.thy Tue Jan 25 13:12:50 2022 +0000 +++ b/thys/LexerExt.thy Thu Jan 27 23:25:26 2022 +0000 @@ -17,10 +17,15 @@ | "mkeps(NTIMES r n) = Stars (replicate n (mkeps r))" | "mkeps(FROMNTIMES r n) = Stars (replicate n (mkeps r))" | "mkeps(NMTIMES r n m) = Stars (replicate n (mkeps r))" - +| "mkeps(NOT ZERO) = Nt Void" +| "mkeps(NOT (CH _ )) = Nt Void" +| "mkeps(NOT (SEQ r1 r2)) = Seq (mkeps (NOT r1)) (mkeps (NOT r1))" +| "mkeps(NOT (ALT r1 r2)) = (if nullable(r1) then Right (mkeps (NOT r2)) else (mkeps (NOT r1)))" + + fun injval :: "rexp \ char \ val \ val" where - "injval (CHAR d) c Void = Char d" + "injval (CH d) c Void = Char d" | "injval (ALT r1 r2) c (Left v1) = Left(injval r1 c v1)" | "injval (ALT r1 r2) c (Right v2) = Right(injval r2 c v2)" | "injval (SEQ r1 r2) c (Seq v1 v2) = Seq (injval r1 c v1) v2" @@ -49,9 +54,11 @@ shows "flat (mkeps r) = []" using assms apply(induct rule: nullable.induct) - apply(auto) - by presburger - + apply(auto) + apply presburger + apply(case_tac r) + apply(auto) + sorry lemma mkeps_nullable: assumes "nullable(r)" @@ -81,7 +88,7 @@ apply (simp add: mkeps_flat) apply(simp) apply(simp) -done + sorry lemma Prf_injval_flat: @@ -90,7 +97,7 @@ using assms apply(induct arbitrary: v rule: der.induct) apply(auto elim!: Prf_elims intro: mkeps_flat split: if_splits) -done + sorry lemma Prf_injval: assumes "\ v : der c r" @@ -170,7 +177,8 @@ apply(simp add: Prf_injval_flat) apply(simp) apply(simp) -done + sorry + @@ -186,8 +194,10 @@ apply(auto intro: Posix.intros simp add: nullable_correctness Sequ_def) apply(subst append.simps(1)[symmetric]) apply(rule Posix.intros) - apply(auto) - done + apply(auto) + apply(case_tac r) + apply(auto) + sorry lemma Posix_injval: @@ -207,22 +217,22 @@ then have "False" by cases then show "(c # s) \ ONE \ (injval ONE c v)" by simp next - case (CHAR d) + case (CH d) consider (eq) "c = d" | (ineq) "c \ d" by blast - then show "(c # s) \ (CHAR d) \ (injval (CHAR d) c v)" + then show "(c # s) \ (CH d) \ (injval (CH d) c v)" proof (cases) case eq - have "s \ der c (CHAR d) \ v" by fact + have "s \ der c (CH d) \ v" by fact then have "s \ ONE \ v" using eq by simp then have eqs: "s = [] \ v = Void" by cases simp - show "(c # s) \ CHAR d \ injval (CHAR d) c v" using eq eqs + show "(c # s) \ CH d \ injval (CH d) c v" using eq eqs by (auto intro: Posix.intros) next case ineq - have "s \ der c (CHAR d) \ v" by fact + have "s \ der c (CH d) \ v" by fact then have "s \ ZERO \ v" using ineq by simp then have "False" by cases - then show "(c # s) \ CHAR d \ injval (CHAR d) c v" by simp + then show "(c # s) \ CH d \ injval (CH d) c v" by simp qed next case (ALT r1 r2) @@ -319,7 +329,6 @@ apply(simp) apply(subgoal_tac "\vss. v2 = Stars vss") apply(clarify) - apply(drule_tac x="v1" in meta_spec) apply(drule_tac x="vss" in meta_spec) apply(drule_tac x="s1" in meta_spec) apply(drule_tac x="s2" in meta_spec) @@ -401,7 +410,6 @@ apply(simp) apply(subgoal_tac "\vss. v2 = Stars vss") apply(clarify) - apply(drule_tac x="v1" in meta_spec) apply(drule_tac x="vss" in meta_spec) apply(drule_tac x="s1" in meta_spec) apply(drule_tac x="s2" in meta_spec) @@ -454,7 +462,6 @@ apply(simp) apply(subgoal_tac "\vss. v2 = Stars vss") apply(clarify) - apply(drule_tac x="v1" in meta_spec) apply(drule_tac x="vss" in meta_spec) apply(drule_tac x="s1" in meta_spec) apply(drule_tac x="s2" in meta_spec) @@ -544,7 +551,6 @@ apply(simp) apply(subgoal_tac "\vss. v2 = Stars vss") apply(clarify) - apply(drule_tac x="v1" in meta_spec) apply(drule_tac x="vss" in meta_spec) apply(drule_tac x="s1" in meta_spec) apply(drule_tac x="s2" in meta_spec) @@ -618,6 +624,9 @@ apply(simp) done qed + next + case (NOT r s v) + then show ?case sorry qed section {* Lexer Correctness *} diff -r cc8e231529fb -r e1b74d618f1b thys/SpecExt.thy --- a/thys/SpecExt.thy Tue Jan 25 13:12:50 2022 +0000 +++ b/thys/SpecExt.thy Thu Jan 27 23:25:26 2022 +0000 @@ -1,6 +1,6 @@ theory SpecExt - imports Main (*"~~/src/HOL/Library/Sublist"*) + imports Main "HOL-Library.Sublist" (*"~~/src/HOL/Library/Sublist"*) begin section {* Sequential Composition of Languages *} @@ -165,10 +165,6 @@ apply(auto simp add: Sequ_def) done -lemma - assumes "[] \ A" "n \ 0" "A \ {}" - shows "A \ (Suc n) = A \ n" - lemma Der_Pow_0: shows "Der c (A \ 0) = {}" by(simp add: Der_def) @@ -221,7 +217,7 @@ datatype rexp = ZERO | ONE -| CHAR char +| CH char | SEQ rexp rexp | ALT rexp rexp | STAR rexp @@ -238,7 +234,7 @@ where "L (ZERO) = {}" | "L (ONE) = {[]}" -| "L (CHAR c) = {[c]}" +| "L (CH c) = {[c]}" | "L (SEQ r1 r2) = (L r1) ;; (L r2)" | "L (ALT r1 r2) = (L r1) \ (L r2)" | "L (STAR r) = (L r)\" @@ -255,7 +251,7 @@ where "nullable (ZERO) = False" | "nullable (ONE) = True" -| "nullable (CHAR c) = False" +| "nullable (CH c) = False" | "nullable (ALT r1 r2) = (nullable r1 \ nullable r2)" | "nullable (SEQ r1 r2) = (nullable r1 \ nullable r2)" | "nullable (STAR r) = True" @@ -270,7 +266,7 @@ where "der c (ZERO) = ZERO" | "der c (ONE) = ZERO" -| "der c (CHAR d) = (if c = d then ONE else ZERO)" +| "der c (CH d) = (if c = d then ONE else ZERO)" | "der c (ALT r1 r2) = ALT (der c r1) (der c r2)" | "der c (SEQ r1 r2) = (if nullable r1 @@ -340,48 +336,6 @@ by (metis append.assoc) - -lemma - "L(FROMNTIMES r n) = L(SEQ (NTIMES r n) (STAR r))" - apply(auto simp add: Sequ_def) - defer - apply(subgoal_tac "\m. s2 \ (L r) \ m") - prefer 2 - apply (simp add: Star_Pow) - apply(auto) - apply(rule_tac x="n + m" in bexI) - apply (simp add: pow_add) - apply simp - apply(subgoal_tac "\m. m + n = xa") - apply(auto) - prefer 2 - using le_add_diff_inverse2 apply auto[1] - by (smt Pow_Star Sequ_def add.commute mem_Collect_eq pow_add2) - -lemma - "L (der c (FROMNTIMES r n)) = - L (SEQ (der c r) (FROMNTIMES r (n - 1)))" - apply(auto simp add: Sequ_def) - using Star_Pow apply blast - using Pow_Star by blast - -lemma - "L (der c (UPNTIMES r n)) = - L(if n = 0 then ZERO else - ALT (der c r) (SEQ (der c r) (UPNTIMES r (n - 1))))" - apply(auto simp add: Sequ_def) - using SpecExt.Pow_empty by blast - -abbreviation "FROM \ FROMNTIMES" - -lemma - shows "L (der c (FROM r n)) = - L (if n <= 0 then SEQ (der c r) (ALT ONE (FROM r 0)) - else SEQ (der c r) (ALT ZERO (FROM r (n -1))))" - apply(auto simp add: Sequ_def) - oops - - fun ders :: "string \ rexp \ rexp" where @@ -454,8 +408,10 @@ apply(auto simp add: Sequ_def Der_def Cons_eq_append_conv)[1] apply(rule_tac x="Suc xa" in bexI) apply(auto simp add: Sequ_def)[2] -apply (metis append_Cons) -apply (metis (no_types, hide_lams) Pow_decomp atMost_iff diff_Suc_eq_diff_pred diff_is_0_eq) + apply (metis append_Cons) + apply(rule_tac x="xa - 1" in bexI) + apply(auto simp add: Sequ_def)[2] + apply (metis One_nat_def Pow_decomp) apply(rule impI)+ apply(subst Sequ_Union_in) apply(subst Der_Pow_Sequ[symmetric]) @@ -485,7 +441,7 @@ | Right val | Left val | Stars "val list" - +| Nt val section {* The string behind a value *} @@ -499,6 +455,7 @@ | "flat (Seq v1 v2) = (flat v1) @ (flat v2)" | "flat (Stars []) = []" | "flat (Stars (v#vs)) = (flat v) @ (flat (Stars vs))" +| "flat (Nt v) = flat v" abbreviation "flats vs \ concat (map flat vs)" @@ -597,7 +554,7 @@ | "\ v1 : r1 \ \ Left v1 : ALT r1 r2" | "\ v2 : r2 \ \ Right v2 : ALT r1 r2" | "\ Void : ONE" -| "\ Char c : CHAR c" +| "\ Char c : CH c" | "\\v \ set vs. \ v : r \ flat v \ []\ \ \ Stars vs : STAR r" | "\\v \ set vs. \ v : r \ flat v \ []; length vs \ n\ \ \ Stars vs : UPNTIMES r n" | "\\v \ set vs1. \ v : r \ flat v \ []; @@ -622,7 +579,7 @@ "\ v : SEQ r1 r2" "\ v : ALT r1 r2" "\ v : ONE" - "\ v : CHAR c" + "\ v : CH c" "\ vs : STAR r" "\ vs : UPNTIMES r n" "\ vs : NTIMES r n" @@ -867,7 +824,10 @@ then obtain vs where "flats vs = s" "\v\set vs. \ v : r \ flat v \ []" "length vs \ n" using IH flats_cval_nonempty by (smt order.trans) then show "\v. \ v : UPNTIMES r n \ flat v = s" - using Prf.intros(7) flat_Stars by blast + using Prf.intros(7) flat_Stars by blast +next + case (NOT r) + then show ?case sorry qed (auto intro: Prf.intros) @@ -917,7 +877,7 @@ lemma LV_simps: shows "LV ZERO s = {}" and "LV ONE s = (if s = [] then {Void} else {})" - and "LV (CHAR c) s = (if s = [c] then {Char c} else {})" + and "LV (CH c) s = (if s = [c] then {Char c} else {})" and "LV (ALT r1 r2) s = Left ` LV r1 s \ Right ` LV r2 s" unfolding LV_def apply(auto intro: Prf.intros elim: Prf.cases) @@ -1226,8 +1186,8 @@ case (ONE s) show "finite (LV ONE s)" by (simp add: LV_simps) next - case (CHAR c s) - show "finite (LV (CHAR c) s)" by (simp add: LV_simps) + case (CH c s) + show "finite (LV (CH c) s)" by (simp add: LV_simps) next case (ALT r1 r2 s) then show "finite (LV (ALT r1 r2) s)" by (simp add: LV_simps) @@ -1270,7 +1230,10 @@ case (NMTIMES r n m s) have "\s. finite (LV r s)" by fact then show "finite (LV (NMTIMES r n m) s)" - by (simp add: LV_NMTIMES_6) + by (simp add: LV_NMTIMES_6) +next + case (NOT r s) + then show ?case sorry qed @@ -1281,7 +1244,7 @@ Posix :: "string \ rexp \ val \ bool" ("_ \ _ \ _" [100, 100, 100] 100) where Posix_ONE: "[] \ ONE \ Void" -| Posix_CHAR: "[c] \ (CHAR c) \ (Char c)" +| Posix_CHAR: "[c] \ (CH c) \ (Char c)" | Posix_ALT1: "s \ r1 \ v \ s \ (ALT r1 r2) \ (Left v)" | Posix_ALT2: "\s \ r2 \ v; s \ L(r1)\ \ s \ (ALT r1 r2) \ (Right v)" | Posix_SEQ: "\s1 \ r1 \ v1; s2 \ r2 \ v2; @@ -1320,7 +1283,7 @@ inductive_cases Posix_elims: "s \ ZERO \ v" "s \ ONE \ v" - "s \ CHAR c \ v" + "s \ CH c \ v" "s \ ALT r1 r2 \ v" "s \ SEQ r1 r2 \ v" "s \ STAR r \ v" @@ -1396,7 +1359,7 @@ then show "Void = v2" by cases auto next case (Posix_CHAR c v2) - have "[c] \ CHAR c \ v2" by fact + have "[c] \ CH c \ v2" by fact then show "Char c = v2" by cases auto next case (Posix_ALT1 s r1 v r2 v2) @@ -1503,14 +1466,9 @@ using Posix1(1) Posix1(2) apply blast apply(case_tac n) apply(simp) - apply(simp) - apply(drule_tac x="va" in meta_spec) - apply(drule_tac x="vs" in meta_spec) - apply(simp) - apply(drule meta_mp) - apply (metis L.simps(9) Posix1(1) UN_E append.right_neutral append_Nil diff_Suc_1 local.Posix_FROMNTIMES1(4) val.inject(5)) - apply (metis L.simps(9) Posix1(1) UN_E append.right_neutral append_Nil) - by (metis One_nat_def Posix1(1) Posix_FROMNTIMES1.hyps(7) self_append_conv self_append_conv2) + apply(simp) + apply (smt (verit, ccfv_threshold) L.simps(9) Posix1(1) UN_E append_eq_append_conv2) + by (metis One_nat_def Posix1(1) Posix_FROMNTIMES1.hyps(7) append_Nil2 append_self_conv2) moreover have IHs: "\v2. s1 \ r \ v2 \ v = v2" "\v2. s2 \ FROMNTIMES r (n - 1) \ v2 \ Stars vs = v2" by fact+ @@ -1555,18 +1513,8 @@ apply(case_tac m) apply(simp) apply(simp) - apply(drule_tac x="va" in meta_spec) - apply(drule_tac x="vs" in meta_spec) - apply(simp) - apply(drule meta_mp) - apply(drule Posix1(1)) - apply(drule Posix1(1)) - apply(drule Posix1(1)) - apply(frule Posix1(1)) - apply(simp) - using Posix_NMTIMES1.hyps(4) apply force - apply (metis L.simps(10) Posix1(1) UN_E append_Nil2 append_self_conv2) - by (metis One_nat_def Posix1(1) Posix_NMTIMES1.hyps(8) append.right_neutral append_Nil) + apply (metis L.simps(10) Posix1(1) UN_E append.right_neutral append_Nil) + by (metis One_nat_def Posix1(1) Posix_NMTIMES1.hyps(8) append_Nil self_append_conv) moreover have IHs: "\v2. s1 \ r \ v2 \ v = v2" "\v2. s2 \ NMTIMES r (n - 1) (m - 1) \ v2 \ Stars vs = v2" by fact+ @@ -1616,7 +1564,6 @@ apply(auto simp add: intro!: Prf.intros elim!: Prf_elims)[2] apply (metis (mono_tags, lifting) Prf.intros(9) append_Nil empty_iff flat_Stars flats_empty list.set(1) mem_Collect_eq) apply(simp) - apply(clarify) apply(case_tac n) apply(simp) apply(simp) @@ -1632,7 +1579,6 @@ apply(simp) apply(simp) apply(simp) - apply(clarify) apply(erule Prf_elims) apply(simp) apply(rule Prf.intros) @@ -1660,7 +1606,6 @@ apply(simp) apply (metis Prf.intros(11) append_Nil empty_iff list.set(1)) apply(simp) - apply(clarify) apply(rotate_tac 6) apply(erule Prf_elims) apply(simp) @@ -1675,7 +1620,6 @@ apply(simp) apply(simp) apply(simp) - apply(clarify) apply(rotate_tac 6) apply(erule Prf_elims) apply(simp) diff -r cc8e231529fb -r e1b74d618f1b thys2/Paper/Paper.thy --- a/thys2/Paper/Paper.thy Tue Jan 25 13:12:50 2022 +0000 +++ b/thys2/Paper/Paper.thy Thu Jan 27 23:25:26 2022 +0000 @@ -7,7 +7,164 @@ "../SizeBound4" "HOL-Library.LaTeXsugar" begin + +declare [[show_question_marks = false]] + +abbreviation + "der_syn r c \ der c r" + +notation (latex output) + der_syn ("_\\_" [79, 1000] 76) and + + ZERO ("\<^bold>0" 81) and + ONE ("\<^bold>1" 81) and + CH ("_" [1000] 80) and + ALT ("_ + _" [77,77] 78) and + SEQ ("_ \ _" [77,77] 78) and + STAR ("_\<^sup>\" [79] 78) + (*>*) + +section {* Introduction *} + +text {* + +Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em +derivative} @{term "der c r"} of a regular expression \r\ w.r.t.\ +a character~\c\, and showed that it gave a simple solution to the +problem of matching a string @{term s} with a regular expression @{term +r}: if the derivative of @{term r} w.r.t.\ (in succession) all the +characters of the string matches the empty string, then @{term r} +matches @{term s} (and {\em vice versa}). The derivative has the +property (which may almost be regarded as its specification) that, for +every string @{term s} and regular expression @{term r} and character +@{term c}, one has @{term "cs \ L(r)"} if and only if \mbox{@{term "s \ L(der c r)"}}. +The beauty of Brzozowski's derivatives is that +they are neatly expressible in any functional language, and easily +definable and reasoned about in theorem provers---the definitions just +consist of inductive datatypes and simple recursive functions. A +mechanised correctness proof of Brzozowski's matcher in for example HOL4 +has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in +Isabelle/HOL is part of the work by Krauss and Nipkow \cite{Krauss2011}. +And another one in Coq is given by Coquand and Siles \cite{Coquand2012}. + +If a regular expression matches a string, then in general there is more +than one way of how the string is matched. There are two commonly used +disambiguation strategies to generate a unique answer: one is called +GREEDY matching \cite{Frisch2004} and the other is POSIX +matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}. +For example consider the string @{term xy} and the regular expression +\mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be +matched in two `iterations' by the single letter-regular expressions +@{term x} and @{term y}, or directly in one iteration by @{term xy}. The +first case corresponds to GREEDY matching, which first matches with the +left-most symbol and only matches the next symbol in case of a mismatch +(this is greedy in the sense of preferring instant gratification to +delayed repletion). The second case is POSIX matching, which prefers the +longest match. + + + +\begin{figure}[t] +\begin{center} +\begin{tikzpicture}[scale=2,node distance=1.3cm, + every node/.style={minimum size=6mm}] +\node (r1) {@{term "r\<^sub>1"}}; +\node (r2) [right=of r1]{@{term "r\<^sub>2"}}; +\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}}; +\node (r3) [right=of r2]{@{term "r\<^sub>3"}}; +\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}}; +\node (r4) [right=of r3]{@{term "r\<^sub>4"}}; +\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}}; +\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}}; +\node (v4) [below=of r4]{@{term "v\<^sub>4"}}; +\draw[->,line width=1mm](r4) -- (v4); +\node (v3) [left=of v4] {@{term "v\<^sub>3"}}; +\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\inj r\<^sub>3 c\}; +\node (v2) [left=of v3]{@{term "v\<^sub>2"}}; +\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\inj r\<^sub>2 b\}; +\node (v1) [left=of v2] {@{term "v\<^sub>1"}}; +\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\inj r\<^sub>1 a\}; +\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}}; +\end{tikzpicture} +\end{center} +\mbox{}\\[-13mm] + +\caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014}, +matching the string @{term "[a,b,c]"}. The first phase (the arrows from +left to right) is \Brz's matcher building successive derivatives. If the +last regular expression is @{term nullable}, then the functions of the +second phase are called (the top-down and right-to-left arrows): first +@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing +how the empty string has been recognised by @{term "r\<^sub>4"}. After +that the function @{term inj} ``injects back'' the characters of the string into +the values. +\label{Sulz}} +\end{figure} + + +*} + +section {* Background *} + +text {* + Sulzmann-Lu algorithm with inj. State that POSIX rules. + metion slg is correct. + + + \begin{figure}[t] + \begin{center} + \begin{tabular}{c} + @{thm[mode=Axiom] Posix.intros(1)}\P\@{term "ONE"} \qquad + @{thm[mode=Axiom] Posix.intros(2)}\P\@{term "c"}\medskip\\ + @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\P+L\\qquad + @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\P+R\\medskip\\ + $\mprset{flushleft} + \inferrule + {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad + @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\ + @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}} + {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\PS\\\ + @{thm[mode=Axiom] Posix.intros(7)}\P[]\\medskip\\ + $\mprset{flushleft} + \inferrule + {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad + @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad + @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\ + @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}} + {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\P\\ + \end{tabular} + \end{center} + \caption{Our inductive definition of POSIX values.}\label{POSIXrules} + \end{figure} + + +*} + +section {* Bitcoded Derivatives *} + +text {* + bitcoded regexes / decoding / bmkeps + gets rid of the second phase (only single phase) + correctness +*} + + +section {* Simplification *} + +text {* + not direct correspondence with PDERs, because of example + problem with retrieve + + correctness +*} + +section {* Bound - NO *} + +section {* Bounded Regex / Not *} + +section {* Conclusion *} + text {* \cite{AusafDyckhoffUrban2016} diff -r cc8e231529fb -r e1b74d618f1b thys2/Paper/document/root.tex --- a/thys2/Paper/document/root.tex Tue Jan 25 13:12:50 2022 +0000 +++ b/thys2/Paper/document/root.tex Thu Jan 27 23:25:26 2022 +0000 @@ -45,33 +45,50 @@ \titlerunning{POSIX Lexing with Bitcoded Derivatives} \author{Chengsong Tan}{King's College London}{chengsong.tan@kcl.ac.uk}{}{} \author{Christian Urban}{King's College London}{christian.urban@kcl.ac.uk}{}{} +\keywords{POSIX matching, Derivatives of Regular Expressions, Isabelle/HOL} +\category{} +\ccsdesc[100]{Design and analysis of algorithms} +\ccsdesc[100]{Formal languages and automata theory} +\Copyright{\mbox{}} +\nolinenumbers \begin{document} \maketitle \begin{abstract} -Brzozowski introduced the notion of derivatives for regular -expressions. They can be used for a very simple regular expression -matching algorithm. Sulzmann and Lu cleverly extended this algorithm -in order to deal with POSIX matching, which is the underlying -disambiguation strategy for regular expressions needed in lexers. -Their algorithm generates POSIX values which encode the information of -\emph{how} a regular expression matches a string---that is, which part -of the string is matched by which part of the regular expression. In -this paper we give our inductive definition of what a POSIX value is -and show $(i)$ that such a value is unique (for given regular -expression and string being matched) and $(ii)$ that Sulzmann and Lu's -algorithm always generates such a value (provided that the regular -expression matches the string). We show that $(iii)$ our inductive -definition of a POSIX value is equivalent to an alternative definition -by Okui and Suzuki which identifies POSIX values as least elements -according to an ordering of values. We also prove the correctness of -Sulzmann's bitcoded version of the POSIX matching algorithm and extend the -results to additional constructors for regular expressions. \smallskip - -{\bf Keywords:} POSIX matching, Derivatives of Regular Expressions, -Isabelle/HOL + Sulzmann and Lu described a lexing algorithm that calculates + Brzozowski derivatives using bit-sequences annotated to regular + expressions. Their algorithm generates POSIX values which encode + the information of \emph{how} a regular expression matches a + string---that is, which part of the string is matched by which part + of the regular expression. The purpose of the bit-sequences in + Sulzmann and Lu's algorithm is to keep the size of derivatives small + which is achieved by `aggressively' simplifying regular expressions. + In this paper we describe a slight variant of Sulzmann and Lu's + algorithm and \textit{(i)} prove that this algorithm generates + unique POSIX values; \textit{(ii)} we also establish a cubic bound + for the size of the derivatives---in earlier works, derivatives can + grow exponentially even after simplification. + +%Brzozowski introduced the notion of derivatives for regular +%expressions. They can be used for a very simple regular expression +%matching algorithm. Sulzmann and Lu cleverly extended this algorithm +%in order to deal with POSIX matching, which is the underlying +%disambiguation strategy for regular expressions needed in lexers. +%Their algorithm generates POSIX values which encode the information of +%\emph{how} a regular expression matches a string---that is, which part +%of the string is matched by which part of the regular expression. In +%this paper we give our inductive definition of what a POSIX value is +%and show $(i)$ that such a value is unique (for given regular +%expression and string being matched) and $(ii)$ that Sulzmann and Lu's +%algorithm always generates such a value (provided that the regular +%expression matches the string). We show that $(iii)$ our inductive +%definition of a POSIX value is equivalent to an alternative definition +%by Okui and Suzuki which identifies POSIX values as least elements +%according to an ordering of values. We also prove the correctness of +%Sulzmann's bitcoded version of the POSIX matching algorithm and extend the +%results to additional constructors for regular expressions. \smallskip \end{abstract} diff -r cc8e231529fb -r e1b74d618f1b thys2/SizeBound4.thy --- a/thys2/SizeBound4.thy Tue Jan 25 13:12:50 2022 +0000 +++ b/thys2/SizeBound4.thy Thu Jan 27 23:25:26 2022 +0000 @@ -132,15 +132,6 @@ apply(auto) done -lemma fuse_Nil: - shows "fuse [] r = r" - by (induct r)(simp_all) - -(* -lemma map_fuse_Nil: - shows "map (fuse []) rs = rs" - by (induct rs)(simp_all add: fuse_Nil) -*) fun intern :: "rexp \ arexp" where "intern ZERO = AZERO" @@ -280,23 +271,23 @@ shows "retrieve (fuse bs r) v = bs @ retrieve r v" using assms apply(induct r arbitrary: v bs) - apply(auto elim: Prf_elims)[4] - defer - using retrieve_encode_STARS - apply(auto elim!: Prf_elims)[1] - apply(case_tac vs) - apply(simp) - apply(simp) - (* AALTs case *) + apply(auto elim: Prf_elims)[4] + apply(case_tac x2a) + apply(simp) + using Prf_elims(1) apply blast + apply(case_tac x2a) + apply(simp) + apply(simp) + apply(case_tac list) apply(simp) - apply(case_tac x2a) - apply(simp) - apply(auto elim!: Prf_elims)[1] + apply(simp) + apply (smt (verit, best) Prf_elims(3) append_assoc retrieve.simps(4) retrieve.simps(5)) apply(simp) - apply(case_tac list) - apply(simp) - apply(auto) + using retrieve_encode_STARS apply(auto elim!: Prf_elims)[1] + apply(case_tac vs) + apply(simp) + apply(simp) done lemma retrieve_fuse: @@ -314,139 +305,92 @@ apply(simp_all add: retrieve_fuse retrieve_encode_STARS) done -(* -lemma bnullable_Hdbmkeps_Hd: - assumes "bnullable a" - shows "bmkeps (AALTs bs (a # rs)) = bs @ (bmkeps a)" - using assms by simp -*) - -lemma r2: - assumes "x \ set rs" "bnullable x" - shows "bnullable (AALTs bs rs)" +lemma retrieve_AALTs_bnullable1: + assumes "bnullable r" + shows "retrieve (AALTs bs (r # rs)) (mkeps (erase (AALTs bs (r # rs)))) + = bs @ retrieve r (mkeps (erase r))" using assms - apply(induct rs) - apply(auto) + apply(case_tac rs) + apply(auto simp add: bnullable_correctness) done -lemma r3: - assumes "\ bnullable r" - "bnullables rs" - shows "retrieve (AALTs bs rs) (mkeps (erase (AALTs bs rs))) = - retrieve (AALTs bs (r # rs)) (mkeps (erase (AALTs bs (r # rs))))" +lemma retrieve_AALTs_bnullable2: + assumes "\bnullable r" "bnullables rs" + shows "retrieve (AALTs bs (r # rs)) (mkeps (erase (AALTs bs (r # rs)))) + = retrieve (AALTs bs rs) (mkeps (erase (AALTs bs rs)))" using assms apply(induct rs arbitrary: r bs) - apply(auto)[1] apply(auto) using bnullable_correctness apply blast - apply(auto simp add: bnullable_correctness mkeps_nullable retrieve_fuse2) - apply(subst retrieve_fuse2[symmetric]) - apply (smt bnullable.simps(4) bnullable_correctness erase.simps(5) erase.simps(6) insert_iff list.exhaust list.set(2) mkeps.simps(3) mkeps_nullable) - apply(simp) - apply(case_tac "bnullable a") - apply (smt append_Nil2 bnullable.simps(4) bnullable_correctness erase.simps(5) erase.simps(6) fuse.simps(4) insert_iff list.exhaust list.set(2) mkeps.simps(3) mkeps_nullable retrieve_fuse2) - apply(drule_tac x="a" in meta_spec) - apply(drule_tac x="bs" in meta_spec) - apply(drule meta_mp) - apply(simp) - apply(drule meta_mp) - apply(auto) - apply(subst retrieve_fuse2[symmetric]) apply(case_tac rs) - apply(simp) - apply(auto)[1] - apply (simp add: bnullable_correctness) - - apply (metis append_Nil2 bnullable_correctness erase_fuse fuse.simps(4) list.set_intros(1) mkeps.simps(3) mkeps_nullable nullable.simps(4) r2) - apply (simp add: bnullable_correctness) - apply (metis append_Nil2 bnullable_correctness erase.simps(6) erase_fuse fuse.simps(4) list.set_intros(2) mkeps.simps(3) mkeps_nullable r2) - apply(simp) + apply(auto) + using bnullable_correctness apply blast + apply(case_tac rs) + apply(auto) done -lemma t: +lemma bmkeps_retrieve_AALTs: assumes "\r \ set rs. bnullable r \ bmkeps r = retrieve r (mkeps (erase r))" "bnullables rs" shows "bs @ bmkepss rs = retrieve (AALTs bs rs) (mkeps (erase (AALTs bs rs)))" using assms apply(induct rs arbitrary: bs) apply(auto) - apply (metis (no_types, opaque_lifting) bmkepss.cases bnullable_correctness erase.simps(5) erase.simps(6) mkeps.simps(3) retrieve.simps(3) retrieve.simps(4)) - apply (metis r3) - apply (metis (no_types, lifting) bmkepss.cases bnullable_correctness empty_iff erase.simps(6) list.set(1) mkeps.simps(3) retrieve.simps(4)) - apply (metis r3) - done - + using retrieve_AALTs_bnullable1 apply presburger + apply (metis retrieve_AALTs_bnullable2) + apply (simp add: retrieve_AALTs_bnullable1) + by (metis retrieve_AALTs_bnullable2) + + lemma bmkeps_retrieve: assumes "bnullable r" shows "bmkeps r = retrieve r (mkeps (erase r))" using assms apply(induct r) - apply(auto) - using t by auto + apply(auto) + using bmkeps_retrieve_AALTs by auto lemma bder_retrieve: assumes "\ v : der c (erase r)" shows "retrieve (bder c r) v = retrieve r (injval (erase r) c v)" using assms apply(induct r arbitrary: v rule: erase.induct) - apply(simp) - apply(erule Prf_elims) - apply(simp) - apply(erule Prf_elims) - apply(simp) - apply(case_tac "c = ca") - apply(simp) - apply(erule Prf_elims) - apply(simp) - apply(simp) - apply(erule Prf_elims) - apply(simp) - apply(erule Prf_elims) - apply(simp) - apply(simp) - apply(rename_tac "r\<^sub>1" "r\<^sub>2" rs v) - apply(erule Prf_elims) - apply(simp) - apply(simp) - apply(case_tac rs) - apply(simp) - apply(simp) - apply (smt Prf_elims(3) injval.simps(2) injval.simps(3) retrieve.simps(4) retrieve.simps(5) same_append_eq) - apply(simp) - apply(case_tac "nullable (erase r1)") - apply(simp) - apply(erule Prf_elims) - apply(subgoal_tac "bnullable r1") - prefer 2 - using bnullable_correctness apply blast - apply(simp) - apply(erule Prf_elims) - apply(simp) - apply(subgoal_tac "bnullable r1") - prefer 2 - using bnullable_correctness apply blast - apply(simp) - apply(simp add: retrieve_fuse2) - apply(simp add: bmkeps_retrieve) - apply(simp) - apply(erule Prf_elims) - apply(simp) - using bnullable_correctness apply blast - apply(rename_tac bs r v) + using Prf_elims(1) apply auto[1] + using Prf_elims(1) apply auto[1] + apply(auto)[1] + apply (metis Prf_elims(4) injval.simps(1) retrieve.simps(1) retrieve.simps(2)) + using Prf_elims(1) apply blast + (* AALTs case *) apply(simp) apply(erule Prf_elims) - apply(clarify) + apply(simp) + apply(simp) + apply(rename_tac "r\<^sub>1" "r\<^sub>2" rs v) + apply(erule Prf_elims) + apply(simp) + apply(simp) + apply(case_tac rs) + apply(simp) + apply(simp) + using Prf_elims(3) apply fastforce + (* ASEQ case *) + apply(simp) + apply(case_tac "nullable (erase r1)") + apply(simp) + apply(erule Prf_elims) + using Prf_elims(2) bnullable_correctness apply force + apply (simp add: bmkeps_retrieve bnullable_correctness retrieve_fuse2) + apply (simp add: bmkeps_retrieve bnullable_correctness retrieve_fuse2) + using Prf_elims(2) apply force + (* ASTAR case *) + apply(rename_tac bs r v) + apply(simp) apply(erule Prf_elims) apply(clarify) - apply(subst injval.simps) - apply(simp del: retrieve.simps) - apply(subst retrieve.simps) - apply(subst retrieve.simps) - apply(simp) - apply(simp add: retrieve_fuse2) - done - + apply(erule Prf_elims) + apply(clarify) + by (simp add: retrieve_fuse2) lemma MAIN_decode: @@ -581,79 +525,6 @@ apply(simp_all) done -lemma L_bsimp_ASEQ: - "L (erase (ASEQ bs r1 r2)) = L (erase (bsimp_ASEQ bs r1 r2))" - apply(induct bs r1 r2 rule: bsimp_ASEQ.induct) - apply(simp_all) - by (metis erase_fuse fuse.simps(4)) - -lemma L_bsimp_AALTs: - "L (erase (AALTs bs rs)) = L (erase (bsimp_AALTs bs rs))" - apply(induct bs rs rule: bsimp_AALTs.induct) - apply(simp_all add: erase_fuse) - done - -lemma L_erase_AALTs: - shows "L (erase (AALTs bs rs)) = \ (L ` erase ` (set rs))" - apply(induct rs) - apply(simp) - apply(simp) - apply(case_tac rs) - apply(simp) - apply(simp) - done - -lemma L_erase_flts: - shows "\ (L ` erase ` (set (flts rs))) = \ (L ` erase ` (set rs))" - apply(induct rs rule: flts.induct) - apply(simp_all) - apply(auto) - using L_erase_AALTs erase_fuse apply auto[1] - by (simp add: L_erase_AALTs erase_fuse) - -lemma L_erase_dB_acc: - shows "(\ (L ` acc) \ (\ (L ` erase ` (set (distinctBy rs erase acc))))) - = \ (L ` acc) \ \ (L ` erase ` (set rs))" - apply(induction rs arbitrary: acc) - apply simp_all - by (smt (z3) SUP_absorb UN_insert sup_assoc sup_commute) - - -lemma L_erase_dB: - shows "(\ (L ` erase ` (set (distinctBy rs erase {})))) = \ (L ` erase ` (set rs))" - by (metis L_erase_dB_acc Un_commute Union_image_empty) - -lemma L_bsimp_erase: - shows "L (erase r) = L (erase (bsimp r))" - apply(induct r) - apply(simp) - apply(simp) - apply(simp) - apply(auto simp add: Sequ_def)[1] - apply(subst L_bsimp_ASEQ[symmetric]) - apply(auto simp add: Sequ_def)[1] - apply(subst (asm) L_bsimp_ASEQ[symmetric]) - apply(auto simp add: Sequ_def)[1] - apply(simp) - apply(subst L_bsimp_AALTs[symmetric]) - defer - apply(simp) - apply(subst (2)L_erase_AALTs) - apply(subst L_erase_dB) - apply(subst L_erase_flts) - apply (simp add: L_erase_AALTs) - done - -lemma L_bders_simp: - shows "L (erase (bders_simp r s)) = L (erase (bders r s))" - apply(induct s arbitrary: r rule: rev_induct) - apply(simp) - apply(simp) - apply(simp add: ders_append) - apply(simp add: bders_simp_append) - apply(simp add: L_bsimp_erase[symmetric]) - by (simp add: der_correctness) - lemma bmkeps_fuse: assumes "bnullable r" @@ -668,17 +539,6 @@ apply(auto simp add: bmkeps_fuse bnullable_fuse) done - -lemma b4: - shows "bnullable (bders_simp r s) = bnullable (bders r s)" -proof - - have "L (erase (bders_simp r s)) = L (erase (bders r s))" - using L_bders_simp by force - then show "bnullable (bders_simp r s) = bnullable (bders r s)" - using bnullable_correctness nullable_correctness by blast -qed - - lemma bder_fuse: shows "bder c (fuse bs a) = fuse bs (bder c a)" apply(induct a arbitrary: bs c) @@ -860,7 +720,7 @@ apply(rule rrewrite_srewrite.ss6[simplified]) apply(simp add: erase_fuse) done - qed (auto intro: rrewrite_srewrite.intros) +qed (auto intro: rrewrite_srewrite.intros) lemma rewrites_fuse: @@ -1173,7 +1033,7 @@ theorem main_blexer_simp: shows "blexer r s = blexer_simp r s" unfolding blexer_def blexer_simp_def - using b4 main_aux by simp + by (metis central main_aux rewrites_bnullable_eq) theorem blexersimp_correctness: diff -r cc8e231529fb -r e1b74d618f1b thys2/paper.pdf Binary file thys2/paper.pdf has changed