# HG changeset patch # User Christian Urban # Date 1643813561 0 # Node ID 3cfea5bb5e2306d0dc72f97b82cabee977fa8826 # Parent 1500f96707b0315240b653d5ab0b3c00b0c6ca9a updated some of the text and cardinality proof diff -r 1500f96707b0 -r 3cfea5bb5e23 thys2/PDerivs.thy --- a/thys2/PDerivs.thy Sun Jan 30 23:37:29 2022 +0000 +++ b/thys2/PDerivs.thy Wed Feb 02 14:52:41 2022 +0000 @@ -12,7 +12,7 @@ "SEQs rs r = (\r'. SEQ r' r) ` rs" by auto -primrec +fun pder :: "char \ rexp \ rexp set" where "pder c ZERO = {}" @@ -23,7 +23,7 @@ (if nullable r1 then SEQs (pder c r1) r2 \ pder c r2 else SEQs (pder c r1) r2)" | "pder c (STAR r) = SEQs (pder c r) (STAR r)" -primrec +fun pders :: "char list \ rexp \ rexp set" where "pders [] r = {r}" diff -r 1500f96707b0 -r 3cfea5bb5e23 thys2/Paper/Paper.thy --- a/thys2/Paper/Paper.thy Sun Jan 30 23:37:29 2022 +0000 +++ b/thys2/Paper/Paper.thy Wed Feb 02 14:52:41 2022 +0000 @@ -45,17 +45,19 @@ set ("_" [73] 73) and AZERO ("ZERO" 81) and - AONE ("ONE _" [79] 81) and + AONE ("ONE _" [79] 78) and ACHAR ("CHAR _ _" [79, 79] 80) and AALTs ("ALTs _ _" [77,77] 78) and - ASEQ ("SEQ _ _ _" [79, 77,77] 78) and + ASEQ ("SEQ _ _ _" [79, 79,79] 78) and ASTAR ("STAR _ _" [79, 79] 78) and code ("code _" [79] 74) and intern ("_\<^latex>\\\mbox{$^\\uparrow$}\" [900] 80) and erase ("_\<^latex>\\\mbox{$^\\downarrow$}\" [1000] 74) and bnullable ("nullable\<^latex>\\\mbox{$_b$}\ _" [1000] 80) and - bmkeps ("mkeps\<^latex>\\\mbox{$_b$}\ _" [1000] 80) + bmkeps ("mkeps\<^latex>\\\mbox{$_b$}\ _" [1000] 80) and + + srewrite ("_\<^latex>\\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\ _" [71, 71] 80) lemma better_retrieve: @@ -244,10 +246,32 @@ section {* Bitcoded Regular Expressions and Derivatives *} text {* - bitcoded regexes / decoding / bmkeps - gets rid of the second phase (only single phase) - correctness + + Sulzmann and Lu describe another algorithm that generates POSIX + values but dispences with the second phase where characters are + injected ``back'' into values. For this they annotate bitcodes to + regular expressions, which we define in Isabelle/HOL as the datatype + \begin{center} + \begin{tabular}{lcl} + @{term breg} & $::=$ & @{term "AZERO"}\\ + & $\mid$ & @{term "AONE bs"}\\ + & $\mid$ & @{term "ACHAR bs c"}\\ + & $\mid$ & @{term "AALTs bs rs"}\\ + & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\ + & $\mid$ & @{term "ASTAR bs r"} + \end{tabular} + \end{center} + + \noindent where @{text bs} stands for a bitsequences; @{text r}, + @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for annotated regular + expressions; and @{text rs} for a list of annotated regular + expressions. In contrast to Sulzmann and Lu we generalise the + alternative regular expressions to lists, instead of just having + binary regular expressions. The idea with annotated regular + expressions is to incrementally generate the value information by + recording bitsequences. Sulzmann and Lu then + define a coding function for how values can be coded into bitsequences. \begin{center} \begin{tabular}{lcl} @@ -261,6 +285,26 @@ \end{tabular} \end{center} + There is also a corresponding decoding function that takes a bitsequence + and generates back a value. However, since the bitsequences are a ``lossy'' + coding (@{term Seq}s are not coded) the decoding function depends also + on a regular expression in order to decode values. + + \begin{center} + \begin{tabular}{lcll} + %@{thm (lhs) decode'.simps(1)} & $\dn$ & @{thm (rhs) decode'.simps(1)}\\ + @{thm (lhs) decode'.simps(2)} & $\dn$ & @{thm (rhs) decode'.simps(2)}\\ + @{thm (lhs) decode'.simps(3)} & $\dn$ & @{thm (rhs) decode'.simps(3)}\\ + @{thm (lhs) decode'.simps(4)} & $\dn$ & @{thm (rhs) decode'.simps(4)}\\ + @{thm (lhs) decode'.simps(5)} & $\dn$ & @{thm (rhs) decode'.simps(5)}\\ + @{thm (lhs) decode'.simps(6)} & $\dn$ & @{thm (rhs) decode'.simps(6)}\\ + @{thm (lhs) decode'.simps(7)} & $\dn$ & @{thm (rhs) decode'.simps(7)}\\ + @{thm (lhs) decode'.simps(8)} & $\dn$ & @{thm (rhs) decode'.simps(8)}\\ + @{thm (lhs) decode'.simps(9)} & $\dn$ & @{thm (rhs) decode'.simps(9)}\\ + @{thm (lhs) decode'.simps(10)} & $\dn$ & @{thm (rhs) decode'.simps(10)} & fix + \end{tabular} + \end{center} + The idea of the bitcodes is to annotate them to regular expressions and generate values incrementally. The bitcodes can be read off from the @{text breg} and then decoded into a value. @@ -296,6 +340,11 @@ \begin{theorem} @{thm blexer_correctness} \end{theorem} + + + bitcoded regexes / decoding / bmkeps + gets rid of the second phase (only single phase) + correctness *} diff -r 1500f96707b0 -r 3cfea5bb5e23 thys2/Paper/document/root.tex --- a/thys2/Paper/document/root.tex Sun Jan 30 23:37:29 2022 +0000 +++ b/thys2/Paper/document/root.tex Wed Feb 02 14:52:41 2022 +0000 @@ -65,8 +65,8 @@ string---that is, which part of the string is matched by which part of the regular expression. The purpose of the bitcodes is to generate POSIX values incrementally while derivatives are calculated. They also help with designing - an `aggressive' simplification function that keeps the size of - derivatives small. Without simplification derivatives can grow + an ``aggressive'' simplification function that keeps the size of + derivatives small. Without simplification the size derivatives can grow exponentially resulting in an extremely slow lexing algorithm. In this paper we describe a variant of Sulzmann and Lu's algorithm: Our algorithm is a recursive functional program, whereas Sulzmann diff -r 1500f96707b0 -r 3cfea5bb5e23 thys2/SizeBound4.thy --- a/thys2/SizeBound4.thy Sun Jan 30 23:37:29 2022 +0000 +++ b/thys2/SizeBound4.thy Wed Feb 02 14:52:41 2022 +0000 @@ -1,6 +1,6 @@ theory SizeBound4 - imports "Lexer" + imports "Lexer" "PDerivs" begin section \Bit-Encodings\ @@ -26,26 +26,26 @@ function decode' :: "bit list \ rexp \ (val * bit list)" where - "decode' ds ZERO = (Void, [])" -| "decode' ds ONE = (Void, ds)" -| "decode' ds (CH d) = (Char d, ds)" + "decode' bs ZERO = (undefined, bs)" +| "decode' bs ONE = (Void, bs)" +| "decode' bs (CH d) = (Char d, bs)" | "decode' [] (ALT r1 r2) = (Void, [])" -| "decode' (Z # ds) (ALT r1 r2) = (let (v, ds') = decode' ds r1 in (Left v, ds'))" -| "decode' (S # ds) (ALT r1 r2) = (let (v, ds') = decode' ds r2 in (Right v, ds'))" -| "decode' ds (SEQ r1 r2) = (let (v1, ds') = decode' ds r1 in - let (v2, ds'') = decode' ds' r2 in (Seq v1 v2, ds''))" +| "decode' (Z # bs) (ALT r1 r2) = (let (v, bs') = decode' bs r1 in (Left v, bs'))" +| "decode' (S # bs) (ALT r1 r2) = (let (v, bs') = decode' bs r2 in (Right v, bs'))" +| "decode' bs (SEQ r1 r2) = (let (v1, bs') = decode' bs r1 in + let (v2, bs'') = decode' bs' r2 in (Seq v1 v2, bs''))" | "decode' [] (STAR r) = (Void, [])" -| "decode' (S # ds) (STAR r) = (Stars [], ds)" -| "decode' (Z # ds) (STAR r) = (let (v, ds') = decode' ds r in - let (vs, ds'') = decode' ds' (STAR r) - in (Stars_add v vs, ds''))" +| "decode' (S # bs) (STAR r) = (Stars [], bs)" +| "decode' (Z # bs) (STAR r) = (let (v, bs') = decode' bs r in + let (vs, bs'') = decode' bs' (STAR r) + in (Stars_add v vs, bs''))" by pat_completeness auto lemma decode'_smaller: - assumes "decode'_dom (ds, r)" - shows "length (snd (decode' ds r)) \ length ds" + assumes "decode'_dom (bs, r)" + shows "length (snd (decode' bs r)) \ length bs" using assms -apply(induct ds r) +apply(induct bs r) apply(auto simp add: decode'.psimps split: prod.split) using dual_order.trans apply blast by (meson dual_order.trans le_SucI) @@ -1032,6 +1032,152 @@ using blexer_correctness main_blexer_simp by simp +(* some tests *) + +fun awidth :: "arexp \ nat" where + "awidth AZERO = 1" +| "awidth (AONE cs) = 1" +| "awidth (ACHAR cs c) = 1" +| "awidth (AALTs cs rs) = (sum_list (map awidth rs))" +| "awidth (ASEQ cs r1 r2) = (awidth r1 + awidth r2)" +| "awidth (ASTAR cs r) = (awidth r)" + + + +lemma + shows "s \ L r \ blexer_simp r s = None" + by (simp add: blexersimp_correctness lexer_correct_None) + +lemma g1: + "bders_simp AZERO s = AZERO" + apply(induct s) + apply(simp) + apply(simp) + done + +lemma g2: + "s \ Nil \ bders_simp (AONE bs) s = AZERO" + apply(induct s) + apply(simp) + apply(simp) + apply(case_tac s) + apply(simp) + apply(simp) + done + +lemma finite_pder: + shows "finite (pder c r)" + apply(induct c r rule: pder.induct) + apply(auto) + done + +lemma asize_fuse: + shows "asize (fuse bs r) = asize r" + apply(induct r arbitrary: bs) + apply(auto) + done + +lemma awidth_fuse: + shows "awidth (fuse bs r) = awidth r" + apply(induct r arbitrary: bs) + apply(auto) + done + +lemma pders_SEQs: + assumes "finite A" + shows "card (SEQs A (STAR r)) \ card A" + using assms + by (simp add: SEQs_eq_image card_image_le) + +lemma binullable_intern: + shows "bnullable (intern r) = nullable r" + apply(induct r) + apply(auto simp add: bnullable_fuse) + done + +lemma + "asize (bsimp (bder c (intern r))) \ Suc ((Suc (card (pder c r))) * (size r) * (size r))" + oops + +lemma + "card (pder c r) \ awidth (bsimp (bder c (intern r)))" + apply(induct c r rule: pder.induct) + apply(simp) + apply(simp) + apply(simp) + apply(simp) + oops + +lemma + "card (pder c r) \ awidth (bder c (intern r))" + apply(induct c r rule: pder.induct) + apply(simp) + apply(simp) + apply(simp) + apply(simp) + apply(rule order_trans) + apply(rule card_Un_le) + apply (simp add: awidth_fuse bder_fuse) + defer + apply(simp) + apply(rule order_trans) + apply(rule pders_SEQs) + using finite_pder apply presburger + apply (simp add: awidth_fuse) + apply(auto) + apply(rule order_trans) + apply(rule card_Un_le) + apply(simp add: awidth_fuse) + defer + using binullable_intern apply blast + using binullable_intern apply blast + apply (smt (verit, best) SEQs_eq_image add.commute add_Suc_right card_image_le dual_order.trans finite_pder trans_le_add2) + apply(subgoal_tac "card (SEQs (pder c r1) r2) \ card (pder c r1)") + apply(linarith) + by (simp add: UNION_singleton_eq_range card_image_le finite_pder) + +lemma + "card (pder c r) \ asize (bder c (intern r))" + apply(induct c r rule: pder.induct) + apply(simp) + apply(simp) + apply(simp) + apply(simp) + apply (metis add_mono_thms_linordered_semiring(1) asize_fuse bder_fuse card_Un_le le_Suc_eq order_trans) + defer + apply(simp) + apply(rule order_trans) + apply(rule pders_SEQs) + using finite_pder apply presburger + apply (simp add: asize_fuse) + apply(simp) + apply(auto) + apply(rule order_trans) + apply(rule card_Un_le) + apply (smt (z3) SEQs_eq_image add.commute add_Suc_right add_mono_thms_linordered_semiring(1) asize_fuse card_image_le dual_order.trans finite_pder le_add1) + apply(rule order_trans) + apply(rule card_Un_le) + using binullable_intern apply blast + using binullable_intern apply blast + by (smt (verit, best) SEQs_eq_image add.commute add_Suc_right card_image_le dual_order.trans finite_pder trans_le_add2) + +lemma + "card (pder c r) \ asize (bsimp (bder c (intern r)))" + apply(induct c r rule: pder.induct) + apply(simp) + apply(simp) + apply(simp) + apply(simp) + apply(rule order_trans) + apply(rule card_Un_le) + prefer 3 + apply(simp) + apply(rule order_trans) + apply(rule pders_SEQs) + using finite_pder apply blast + oops + + (* below is the idempotency of bsimp *) lemma bsimp_ASEQ_fuse: diff -r 1500f96707b0 -r 3cfea5bb5e23 thys2/paper.pdf Binary file thys2/paper.pdf has changed