# HG changeset patch # User Chengsong # Date 1646479919 0 # Node ID 426a93160f4a39d72d39e78091308a38083c8571 # Parent 0856fbf8bda74786dea8ed85d54f8171f706d450 6ct diff -r 0856fbf8bda7 -r 426a93160f4a ChengsongPhdThesis/ChengsongPhDThesis.tex --- a/ChengsongPhdThesis/ChengsongPhDThesis.tex Wed Mar 02 23:53:11 2022 +0000 +++ b/ChengsongPhdThesis/ChengsongPhDThesis.tex Sat Mar 05 11:31:59 2022 +0000 @@ -38,6 +38,10 @@ \def\lexer{\mathit{lexer}} \def\mkeps{\mathit{mkeps}} +\def\bmkeps{\textit{bmkeps}} +\def\retrieve{\textit{retrieve}} +\def\blexer{\textit{blexer}} +\def\flex{\textit{flex}} \def\inj{\mathit{inj}} \def\Empty{\mathit{Empty}} \def\Left{\mathit{Left}} @@ -49,6 +53,7 @@ \def\nullable{\mathit{nullable}} \def\Z{\mathit{Z}} \def\S{\mathit{S}} +\def\rup{r^\uparrow} \def\awidth{\mathit{awidth}} @@ -146,7 +151,61 @@ \subsubsection{ NFA's} +$\bold{Problems With This:}$ +\begin{itemize} +\item Can be slow especially when many states are active. +\item +Want Lexing Results: Can have Exponential different matching results. +\end{itemize} + + +One regular expression can have multiple lexical values. For example +for the regular expression $(a+b)^*$, it has a infinite list of +values corresponding to it: $\Stars\,[]$, $\Stars\,[\Left(Char(a))]$, +$\Stars\,[\Right(Char(b))]$, $\Stars\,[\Left(Char(a),\,\Right(Char(b))]$, +$\ldots$, and vice versa. +Even for the regular expression matching a certain string, there could +still be more than one value corresponding to it. +Take the example where $r= (a^*\cdot a^*)^*$ and the string +$s=\underbrace{aa\ldots a}_\text{n \textit{a}s}$. +The number of different ways of matching +without allowing any value under a star to be flattened +to an empty string can be given by the following formula: +\begin{center} + $C_n = (n+1)+n C_1+\ldots + 2 C_{n-1}$ +\end{center} +and a closed form formula can be calculated to be +\begin{equation} + C_n =\frac{(2+\sqrt{2})^n - (2-\sqrt{2})^n}{4\sqrt{2}} +\end{equation} +which is clearly in exponential order. +A lexer aimed at getting all the possible values has an exponential +worst case runtime. Therefore it is impractical to try to generate +all possible matches in a run. In practice, we are usually +interested about POSIX values, which by intuition always +match the leftmost regular expression when there is a choice +and always match a sub part as much as possible before proceeding +to the next token. For example, the above example has the POSIX value +$ \Stars\,[\Seq(Stars\,[\underbrace{\Char(a),\ldots,\Char(a)}_\text{n iterations}], Stars\,[])]$. +The output of an algorithm we want would be a POSIX matching +encoded as a value.\\ +$\mathbf{TODO:}$ +\begin{itemize} +\item +Illustrate graphically how you can match $a*a**$ with $aaa$ in different ways. +\item +Give a backtracking algorithm, and explain briefly why this can be exponentially slow. +(When there is a matching, it finds straight away; where there is not one, this fails to +recognize immediately that a match cannot be possibly found, and tries out all remaining +possibilities, etc.) +\item +From the above point, are there statical analysis tools that single out those malicious +patterns and tell before a lexer is even run? +Have a more thorough survey of the Birmingham paper. +Give out the suitable scenarios for such static analysis algorithms. + +\end{itemize} \subsubsection{DFAs} The tool JFLEX uses it. @@ -183,8 +242,137 @@ +bitcodes! Built on top of derivatives, but with auxiliary bits \subsection{Correctness Proof} -unproven by Sulzmann and Lu -by Ausaf and Urban + +Not proven by Sulzmann and Lu + +Proven by Ausaf and Urban!! + + +For this we have started with looking at the proof of +\begin{equation}\label{lexer} +\blexer \; (r^\uparrow) s = \lexer \;r \;s, +\end{equation} + +%\noindent +%might provide us insight into proving +%\begin{center} +%$\blexer \; r^\uparrow \;s = \blexers \; r^\uparrow \;s$ +%\end{center} + +\noindent +which established that the bit-sequence algorithm produces the same +result as the original algorithm, which does not use +bit-sequences. +The proof uses two ``tricks''. One is that it uses a \flex-function + +\begin{center} +\begin{tabular}{lcl} +$\textit{flex} \;r\; f\; (c\!::\!s) $ & $\dn$ & $\textit{flex} \; (r\backslash c) \;(\lambda v. f (inj \; r \; c \; v)) \;s$ \\ +$\textit{flex} \;r\; f\; [\,] $ & $\dn$ & $f$ +\end{tabular} +\end{center} + +\noindent + +The intuition behind the $\flex$ function is that + it accumulates a series of $\inj$ function applications when doing derivatives + in a $\mathit{LIFO}$ manner. The arguments of the $\inj$ functions are kept by + remembering which character + was chopped off and what the regular expression looks like before + chopping off that character. + The $\mathit{LIFO}$ order was achieved by putting the newest $\inj$ application + always before the application of $f$, the previously accumulated function applications.\\ +Therefore, the function $\flex$, when acted on a string $s@[c]$ where the last +character is $c$, by nature can have its last injection function revealed already: +\begin{equation}\label{flex} +\flex \; r \; id \; (s@[c]) \; v = \flex \; r \; id \; s \; (inj \; (r\backslash s) \; c\; v). +\end{equation} +that the last character can be taken off, and the injection it causes be applied to +the argument value $v$. + +Ausaf and Urban proved that the Sulzmann and Lu's lexers +can be charactarized by the $\flex$ function: +\begin{center} +$\lexer \;r\; s = \flex \;\textit{id} \; r\;s \;(\mkeps \; (r\backslash s))$. +\end{center} + +\noindent +This property says that the Sulzmann and Lu's $\lexer$ does lexing by +stacking up injection functions while doing derivatives, +explicitly showing the order of characters being +injected back in each step. + +\noindent +The other trick, which is the crux in the existing proof, +is the use of the $\retrieve$-function: +\begin{center} +\begin{tabular}{@{}l@{\hspace{2mm}}c@{\hspace{2mm}}l@{}} + $\textit{retrieve}\,(_{bs}\ONE)\,\Empty$ & $\dn$ & $bs$\\ + $\textit{retrieve}\,(_{bs}{\bf c})\,(\Char\,d)$ & $\dn$ & $bs$\\ + $\textit{retrieve}\,(_{bs}\sum a::as)\,(\Left\,v)$ & $\dn$ & + $bs \,@\, \textit{retrieve}\,a\,v$\\ + $\textit{retrieve}\,(_{bs}\sum a::as)\,(\Right\,v)$ & $\dn$ & + $\textit{bs} \,@\, \textit{retrieve}\,(_{[]}\sum as)\,v$\\ + $\textit{retrieve}\,(_{bs}a_1\cdot a_2)\,(\Seq\,v_1\,v_2)$ & $\dn$ & + $bs \,@\,\textit{retrieve}\,a_1\,v_1\,@\, \textit{retrieve}\,a_2\,v_2$\\ + $\textit{retrieve}\,(_{bs}a^*)\,(\Stars\,[])$ & $\dn$ & + $bs \,@\, [0]$\\ + $\textit{retrieve}\,(_{bs}a^*)\,(\Stars\,(v\!::\!vs))$ & $\dn$ &\\ + \multicolumn{3}{l}{ + \hspace{3cm}$bs \,@\, [1] \,@\, \textit{retrieve}\,a\,v\,@\, + \textit{retrieve}\,(_{[]}a^*)\,(\Stars\,vs)$}\\ +\end{tabular} +\end{center} + +\noindent +Sulzmann and Lu proposed this function, but did not prove +anything about it. Ausaf and Urban made use of the +fact about $\retrieve$ in their proof: + \begin{equation}\label{retrieve_reversible} + \retrieve\; \rup \backslash c \; v = \retrieve \; \rup (\inj \;r \;c \; v) + \end{equation} +This says that $\retrieve$ will always pick up +partial information about a lexing value value and transform it into suitable bitcodes. +If the information is in the regular expression (stored as bitcodes), it will keep those +bitcodes with the guidance of the value, +if the information is in the value, which has been injected back to the value, +it will "digest" and transform that part of the value to bitcodes. + +\noindent + +Using this together with ~\eqref{flex}, we can prove that the bitcoded version of +lexer is the same as Sulzmann and Lu's lexer: +\begin{center} +$\lexer \; r \; s = \flex \; r\; id\; s\; v = \textit{decode} \;( \textit{bmkeps}\; (\rup \backslash s) ) r = \blexer \; r \; s$ +\end{center} +\noindent +\begin{proof} +We express $\bmkeps$ using $\retrieve$, and the theorem to prove becomes: +\begin{center} +$ \flex \; r\; id\; s\; v = \textit{decode} \;( \textit{retrieve}\; (\rup \backslash s) \; v \;) r$ +\end{center} +\noindent +We prove the above by reverse induction on string $s$(meaning that the inductive step is on +$s @ [c]$ rather than $c :: s$). +$v$ takes arbitrary values.\\ +The base case goes through trivially.\\ +For the inductive step, assuming +$ \flex \; r\; id\; s\; v = \textit{decode} \;( \textit{retrieve}\; (\rup \backslash s) \; v \;) r$ +holds for all values $v$. Now we need to show that +$ \flex \; r\; id\; s@[c]\; v = \textit{decode} \;( \textit{retrieve}\; (\rup \backslash (s@[c])) \; v \;) r$.\\ +~\eqref{flex} allows us to do the following rewrite: +\begin{center} +$ \flex \; r\; id\; (s@[c])\; v = \flex \; r \; id\; s\; (\inj \; (r \backslash s) \; c\; v)= \textit{decode} \;( \textit{retrieve}\; (\rup \backslash s) \; (\inj \; (r\backslash s) \;c\;v)\;) r$ +\end{center} +~\eqref{retrieve_reversible} allows us to further rewrite the $\mathit{RHS}$ of the above to +\begin{center} +$\textit{decode} \; (\textit{retrieve}\; (\rup \backslash (s @ [c])) \; v\;) \;r$ +\end{center} + + +\end{proof} + + \section{My Work} diff -r 0856fbf8bda7 -r 426a93160f4a thys2/SizeBound6CT.thy --- a/thys2/SizeBound6CT.thy Wed Mar 02 23:53:11 2022 +0000 +++ b/thys2/SizeBound6CT.thy Sat Mar 05 11:31:59 2022 +0000 @@ -116,16 +116,11 @@ *) -datatype cchar = -Achar -|Bchar -|Cchar -|Dchar datatype rrexp = RZERO | RONE -| RCHAR cchar +| RCHAR char | RSEQ rrexp rrexp | RALTS "rrexp list" | RSTAR rrexp @@ -145,19 +140,13 @@ | "rnullable (RSEQ r1 r2) = (rnullable r1 \ rnullable r2)" | "rnullable (RSTAR r) = True" -fun convert_cchar_char :: "cchar \ char" - where -"convert_cchar_char Achar = (CHR 0x41) " -| "convert_cchar_char Bchar = (CHR 0x42) " -| "convert_cchar_char Cchar = (CHR 0x43) " -| "convert_cchar_char Dchar = (CHR 0x44) " fun rder :: "char \ rrexp \ rrexp" where "rder c (RZERO) = RZERO" | "rder c (RONE) = RZERO" -| "rder c (RCHAR d) = (if c = (convert_cchar_char d) then RONE else RZERO)" +| "rder c (RCHAR d) = (if c = d then RONE else RZERO)" | "rder c (RALTS rs) = RALTS (map (rder c) rs)" | "rder c (RSEQ r1 r2) = (if rnullable r1 @@ -879,11 +868,17 @@ |"rexp_enum (Suc n) = [(RSEQ r1 r2). r1 \ set (rexp_enum i) \ r2 \ set (rexp_enum j) \ i + j = n]" *) +definition SEQ_set where + "SEQ_set A n \ {RSEQ r1 r2 | r1 r2. r1 \ A \ r2 \ A \ rsize r1 + rsize r2 \ n}" + +definition ALT_set where +"ALT_set A n \ {RALTS rs | rs. set rs \ A \ sum_list (map rsize rs) \ n}" + context notes rev_conj_cong[fundef_cong] begin function (sequential) rexp_enum :: "nat \ rrexp set" where "rexp_enum 0 = {}" -|"rexp_enum (Suc 0) = {RALTS []} \ {RZERO} \ {RONE} \ { (RCHAR c) |c. c \{Achar, Bchar, Cchar, Dchar} }" +|"rexp_enum (Suc 0) = {RALTS []} \ {RZERO} \ {RONE} \ { (RCHAR c) |c::char. True }" |"rexp_enum (Suc n) = {(RSEQ r1 r2)|r1 r2 i j. r1 \ (rexp_enum i) \ r2 \ (rexp_enum j) \ i + j = n} \ {(RALTS (a # rs)) | a rs i j. a \ (rexp_enum i) \ (RALTS rs) \ (rexp_enum j) \ i + j = Suc n \ i \ n \ j \ n} \ {RSTAR r0 | r0. r0 \ (rexp_enum n)} \ @@ -904,16 +899,146 @@ shows "n \ m \ (rexp_enum n) \ (rexp_enum m)" by (simp add: lift_Suc_mono_le rexp_enum_inclusion) +lemma zero_in_Suc0: + shows "RZERO \ rexp_enum (Suc 0)" +and "RZERO \ rexp_enum 1" + apply simp + by simp + +lemma one_in_Suc0: + shows "RONE \ rexp_enum (Suc 0)" +and "RONE \ rexp_enum 1" + apply simp + by simp + +lemma char_in_Suc0: + shows "RCHAR c \ rexp_enum (Suc 0)" + apply simp + done + + +lemma char_in1: + shows "RCHAR c \ rexp_enum 1" + using One_nat_def char_in_Suc0 by presburger + +lemma alts_nil_in_Suc0: + shows "RALTS [] \ rexp_enum (Suc 0)" + and "RALTS [] \ rexp_enum 1" + apply simp + by simp + + +lemma zero_in_positive: + shows "RZERO \ rexp_enum (Suc N)" + by (metis le_add1 plus_1_eq_Suc rexp_enum_mono subsetD zero_in_Suc0(2)) + +lemma one_in_positive: + shows "RONE \ rexp_enum (Suc N)" + by (metis le_add1 plus_1_eq_Suc rexp_enum_mono subsetD one_in_Suc0(2)) + +lemma alts_in_positive: + shows "RALTS [] \ rexp_enum (Suc N)" + by (metis One_nat_def alts_nil_in_Suc0(1) le_add_same_cancel1 less_Suc_eq_le plus_1_eq_Suc rexp_enum_mono subsetD zero_less_Suc) + +lemma char_in_positive: + shows "RCHAR c \ rexp_enum (Suc N)" + apply(cases c) + apply (metis Suc_eq_plus1 char_in1 le_add2 rexp_enum_mono subsetD)+ + done + lemma enum_inductive_cases: shows "rsize (RSEQ r1 r2) = Suc n \ \i j. rsize r1 = i \ rsize r2 = j\ i + j = n" by (metis Suc_inject rsize.simps(5)) -thm rsize.simps(5) + lemma enumeration_finite: shows "\Nn. card (rexp_enum n) < Nn" apply(simp add:no_top_class.gt_ex) done + +lemma s1: +"{r::rexp . size r = 0} = ({ZERO, ONE} \ {CH c| c. True})" +apply(auto) +apply(case_tac x) +apply(simp_all) +done + + + + +lemma enum_Suc0: + shows " rexp_enum (Suc 0) = {RZERO} \ {RONE} \ {RCHAR c | c. True} \ {RALTS []}" + by auto + +lemma enumeration_chars_finite: + shows "finite {RCHAR c |c. True}" + apply(subgoal_tac "finite (RCHAR ` (UNIV::char set))") + prefer 2 + using finite_code apply blast + by (simp add: full_SetCompr_eq) + +lemma enum_Suc0_finite: + shows "finite (rexp_enum (Suc 0))" + apply(subgoal_tac "finite ( {RZERO} \ {RONE} \ {RCHAR c | c. True} \ {RALTS []})") + using enum_Suc0 apply presburger + using enumeration_chars_finite by blast + +lemma enum_1_finite: + shows "finite (rexp_enum 1)" + using enum_Suc0_finite by force + +lemma enum_stars_finite: + shows " finite (rexp_enum n) \ finite {RSTAR r0 | r0. r0 \ (rexp_enum n)}" + apply(induct n) + apply simp + apply simp + done + +definition RSEQ_set + where + "RSEQ_set A B \ (\(r1, r2) . (RSEQ r1 r2 )) ` (A \ B)" + + +lemma enum_seq_finite: + shows "(\k. k < n \ finite (rexp_enum k)) \ finite +{(RSEQ r1 r2)|r1 r2 i j. r1 \ (rexp_enum i) \ r2 \ (rexp_enum j) \ i + j = n}" + apply(induct n) + apply simp + apply(subgoal_tac "{(RSEQ r1 r2)|r1 r2 i j. r1 \ (rexp_enum i) \ r2 \ (rexp_enum j) \ i + j = Suc n} +\ RSEQ_set (rexp_enum n) (rexp_enum n)") + apply(subgoal_tac "finite ( RSEQ_set (rexp_enum n) (rexp_enum n))") + using rev_finite_subset + apply fastforce + + sorry + + + +lemma enum_induct_finite: + shows " finite ( {(RSEQ r1 r2)|r1 r2 i j. r1 \ (rexp_enum i) \ r2 \ (rexp_enum j) \ i + j = n} \ +{(RALTS (a # rs)) | a rs i j. a \ (rexp_enum i) \ (RALTS rs) \ (rexp_enum j) \ i + j = Suc n \ i \ n \ j \ n} \ +{RSTAR r0 | r0. r0 \ (rexp_enum n)} \ +(rexp_enum n))" + apply(induct n) + apply simp + sorry + +lemma enumeration_finite2: + shows "finite (rexp_enum n)" + apply(cases n) + apply auto[1] + apply(case_tac nat) + using enum_Suc0_finite apply blast + apply(subgoal_tac "rexp_enum ( Suc n) = {(RSEQ r1 r2)|r1 r2 i j. r1 \ (rexp_enum i) \ r2 \ (rexp_enum j) \ i + j = n} \ +{(RALTS (a # rs)) | a rs i j. a \ (rexp_enum i) \ (RALTS rs) \ (rexp_enum j) \ i + j = Suc n \ i \ n \ j \ n} \ +{RSTAR r0 | r0. r0 \ (rexp_enum n)} \ +(rexp_enum n)") + prefer 2 + using rexp_enum.simps(3) apply presburger + using enum_induct_finite by auto + + lemma size1_rexps: shows "RCHAR x \ rexp_enum 1" apply(cases x) @@ -978,15 +1103,14 @@ apply auto[1] apply(subgoal_tac "RALTS (a # list) \ {uu. \a rs i j. uu = RALTS (a # rs) \ a \ rexp_enum i \ RALTS rs \ rexp_enum j \ i + j = Suc N \ i \ N \ j \ N}") - - apply fastforce apply(subgoal_tac "a \ rexp_enum i") prefer 2 - apply linarith by blast +thm rsize.elims + lemma rexp_enum_covers: shows " rsize r \ N \ r \ rexp_enum N \ r \ rexp_enum (rsize r)" apply(induct N arbitrary : r) @@ -1016,14 +1140,77 @@ apply(subgoal_tac "RALTS list \ rexp_enum N") prefer 2 apply presburger + using def_enum_alts rexp_size_induct apply presburger + using rexp_size_induct apply presburger + using rexp_size_induct apply presburger + using rexp_size_induct apply presburger + apply(subgoal_tac "r \ rexp_enum 1") + apply (metis rsize.simps(1)) + apply(subgoal_tac "rsize r = Suc 0") + prefer 2 + using rsize.simps(1) apply presburger + apply(subgoal_tac "r \ rexp_enum (Suc 0)") + apply force + using zero_in_Suc0 apply blast + apply simp - sorry + using one_in_positive apply auto[1] + + apply (metis char_in_positive) + apply(subgoal_tac "rsize x41 \ N") + apply(subgoal_tac "rsize x42 \ N") + prefer 2 + apply auto[1] + prefer 2 + using enum_inductive_cases nat_le_iff_add apply blast + apply(subgoal_tac "x41 \ rexp_enum (rsize x41)") + prefer 2 + apply blast + apply(subgoal_tac "x42 \ rexp_enum (rsize x42)") + prefer 2 + apply blast + apply(subgoal_tac "rsize x42 + rsize x41 = N") + prefer 2 + using add.commute enum_inductive_cases apply blast + apply(subgoal_tac "rexp_enum (Suc N) = {(RSEQ r1 r2)|r1 r2 i j. r1 \ (rexp_enum i) \ r2 \ (rexp_enum j) \ i + j = N} \ +{(RALTS (a # rs)) | a rs i j. a \ (rexp_enum i) \ (RALTS rs) \ (rexp_enum j) \ i + j = Suc N \ i \ N \ j \ N} \ +{RSTAR r0 | r0. r0 \ (rexp_enum N)} \ +(rexp_enum N)") + apply (smt (verit, del_insts) UnCI mem_Collect_eq old.nat.inject rsize.simps(5)) + apply (smt (verit, ccfv_threshold) One_nat_def nle_le not_less_eq_eq rexp_enum_case3 size_geq1) + apply(subgoal_tac "x6 \ rexp_enum N") + prefer 2 + + apply force + apply(subgoal_tac "N \ Suc 0") + prefer 2 + apply (metis less_Suc_eq_le non_zero_size rsize.simps(6)) + apply(subgoal_tac "rexp_enum (Suc N) = {(RSEQ r1 r2)|r1 r2 i j. r1 \ (rexp_enum i) \ r2 \ (rexp_enum j) \ i + j = N} \ +{(RALTS (a # rs)) | a rs i j. a \ (rexp_enum i) \ (RALTS rs) \ (rexp_enum j) \ i + j = Suc N \ i \ N \ j \ N} \ +{RSTAR r0 | r0. r0 \ (rexp_enum N)} \ +(rexp_enum N)") + prefer 2 + using rexp_enum_case3 apply presburger + by (metis (mono_tags, lifting) Un_iff mem_Collect_eq) -lemma finite_size_finite_regx: - shows " \l. \rs. ((\r \ (set rs). rsize r < N) \ (distinct rs) \ (length rs) < l) " + + + +definition + "sizeNregex N \ {r. rsize r \ N}" + + - sorry +lemma sizeNregex_covered: + shows "sizeNregex N \ rexp_enum N" + using rexp_enum_covers sizeNregex_def by auto + +lemma finiteness_of_sizeN_regex: + shows "finite (sizeNregex N)" + by (meson enumeration_finite2 rev_finite_subset sizeNregex_covered) + + (*below probably needs proved concurrently*)