diff -r 09a57446696a -r c6351a96bf80 ChengsongPhdThesis/ChengsongPhDThesis.tex --- a/ChengsongPhdThesis/ChengsongPhDThesis.tex Mon Mar 07 12:27:27 2022 +0000 +++ b/ChengsongPhdThesis/ChengsongPhDThesis.tex Tue Mar 08 00:50:40 2022 +0000 @@ -378,10 +378,99 @@ \subsection{an improved version of bit-coded algorithm: with simp!} -\subsection{a correctness proof for bitcoded} +\subsection{a correctness proof for bitcoded algorithm} \subsection{finiteness proof } +\subsubsection{closed form} +We can give the derivative of regular expressions +with respect to string a closed form with respect to simplification: +\begin{itemize} +\item +closed form for sequences: +\begin{verbatim} +lemma seq_closed_form: shows +"rsimp (rders_simp (RSEQ r1 r2) s) = +rsimp ( RALTS ( (RSEQ (rders_simp r1 s) r2) # + (map (rders r2) (vsuf s r1)) + ) + )" +\end{verbatim} +where the recursive function $\textit{vsuf}$ is defined as +\begin{verbatim} +fun vsuf :: "char list -> rrexp -> char list list" where +"vsuf [] _ = []" +|"vsuf (c#cs) r1 = (if (rnullable r1) then (vsuf cs (rder c r1)) @ [c # cs] + else (vsuf cs (rder c r1)) + ) " +\end{verbatim} +\item +closed form for stars: +\begin{verbatim} +lemma star_closed_form: + shows "rders_simp (RSTAR r0) (c#s) = +rsimp ( RALTS ( +(map (\lambda s1. RSEQ (rders_simp r0 s1) (RSTAR r0) ) +(star_updates s r [[c]]) ) ))" +\end{verbatim} +where the recursive function $\textit{star}\_\textit{updates}$ is defined as +\begin{verbatim} +fun star_update :: "char -> rrexp -> char list list -> char list list" where +"star_update c r [] = []" +|"star_update c r (s # Ss) = (if (rnullable (rders_simp r s)) + then (s@[c]) # [c] # (star_update c r Ss) + else (s@[c]) # (star_update c r Ss) )" + +fun star_updates :: "char list -> rrexp -> char list list -> char list list" + where +"star_updates [] r Ss = Ss" +| "star_updates (c # cs) r Ss = star_updates cs r (star_update c r Ss)" + +\end{verbatim} + + +\end{itemize} +These closed form is a formalization of the intuition + that we can push in the derivatives +of compound regular expressions to its sub-expressions, and the resulting +expression is a linear combination of those sub-expressions' derivatives. +\subsubsection{Estimation of closed forms' size} +And thanks to $\textit{distinctBy}$ helping with deduplication, +the linear combination can be bounded by the set enumerating all +regular expressions up to a certain size : +\begin{verbatim} + +lemma star_closed_form_bounded_by_rdistinct_list_estimate: + shows "rsize (rsimp ( RALTS ( (map (\lambda s1. RSEQ (rders_simp r0 s1) (RSTAR r0) ) + (star_updates s r [[c]]) ) ))) <= + Suc (sum_list (map rsize (rdistinct (map (\lambda s1. RSEQ (rders_simp r0 s1) (RSTAR r0) ) + (star_updates s r [[c]]) ) {}) ) )" + + lemma distinct_list_rexp_up_to_certain_size_bouded_by_set_enumerating_up_to_that_size: + shows "\forallr\in set rs. (rsize r ) <= N ==> sum_list (map rsize (rdistinct rs {})) <= + (card (rexp_enum N))* N" + + lemma ind_hypo_on_ders_leads_to_stars_bounded: + shows "\foralls. rsize (rders_simp r0 s) <= N ==> + (sum_list (map rsize (rdistinct (map (\lambda s1. RSEQ (rders_simp r0 s1) (RSTAR r0) ) + (star_updates s r [[c]]) ) {}) ) ) <= +(card (rexp_enum (Suc (N + rsize (RSTAR r0))))) * (Suc (N + rsize (RSTAR r0))) +" +\end{verbatim} + +With the above 3 lemmas, we can argue that the inductive hypothesis +$r_0$'s derivatives is bounded above leads to $r_0^*$'s +derivatives being bounded above. +\begin{verbatim} + +lemma r0_bounded_star_bounded: + shows "\foralls. rsize (rders_simp r0 s) <= N ==> + \foralls. rsize (rders_simp (RSTAR r0) s) <= +(card (rexp_enum (Suc (N + rsize (RSTAR r0))))) * (Suc (N + rsize (RSTAR r0)))" +\end{verbatim} + + +And we have a similar argument for the sequence case. \subsection{stronger simplification} @@ -401,32 +490,7 @@ Deciding whether a string is in the language of the regex can be intuitively done by constructing an NFA\cite{Thompson_1968}: -and simulate the running of it: -\begin{figure} -\centering -\includegraphics[scale=0.5]{pics/regex_nfa_base.png} -\end{figure} - -\begin{figure} -\centering -\includegraphics[scale=0.5]{pics/regex_nfa_seq1.png} -\end{figure} - -\begin{figure} -\centering -\includegraphics[scale=0.5]{pics/regex_nfa_seq2.png} -\end{figure} - -\begin{figure} -\centering -\includegraphics[scale=0.5]{pics/regex_nfa_alt.png} -\end{figure} - - -\begin{figure} -\centering -\includegraphics[scale=0.5]{pics/regex_nfa_star.png} -\end{figure} +and simulate the running of it. Which should be simple enough that modern programmers have no problems with it at all? @@ -1898,6 +1962,9 @@ analysis approach by implementing them in the same language and then compare their performance. + +\section{discarded} +haha \bibliographystyle{plain} \bibliography{root,regex_time_complexity}