ChengsongPhdThesis/ChengsongPhDThesis.tex
changeset 443 c6351a96bf80
parent 441 426a93160f4a
child 444 a7e98deebb5c
equal deleted inserted replaced
442:09a57446696a 443:c6351a96bf80
   376 
   376 
   377 \section{My Work}
   377 \section{My Work}
   378 
   378 
   379 \subsection{an improved version of bit-coded algorithm: with simp!}
   379 \subsection{an improved version of bit-coded algorithm: with simp!}
   380 
   380 
   381 \subsection{a correctness proof for bitcoded}
   381 \subsection{a correctness proof for bitcoded algorithm}
   382 
   382 
   383 \subsection{finiteness proof }
   383 \subsection{finiteness proof }
   384 
   384 \subsubsection{closed form}
       
   385 We can give the derivative of regular expressions
       
   386 with respect to string a closed form with respect to simplification:
       
   387 \begin{itemize}
       
   388 \item
       
   389 closed form for sequences:
       
   390 \begin{verbatim}
       
   391 lemma seq_closed_form: shows 
       
   392 "rsimp (rders_simp (RSEQ r1 r2) s) = 
       
   393 rsimp ( RALTS ( (RSEQ (rders_simp r1 s) r2) # 
       
   394                 (map (rders r2) (vsuf s r1)) 
       
   395               )  
       
   396       )"
       
   397 \end{verbatim}
       
   398 where the recursive function $\textit{vsuf}$ is defined as
       
   399 \begin{verbatim}
       
   400 fun vsuf :: "char list -> rrexp -> char list list" where
       
   401 "vsuf [] _ = []"
       
   402 |"vsuf (c#cs) r1 = (if (rnullable r1) then  (vsuf cs (rder c r1)) @ [c # cs]
       
   403                                       else  (vsuf cs (rder c r1))
       
   404                    ) "
       
   405 
       
   406 \end{verbatim}
       
   407 \item
       
   408 closed form for stars:
       
   409 \begin{verbatim}
       
   410 lemma star_closed_form:
       
   411   shows "rders_simp (RSTAR r0) (c#s) = 
       
   412 rsimp ( RALTS ( 
       
   413 (map (\lambda s1. RSEQ (rders_simp r0 s1) (RSTAR r0) ) 
       
   414 (star_updates s r [[c]]) ) ))"
       
   415 \end{verbatim}
       
   416 where the recursive function $\textit{star}\_\textit{updates}$ is defined as
       
   417 \begin{verbatim}
       
   418 fun star_update :: "char -> rrexp -> char list list -> char list list" where
       
   419 "star_update c r [] = []"
       
   420 |"star_update c r (s # Ss) = (if (rnullable (rders_simp r s)) 
       
   421                                 then (s@[c]) # [c] # (star_update c r Ss) 
       
   422                                else   (s@[c]) # (star_update c r Ss) )"
       
   423 
       
   424 fun star_updates :: "char list -> rrexp -> char list list -> char list list"
       
   425   where
       
   426 "star_updates [] r Ss = Ss"
       
   427 | "star_updates (c # cs) r Ss = star_updates cs r (star_update c r Ss)"
       
   428 
       
   429 \end{verbatim}
       
   430 
       
   431 
       
   432 \end{itemize}
       
   433 These closed form is a formalization of the intuition 
       
   434  that we can push in the derivatives
       
   435 of compound regular expressions to its sub-expressions, and the resulting
       
   436 expression is a linear combination of those sub-expressions' derivatives.
       
   437 \subsubsection{Estimation of closed forms' size}
       
   438 And thanks to $\textit{distinctBy}$ helping with deduplication,
       
   439 the linear combination can be  bounded by the set enumerating all 
       
   440 regular expressions up to a certain size :
       
   441 \begin{verbatim}
       
   442 
       
   443 lemma star_closed_form_bounded_by_rdistinct_list_estimate:
       
   444   shows "rsize (rsimp ( RALTS ( (map (\lambda s1. RSEQ (rders_simp r0 s1) (RSTAR r0) )
       
   445          (star_updates s r [[c]]) ) ))) <=
       
   446         Suc (sum_list (map rsize (rdistinct (map (\lambda s1. RSEQ (rders_simp r0 s1) (RSTAR r0) )
       
   447          (star_updates s r [[c]]) ) {})  ) )"
       
   448          
       
   449          lemma distinct_list_rexp_up_to_certain_size_bouded_by_set_enumerating_up_to_that_size:
       
   450   shows "\forallr\in set rs. (rsize r ) <= N ==> sum_list (map rsize (rdistinct rs {})) <=
       
   451          (card (rexp_enum N))* N"
       
   452          
       
   453          lemma ind_hypo_on_ders_leads_to_stars_bounded:
       
   454   shows "\foralls. rsize (rders_simp r0 s) <= N ==>        
       
   455       (sum_list (map rsize (rdistinct (map (\lambda s1. RSEQ (rders_simp r0 s1) (RSTAR r0) )
       
   456          (star_updates s r [[c]]) ) {})  ) ) <= 
       
   457 (card (rexp_enum (Suc (N + rsize (RSTAR r0))))) * (Suc (N + rsize (RSTAR r0)))
       
   458 "
       
   459 \end{verbatim}
       
   460 
       
   461 With the above 3 lemmas, we can argue that the inductive hypothesis
       
   462 $r_0$'s derivatives is bounded above  leads to $r_0^*$'s
       
   463 derivatives being bounded above.
       
   464 \begin{verbatim}
       
   465 
       
   466 lemma r0_bounded_star_bounded:
       
   467   shows "\foralls. rsize (rders_simp r0 s) <= N ==>
       
   468              \foralls. rsize (rders_simp (RSTAR r0) s) <= 
       
   469 (card (rexp_enum (Suc (N + rsize (RSTAR r0))))) * (Suc (N + rsize (RSTAR r0)))"
       
   470 \end{verbatim}
       
   471 
       
   472 
       
   473 And we have a similar argument for the sequence case.
   385 \subsection{stronger simplification}
   474 \subsection{stronger simplification}
   386 
   475 
   387 
   476 
   388 \subsection{cubic bound}
   477 \subsection{cubic bound}
   389 
   478 
   399 
   488 
   400 \subsection{Too Detailed too basic? regex to NFA translation}
   489 \subsection{Too Detailed too basic? regex to NFA translation}
   401 
   490 
   402 Deciding whether a string is in the language of the regex
   491 Deciding whether a string is in the language of the regex
   403 can be intuitively done by constructing an NFA\cite{Thompson_1968}:
   492 can be intuitively done by constructing an NFA\cite{Thompson_1968}:
   404 and simulate the running of it:
   493 and simulate the running of it.
   405 \begin{figure}
       
   406 \centering
       
   407 \includegraphics[scale=0.5]{pics/regex_nfa_base.png}
       
   408 \end{figure}
       
   409 
       
   410 \begin{figure}
       
   411 \centering
       
   412 \includegraphics[scale=0.5]{pics/regex_nfa_seq1.png}
       
   413 \end{figure}
       
   414 
       
   415 \begin{figure}
       
   416 \centering
       
   417 \includegraphics[scale=0.5]{pics/regex_nfa_seq2.png}
       
   418 \end{figure}
       
   419 
       
   420 \begin{figure}
       
   421 \centering
       
   422 \includegraphics[scale=0.5]{pics/regex_nfa_alt.png}
       
   423 \end{figure}
       
   424 
       
   425 
       
   426 \begin{figure}
       
   427 \centering
       
   428 \includegraphics[scale=0.5]{pics/regex_nfa_star.png}
       
   429 \end{figure}
       
   430 
   494 
   431 Which should be simple enough that modern programmers
   495 Which should be simple enough that modern programmers
   432 have no problems with it at all?
   496 have no problems with it at all?
   433 Not really:
   497 Not really:
   434 
   498 
  1896 to contrast our (faster) version of bitcoded algorithm with the
  1960 to contrast our (faster) version of bitcoded algorithm with the
  1897 Symbolic Regex Matcher, the RE2, the Rust Regex Engine, and the static
  1961 Symbolic Regex Matcher, the RE2, the Rust Regex Engine, and the static
  1898 analysis approach by implementing them in the same language and then compare
  1962 analysis approach by implementing them in the same language and then compare
  1899 their performance.
  1963 their performance.
  1900 
  1964 
       
  1965 
       
  1966 \section{discarded}
       
  1967 haha
  1901 \bibliographystyle{plain}
  1968 \bibliographystyle{plain}
  1902 \bibliography{root,regex_time_complexity}
  1969 \bibliography{root,regex_time_complexity}
  1903 
  1970 
  1904 
  1971 
  1905 
  1972