thys3/Paper.thy
changeset 599 a5f666410101
parent 578 e71a6e2aca2d
child 615 8881a09a06fd
equal deleted inserted replaced
598:2c9a3aba8ebc 599:a5f666410101
   261   and @{text "r"}$^{\{n..m\}}$ which specify intervals for how many
   261   and @{text "r"}$^{\{n..m\}}$ which specify intervals for how many
   262   times @{text r} should match. The results presented in this paper
   262   times @{text r} should match. The results presented in this paper
   263   extend straightforwardly to them too. The importance of the bounded
   263   extend straightforwardly to them too. The importance of the bounded
   264   regular expressions is that they are often used in practical
   264   regular expressions is that they are often used in practical
   265   applications, such as Snort (a system for detecting network
   265   applications, such as Snort (a system for detecting network
   266   intrusion) and also in XML Schema definitions. According to Bj\"{o}rklund et
   266   intrusions) and also in XML Schema definitions. According to Bj\"{o}rklund et
   267   al~\cite{BjorklundMartensTimm2015}, bounded regular expressions 
   267   al~\cite{BjorklundMartensTimm2015}, bounded regular expressions 
   268   occur frequently in the latter and can have counters of up to
   268   occur frequently in the latter and can have counters of up to
   269   ten million.  The problem is that tools based on the classic notion
   269   ten million.  The problem is that tools based on the classic notion
   270   of automata need to expand @{text "r"}$^{\{n\}}$ into @{text n}
   270   of automata need to expand @{text "r"}$^{\{n\}}$ into @{text n}
   271   connected copies of the automaton for @{text r}. This leads to very
   271   connected copies of the automaton for @{text r}. This leads to very
   716   %\textit{decode}:
   716   %\textit{decode}:
   717 
   717 
   718   \begin{figure}
   718   \begin{figure}
   719   \begin{center}
   719   \begin{center}
   720   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
   720   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
   721   \multicolumn{3}{@ {}l}{$\textit{decode}'\,bs\,(\ONE)$ $\;\dn\;$ $(\Empty, bs)$ \quad\qquad
   721   $\textit{decode}'\,bs\,(\ONE)$ & $\;\dn\;$ & $(\Empty, bs)$\\
   722   $\textit{decode}'\,bs\,(c)$ $\;\dn\;$ $(\Char\,c, bs)$}\\
   722   $\textit{decode}'\,bs\,(c)$ & $\;\dn\;$ & $(\Char\,c, bs)$\\
   723   $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
   723   $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
   724      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
   724      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
   725        (\Left\,v, bs_1)$\\
   725        (\Left\,v, bs_1)$\\
   726   $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
   726   $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
   727      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
   727      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
   733   $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
   733   $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
   734   $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & 
   734   $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & 
   735          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
   735          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
   736   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
   736   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
   737         \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\\
   737         \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\\
   738   $\textit{decode}'\,(\S\!::\!bs)\,(r^{\{n\}})$ & $\dn$ & $(\Stars\,[], bs)$\\
   738   $\textit{decode}'\,bs\,(r^{\{n\}})$ & $\dn$ & $\textit{decode}'\,bs\,r^*$\smallskip\medskip\\
   739   $\textit{decode}'\,(\Z\!::\!bs)\,(r^{\{n\}})$ & $\dn$ & 
   739 
   740          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
   740   $\textit{decode}\,bs\,r$ & $\dn$ & 
   741   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^{\{n - 1\}}$
   741      $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
   742         \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\medskip\\	
   742   & & $\;\;\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;\textit{else}\;\textit{None}$\\[-4mm]   
   743   \multicolumn{3}{@ {}l}{$\textit{decode}\,bs\,r$ $\dn$ 
       
   744      $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$
       
   745      $\;\,\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
       
   746        \textit{else}\;\textit{None}$}\\[-4mm]   
       
   747   \end{tabular}    
   743   \end{tabular}    
   748   \end{center}
   744   \end{center}
   749   \caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode}
   745   \caption{Two functions, called $\textit{decode}'$ and \textit{decode}, for decoding a value from a bitsequence with the help of a regular expression.\\[-5mm]}\label{decode}
   750   \end{figure}
   746   \end{figure}
   751 
   747 
  1035 We can then prove the correctness of \textit{blexer}---it indeed
  1031 We can then prove the correctness of \textit{blexer}---it indeed
  1036 produces the same result as \textit{lexer}.
  1032 produces the same result as \textit{lexer}.
  1037 
  1033 
  1038 
  1034 
  1039 \begin{theorem}\label{thmone}
  1035 \begin{theorem}\label{thmone}
  1040 $\textit{lexer}\,r\,s = \textit{blexer}\,r\,s$
  1036 $\textit{blexer}\,r\,s = \textit{lexer}\,r\,s$
  1041 \end{theorem}  
  1037 \end{theorem}  
  1042 
  1038 
  1043 
  1039 
  1044 
  1040 
  1045 \noindent This establishes that the bitcoded algorithm \emph{without}
  1041 \noindent This establishes that the bitcoded algorithm \emph{without}
  1099      expressions is that they can be easily modified such that simplification does not
  1095      expressions is that they can be easily modified such that simplification does not
  1100      interfere with the value constructions. For example we can ``flatten'', or
  1096      interfere with the value constructions. For example we can ``flatten'', or
  1101      de-nest, or spill out, @{text ALTs} as follows
  1097      de-nest, or spill out, @{text ALTs} as follows
  1102      %
  1098      %
  1103      \[
  1099      \[
  1104      @{term "ALTs bs\<^sub>1 ((ALTs bs\<^sub>2 rs\<^sub>2) # rs\<^sub>1)"}
  1100      @{term "ALTs bs\<^sub>1 (((ALTs bs\<^sub>2 rs\<^sub>2)) # rs\<^sub>1)"}
  1105      \quad\xrightarrow{bsimp}\quad
  1101      \quad\xrightarrow{bsimp}\quad
  1106      @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) @ rs\<^sub>1)"}
  1102      @{term "ALTs bs\<^sub>1 ((map (fuse bs\<^sub>2) rs\<^sub>2) @ rs\<^sub>1)"}
  1107      \]
  1103      \]
  1108 
  1104 
  1109      \noindent
  1105      \noindent
  1375      %\end{proof}
  1371      %\end{proof}
  1376 
  1372 
  1377      \noindent
  1373      \noindent
  1378      With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
  1374      With these lemmas in place we can finally establish that @{term "blexer_simp"} and @{term "blexer"}
  1379      generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
  1375      generate the same value, and using Theorem~\ref{thmone} from the previous section that this value
  1380      is indeed the POSIX value.
  1376      is indeed the POSIX value as generated by \textit{lexer}.
  1381      
  1377      
  1382      \begin{theorem}
  1378      \begin{theorem}
  1383      @{thm[mode=IfThen] main_blexer_simp}
  1379      @{thm[mode=IfThen] main_blexer_simp[symmetric]} \; (@{text "= lexer r s"}\; by Thm.~\ref{thmone})
  1384      \end{theorem}
  1380      \end{theorem}
  1385 
  1381 
  1386      %\begin{proof}
  1382      %\begin{proof}
  1387      %By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}. 	
  1383      %By unfolding the definitions and using Lemmas~\ref{lemtwo} and \ref{lemthree}. 	
  1388      %\end{proof}
  1384      %\end{proof}
  1541    obscure, examples.
  1537    obscure, examples.
  1542    %We found that from an implementation
  1538    %We found that from an implementation
  1543    %point-of-view it is really important to have the formal proofs of
  1539    %point-of-view it is really important to have the formal proofs of
  1544    %the corresponding properties at hand.
  1540    %the corresponding properties at hand.
  1545 
  1541 
  1546    We can of course only make a claim about the correctness and the sizes of the
  1542    With the results reported here, we can of course only make a claim about the correctness
       
  1543    of the algorithm and the sizes of the
  1547    derivatives, not about the efficiency or runtime of our version of
  1544    derivatives, not about the efficiency or runtime of our version of
  1548    Sulzman and Lu's algorithm. But we found the size is an important
  1545    Sulzman and Lu's algorithm. But we found the size is an important
  1549    first indicator about efficiency: clearly if the derivatives can
  1546    first indicator about efficiency: clearly if the derivatives can
  1550    grow to arbitrarily big sizes and the algorithm needs to traverse
  1547    grow to arbitrarily big sizes and the algorithm needs to traverse
  1551    the derivatives possibly several times, then the algorithm will be
  1548    the derivatives possibly several times, then the algorithm will be