thys2/Paper/Paper.thy
author Christian Urban <christian.urban@kcl.ac.uk>
Sat, 29 Jan 2022 23:53:21 +0000
changeset 400 46e5566ad4ba
parent 398 dac6d27c99c6
child 402 1612f2a77bf6
permissions -rw-r--r--
updated

(*<*)
theory Paper
imports 
   "../Lexer"
   "../Simplifying" 
   "../Positions"
   "../SizeBound4" 
   "HOL-Library.LaTeXsugar"
begin

declare [[show_question_marks = false]]

notation (latex output)
  If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
  Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) 


abbreviation 
  "der_syn r c \<equiv> der c r"

notation (latex output)
  der_syn ("_\\_" [79, 1000] 76) and

  ZERO ("\<^bold>0" 81) and 
  ONE ("\<^bold>1" 81) and 
  CH ("_" [1000] 80) and
  ALT ("_ + _" [77,77] 78) and
  SEQ ("_ \<cdot> _" [77,77] 78) and
  STAR ("_\<^sup>\<star>" [79] 78) 

(*>*)

section {* Introduction *}

text {*

In the last fifteen or so years, Brzozowski's derivatives of regular
expressions have sparked quite a bit of interest in the functional
programming and theorem prover communities.  The beauty of
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
expressible in any functional language, and easily definable and
reasoned about in theorem provers---the definitions just consist of
inductive datatypes and simple recursive functions. A mechanised
correctness proof of Brzozowski's matcher in for example HOL4 has been
mentioned by Owens and Slind~\cite{Owens2008}. Another one in
Isabelle/HOL is part of the work by Krauss and Nipkow
\cite{Krauss2011}.  And another one in Coq is given by Coquand and
Siles \cite{Coquand2012}.


The notion of derivatives
\cite{Brzozowski1964}, written @{term "der c r"}, of a regular
expression give a simple solution to the problem of matching a string
@{term s} with a regular expression @{term r}: if the derivative of
@{term r} w.r.t.\ (in succession) all the characters of the string
matches the empty string, then @{term r} matches @{term s} (and {\em
vice versa}). The derivative has the property (which may almost be
regarded as its specification) that, for every string @{term s} and
regular expression @{term r} and character @{term c}, one has @{term
"cs \<in> L(r)"} if and only if \mbox{@{term "s \<in> L(der c r)"}}.


If a regular expression matches a string, then in general there is more
than one way of how the string is matched. There are two commonly used
disambiguation strategies to generate a unique answer: one is called
GREEDY matching \cite{Frisch2004} and the other is POSIX
matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}.
For example consider the string @{term xy} and the regular expression
\mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be
matched in two `iterations' by the single letter-regular expressions
@{term x} and @{term y}, or directly in one iteration by @{term xy}. The
first case corresponds to GREEDY matching, which first matches with the
left-most symbol and only matches the next symbol in case of a mismatch
(this is greedy in the sense of preferring instant gratification to
delayed repletion). The second case is POSIX matching, which prefers the
longest match.



\begin{figure}[t]
\begin{center}
\begin{tikzpicture}[scale=2,node distance=1.3cm,
                    every node/.style={minimum size=6mm}]
\node (r1)  {@{term "r\<^sub>1"}};
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
\draw[->,line width=1mm](r4) -- (v4);
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
\end{tikzpicture}
\end{center}
\mbox{}\\[-13mm]

\caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
left to right) is \Brz's matcher building successive derivatives. If the 
last regular expression is @{term nullable}, then the functions of the 
second phase are called (the top-down and right-to-left arrows): first 
@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
how the empty string has been recognised by @{term "r\<^sub>4"}. After
that the function @{term inj} ``injects back'' the characters of the string into
the values.
\label{Sulz}}
\end{figure} 


*}

section {* Background *}

text {*
  Sulzmann-Lu algorithm with inj. State that POSIX rules.
  metion slg is correct.


  \begin{figure}[t]
  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
  @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
  @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
  @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
    @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
    @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
   {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\\
  @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\medskip\\
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
    @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
    @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
    @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>
  \end{tabular}
  \end{center}
  \caption{Our inductive definition of POSIX values.}\label{POSIXrules}
  \end{figure}


*}

section {* Bitcoded Derivatives *}

text {*
  bitcoded regexes / decoding / bmkeps
  gets rid of the second phase (only single phase)   
  correctness
*}


section {* Simplification *}

text {*
     Sulzmann \& Lu apply simplification via a fixpoint operation; also does not use erase to filter out duplicates.

   not direct correspondence with PDERs, because of example
   problem with retrieve 

   correctness



   \begin{figure}[t]
  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Axiom] bs1}\qquad
  @{thm[mode=Axiom] bs2}\qquad
  @{thm[mode=Axiom] bs3}\\
  @{thm[mode=Rule] bs4}\qquad
  @{thm[mode=Rule] bs5}\\
  @{thm[mode=Rule] bs6}\qquad
  @{thm[mode=Rule] bs7}\\
  @{thm[mode=Rule] bs8}\\
  @{thm[mode=Axiom] ss1}\qquad
  @{thm[mode=Rule] ss2}\qquad
  @{thm[mode=Rule] ss3}\\
  @{thm[mode=Axiom] ss4}\qquad
  @{thm[mode=Axiom] ss5}\qquad
  @{thm[mode=Rule] ss6}\\
  \end{tabular}
  \end{center}
  \caption{???}\label{SimpRewrites}
  \end{figure}
*}

section {* Bound - NO *}

section {* Bounded Regex / Not *}

section {* Conclusion *}

text {*

\cite{AusafDyckhoffUrban2016}

%%\bibliographystyle{plain}
\bibliography{root}
*}

(*<*)
end
(*>*)