diff -r cc8e231529fb -r e1b74d618f1b thys2/Paper/Paper.thy --- a/thys2/Paper/Paper.thy Tue Jan 25 13:12:50 2022 +0000 +++ b/thys2/Paper/Paper.thy Thu Jan 27 23:25:26 2022 +0000 @@ -7,7 +7,164 @@ "../SizeBound4" "HOL-Library.LaTeXsugar" begin + +declare [[show_question_marks = false]] + +abbreviation + "der_syn r c \ der c r" + +notation (latex output) + der_syn ("_\\_" [79, 1000] 76) and + + ZERO ("\<^bold>0" 81) and + ONE ("\<^bold>1" 81) and + CH ("_" [1000] 80) and + ALT ("_ + _" [77,77] 78) and + SEQ ("_ \ _" [77,77] 78) and + STAR ("_\<^sup>\" [79] 78) + (*>*) + +section {* Introduction *} + +text {* + +Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em +derivative} @{term "der c r"} of a regular expression \r\ w.r.t.\ +a character~\c\, and showed that it gave a simple solution to the +problem of matching a string @{term s} with a regular expression @{term +r}: if the derivative of @{term r} w.r.t.\ (in succession) all the +characters of the string matches the empty string, then @{term r} +matches @{term s} (and {\em vice versa}). The derivative has the +property (which may almost be regarded as its specification) that, for +every string @{term s} and regular expression @{term r} and character +@{term c}, one has @{term "cs \ L(r)"} if and only if \mbox{@{term "s \ L(der c r)"}}. +The beauty of Brzozowski's derivatives is that +they are neatly expressible in any functional language, and easily +definable and reasoned about in theorem provers---the definitions just +consist of inductive datatypes and simple recursive functions. A +mechanised correctness proof of Brzozowski's matcher in for example HOL4 +has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in +Isabelle/HOL is part of the work by Krauss and Nipkow \cite{Krauss2011}. +And another one in Coq is given by Coquand and Siles \cite{Coquand2012}. + +If a regular expression matches a string, then in general there is more +than one way of how the string is matched. There are two commonly used +disambiguation strategies to generate a unique answer: one is called +GREEDY matching \cite{Frisch2004} and the other is POSIX +matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}. +For example consider the string @{term xy} and the regular expression +\mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be +matched in two `iterations' by the single letter-regular expressions +@{term x} and @{term y}, or directly in one iteration by @{term xy}. The +first case corresponds to GREEDY matching, which first matches with the +left-most symbol and only matches the next symbol in case of a mismatch +(this is greedy in the sense of preferring instant gratification to +delayed repletion). The second case is POSIX matching, which prefers the +longest match. + + + +\begin{figure}[t] +\begin{center} +\begin{tikzpicture}[scale=2,node distance=1.3cm, + every node/.style={minimum size=6mm}] +\node (r1) {@{term "r\<^sub>1"}}; +\node (r2) [right=of r1]{@{term "r\<^sub>2"}}; +\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}}; +\node (r3) [right=of r2]{@{term "r\<^sub>3"}}; +\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}}; +\node (r4) [right=of r3]{@{term "r\<^sub>4"}}; +\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}}; +\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}}; +\node (v4) [below=of r4]{@{term "v\<^sub>4"}}; +\draw[->,line width=1mm](r4) -- (v4); +\node (v3) [left=of v4] {@{term "v\<^sub>3"}}; +\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\inj r\<^sub>3 c\}; +\node (v2) [left=of v3]{@{term "v\<^sub>2"}}; +\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\inj r\<^sub>2 b\}; +\node (v1) [left=of v2] {@{term "v\<^sub>1"}}; +\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\inj r\<^sub>1 a\}; +\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}}; +\end{tikzpicture} +\end{center} +\mbox{}\\[-13mm] + +\caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014}, +matching the string @{term "[a,b,c]"}. The first phase (the arrows from +left to right) is \Brz's matcher building successive derivatives. If the +last regular expression is @{term nullable}, then the functions of the +second phase are called (the top-down and right-to-left arrows): first +@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing +how the empty string has been recognised by @{term "r\<^sub>4"}. After +that the function @{term inj} ``injects back'' the characters of the string into +the values. +\label{Sulz}} +\end{figure} + + +*} + +section {* Background *} + +text {* + Sulzmann-Lu algorithm with inj. State that POSIX rules. + metion slg is correct. + + + \begin{figure}[t] + \begin{center} + \begin{tabular}{c} + @{thm[mode=Axiom] Posix.intros(1)}\P\@{term "ONE"} \qquad + @{thm[mode=Axiom] Posix.intros(2)}\P\@{term "c"}\medskip\\ + @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\P+L\\qquad + @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\P+R\\medskip\\ + $\mprset{flushleft} + \inferrule + {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad + @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\ + @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}} + {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\PS\\\ + @{thm[mode=Axiom] Posix.intros(7)}\P[]\\medskip\\ + $\mprset{flushleft} + \inferrule + {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad + @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad + @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\ + @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}} + {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\P\\ + \end{tabular} + \end{center} + \caption{Our inductive definition of POSIX values.}\label{POSIXrules} + \end{figure} + + +*} + +section {* Bitcoded Derivatives *} + +text {* + bitcoded regexes / decoding / bmkeps + gets rid of the second phase (only single phase) + correctness +*} + + +section {* Simplification *} + +text {* + not direct correspondence with PDERs, because of example + problem with retrieve + + correctness +*} + +section {* Bound - NO *} + +section {* Bounded Regex / Not *} + +section {* Conclusion *} + text {* \cite{AusafDyckhoffUrban2016}