\documentclass{beamer}\usepackage{tikz}\usepackage[english]{babel}\usepackage{proof}\usetheme{Luebeck}\usetikzlibrary{positioning}\usetikzlibrary{decorations.pathreplacing}\definecolor{darkblue}{rgb}{0,0,.803}\definecolor{cream}{rgb}{1,1,.8}\newcommand{\smath}[1]{\textcolor{darkblue}{\ensuremath{#1}}}\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}\newcommand{\Zero}{{\bf 0}}\newcommand{\One}{{\bf 1}}\newenvironment{bubble}[1][]{%\addtolength{\leftmargini}{4mm}%\begin{tikzpicture}[baseline=(current bounding box.north)]%\draw (0,0) node[inner sep=2mm,fill=cream,ultra thick,draw=red,rounded corners=2mm]% \bgroup\begin{minipage}{#1}\raggedright{}}{\end{minipage}\egroup;%\end{tikzpicture}\bigskip}\newcommand\grid[1]{% \begin{tikzpicture}[baseline=(char.base)] \path[use as bounding box] (0,0) rectangle (1em,1em); \draw[red!50, fill=red!20] (0,0) rectangle (1em,1em); \node[inner sep=1pt,anchor=base west] (char) at (0em,\gridraiseamount) {#1}; \end{tikzpicture}}\newcommand\gridraiseamount{0.12em}\makeatletter\newcommand\Grid[1]{% \@tfor\z:=#1\do{\grid{\z}}}\makeatother \newcommand\Vspace[1][.3em]{% \mbox{\kern.06em\vrule height.3ex}% \vbox{\hrule width#1}% \hbox{\vrule height.3ex}}\def\VS{\Vspace[0.6em]}\title[POSIX Lexing with Derivatives of Regexes]{\bf POSIX Lexing with\\ \bf Derivatives of Regular Expressions\\ \bf (Proof Pearl)}\author{Fahad Ausaf, Roy Dyckhoff and Christian Urban}\date{King's College London, University of St Andrews}\begin{document}\maketitle %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Brzozowski's Derivatives of Regular Expressions} Idea: If \smath{r} matches the string \smath{c\!::\!s}, what is a regular expression that matches just \smath{s}? \\ \begin{center} \begin{tabular}{l@{\hspace{5mm}}lcl} chars: &\smath{\Zero \backslash c} & \smath{\dn} & \smath{\Zero}\\ &\smath{\One \backslash c} & \smath{\dn} & \smath{\Zero}\\ &\smath{d \backslash c} & \smath{\dn} & \smath{\textit{if}\;d = c\;\textit{then}\;\One\;\textit{else}\;\Zero}\\ &\smath{r_1 + r_2 \backslash c} & \smath{\dn} & \smath{r_1 \backslash c \,+\, r_2 \backslash c}\\ &\smath{r_1 \cdot r_2 \backslash c} & \smath{\dn} & \smath{\textit{if}\;\textit{nullable}\;r_1}\\ && & \smath{\textit{then}\;r_1\backslash c \cdot r_2 \,+\, r_2\backslash c \;\textit{else}\;r_1\backslash c \cdot r_2}\\ &\smath{r^* \backslash c} & \smath{\dn} & \smath{r\backslash c \,\cdot\, r^*}\bigskip\\ strings: &\smath{r\backslash []} & \smath{\dn} & \smath{r}\\ &\smath{r\backslash c\!::\!s} & \smath{\dn} & \smath{(r\backslash c)\backslash s}\\ \end{tabular} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Brzozowski's Matcher} Does \smath{r_1} match string \smath{abc}? \begin{center} \begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}] \node (r1) {\smath{r_1}}; \node (r2) [right=of r1] {\smath{r_2}}; \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}}; \node (r3) [right=of r2] {\smath{r_3}}; \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}}; \node (r4) [right=of r3] {\smath{r_4}}; \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}}; \draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;\textit{nullable}?}}}; \end{tikzpicture} \end{center}\pause It leads to an elegant functional program: \begin{center} \smath{\textit{matches}\,(r, s) \dn \textit{nullable}\,(r\backslash s)} \end{center}\pause It is an easy exercise to formally prove (e.g.~Coq, HOL, Isabelle): \begin{center} \smath{\textit{matches}\,(r, s)} if and only if \smath{s \in L(r)} \end{center}\pause {\bf But Brzozowski's matcher gives only a yes/no-answer.} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}% \frametitle{Sulzmann and Lu's Matcher}% \begin{itemize}% \item Sulzmann \& Lu came up with a beautiful idea for how % to extend the simple regular expression matcher to POSIX % matching/lexing (FLOPS 2014)% \item The idea: define an inverse operation to the derivatives% \end{itemize}% \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Sulzmann and Lu's Matcher} Sulzmann and Lu added a second phase in order to answer \alert{\textbf{how}} the regular expression matched the string. \begin{center}\small \begin{tikzpicture}[scale=1,node distance=0.8cm,every node/.style={minimum size=7mm}] \node (r1) {\smath{r_1}}; \node (r2) [right=of r1] {\smath{r_2}}; \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}}; \node (r3) [right=of r2] {\smath{r_3}}; \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}}; \node (r4) [right=of r3] {\smath{r_4}}; \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}}; \draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;nullable?}}}; \node (v4) [below=of r4] {\smath{v_4}}; \draw[->,line width=1mm] (r4) -- (v4); \node (v3) [left=of v4] {\smath{v_3}}; \draw[->,line width=1mm] (v4) -- (v3) node[below,midway] {\smath{inj\,c}}; \node (v2) [left=of v3] {\smath{v_2}}; \draw[->,line width=1mm] (v3) -- (v2) node[below,midway] {\smath{inj\,b}}; \node (v1) [left=of v2] {\smath{v_1}}; \draw[->,line width=1mm] (v2) -- (v1) node[below,midway] {\smath{inj\,a}}; \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{\smath{mkeps}}}; \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm}, line width=1.5mm] (r1) -- (r4) node [black,midway,above, yshift=12mm] {\large\bf first phase}; \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm}, line width=1.5mm] (v4) -- (v1) node [black,midway,below, yshift=-12mm] {\large\bf second phase}; %% first phase \draw[line width=14mm, rounded corners, opacity=0.1, cap=round,join=round,color=yellow!30] (r1.center) -- (r4.center); %% second phase \draw[line width=14.1mm, rounded corners, opacity=0.2, cap=round,join=round,draw=black, fill=white] (r4) -- (v4.center) -- (v1.center); \draw[line width=14mm, rounded corners, opacity=0.2, cap=round,join=round,color=yellow!30] (r4) -- (v4.center) -- (v1.center); \end{tikzpicture} \end{center} There are several possible answers for \alert{\textbf{how}}: POSIX, GREEDY, \ldots \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{POSIX Matching (needed for Lexing)} \begin{bubble}[10cm] {\bf Longest Match Rule:} The longest initial substring matched by any regular expression is taken as the next token. \end{bubble} \begin{bubble}[10cm] {\bf Rule Priority:} For a particular longest initial substring, the first regular expression that can match determines the token. \end{bubble} For example: \smath{r_{keywords} + r_{identifiers}} \begin{itemize} \item \smath{\texttt{\Grid{iffoo\VS bla}}} \item \smath{\texttt{\Grid{if\VS bla}}} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Problems with POSIX} Grathwohl, Henglein and Rasmussen wrote: \begin{bubble}[10cm] \it ``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.'' \end{bubble}\bigskip Also Kuklewicz maintains a unit-test repository for POSIX matching, which indicates that most POSIX matchcers are buggy. \begin{center} \url{http://www.haskell.org/haskellwiki/Regex_Posix} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}% \frametitle{Correctness}% \begin{itemize}% \item Sulzmann \& Lu came up with a beautiful idea for how % to extend the simple regular expression matcher to POSIX % matching/lexing (FLOPS 2014)% \item The idea: define an inverse operation to the derivatives% \end{itemize}% \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Sulzmann and Lu Matcher} We want to match the string $abc$ using $r_1$\\ \begin{center} \begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}] \node (r1) {$r_1$}; \node (r2) [right=of r1] {$r_2$}; \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {$der\,a$}; \node (r3) [right=of r2] {$r_3$}; \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {$der\,b$}; \node (r4) [right=of r3] {$r_4$}; \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {$der\,c$}; \draw (r4) node[anchor=west] {\;\raisebox{3mm}{$\;\;nullable?$}};\pause \node (v4) [below=of r4] {$v_4$}; \draw[->,line width=1mm] (r4) -- (v4); \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{$mkeps$}};\pause \node (v3) [left=of v4] {$v_3$}; \draw[->,line width=1mm] (v4) -- (v3) node[below,midway] {$inj\,c$};\pause \node (v2) [left=of v3] {$v_2$}; \draw[->,line width=1mm] (v3) -- (v2) node[below,midway] {$inj\,b$};\pause \node (v1) [left=of v2] {$v_1$}; \draw[->,line width=1mm] (v2) -- (v1) node[below,midway] {$inj\,a$};\pause %\draw[->,line width=0.5mm] (r3) -- (v3); %\draw[->,line width=0.5mm] (r2) -- (v2); %\draw[->,line width=0.5mm] (r1) -- (v1); \onslide<1-> \end{tikzpicture} \end{center} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Regular Expressions and Values} Regular expressions and their corresponding values (for how a regular expression matched string): \vspace{5 mm} \begin{columns}[c] % the "c" option specifies center vertical alignment \column{.4\textwidth} % column designated by a command \begin{tabular}{ l l l } \smath{r} & ::= & \smath{\Zero} \\ & $\mid$ & \smath{\One} \\ & $\mid$ & \smath{c} \\ & $\mid$ & \smath{r_1 \cdot r_2} \\ & $\mid$ & \smath{r_1 + r_2} \\ \\ & $\mid$ & \smath{r^*} \\ \end{tabular} \column{.4\textwidth} \begin{tabular}{ l l l } \smath{v} & ::= & \\ & $\mid$ & \smath{Empty} \\ & $\mid$ & \smath{Char(c)} \\ & $\mid$ & \smath{Seq(v_1\cdot v_2)} \\ & $\mid$ & \smath{Left(v)} \\ & $\mid$ & \smath{Right(v)} \\ & $\mid$ & \smath{[v_1,...,v_n]} \end{tabular} \end{columns} \vspace{5 mm} There is also a notion of a string behind a value $\mid v \mid$ \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Mkeps and Injection Functions} \textbf{Mkeps Function} \begin{tabular}{ l l l } \textit{mkeps} (\textbf{1}) & $\dn$ & \textit{Empty} \\ \textit{mkeps} ($r_1 \cdot r_2$) & $\dn$ & \textit{Seq} (\textit{mkeps} $r_1$) (\textit{mkeps} $r_2$)\\ \textit{mkeps} ($r_1 + r_2$) & $\dn$ & \textit{if nullable $r_1$ then Left} (\textit{mkeps $r_1$})\\ & & \textit{else Right} (\textit{mkeps $r_2$})\\ \textit{mkeps} ($r^*$) & $\dn$ & \textit{Stars} [] \\ \end{tabular}\\ \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Mkeps and Injection Functions} \textbf{Injection Function} \begin{tabular}{ l l l } \textit{inj d c} () & $\dn$ & \textit{Char d}\\ \textit{inj} ($r_1 + r_2$) \textit{c} (\textit{Left $v_1$}) & $\dn$ & \textit{Left} (\textit{inj $r_1$ c $v_1$})\\ \textit{inj} ($r_1 + r_2$) \textit{c} (\textit{Right $v_2$}) & $\dn$ & \textit{Right} (\textit{inj $r_2$ c $v_2$})\\ \textit{inj} ($r_1 \cdot r_2$) \textit{c} (\textit{Seq $v_1$ $v_2$}) & $\dn$ & \textit{Seq} (\textit{inj $r_1$ c $v_1$}) $v_2$\\ \textit{inj} ($r_1 \cdot r_2$) \textit{c} (\textit{Left} (\textit{Seq $v_1$ $v_2$})) & $\dn$ & \textit{Seq} (\textit{inj $r_1$ c $v_1$}) $v_2$\\ \textit{inj} ($r_1 \cdot r_2$) \textit{c} (\textit{Right $v_2$}) & $\dn$ & \textit{Seq} (\textit{mkeps $r_1$}) (\textit{inj $r_2$ c $v_2$})\\ \textit{inj} ($r^*$) \textit{c} (\textit{Seq v} (\textit{Stars vs})) & $\dn$ & \textit{Stars} (\textit{inj r c v}::\textit{vs}) \end{tabular} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{POSIX Ordering Relation by Sulzmann \& Lu} \begin{itemize} \item Introduce an inductive defined ordering relation $v \succ_r v'$ which captures the idea of POSIX matching. \item The algorithm returns the maximum of all possible values that are possible for a regular expression. \item The idea is from a paper by Frisch \& Cardelli about GREEDY matching (GREEDY = preferring instant gratification to delayed repletion) %\item e.g. given $(a + (b + ab))^{*}$ and string $ab$\\ %\begin{center} %\begin{tabular}{ll} %GREEDY: & $[Left(a), Right(Left(b))]$\\ %POSIX: & $[Right(Right(Seq(a, b)))]$ %\end{tabular} %\end{center} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}% \frametitle{POSIX Ordering Relation by Sulzmann \& Lu}% % \begin{center}% $\infer{\vdash Empty : \epsilon}{}$\hspace{15mm} % $\infer{\vdash Char(c): c}{}$\bigskip% % $\infer{\vdash Seq(v_1, v_2) : r_1\cdot r_2}{\vdash v_1 : r_1 \quad \vdash v_2 : r_2}$% \bigskip% % $\infer{\vdash Left(v) : r_1 + r_2}{\vdash v : r_1}$\hspace{15mm}% $\infer{\vdash Right(v): r_1 + r_2}{\vdash v : r_2}$\bigskip% % $\infer{\vdash [] : r^*}{}$\hspace{15mm}% $\infer{\vdash [v_1,\ldots, v_n] : r^*}% {\vdash v_1 : r \quad\ldots\quad \vdash v_n : r}$% \end{center}% % \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Problems} \begin{itemize} \item Sulzmann: \ldots Let's assume $v$ is not a $POSIX$ value, then there must be another one \ldots contradiction. \item Exists ? \begin{center} $L(r) \not= \varnothing \;\Rightarrow\; \exists v.\;POSIX(v, r)$ \end{center} \item In the sequence case $Seq(v_1,v_2)\succ_{r_1\cdot r_2} Seq(v_1', v_2')$, the induction hypotheses require $|v_1| = |v_1'|$ and $|v_2| = |v_2'|$, but you only know \begin{center} $|v_1| @ |v_2| = |v_1'| @ |v_2'|$ \end{center} \item Although one begins with the assumption that the two values have the same flattening, this cannot be maintained as one descends into the induction (alternative, sequence) \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}% \frametitle{Problems}% % \begin{itemize}% \item I have no doubt the algorithm is correct --- % the problem is I do not believe their proof.% % \begin{center}% \small% ``How could I miss this? Well, I was rather careless when % stating this Lemma :)\smallskip% % Great example how formal machine checked proofs (and % proof assistants) can help to spot flawed reasoning steps.''% % \end{center}% % \begin{center}% \small% ``Well, I don't think there's any flaw. The issue is how to % come up with a mechanical proof. In my world mathematical % proof $=$ mechanical proof doesn't necessarily hold.''% % \end{center}% % \end{itemize} % % \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Our Solution} \begin{itemize} \item A direct definition of what a POSIX value is, using the relation \smath{s \in r \to v} (our specification)\bigskip \begin{center} \smath{\infer{[] \in \One \to Empty}{}}\hspace{15mm} \smath{\infer{[c] \in c \to Char(c)}{}}\bigskip\medskip \smath{\infer{s \in r_1 + r_2 \to Left(v)} {s \in r_1 \to v}}\hspace{10mm} \smath{\infer{s \in r_1 + r_2 \to Right(v)} {s \in r_2 \to v & s \not\in L(r_1)}}\bigskip\medskip \smath{\infer{s_1 @ s_2 \in r_1 \cdot r_2 \to Seq(v_1, v_2)} {\small\begin{array}{l} s_1 \in r_1 \to v_1 \\ s_2 \in r_2 \to v_2 \\ \neg(\exists s_3\,s_4.\; s_3 \not= [] \wedge s_3 @ s_4 = s_2 \wedge s_1 @ s_3 \in L(r_1) \wedge s_4 \in L(r_2)) \end{array}}} {\ldots} \end{center} \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[c] \frametitle{Properties} It is almost trival to prove: \begin{itemize} \item Uniqueness \begin{center} If $s \in r \to v_1$ and $s \in r \to v_2$ then $v_1 = v_2$ \end{center}\bigskip \item Correctness \begin{center} $lexer(r, s) = v$ if and only if $s \in r \to v$ \end{center} \end{itemize}\bigskip\bigskip\pause You can now start to implement optimisations and derive correctness proofs for them.% But we still do not know whether% % \begin{center}% $s \in r \to v$ % \end{center}% % is a POSIX value according to Sulzmann \& Lu's definition% (biggest value for $s$ and $r$) \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{Conclusions} \begin{itemize} \item Sulzmann and Lu's informal proof contained small gaps (acknowledged) but we believe it had also fundamental flaws\medskip \item We replaced the POSIX definition of Sulzmann \& Lu by a new definition (ours is inspired by work of Vansummeren, 2006)\medskip \item Now, its a nice exercise for theorem proving\medskip \item Some optimisations need to be applied to the algorithm in order to become fast enough\medskip \item Can be used for lexing, is a small beautiful functional program \end{itemize} \end{frame}\end{document}