# HG changeset patch # User Christian Urban # Date 1471696759 -3600 # Node ID 599b2bfcebf674145d7894a67eea94430964adf9 # Parent 097f396a1ae62d6312262d66ea951c30eef0675c updated slides diff -r 097f396a1ae6 -r 599b2bfcebf6 Slides/slides04.pdf Binary file Slides/slides04.pdf has changed diff -r 097f396a1ae6 -r 599b2bfcebf6 Slides/slides04.tex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Slides/slides04.tex Sat Aug 20 13:39:19 2016 +0100 @@ -0,0 +1,442 @@ +\documentclass{beamer} +\usepackage{tikz} +\usepackage[english]{babel} +\usepackage{proof} +\usetheme{Luebeck} +\usetikzlibrary{positioning} +\usetikzlibrary{decorations.pathreplacing} +\definecolor{darkblue}{rgb}{0,0,.803} +\definecolor{cream}{rgb}{1,1,.8} + +\newcommand{\smath}[1]{\textcolor{darkblue}{\ensuremath{#1}}} +\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}} +\newcommand{\Zero}{{\bf 0}} +\newcommand{\One}{{\bf 1}} + +\newenvironment{bubble}[1][]{% +\addtolength{\leftmargini}{4mm}% +\begin{tikzpicture}[baseline=(current bounding box.north)]% +\draw (0,0) node[inner sep=2mm,fill=cream,ultra thick,draw=red,rounded corners=2mm]% +\bgroup\begin{minipage}{#1}\raggedright{}} +{\end{minipage}\egroup;% +\end{tikzpicture}\bigskip} + +\newcommand\grid[1]{% +\begin{tikzpicture}[baseline=(char.base)] + \path[use as bounding box] + (0,0) rectangle (1em,1em); + \draw[red!50, fill=red!20] + (0,0) rectangle (1em,1em); + \node[inner sep=1pt,anchor=base west] + (char) at (0em,\gridraiseamount) {#1}; +\end{tikzpicture}} +\newcommand\gridraiseamount{0.12em} + +\makeatletter +\newcommand\Grid[1]{% + \@tfor\z:=#1\do{\grid{\z}}} +\makeatother + +\newcommand\Vspace[1][.3em]{% + \mbox{\kern.06em\vrule height.3ex}% + \vbox{\hrule width#1}% + \hbox{\vrule height.3ex}} + +\def\VS{\Vspace[0.6em]} + + +\title[POSIX Lexing with Derivatives of Regexes] + {\bf POSIX Lexing with\\ + \bf Derivatives of Regular Expressions\\ + \bf (Proof Pearl)} +\author{Fahad Ausaf, Roy Dyckhoff and Christian Urban} +\date{King's College London, University of St Andrews} + +\begin{document} +\maketitle + + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + \begin{frame} + \frametitle{Brzozowski's Derivatives of Regular Expressions} + + Idea: If \smath{r} matches the string \smath{c\!::\!s}, + what is a regular expression that matches just \smath{s}? \\ + + \begin{center} + \begin{tabular}{l@{\hspace{5mm}}lcl} + chars: + &\smath{\Zero \backslash c} & \smath{\dn} & \smath{\Zero}\\ + &\smath{\One \backslash c} & \smath{\dn} & \smath{\Zero}\\ + &\smath{d \backslash c} & \smath{\dn} & + \smath{\textit{if}\;d = c\;\textit{then}\;\One\;\textit{else}\;\Zero}\\ + &\smath{r_1 + r_2 \backslash c} & \smath{\dn} & + \smath{r_1 \backslash c \,+\, r_2 \backslash c}\\ + &\smath{r_1 \cdot r_2 \backslash c} & \smath{\dn} & + \smath{\textit{if}\;\textit{nullable}\;r_1}\\ + && & \smath{\textit{then}\;r_1\backslash c \cdot r_2 \,+\, r_2\backslash c + \;\textit{else}\;r_1\backslash c \cdot r_2}\\ + &\smath{r^* \backslash c} & \smath{\dn} & + \smath{r\backslash c \,\cdot\, r^*}\bigskip\\ + + strings: + &\smath{r\backslash []} & \smath{\dn} & \smath{r}\\ + &\smath{r\backslash c\!::\!s} & \smath{\dn} & + \smath{(r\backslash c)\backslash s}\\ + \end{tabular} + \end{center} + \end{frame} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame} + \frametitle{Brzozowski's Matcher} + + Does \smath{r_1} match string \smath{abc}? + + \begin{center} + \begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}] + \node (r1) {\smath{r_1}}; + \node (r2) [right=of r1] {\smath{r_2}}; + \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}}; + \node (r3) [right=of r2] {\smath{r_3}}; + \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}}; + \node (r4) [right=of r3] {\smath{r_4}}; + \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}}; + \draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;\textit{nullable}?}}}; + \end{tikzpicture} + \end{center}\pause + + It leads to an elegant functional program: + + \begin{center} + \smath{\textit{matches}\,(r, s) \dn \textit{nullable}\,(r\backslash s)} + \end{center}\pause + + It is an easy exercise to formally prove (e.g.~Coq, HOL, Isabelle): + + \begin{center} + \smath{\textit{matches}\,(r, s)} if and only if + \smath{s \in L(r)} + \end{center}\pause + + {\bf But Brzozowski's matcher gives only a yes/no-answer.} + \end{frame} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame} + \frametitle{Sulzmann and Lu's Matcher} + + Sulzmann and Lu added a second phase in order to answer + \alert{\textbf{how}} the regular expression matched the string. + + \begin{center}\small + \begin{tikzpicture}[scale=1,node distance=0.8cm,every node/.style={minimum size=7mm}] + \node (r1) {\smath{r_1}}; + \node (r2) [right=of r1] {\smath{r_2}}; + \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}}; + \node (r3) [right=of r2] {\smath{r_3}}; + \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}}; + \node (r4) [right=of r3] {\smath{r_4}}; + \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}}; + \draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;nullable?}}}; + \node (v4) [below=of r4] {\smath{v_4}}; + \draw[->,line width=1mm] (r4) -- (v4); + \node (v3) [left=of v4] {\smath{v_3}}; + \draw[->,line width=1mm] (v4) -- (v3) node[below,midway] {\smath{inj\,c}}; + \node (v2) [left=of v3] {\smath{v_2}}; + \draw[->,line width=1mm] (v3) -- (v2) node[below,midway] {\smath{inj\,b}}; + \node (v1) [left=of v2] {\smath{v_1}}; + \draw[->,line width=1mm] (v2) -- (v1) node[below,midway] {\smath{inj\,a}}; + \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{\smath{mkeps}}}; + + \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm}, + line width=1.5mm] + (r1) -- (r4) node [black,midway,above, yshift=12mm] + {\large\bf first phase}; + \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm}, + line width=1.5mm] + (v4) -- (v1) node [black,midway,below, yshift=-12mm] + {\large\bf second phase}; + %% first phase + \draw[line width=14mm, rounded corners, opacity=0.1, + cap=round,join=round,color=yellow!30] + (r1.center) -- (r4.center); + %% second phase + \draw[line width=14.1mm, rounded corners, opacity=0.2, + cap=round,join=round,draw=black, fill=white] + (r4) -- (v4.center) -- (v1.center); + \draw[line width=14mm, rounded corners, opacity=0.2, + cap=round,join=round,color=yellow!30] + (r4) -- (v4.center) -- (v1.center); + \end{tikzpicture} + \end{center} + + There are several possible answers for + \alert{\textbf{how}}: POSIX, GREEDY, \ldots + + \end{frame} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame}{Regular Expressions and Values} + Regular expressions and their corresponding values (for how a + regular expression matched a string):\bigskip + + \begin{columns}[c] % the "c" option specifies center vertical alignment + \column{.4\textwidth} % column designated by a command + \begin{tabular}{ l l l } + \smath{r} & ::= & \smath{\Zero} \\ + & $\mid$ & \smath{\One} \\ + & $\mid$ & \smath{c} \\ + & $\mid$ & \smath{r_1 \cdot r_2} \\ + & $\mid$ & \smath{r_1 + r_2} \\ \\ + & $\mid$ & \smath{r^*} \\ + \end{tabular} + \column{.4\textwidth} + \begin{tabular}{ l l l } + + \smath{v} & ::= & \\ + & $\mid$ & \smath{Empty} \\ + & $\mid$ & \smath{Char(c)} \\ + & $\mid$ & \smath{Seq(v_1\cdot v_2)} \\ + & $\mid$ & \smath{Left(v)} \\ + & $\mid$ & \smath{Right(v)} \\ + & $\mid$ & \smath{[v_1,...,v_n]} + \end{tabular} + \end{columns} + \end{frame} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame} + \frametitle{POSIX Matching (needed for Lexing)} + + + \begin{bubble}[10cm] + {\bf Longest Match Rule:} The longest + initial substring matched by any regular expression is taken + as the next token. + \end{bubble} + + \begin{bubble}[10cm] + {\bf Rule Priority:} For a particular longest initial substring, + the first regular expression that can match determines the + token. + \end{bubble} + + For example: \smath{r_{keywords} + r_{identifiers}}:\bigskip + + \begin{itemize} + \item \smath{\texttt{\Grid{iffoo\VS bla}}} + + \item \smath{\texttt{\Grid{if\VS bla}}} + \end{itemize} + + \end{frame} + + + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame} + \frametitle{Problems with POSIX} + + Grathwohl, Henglein and Rasmussen wrote: + + \begin{bubble}[10cm] + \it ``The POSIX strategy is more complicated than the greedy because + of the dependence on information + about the length of matched strings in the various subexpressions.'' + \end{bubble}\bigskip + + Also Kuklewicz maintains a unit-test repository for POSIX + matching, which indicates that most POSIX mathcers are buggy. + + \begin{center} + \url{http://www.haskell.org/haskellwiki/Regex_Posix} + \end{center} + + \end{frame} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame} + \frametitle{``Correctness'' by Sulzmann and Lu} + \begin{itemize} + \item Sulzmann \& Lu's idea is to order all possible + answer such that they can prove the correct answer is + the maximum + \item The idea is taken from a GREEDY algorithm (and it + works there)\bigskip\pause + + \item {\bf But} we made no progress in formalising + Sulzmann \& Lu's idea, because + + \begin{itemize} + \item transitivity, existence of maxima etc all fail to + turn into real proofs + \item the reason: the ordering works only if .... + \item though we did find mistakes: +\begin{center} + \small + ``How could I miss this? Well, I was rather careless when + stating this Lemma :)\smallskip + + Great example how formal machine checked proofs (and + proof assistants) can help to spot flawed reasoning steps.'' + + \end{center}\pause + + \begin{center} + \small + ``Well, I don't think there's any flaw. The issue is how to + come up with a mechanical proof. In my world mathematical + proof $=$ mechanical proof doesn't necessarily hold.'' + + \end{center}\pause + \end{itemize} + + \end{itemize} + \end{frame} + + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame} + \frametitle{Sulzmann and Lu Matcher} + + We want to match the string $abc$ using $r_1$\\ + + \begin{center} + \begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}] + \node (r1) {$r_1$}; + \node (r2) [right=of r1] {$r_2$}; + \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {$der\,a$};\pause + \node (r3) [right=of r2] {$r_3$}; + \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {$der\,b$};\pause + \node (r4) [right=of r3] {$r_4$}; + \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {$der\,c$};\pause + \draw (r4) node[anchor=west] {\;\raisebox{3mm}{$\;\;nullable?$}};\pause + \node (v4) [below=of r4] {$v_4$}; + \draw[->,line width=1mm] (r4) -- (v4);\pause + \node (v3) [left=of v4] {$v_3$}; + \draw[->,line width=1mm] (v4) -- (v3) node[below,midway] {$inj\,c$};\pause + \node (v2) [left=of v3] {$v_2$}; + \draw[->,line width=1mm] (v3) -- (v2) node[below,midway] {$inj\,b$};\pause + \node (v1) [left=of v2] {$v_1$}; + \draw[->,line width=1mm] (v2) -- (v1) node[below,midway] {$inj\,a$};\pause + \draw[->,line width=0.5mm] (r3) -- (v3); + \draw[->,line width=0.5mm] (r2) -- (v2); + \draw[->,line width=0.5mm] (r1) -- (v1); + \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{$mkeps$}}; + \end{tikzpicture} + \end{center} + + \end{frame} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame} + \frametitle{Problems} + \begin{itemize} + + \item Sulzmann: \ldots Let's assume $v$ is not a $POSIX$ value, + then there must be another one \ldots contradiction.\pause + + \item Exists ? + \begin{center} + $L(r) \not= \varnothing \;\Rightarrow\; \exists v.\;POSIX(v, r)$ + \end{center}\pause + + \item In the sequence case + $Seq(v_1,v_2)\succ_{r_1\cdot r_2} Seq(v_1', v_2')$, + the induction hypotheses require + $|v_1| = |v_1'|$ and $|v_2| = |v_2'|$, but you only know + + \begin{center} + $|v_1| @ |v_2| = |v_1'| @ |v_2'|$ + \end{center}\pause + + \item Although one begins with the assumption that the two + values have the same flattening, this cannot be maintained + as one descends into the induction (alternative, sequence) + + \end{itemize} + \end{frame} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + \begin{frame} + \frametitle{Our Solution} + + \begin{itemize} + \item A direct definition of what a POSIX value is, using the + relation \smath{s \in r \to v} (our specification)\bigskip + + \begin{center} + \smath{\infer{[] \in \epsilon \to Empty}{}}\hspace{15mm} + \smath{\infer{[c] \in c \to Char(c)}{}}\bigskip\medskip + + \smath{\infer{s \in r_1 + r_2 \to Left(v)} + {s \in r_1 \to v}}\hspace{10mm} + \smath{\infer{s \in r_1 + r_2 \to Right(v)} + {s \in r_2 \to v & s \not\in L(r_1)}}\bigskip\medskip + + \smath{\infer{s_1 @ s_2 \in r_1 \cdot r_2 \to Seq(v_1, v_2)} + {\small\begin{array}{l} + s_1 \in r_1 \to v_1 \\ + s_2 \in r_2 \to v_2 \\ + \neg(\exists s_3\,s_4.\; s_3 \not= [] + \wedge s_3 @ s_4 = s_2 \wedge + s_1 @ s_3 \in L(r_1) \wedge + s_4 \in L(r_2)) + \end{array}}} + + {\ldots} + \end{center} + + \end{itemize} + \end{frame} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + \begin{frame}[c] + \frametitle{Properties} + + It is almost trival to prove: + + \begin{itemize} + \item Uniqueness + \begin{center} + If $s \in r \to v_1$ and $s \in r \to v_2$ then + $v_1 = v_2$ + \end{center}\bigskip + + \item Correctness + \begin{center} + $lexer(r, s) = v$ if and only if $s \in r \to v$ + \end{center} + \end{itemize}\bigskip\bigskip\pause + + + You can now start to implement optimisations and derive + correctness proofs for them. But we still do not know whether + + \begin{center} + $s \in r \to v$ + \end{center} + + is a POSIX value according to Sulzmann \& Lu's definition + (biggest value for $s$ and $r$) + \end{frame} + + +\end{document}