lexing: changeset 207:599b2bfcebf6

Binary file Slides/slides04.pdf has changed

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Slides/slides04.tex	Sat Aug 20 13:39:19 2016 +0100
@@ -0,0 +1,442 @@
+\documentclass{beamer}
+\usepackage{tikz}
+\usepackage[english]{babel}
+\usepackage{proof}
+\usetheme{Luebeck}
+\usetikzlibrary{positioning}
+\usetikzlibrary{decorations.pathreplacing}
+\definecolor{darkblue}{rgb}{0,0,.803}
+\definecolor{cream}{rgb}{1,1,.8}
+
+\newcommand{\smath}[1]{\textcolor{darkblue}{\ensuremath{#1}}}
+\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}
+\newcommand{\Zero}{{\bf 0}}
+\newcommand{\One}{{\bf 1}}
+
+\newenvironment{bubble}[1][]{%
+\addtolength{\leftmargini}{4mm}%
+\begin{tikzpicture}[baseline=(current bounding box.north)]%
+\draw (0,0) node[inner sep=2mm,fill=cream,ultra thick,draw=red,rounded corners=2mm]% 
+\bgroup\begin{minipage}{#1}\raggedright{}}
+{\end{minipage}\egroup;%
+\end{tikzpicture}\bigskip}
+
+\newcommand\grid[1]{%
+\begin{tikzpicture}[baseline=(char.base)]
+  \path[use as bounding box]
+    (0,0) rectangle (1em,1em);
+  \draw[red!50, fill=red!20]
+    (0,0) rectangle (1em,1em);
+  \node[inner sep=1pt,anchor=base west]
+    (char) at (0em,\gridraiseamount) {#1};
+\end{tikzpicture}}
+\newcommand\gridraiseamount{0.12em}
+
+\makeatletter
+\newcommand\Grid[1]{%
+  \@tfor\z:=#1\do{\grid{\z}}}
+\makeatother	
+
+\newcommand\Vspace[1][.3em]{%
+  \mbox{\kern.06em\vrule height.3ex}%
+  \vbox{\hrule width#1}%
+  \hbox{\vrule height.3ex}}
+
+\def\VS{\Vspace[0.6em]}
+
+
+\title[POSIX Lexing with Derivatives of Regexes]
+      {\bf POSIX Lexing with\\ 
+       \bf Derivatives of Regular Expressions\\
+       \bf (Proof Pearl)}
+\author{Fahad Ausaf, Roy Dyckhoff and Christian Urban}
+\date{King's College London, University of St Andrews}
+
+\begin{document}
+\maketitle
+
+
+    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	\begin{frame}
+	\frametitle{Brzozowski's Derivatives of Regular Expressions}
+		
+	Idea: If \smath{r} matches the string \smath{c\!::\!s}, 
+	what is a regular expression that matches just \smath{s}? \\
+		
+   \begin{center}
+   \begin{tabular}{l@{\hspace{5mm}}lcl}	
+   chars:
+   &\smath{\Zero \backslash c} & \smath{\dn} & \smath{\Zero}\\
+   &\smath{\One \backslash c}  & \smath{\dn} & \smath{\Zero}\\
+   &\smath{d \backslash c}  & \smath{\dn} & 
+   \smath{\textit{if}\;d = c\;\textit{then}\;\One\;\textit{else}\;\Zero}\\
+   &\smath{r_1 + r_2 \backslash c} & \smath{\dn} & 
+   \smath{r_1 \backslash c \,+\, r_2 \backslash c}\\
+   &\smath{r_1 \cdot r_2 \backslash c} & \smath{\dn} & 
+   \smath{\textit{if}\;\textit{nullable}\;r_1}\\
+   && & \smath{\textit{then}\;r_1\backslash c \cdot r_2 \,+\, r_2\backslash c
+   \;\textit{else}\;r_1\backslash c \cdot r_2}\\
+   &\smath{r^* \backslash c} & \smath{\dn} & 
+   \smath{r\backslash c \,\cdot\, r^*}\bigskip\\
+   
+   strings:
+   &\smath{r\backslash []} & \smath{\dn} & \smath{r}\\  
+   &\smath{r\backslash c\!::\!s} & \smath{\dn} & 
+   \smath{(r\backslash c)\backslash s}\\
+   \end{tabular}
+   \end{center}
+	\end{frame}
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}
+	\frametitle{Brzozowski's Matcher}
+	
+	Does \smath{r_1} match string \smath{abc}? 
+	
+	\begin{center}
+	\begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}]
+		\node (r1)  {\smath{r_1}};
+		\node (r2) [right=of r1] {\smath{r_2}};
+		\draw[->,line width=1mm]  (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}};
+		\node (r3) [right=of r2] {\smath{r_3}};
+		\draw[->,line width=1mm]  (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}};
+		\node (r4) [right=of r3] {\smath{r_4}};
+		\draw[->,line width=1mm]  (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}};
+		\draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;\textit{nullable}?}}};
+	\end{tikzpicture}
+	\end{center}\pause
+	
+	It leads to an elegant functional program:
+	
+	\begin{center}
+	\smath{\textit{matches}\,(r, s) \dn \textit{nullable}\,(r\backslash s)}
+	\end{center}\pause
+	
+	It is an easy exercise to formally prove (e.g.~Coq, HOL, Isabelle):
+		
+	\begin{center}	
+		\smath{\textit{matches}\,(r, s)} if and only if 
+		\smath{s \in L(r)}
+   \end{center}\pause 
+		
+	{\bf But Brzozowski's matcher gives only a yes/no-answer.}	
+	\end{frame}
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}
+	\frametitle{Sulzmann and Lu's Matcher}
+
+   Sulzmann and Lu added a second phase in order to answer
+   \alert{\textbf{how}} the regular expression matched the string.
+   
+   \begin{center}\small
+   \begin{tikzpicture}[scale=1,node distance=0.8cm,every node/.style={minimum size=7mm}]
+		\node (r1)  {\smath{r_1}};
+		\node (r2) [right=of r1] {\smath{r_2}};
+		\draw[->,line width=1mm]  (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}};
+		\node (r3) [right=of r2] {\smath{r_3}};
+		\draw[->,line width=1mm]  (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}};
+		\node (r4) [right=of r3] {\smath{r_4}};
+		\draw[->,line width=1mm]  (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}};
+		\draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;nullable?}}};
+		\node (v4) [below=of r4] {\smath{v_4}};
+		\draw[->,line width=1mm]  (r4) -- (v4);
+		\node (v3) [left=of v4] {\smath{v_3}};
+		\draw[->,line width=1mm]  (v4) -- (v3) node[below,midway] {\smath{inj\,c}};
+		\node (v2) [left=of v3] {\smath{v_2}};
+		\draw[->,line width=1mm]  (v3) -- (v2) node[below,midway] {\smath{inj\,b}};
+		\node (v1) [left=of v2] {\smath{v_1}};
+		\draw[->,line width=1mm]  (v2) -- (v1) node[below,midway] {\smath{inj\,a}};
+		\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{\smath{mkeps}}};
+
+       \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm},
+              line width=1.5mm]
+              (r1) -- (r4) node [black,midway,above, yshift=12mm] 
+             {\large\bf  first phase};
+       \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm},
+              line width=1.5mm]
+              (v4) -- (v1) node [black,midway,below, yshift=-12mm] 
+             {\large\bf  second phase}; 
+       %% first phase      
+       \draw[line width=14mm, rounded corners, opacity=0.1,
+             cap=round,join=round,color=yellow!30]
+            (r1.center)  -- (r4.center);
+       %% second phase     
+       \draw[line width=14.1mm, rounded corners, opacity=0.2,
+             cap=round,join=round,draw=black, fill=white]
+            (r4)  -- (v4.center) -- (v1.center);
+       \draw[line width=14mm, rounded corners, opacity=0.2,
+             cap=round,join=round,color=yellow!30]
+            (r4)  -- (v4.center) -- (v1.center);     
+	\end{tikzpicture}
+   \end{center}
+   
+   There are several possible answers for 
+   \alert{\textbf{how}}: POSIX, GREEDY, \ldots
+
+   \end{frame}
+   
+   %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}{Regular Expressions and Values}
+	Regular expressions and their corresponding values (for how a 
+	regular expression matched a string):\bigskip 
+	
+    \begin{columns}[c] % the "c" option specifies center vertical alignment
+    \column{.4\textwidth} % column designated by a command
+     	\begin{tabular}{ l l l }
+			\smath{r} & ::= 		& \smath{\Zero} 	\\
+			    & $\mid$	& \smath{\One} 		\\
+			    & $\mid$	& \smath{c} 				\\
+			    & $\mid$	& \smath{r_1 \cdot r_2}	\\
+			    & $\mid$	& \smath{r_1 + r_2}		\\ \\
+			    & $\mid$	& \smath{r^*}				\\
+		\end{tabular}
+    \column{.4\textwidth}
+     	\begin{tabular}{ l l l }
+     	
+			\smath{v} & ::= 		& \\
+			    & $\mid$	& \smath{Empty} 			\\
+			    & $\mid$	& \smath{Char(c)} 		\\
+			    & $\mid$	& \smath{Seq(v_1\cdot v_2)}	\\
+			    & $\mid$	& \smath{Left(v)}		\\
+			    & $\mid$	& \smath{Right(v)}		\\
+			    & $\mid$	& \smath{[v_1,...,v_n]}
+		\end{tabular}
+    \end{columns}
+	\end{frame}
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+
+
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}
+	\frametitle{POSIX Matching (needed for Lexing)}
+
+
+    \begin{bubble}[10cm]
+    {\bf Longest Match Rule:} The longest 
+    initial substring matched by any regular expression is taken 
+    as the next token.
+    \end{bubble}
+    
+    \begin{bubble}[10cm]
+    {\bf Rule Priority:} For a particular longest initial substring, 
+    the first regular expression that can match determines the 
+	token.
+    \end{bubble}
+  
+	For example: \smath{r_{keywords} + r_{identifiers}}:\bigskip
+
+		\begin{itemize}
+		\item \smath{\texttt{\Grid{iffoo\VS bla}}} 
+
+		\item \smath{\texttt{\Grid{if\VS bla}}}
+		\end{itemize}	
+	
+	\end{frame}
+	
+	
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}
+	\frametitle{Problems with POSIX}
+	
+	Grathwohl, Henglein and Rasmussen wrote:
+	
+	\begin{bubble}[10cm]   
+	\it ``The POSIX strategy is more complicated than the greedy because 
+	of the dependence on information 
+	about the length of matched strings in the various subexpressions.''   
+	\end{bubble}\bigskip   
+	
+	Also Kuklewicz maintains a unit-test repository for POSIX
+	matching, which indicates that most POSIX mathcers are buggy.
+	
+	\begin{center}
+	\url{http://www.haskell.org/haskellwiki/Regex_Posix}
+   \end{center}
+
+	\end{frame}
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}
+	\frametitle{``Correctness'' by Sulzmann and Lu}
+	\begin{itemize}
+		\item Sulzmann \& Lu's idea is to order all possible
+		answer such that they can prove the correct answer is
+		the maximum
+		\item The idea is taken from a GREEDY algorithm (and it 
+		works there)\bigskip\pause
+		
+		\item {\bf But} we made no progress in formalising 
+		Sulzmann \& Lu's idea, because
+		
+      \begin{itemize}
+      \item transitivity, existence of maxima etc all fail to
+      turn into real proofs
+      \item the reason: the ordering works only if ....
+      \item though we did find mistakes:
+\begin{center}
+	\small
+	``How could I miss this? Well, I was rather careless when 
+	stating this Lemma :)\smallskip
+	
+	Great example how formal machine checked proofs (and 
+	proof assistants) can help to spot flawed reasoning steps.''
+	
+	\end{center}\pause
+	  
+	\begin{center}
+	\small
+	``Well, I don't think there's any flaw. The issue is how to 
+	come up with a mechanical proof. In my world mathematical 
+	proof $=$ mechanical proof doesn't necessarily hold.''
+	
+	\end{center}\pause
+	  \end{itemize}		
+   
+	\end{itemize}
+	\end{frame}
+	
+
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}
+	\frametitle{Sulzmann and Lu Matcher}
+	
+	We want to match the string $abc$ using $r_1$\\
+	
+	\begin{center}	
+	\begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}]
+		\node (r1)  {$r_1$};
+		\node (r2) [right=of r1] {$r_2$};
+		\draw[->,line width=1mm]  (r1) -- (r2) node[above,midway] {$der\,a$};\pause
+		\node (r3) [right=of r2] {$r_3$};
+		\draw[->,line width=1mm]  (r2) -- (r3) node[above,midway] {$der\,b$};\pause
+		\node (r4) [right=of r3] {$r_4$};
+		\draw[->,line width=1mm]  (r3) -- (r4) node[above,midway] {$der\,c$};\pause
+		\draw (r4) node[anchor=west] {\;\raisebox{3mm}{$\;\;nullable?$}};\pause
+		\node (v4) [below=of r4] {$v_4$};
+		\draw[->,line width=1mm]  (r4) -- (v4);\pause
+		\node (v3) [left=of v4] {$v_3$};
+		\draw[->,line width=1mm]  (v4) -- (v3) node[below,midway] {$inj\,c$};\pause
+		\node (v2) [left=of v3] {$v_2$};
+		\draw[->,line width=1mm]  (v3) -- (v2) node[below,midway] {$inj\,b$};\pause
+		\node (v1) [left=of v2] {$v_1$};
+		\draw[->,line width=1mm]  (v2) -- (v1) node[below,midway] {$inj\,a$};\pause
+		\draw[->,line width=0.5mm]  (r3) -- (v3);
+		\draw[->,line width=0.5mm]  (r2) -- (v2);
+		\draw[->,line width=0.5mm]  (r1) -- (v1);
+		\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{$mkeps$}};
+	\end{tikzpicture}
+	\end{center}
+	
+	\end{frame}	
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}
+	\frametitle{Problems}
+	\begin{itemize}
+	
+	\item Sulzmann: \ldots Let's assume $v$ is not a $POSIX$ value, 
+	then there must be another one \ldots contradiction.\pause
+	
+	\item Exists ?
+	\begin{center}
+	$L(r) \not= \varnothing \;\Rightarrow\; \exists v.\;POSIX(v, r)$
+	\end{center}\pause
+	
+	\item In the sequence case 
+	$Seq(v_1,v_2)\succ_{r_1\cdot r_2} Seq(v_1', v_2')$, 
+	the induction hypotheses require 
+	$|v_1| = |v_1'|$ and $|v_2| = |v_2'|$, but you only know 
+	
+	\begin{center}
+	$|v_1| @ |v_2| = |v_1'| @ |v_2'|$
+	\end{center}\pause
+	
+	\item Although one begins with the assumption that the two 
+	values have the same flattening, this cannot be maintained 
+	as one descends into the induction (alternative, sequence)
+		
+	\end{itemize}
+	\end{frame}
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	
+	\begin{frame}
+	\frametitle{Our Solution}
+
+	\begin{itemize}
+	\item A direct definition of what a POSIX value is, using the 
+	relation \smath{s \in r \to v} (our specification)\bigskip
+	
+	\begin{center}
+	\smath{\infer{[] \in \epsilon \to Empty}{}}\hspace{15mm}
+	\smath{\infer{[c] \in c \to Char(c)}{}}\bigskip\medskip
+	
+	\smath{\infer{s \in r_1 + r_2 \to Left(v)}
+	      {s \in r_1 \to v}}\hspace{10mm}
+	\smath{\infer{s \in r_1 + r_2 \to Right(v)}
+	      {s \in r_2 \to v & s \not\in L(r_1)}}\bigskip\medskip
+	
+	\smath{\infer{s_1 @ s_2 \in r_1 \cdot r_2 \to Seq(v_1, v_2)}
+	          {\small\begin{array}{l}
+	           s_1 \in r_1 \to v_1 \\
+	           s_2 \in r_2 \to v_2 \\
+	           \neg(\exists s_3\,s_4.\; s_3 \not= []
+	           \wedge s_3 @ s_4 = s_2 \wedge
+	           s_1 @ s_3 \in L(r_1) \wedge
+	           s_4 \in L(r_2))
+	           \end{array}}}
+	           
+	{\ldots}           
+	\end{center}	
+	
+	\end{itemize}
+	\end{frame}
+	
+	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+	\begin{frame}[c]
+	\frametitle{Properties}
+	
+	It is almost trival to prove:
+	
+	\begin{itemize}
+	\item Uniqueness
+	\begin{center}
+	If $s \in r \to v_1$ and $s \in r \to v_2$ then
+	$v_1 = v_2$
+	\end{center}\bigskip
+	
+	\item Correctness
+	\begin{center}
+	$lexer(r, s) = v$ if and only if $s \in r \to v$
+	\end{center}
+	\end{itemize}\bigskip\bigskip\pause
+	
+	
+	You can now start to implement optimisations and derive
+	correctness proofs for them. But we still do not know whether
+	
+	\begin{center}
+	$s \in r \to v$ 
+	\end{center}
+	
+	is a POSIX value according to Sulzmann \& Lu's definition
+	(biggest value for $s$ and $r$)
+	\end{frame}
+	
+
+\end{document}

author	Christian Urban <christian dot urban at kcl dot ac dot uk>
	Sat, 20 Aug 2016 13:39:19 +0100
changeset 207	599b2bfcebf6
parent 206	097f396a1ae6
child 208	02568e85a394

Slides/slides04.pdf		file \| annotate \| diff \| comparison \| revisions
Slides/slides04.tex		file \| annotate \| diff \| comparison \| revisions