Slides/slides04.tex
changeset 207 599b2bfcebf6
child 208 02568e85a394
equal deleted inserted replaced
206:097f396a1ae6 207:599b2bfcebf6
       
     1 \documentclass{beamer}
       
     2 \usepackage{tikz}
       
     3 \usepackage[english]{babel}
       
     4 \usepackage{proof}
       
     5 \usetheme{Luebeck}
       
     6 \usetikzlibrary{positioning}
       
     7 \usetikzlibrary{decorations.pathreplacing}
       
     8 \definecolor{darkblue}{rgb}{0,0,.803}
       
     9 \definecolor{cream}{rgb}{1,1,.8}
       
    10 
       
    11 \newcommand{\smath}[1]{\textcolor{darkblue}{\ensuremath{#1}}}
       
    12 \newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}
       
    13 \newcommand{\Zero}{{\bf 0}}
       
    14 \newcommand{\One}{{\bf 1}}
       
    15 
       
    16 \newenvironment{bubble}[1][]{%
       
    17 \addtolength{\leftmargini}{4mm}%
       
    18 \begin{tikzpicture}[baseline=(current bounding box.north)]%
       
    19 \draw (0,0) node[inner sep=2mm,fill=cream,ultra thick,draw=red,rounded corners=2mm]% 
       
    20 \bgroup\begin{minipage}{#1}\raggedright{}}
       
    21 {\end{minipage}\egroup;%
       
    22 \end{tikzpicture}\bigskip}
       
    23 
       
    24 \newcommand\grid[1]{%
       
    25 \begin{tikzpicture}[baseline=(char.base)]
       
    26   \path[use as bounding box]
       
    27     (0,0) rectangle (1em,1em);
       
    28   \draw[red!50, fill=red!20]
       
    29     (0,0) rectangle (1em,1em);
       
    30   \node[inner sep=1pt,anchor=base west]
       
    31     (char) at (0em,\gridraiseamount) {#1};
       
    32 \end{tikzpicture}}
       
    33 \newcommand\gridraiseamount{0.12em}
       
    34 
       
    35 \makeatletter
       
    36 \newcommand\Grid[1]{%
       
    37   \@tfor\z:=#1\do{\grid{\z}}}
       
    38 \makeatother	
       
    39 
       
    40 \newcommand\Vspace[1][.3em]{%
       
    41   \mbox{\kern.06em\vrule height.3ex}%
       
    42   \vbox{\hrule width#1}%
       
    43   \hbox{\vrule height.3ex}}
       
    44 
       
    45 \def\VS{\Vspace[0.6em]}
       
    46 
       
    47 
       
    48 \title[POSIX Lexing with Derivatives of Regexes]
       
    49       {\bf POSIX Lexing with\\ 
       
    50        \bf Derivatives of Regular Expressions\\
       
    51        \bf (Proof Pearl)}
       
    52 \author{Fahad Ausaf, Roy Dyckhoff and Christian Urban}
       
    53 \date{King's College London, University of St Andrews}
       
    54 
       
    55 \begin{document}
       
    56 \maketitle
       
    57 
       
    58 
       
    59     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
    60 	\begin{frame}
       
    61 	\frametitle{Brzozowski's Derivatives of Regular Expressions}
       
    62 		
       
    63 	Idea: If \smath{r} matches the string \smath{c\!::\!s}, 
       
    64 	what is a regular expression that matches just \smath{s}? \\
       
    65 		
       
    66    \begin{center}
       
    67    \begin{tabular}{l@{\hspace{5mm}}lcl}	
       
    68    chars:
       
    69    &\smath{\Zero \backslash c} & \smath{\dn} & \smath{\Zero}\\
       
    70    &\smath{\One \backslash c}  & \smath{\dn} & \smath{\Zero}\\
       
    71    &\smath{d \backslash c}  & \smath{\dn} & 
       
    72    \smath{\textit{if}\;d = c\;\textit{then}\;\One\;\textit{else}\;\Zero}\\
       
    73    &\smath{r_1 + r_2 \backslash c} & \smath{\dn} & 
       
    74    \smath{r_1 \backslash c \,+\, r_2 \backslash c}\\
       
    75    &\smath{r_1 \cdot r_2 \backslash c} & \smath{\dn} & 
       
    76    \smath{\textit{if}\;\textit{nullable}\;r_1}\\
       
    77    && & \smath{\textit{then}\;r_1\backslash c \cdot r_2 \,+\, r_2\backslash c
       
    78    \;\textit{else}\;r_1\backslash c \cdot r_2}\\
       
    79    &\smath{r^* \backslash c} & \smath{\dn} & 
       
    80    \smath{r\backslash c \,\cdot\, r^*}\bigskip\\
       
    81    
       
    82    strings:
       
    83    &\smath{r\backslash []} & \smath{\dn} & \smath{r}\\  
       
    84    &\smath{r\backslash c\!::\!s} & \smath{\dn} & 
       
    85    \smath{(r\backslash c)\backslash s}\\
       
    86    \end{tabular}
       
    87    \end{center}
       
    88 	\end{frame}
       
    89 	
       
    90 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
    91 	
       
    92 	\begin{frame}
       
    93 	\frametitle{Brzozowski's Matcher}
       
    94 	
       
    95 	Does \smath{r_1} match string \smath{abc}? 
       
    96 	
       
    97 	\begin{center}
       
    98 	\begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}]
       
    99 		\node (r1)  {\smath{r_1}};
       
   100 		\node (r2) [right=of r1] {\smath{r_2}};
       
   101 		\draw[->,line width=1mm]  (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}};
       
   102 		\node (r3) [right=of r2] {\smath{r_3}};
       
   103 		\draw[->,line width=1mm]  (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}};
       
   104 		\node (r4) [right=of r3] {\smath{r_4}};
       
   105 		\draw[->,line width=1mm]  (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}};
       
   106 		\draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;\textit{nullable}?}}};
       
   107 	\end{tikzpicture}
       
   108 	\end{center}\pause
       
   109 	
       
   110 	It leads to an elegant functional program:
       
   111 	
       
   112 	\begin{center}
       
   113 	\smath{\textit{matches}\,(r, s) \dn \textit{nullable}\,(r\backslash s)}
       
   114 	\end{center}\pause
       
   115 	
       
   116 	It is an easy exercise to formally prove (e.g.~Coq, HOL, Isabelle):
       
   117 		
       
   118 	\begin{center}	
       
   119 		\smath{\textit{matches}\,(r, s)} if and only if 
       
   120 		\smath{s \in L(r)}
       
   121    \end{center}\pause 
       
   122 		
       
   123 	{\bf But Brzozowski's matcher gives only a yes/no-answer.}	
       
   124 	\end{frame}
       
   125 	
       
   126 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   127 	
       
   128 	\begin{frame}
       
   129 	\frametitle{Sulzmann and Lu's Matcher}
       
   130 
       
   131    Sulzmann and Lu added a second phase in order to answer
       
   132    \alert{\textbf{how}} the regular expression matched the string.
       
   133    
       
   134    \begin{center}\small
       
   135    \begin{tikzpicture}[scale=1,node distance=0.8cm,every node/.style={minimum size=7mm}]
       
   136 		\node (r1)  {\smath{r_1}};
       
   137 		\node (r2) [right=of r1] {\smath{r_2}};
       
   138 		\draw[->,line width=1mm]  (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}};
       
   139 		\node (r3) [right=of r2] {\smath{r_3}};
       
   140 		\draw[->,line width=1mm]  (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}};
       
   141 		\node (r4) [right=of r3] {\smath{r_4}};
       
   142 		\draw[->,line width=1mm]  (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}};
       
   143 		\draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;nullable?}}};
       
   144 		\node (v4) [below=of r4] {\smath{v_4}};
       
   145 		\draw[->,line width=1mm]  (r4) -- (v4);
       
   146 		\node (v3) [left=of v4] {\smath{v_3}};
       
   147 		\draw[->,line width=1mm]  (v4) -- (v3) node[below,midway] {\smath{inj\,c}};
       
   148 		\node (v2) [left=of v3] {\smath{v_2}};
       
   149 		\draw[->,line width=1mm]  (v3) -- (v2) node[below,midway] {\smath{inj\,b}};
       
   150 		\node (v1) [left=of v2] {\smath{v_1}};
       
   151 		\draw[->,line width=1mm]  (v2) -- (v1) node[below,midway] {\smath{inj\,a}};
       
   152 		\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{\smath{mkeps}}};
       
   153 
       
   154        \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm},
       
   155               line width=1.5mm]
       
   156               (r1) -- (r4) node [black,midway,above, yshift=12mm] 
       
   157              {\large\bf  first phase};
       
   158        \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm},
       
   159               line width=1.5mm]
       
   160               (v4) -- (v1) node [black,midway,below, yshift=-12mm] 
       
   161              {\large\bf  second phase}; 
       
   162        %% first phase      
       
   163        \draw[line width=14mm, rounded corners, opacity=0.1,
       
   164              cap=round,join=round,color=yellow!30]
       
   165             (r1.center)  -- (r4.center);
       
   166        %% second phase     
       
   167        \draw[line width=14.1mm, rounded corners, opacity=0.2,
       
   168              cap=round,join=round,draw=black, fill=white]
       
   169             (r4)  -- (v4.center) -- (v1.center);
       
   170        \draw[line width=14mm, rounded corners, opacity=0.2,
       
   171              cap=round,join=round,color=yellow!30]
       
   172             (r4)  -- (v4.center) -- (v1.center);     
       
   173 	\end{tikzpicture}
       
   174    \end{center}
       
   175    
       
   176    There are several possible answers for 
       
   177    \alert{\textbf{how}}: POSIX, GREEDY, \ldots
       
   178 
       
   179    \end{frame}
       
   180    
       
   181    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   182 	
       
   183 	\begin{frame}{Regular Expressions and Values}
       
   184 	Regular expressions and their corresponding values (for how a 
       
   185 	regular expression matched a string):\bigskip 
       
   186 	
       
   187     \begin{columns}[c] % the "c" option specifies center vertical alignment
       
   188     \column{.4\textwidth} % column designated by a command
       
   189      	\begin{tabular}{ l l l }
       
   190 			\smath{r} & ::= 		& \smath{\Zero} 	\\
       
   191 			    & $\mid$	& \smath{\One} 		\\
       
   192 			    & $\mid$	& \smath{c} 				\\
       
   193 			    & $\mid$	& \smath{r_1 \cdot r_2}	\\
       
   194 			    & $\mid$	& \smath{r_1 + r_2}		\\ \\
       
   195 			    & $\mid$	& \smath{r^*}				\\
       
   196 		\end{tabular}
       
   197     \column{.4\textwidth}
       
   198      	\begin{tabular}{ l l l }
       
   199      	
       
   200 			\smath{v} & ::= 		& \\
       
   201 			    & $\mid$	& \smath{Empty} 			\\
       
   202 			    & $\mid$	& \smath{Char(c)} 		\\
       
   203 			    & $\mid$	& \smath{Seq(v_1\cdot v_2)}	\\
       
   204 			    & $\mid$	& \smath{Left(v)}		\\
       
   205 			    & $\mid$	& \smath{Right(v)}		\\
       
   206 			    & $\mid$	& \smath{[v_1,...,v_n]}
       
   207 		\end{tabular}
       
   208     \end{columns}
       
   209 	\end{frame}
       
   210 	
       
   211 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   212 	
       
   213 
       
   214 
       
   215 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   216 	
       
   217 	\begin{frame}
       
   218 	\frametitle{POSIX Matching (needed for Lexing)}
       
   219 
       
   220 
       
   221     \begin{bubble}[10cm]
       
   222     {\bf Longest Match Rule:} The longest 
       
   223     initial substring matched by any regular expression is taken 
       
   224     as the next token.
       
   225     \end{bubble}
       
   226     
       
   227     \begin{bubble}[10cm]
       
   228     {\bf Rule Priority:} For a particular longest initial substring, 
       
   229     the first regular expression that can match determines the 
       
   230 	token.
       
   231     \end{bubble}
       
   232   
       
   233 	For example: \smath{r_{keywords} + r_{identifiers}}:\bigskip
       
   234 
       
   235 		\begin{itemize}
       
   236 		\item \smath{\texttt{\Grid{iffoo\VS bla}}} 
       
   237 
       
   238 		\item \smath{\texttt{\Grid{if\VS bla}}}
       
   239 		\end{itemize}	
       
   240 	
       
   241 	\end{frame}
       
   242 	
       
   243 	
       
   244 	
       
   245 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   246 	
       
   247 	\begin{frame}
       
   248 	\frametitle{Problems with POSIX}
       
   249 	
       
   250 	Grathwohl, Henglein and Rasmussen wrote:
       
   251 	
       
   252 	\begin{bubble}[10cm]   
       
   253 	\it ``The POSIX strategy is more complicated than the greedy because 
       
   254 	of the dependence on information 
       
   255 	about the length of matched strings in the various subexpressions.''   
       
   256 	\end{bubble}\bigskip   
       
   257 	
       
   258 	Also Kuklewicz maintains a unit-test repository for POSIX
       
   259 	matching, which indicates that most POSIX mathcers are buggy.
       
   260 	
       
   261 	\begin{center}
       
   262 	\url{http://www.haskell.org/haskellwiki/Regex_Posix}
       
   263    \end{center}
       
   264 
       
   265 	\end{frame}
       
   266 	
       
   267 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   268 	
       
   269 	\begin{frame}
       
   270 	\frametitle{``Correctness'' by Sulzmann and Lu}
       
   271 	\begin{itemize}
       
   272 		\item Sulzmann \& Lu's idea is to order all possible
       
   273 		answer such that they can prove the correct answer is
       
   274 		the maximum
       
   275 		\item The idea is taken from a GREEDY algorithm (and it 
       
   276 		works there)\bigskip\pause
       
   277 		
       
   278 		\item {\bf But} we made no progress in formalising 
       
   279 		Sulzmann \& Lu's idea, because
       
   280 		
       
   281       \begin{itemize}
       
   282       \item transitivity, existence of maxima etc all fail to
       
   283       turn into real proofs
       
   284       \item the reason: the ordering works only if ....
       
   285       \item though we did find mistakes:
       
   286 \begin{center}
       
   287 	\small
       
   288 	``How could I miss this? Well, I was rather careless when 
       
   289 	stating this Lemma :)\smallskip
       
   290 	
       
   291 	Great example how formal machine checked proofs (and 
       
   292 	proof assistants) can help to spot flawed reasoning steps.''
       
   293 	
       
   294 	\end{center}\pause
       
   295 	  
       
   296 	\begin{center}
       
   297 	\small
       
   298 	``Well, I don't think there's any flaw. The issue is how to 
       
   299 	come up with a mechanical proof. In my world mathematical 
       
   300 	proof $=$ mechanical proof doesn't necessarily hold.''
       
   301 	
       
   302 	\end{center}\pause
       
   303 	  \end{itemize}		
       
   304    
       
   305 	\end{itemize}
       
   306 	\end{frame}
       
   307 	
       
   308 
       
   309 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   310 	
       
   311 	\begin{frame}
       
   312 	\frametitle{Sulzmann and Lu Matcher}
       
   313 	
       
   314 	We want to match the string $abc$ using $r_1$\\
       
   315 	
       
   316 	\begin{center}	
       
   317 	\begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}]
       
   318 		\node (r1)  {$r_1$};
       
   319 		\node (r2) [right=of r1] {$r_2$};
       
   320 		\draw[->,line width=1mm]  (r1) -- (r2) node[above,midway] {$der\,a$};\pause
       
   321 		\node (r3) [right=of r2] {$r_3$};
       
   322 		\draw[->,line width=1mm]  (r2) -- (r3) node[above,midway] {$der\,b$};\pause
       
   323 		\node (r4) [right=of r3] {$r_4$};
       
   324 		\draw[->,line width=1mm]  (r3) -- (r4) node[above,midway] {$der\,c$};\pause
       
   325 		\draw (r4) node[anchor=west] {\;\raisebox{3mm}{$\;\;nullable?$}};\pause
       
   326 		\node (v4) [below=of r4] {$v_4$};
       
   327 		\draw[->,line width=1mm]  (r4) -- (v4);\pause
       
   328 		\node (v3) [left=of v4] {$v_3$};
       
   329 		\draw[->,line width=1mm]  (v4) -- (v3) node[below,midway] {$inj\,c$};\pause
       
   330 		\node (v2) [left=of v3] {$v_2$};
       
   331 		\draw[->,line width=1mm]  (v3) -- (v2) node[below,midway] {$inj\,b$};\pause
       
   332 		\node (v1) [left=of v2] {$v_1$};
       
   333 		\draw[->,line width=1mm]  (v2) -- (v1) node[below,midway] {$inj\,a$};\pause
       
   334 		\draw[->,line width=0.5mm]  (r3) -- (v3);
       
   335 		\draw[->,line width=0.5mm]  (r2) -- (v2);
       
   336 		\draw[->,line width=0.5mm]  (r1) -- (v1);
       
   337 		\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{$mkeps$}};
       
   338 	\end{tikzpicture}
       
   339 	\end{center}
       
   340 	
       
   341 	\end{frame}	
       
   342 	
       
   343 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   344 	
       
   345 	
       
   346 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   347 	
       
   348 	\begin{frame}
       
   349 	\frametitle{Problems}
       
   350 	\begin{itemize}
       
   351 	
       
   352 	\item Sulzmann: \ldots Let's assume $v$ is not a $POSIX$ value, 
       
   353 	then there must be another one \ldots contradiction.\pause
       
   354 	
       
   355 	\item Exists ?
       
   356 	\begin{center}
       
   357 	$L(r) \not= \varnothing \;\Rightarrow\; \exists v.\;POSIX(v, r)$
       
   358 	\end{center}\pause
       
   359 	
       
   360 	\item In the sequence case 
       
   361 	$Seq(v_1,v_2)\succ_{r_1\cdot r_2} Seq(v_1', v_2')$, 
       
   362 	the induction hypotheses require 
       
   363 	$|v_1| = |v_1'|$ and $|v_2| = |v_2'|$, but you only know 
       
   364 	
       
   365 	\begin{center}
       
   366 	$|v_1| @ |v_2| = |v_1'| @ |v_2'|$
       
   367 	\end{center}\pause
       
   368 	
       
   369 	\item Although one begins with the assumption that the two 
       
   370 	values have the same flattening, this cannot be maintained 
       
   371 	as one descends into the induction (alternative, sequence)
       
   372 		
       
   373 	\end{itemize}
       
   374 	\end{frame}
       
   375 	
       
   376 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   377 	
       
   378 	\begin{frame}
       
   379 	\frametitle{Our Solution}
       
   380 
       
   381 	\begin{itemize}
       
   382 	\item A direct definition of what a POSIX value is, using the 
       
   383 	relation \smath{s \in r \to v} (our specification)\bigskip
       
   384 	
       
   385 	\begin{center}
       
   386 	\smath{\infer{[] \in \epsilon \to Empty}{}}\hspace{15mm}
       
   387 	\smath{\infer{[c] \in c \to Char(c)}{}}\bigskip\medskip
       
   388 	
       
   389 	\smath{\infer{s \in r_1 + r_2 \to Left(v)}
       
   390 	      {s \in r_1 \to v}}\hspace{10mm}
       
   391 	\smath{\infer{s \in r_1 + r_2 \to Right(v)}
       
   392 	      {s \in r_2 \to v & s \not\in L(r_1)}}\bigskip\medskip
       
   393 	
       
   394 	\smath{\infer{s_1 @ s_2 \in r_1 \cdot r_2 \to Seq(v_1, v_2)}
       
   395 	          {\small\begin{array}{l}
       
   396 	           s_1 \in r_1 \to v_1 \\
       
   397 	           s_2 \in r_2 \to v_2 \\
       
   398 	           \neg(\exists s_3\,s_4.\; s_3 \not= []
       
   399 	           \wedge s_3 @ s_4 = s_2 \wedge
       
   400 	           s_1 @ s_3 \in L(r_1) \wedge
       
   401 	           s_4 \in L(r_2))
       
   402 	           \end{array}}}
       
   403 	           
       
   404 	{\ldots}           
       
   405 	\end{center}	
       
   406 	
       
   407 	\end{itemize}
       
   408 	\end{frame}
       
   409 	
       
   410 	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
       
   411 	\begin{frame}[c]
       
   412 	\frametitle{Properties}
       
   413 	
       
   414 	It is almost trival to prove:
       
   415 	
       
   416 	\begin{itemize}
       
   417 	\item Uniqueness
       
   418 	\begin{center}
       
   419 	If $s \in r \to v_1$ and $s \in r \to v_2$ then
       
   420 	$v_1 = v_2$
       
   421 	\end{center}\bigskip
       
   422 	
       
   423 	\item Correctness
       
   424 	\begin{center}
       
   425 	$lexer(r, s) = v$ if and only if $s \in r \to v$
       
   426 	\end{center}
       
   427 	\end{itemize}\bigskip\bigskip\pause
       
   428 	
       
   429 	
       
   430 	You can now start to implement optimisations and derive
       
   431 	correctness proofs for them. But we still do not know whether
       
   432 	
       
   433 	\begin{center}
       
   434 	$s \in r \to v$ 
       
   435 	\end{center}
       
   436 	
       
   437 	is a POSIX value according to Sulzmann \& Lu's definition
       
   438 	(biggest value for $s$ and $r$)
       
   439 	\end{frame}
       
   440 	
       
   441 
       
   442 \end{document}