|
1 \documentclass{beamer} |
|
2 \usepackage{tikz} |
|
3 \usepackage[english]{babel} |
|
4 \usepackage{proof} |
|
5 \usetheme{Luebeck} |
|
6 \usetikzlibrary{positioning} |
|
7 \usetikzlibrary{decorations.pathreplacing} |
|
8 \definecolor{darkblue}{rgb}{0,0,.803} |
|
9 \definecolor{cream}{rgb}{1,1,.8} |
|
10 |
|
11 \newcommand{\smath}[1]{\textcolor{darkblue}{\ensuremath{#1}}} |
|
12 \newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}} |
|
13 \newcommand{\Zero}{{\bf 0}} |
|
14 \newcommand{\One}{{\bf 1}} |
|
15 |
|
16 \newenvironment{bubble}[1][]{% |
|
17 \addtolength{\leftmargini}{4mm}% |
|
18 \begin{tikzpicture}[baseline=(current bounding box.north)]% |
|
19 \draw (0,0) node[inner sep=2mm,fill=cream,ultra thick,draw=red,rounded corners=2mm]% |
|
20 \bgroup\begin{minipage}{#1}\raggedright{}} |
|
21 {\end{minipage}\egroup;% |
|
22 \end{tikzpicture}\bigskip} |
|
23 |
|
24 \newcommand\grid[1]{% |
|
25 \begin{tikzpicture}[baseline=(char.base)] |
|
26 \path[use as bounding box] |
|
27 (0,0) rectangle (1em,1em); |
|
28 \draw[red!50, fill=red!20] |
|
29 (0,0) rectangle (1em,1em); |
|
30 \node[inner sep=1pt,anchor=base west] |
|
31 (char) at (0em,\gridraiseamount) {#1}; |
|
32 \end{tikzpicture}} |
|
33 \newcommand\gridraiseamount{0.12em} |
|
34 |
|
35 \makeatletter |
|
36 \newcommand\Grid[1]{% |
|
37 \@tfor\z:=#1\do{\grid{\z}}} |
|
38 \makeatother |
|
39 |
|
40 \newcommand\Vspace[1][.3em]{% |
|
41 \mbox{\kern.06em\vrule height.3ex}% |
|
42 \vbox{\hrule width#1}% |
|
43 \hbox{\vrule height.3ex}} |
|
44 |
|
45 \def\VS{\Vspace[0.6em]} |
|
46 |
|
47 |
|
48 \title[POSIX Lexing with Derivatives of Regexes] |
|
49 {\bf POSIX Lexing with\\ |
|
50 \bf Derivatives of Regular Expressions\\ |
|
51 \bf (Proof Pearl)} |
|
52 \author{Fahad Ausaf, Roy Dyckhoff and Christian Urban} |
|
53 \date{King's College London, University of St Andrews} |
|
54 |
|
55 \begin{document} |
|
56 \maketitle |
|
57 |
|
58 |
|
59 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
60 \begin{frame} |
|
61 \frametitle{Brzozowski's Derivatives of Regular Expressions} |
|
62 |
|
63 Idea: If \smath{r} matches the string \smath{c\!::\!s}, |
|
64 what is a regular expression that matches just \smath{s}? \\ |
|
65 |
|
66 \begin{center} |
|
67 \begin{tabular}{l@{\hspace{5mm}}lcl} |
|
68 chars: |
|
69 &\smath{\Zero \backslash c} & \smath{\dn} & \smath{\Zero}\\ |
|
70 &\smath{\One \backslash c} & \smath{\dn} & \smath{\Zero}\\ |
|
71 &\smath{d \backslash c} & \smath{\dn} & |
|
72 \smath{\textit{if}\;d = c\;\textit{then}\;\One\;\textit{else}\;\Zero}\\ |
|
73 &\smath{r_1 + r_2 \backslash c} & \smath{\dn} & |
|
74 \smath{r_1 \backslash c \,+\, r_2 \backslash c}\\ |
|
75 &\smath{r_1 \cdot r_2 \backslash c} & \smath{\dn} & |
|
76 \smath{\textit{if}\;\textit{nullable}\;r_1}\\ |
|
77 && & \smath{\textit{then}\;r_1\backslash c \cdot r_2 \,+\, r_2\backslash c |
|
78 \;\textit{else}\;r_1\backslash c \cdot r_2}\\ |
|
79 &\smath{r^* \backslash c} & \smath{\dn} & |
|
80 \smath{r\backslash c \,\cdot\, r^*}\bigskip\\ |
|
81 |
|
82 strings: |
|
83 &\smath{r\backslash []} & \smath{\dn} & \smath{r}\\ |
|
84 &\smath{r\backslash c\!::\!s} & \smath{\dn} & |
|
85 \smath{(r\backslash c)\backslash s}\\ |
|
86 \end{tabular} |
|
87 \end{center} |
|
88 \end{frame} |
|
89 |
|
90 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
91 |
|
92 \begin{frame} |
|
93 \frametitle{Brzozowski's Matcher} |
|
94 |
|
95 Does \smath{r_1} match string \smath{abc}? |
|
96 |
|
97 \begin{center} |
|
98 \begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}] |
|
99 \node (r1) {\smath{r_1}}; |
|
100 \node (r2) [right=of r1] {\smath{r_2}}; |
|
101 \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}}; |
|
102 \node (r3) [right=of r2] {\smath{r_3}}; |
|
103 \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}}; |
|
104 \node (r4) [right=of r3] {\smath{r_4}}; |
|
105 \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}}; |
|
106 \draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;\textit{nullable}?}}}; |
|
107 \end{tikzpicture} |
|
108 \end{center}\pause |
|
109 |
|
110 It leads to an elegant functional program: |
|
111 |
|
112 \begin{center} |
|
113 \smath{\textit{matches}\,(r, s) \dn \textit{nullable}\,(r\backslash s)} |
|
114 \end{center}\pause |
|
115 |
|
116 It is an easy exercise to formally prove (e.g.~Coq, HOL, Isabelle): |
|
117 |
|
118 \begin{center} |
|
119 \smath{\textit{matches}\,(r, s)} if and only if |
|
120 \smath{s \in L(r)} |
|
121 \end{center}\pause |
|
122 |
|
123 {\bf But Brzozowski's matcher gives only a yes/no-answer.} |
|
124 \end{frame} |
|
125 |
|
126 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
127 |
|
128 \begin{frame} |
|
129 \frametitle{Sulzmann and Lu's Matcher} |
|
130 |
|
131 Sulzmann and Lu added a second phase in order to answer |
|
132 \alert{\textbf{how}} the regular expression matched the string. |
|
133 |
|
134 \begin{center}\small |
|
135 \begin{tikzpicture}[scale=1,node distance=0.8cm,every node/.style={minimum size=7mm}] |
|
136 \node (r1) {\smath{r_1}}; |
|
137 \node (r2) [right=of r1] {\smath{r_2}}; |
|
138 \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {\smath{\_\backslash a}}; |
|
139 \node (r3) [right=of r2] {\smath{r_3}}; |
|
140 \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {\smath{\_\backslash b}}; |
|
141 \node (r4) [right=of r3] {\smath{r_4}}; |
|
142 \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {\smath{\_\backslash c}}; |
|
143 \draw (r4) node[anchor=west] {\;\raisebox{3mm}{\smath{\;\;nullable?}}}; |
|
144 \node (v4) [below=of r4] {\smath{v_4}}; |
|
145 \draw[->,line width=1mm] (r4) -- (v4); |
|
146 \node (v3) [left=of v4] {\smath{v_3}}; |
|
147 \draw[->,line width=1mm] (v4) -- (v3) node[below,midway] {\smath{inj\,c}}; |
|
148 \node (v2) [left=of v3] {\smath{v_2}}; |
|
149 \draw[->,line width=1mm] (v3) -- (v2) node[below,midway] {\smath{inj\,b}}; |
|
150 \node (v1) [left=of v2] {\smath{v_1}}; |
|
151 \draw[->,line width=1mm] (v2) -- (v1) node[below,midway] {\smath{inj\,a}}; |
|
152 \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{\smath{mkeps}}}; |
|
153 |
|
154 \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm}, |
|
155 line width=1.5mm] |
|
156 (r1) -- (r4) node [black,midway,above, yshift=12mm] |
|
157 {\large\bf first phase}; |
|
158 \draw [decorate,decoration={brace,amplitude=10pt,raise=8mm}, |
|
159 line width=1.5mm] |
|
160 (v4) -- (v1) node [black,midway,below, yshift=-12mm] |
|
161 {\large\bf second phase}; |
|
162 %% first phase |
|
163 \draw[line width=14mm, rounded corners, opacity=0.1, |
|
164 cap=round,join=round,color=yellow!30] |
|
165 (r1.center) -- (r4.center); |
|
166 %% second phase |
|
167 \draw[line width=14.1mm, rounded corners, opacity=0.2, |
|
168 cap=round,join=round,draw=black, fill=white] |
|
169 (r4) -- (v4.center) -- (v1.center); |
|
170 \draw[line width=14mm, rounded corners, opacity=0.2, |
|
171 cap=round,join=round,color=yellow!30] |
|
172 (r4) -- (v4.center) -- (v1.center); |
|
173 \end{tikzpicture} |
|
174 \end{center} |
|
175 |
|
176 There are several possible answers for |
|
177 \alert{\textbf{how}}: POSIX, GREEDY, \ldots |
|
178 |
|
179 \end{frame} |
|
180 |
|
181 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
182 |
|
183 \begin{frame}{Regular Expressions and Values} |
|
184 Regular expressions and their corresponding values (for how a |
|
185 regular expression matched a string):\bigskip |
|
186 |
|
187 \begin{columns}[c] % the "c" option specifies center vertical alignment |
|
188 \column{.4\textwidth} % column designated by a command |
|
189 \begin{tabular}{ l l l } |
|
190 \smath{r} & ::= & \smath{\Zero} \\ |
|
191 & $\mid$ & \smath{\One} \\ |
|
192 & $\mid$ & \smath{c} \\ |
|
193 & $\mid$ & \smath{r_1 \cdot r_2} \\ |
|
194 & $\mid$ & \smath{r_1 + r_2} \\ \\ |
|
195 & $\mid$ & \smath{r^*} \\ |
|
196 \end{tabular} |
|
197 \column{.4\textwidth} |
|
198 \begin{tabular}{ l l l } |
|
199 |
|
200 \smath{v} & ::= & \\ |
|
201 & $\mid$ & \smath{Empty} \\ |
|
202 & $\mid$ & \smath{Char(c)} \\ |
|
203 & $\mid$ & \smath{Seq(v_1\cdot v_2)} \\ |
|
204 & $\mid$ & \smath{Left(v)} \\ |
|
205 & $\mid$ & \smath{Right(v)} \\ |
|
206 & $\mid$ & \smath{[v_1,...,v_n]} |
|
207 \end{tabular} |
|
208 \end{columns} |
|
209 \end{frame} |
|
210 |
|
211 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
212 |
|
213 |
|
214 |
|
215 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
216 |
|
217 \begin{frame} |
|
218 \frametitle{POSIX Matching (needed for Lexing)} |
|
219 |
|
220 |
|
221 \begin{bubble}[10cm] |
|
222 {\bf Longest Match Rule:} The longest |
|
223 initial substring matched by any regular expression is taken |
|
224 as the next token. |
|
225 \end{bubble} |
|
226 |
|
227 \begin{bubble}[10cm] |
|
228 {\bf Rule Priority:} For a particular longest initial substring, |
|
229 the first regular expression that can match determines the |
|
230 token. |
|
231 \end{bubble} |
|
232 |
|
233 For example: \smath{r_{keywords} + r_{identifiers}}:\bigskip |
|
234 |
|
235 \begin{itemize} |
|
236 \item \smath{\texttt{\Grid{iffoo\VS bla}}} |
|
237 |
|
238 \item \smath{\texttt{\Grid{if\VS bla}}} |
|
239 \end{itemize} |
|
240 |
|
241 \end{frame} |
|
242 |
|
243 |
|
244 |
|
245 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
246 |
|
247 \begin{frame} |
|
248 \frametitle{Problems with POSIX} |
|
249 |
|
250 Grathwohl, Henglein and Rasmussen wrote: |
|
251 |
|
252 \begin{bubble}[10cm] |
|
253 \it ``The POSIX strategy is more complicated than the greedy because |
|
254 of the dependence on information |
|
255 about the length of matched strings in the various subexpressions.'' |
|
256 \end{bubble}\bigskip |
|
257 |
|
258 Also Kuklewicz maintains a unit-test repository for POSIX |
|
259 matching, which indicates that most POSIX mathcers are buggy. |
|
260 |
|
261 \begin{center} |
|
262 \url{http://www.haskell.org/haskellwiki/Regex_Posix} |
|
263 \end{center} |
|
264 |
|
265 \end{frame} |
|
266 |
|
267 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
268 |
|
269 \begin{frame} |
|
270 \frametitle{``Correctness'' by Sulzmann and Lu} |
|
271 \begin{itemize} |
|
272 \item Sulzmann \& Lu's idea is to order all possible |
|
273 answer such that they can prove the correct answer is |
|
274 the maximum |
|
275 \item The idea is taken from a GREEDY algorithm (and it |
|
276 works there)\bigskip\pause |
|
277 |
|
278 \item {\bf But} we made no progress in formalising |
|
279 Sulzmann \& Lu's idea, because |
|
280 |
|
281 \begin{itemize} |
|
282 \item transitivity, existence of maxima etc all fail to |
|
283 turn into real proofs |
|
284 \item the reason: the ordering works only if .... |
|
285 \item though we did find mistakes: |
|
286 \begin{center} |
|
287 \small |
|
288 ``How could I miss this? Well, I was rather careless when |
|
289 stating this Lemma :)\smallskip |
|
290 |
|
291 Great example how formal machine checked proofs (and |
|
292 proof assistants) can help to spot flawed reasoning steps.'' |
|
293 |
|
294 \end{center}\pause |
|
295 |
|
296 \begin{center} |
|
297 \small |
|
298 ``Well, I don't think there's any flaw. The issue is how to |
|
299 come up with a mechanical proof. In my world mathematical |
|
300 proof $=$ mechanical proof doesn't necessarily hold.'' |
|
301 |
|
302 \end{center}\pause |
|
303 \end{itemize} |
|
304 |
|
305 \end{itemize} |
|
306 \end{frame} |
|
307 |
|
308 |
|
309 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
310 |
|
311 \begin{frame} |
|
312 \frametitle{Sulzmann and Lu Matcher} |
|
313 |
|
314 We want to match the string $abc$ using $r_1$\\ |
|
315 |
|
316 \begin{center} |
|
317 \begin{tikzpicture}[scale=2,node distance=1.3cm,every node/.style={minimum size=8mm}] |
|
318 \node (r1) {$r_1$}; |
|
319 \node (r2) [right=of r1] {$r_2$}; |
|
320 \draw[->,line width=1mm] (r1) -- (r2) node[above,midway] {$der\,a$};\pause |
|
321 \node (r3) [right=of r2] {$r_3$}; |
|
322 \draw[->,line width=1mm] (r2) -- (r3) node[above,midway] {$der\,b$};\pause |
|
323 \node (r4) [right=of r3] {$r_4$}; |
|
324 \draw[->,line width=1mm] (r3) -- (r4) node[above,midway] {$der\,c$};\pause |
|
325 \draw (r4) node[anchor=west] {\;\raisebox{3mm}{$\;\;nullable?$}};\pause |
|
326 \node (v4) [below=of r4] {$v_4$}; |
|
327 \draw[->,line width=1mm] (r4) -- (v4);\pause |
|
328 \node (v3) [left=of v4] {$v_3$}; |
|
329 \draw[->,line width=1mm] (v4) -- (v3) node[below,midway] {$inj\,c$};\pause |
|
330 \node (v2) [left=of v3] {$v_2$}; |
|
331 \draw[->,line width=1mm] (v3) -- (v2) node[below,midway] {$inj\,b$};\pause |
|
332 \node (v1) [left=of v2] {$v_1$}; |
|
333 \draw[->,line width=1mm] (v2) -- (v1) node[below,midway] {$inj\,a$};\pause |
|
334 \draw[->,line width=0.5mm] (r3) -- (v3); |
|
335 \draw[->,line width=0.5mm] (r2) -- (v2); |
|
336 \draw[->,line width=0.5mm] (r1) -- (v1); |
|
337 \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{$mkeps$}}; |
|
338 \end{tikzpicture} |
|
339 \end{center} |
|
340 |
|
341 \end{frame} |
|
342 |
|
343 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
344 |
|
345 |
|
346 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
347 |
|
348 \begin{frame} |
|
349 \frametitle{Problems} |
|
350 \begin{itemize} |
|
351 |
|
352 \item Sulzmann: \ldots Let's assume $v$ is not a $POSIX$ value, |
|
353 then there must be another one \ldots contradiction.\pause |
|
354 |
|
355 \item Exists ? |
|
356 \begin{center} |
|
357 $L(r) \not= \varnothing \;\Rightarrow\; \exists v.\;POSIX(v, r)$ |
|
358 \end{center}\pause |
|
359 |
|
360 \item In the sequence case |
|
361 $Seq(v_1,v_2)\succ_{r_1\cdot r_2} Seq(v_1', v_2')$, |
|
362 the induction hypotheses require |
|
363 $|v_1| = |v_1'|$ and $|v_2| = |v_2'|$, but you only know |
|
364 |
|
365 \begin{center} |
|
366 $|v_1| @ |v_2| = |v_1'| @ |v_2'|$ |
|
367 \end{center}\pause |
|
368 |
|
369 \item Although one begins with the assumption that the two |
|
370 values have the same flattening, this cannot be maintained |
|
371 as one descends into the induction (alternative, sequence) |
|
372 |
|
373 \end{itemize} |
|
374 \end{frame} |
|
375 |
|
376 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
377 |
|
378 \begin{frame} |
|
379 \frametitle{Our Solution} |
|
380 |
|
381 \begin{itemize} |
|
382 \item A direct definition of what a POSIX value is, using the |
|
383 relation \smath{s \in r \to v} (our specification)\bigskip |
|
384 |
|
385 \begin{center} |
|
386 \smath{\infer{[] \in \epsilon \to Empty}{}}\hspace{15mm} |
|
387 \smath{\infer{[c] \in c \to Char(c)}{}}\bigskip\medskip |
|
388 |
|
389 \smath{\infer{s \in r_1 + r_2 \to Left(v)} |
|
390 {s \in r_1 \to v}}\hspace{10mm} |
|
391 \smath{\infer{s \in r_1 + r_2 \to Right(v)} |
|
392 {s \in r_2 \to v & s \not\in L(r_1)}}\bigskip\medskip |
|
393 |
|
394 \smath{\infer{s_1 @ s_2 \in r_1 \cdot r_2 \to Seq(v_1, v_2)} |
|
395 {\small\begin{array}{l} |
|
396 s_1 \in r_1 \to v_1 \\ |
|
397 s_2 \in r_2 \to v_2 \\ |
|
398 \neg(\exists s_3\,s_4.\; s_3 \not= [] |
|
399 \wedge s_3 @ s_4 = s_2 \wedge |
|
400 s_1 @ s_3 \in L(r_1) \wedge |
|
401 s_4 \in L(r_2)) |
|
402 \end{array}}} |
|
403 |
|
404 {\ldots} |
|
405 \end{center} |
|
406 |
|
407 \end{itemize} |
|
408 \end{frame} |
|
409 |
|
410 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
411 \begin{frame}[c] |
|
412 \frametitle{Properties} |
|
413 |
|
414 It is almost trival to prove: |
|
415 |
|
416 \begin{itemize} |
|
417 \item Uniqueness |
|
418 \begin{center} |
|
419 If $s \in r \to v_1$ and $s \in r \to v_2$ then |
|
420 $v_1 = v_2$ |
|
421 \end{center}\bigskip |
|
422 |
|
423 \item Correctness |
|
424 \begin{center} |
|
425 $lexer(r, s) = v$ if and only if $s \in r \to v$ |
|
426 \end{center} |
|
427 \end{itemize}\bigskip\bigskip\pause |
|
428 |
|
429 |
|
430 You can now start to implement optimisations and derive |
|
431 correctness proofs for them. But we still do not know whether |
|
432 |
|
433 \begin{center} |
|
434 $s \in r \to v$ |
|
435 \end{center} |
|
436 |
|
437 is a POSIX value according to Sulzmann \& Lu's definition |
|
438 (biggest value for $s$ and $r$) |
|
439 \end{frame} |
|
440 |
|
441 |
|
442 \end{document} |