204 \path (v2) |
273 \path (v2) |
205 edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1); |
274 edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1); |
206 |
275 |
207 \path (v1) |
276 \path (v1) |
208 edge [] node {$\inj \; r \; c_1 \; v_1$} (v); |
277 edge [] node {$\inj \; r \; c_1 \; v_1$} (v); |
209 \path (r) |
278 |
210 edge [dashed, bend right] node {} (stack); |
279 \path (stack) |
211 |
280 edge [dashed] node {} (-4.2, -5.2); |
212 \path (r1) |
281 \path (stack) |
213 edge [dashed, ] node {} (stack); |
282 edge [dashed] node {} (-4, -5.2); |
214 |
283 \path (stack) |
215 \path (c1) |
284 edge [dashed] node {} (-0.1, -5.2); |
216 edge [dashed, bend right] node {} (stack); |
285 \path (stack) |
217 |
286 edge [dashed] node {} (0.2, -5.26); |
218 \path (c2) |
287 \path (stack) |
219 edge [dashed] node {} (stack); |
288 edge [dashed] node {} (3.2, -5); |
220 \path (4.5, 5) |
289 \path (stack) |
221 edge [dashed, bend left] node {} (stack); |
290 edge [dashed] node {} (2.7, -5); |
222 \path (4.9, 5) |
291 \path (stack) |
223 edge [dashed, bend left] node {} (stack); |
292 edge [dashed] node {} (3.7, -5); |
224 \path (5.3, 5) |
|
225 edge [dashed, bend left] node {} (stack); |
|
226 \path (r2) |
|
227 edge [dashed, ] node {} (stack); |
|
228 \path (rn) |
|
229 edge [dashed, bend left] node {} (stack); |
|
230 \end{tikzpicture} |
293 \end{tikzpicture} |
231 %\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] |
294 \caption{Stored $r_i, c_{i+1}$ Used by $\inj$} |
232 %%\draw (-6,-6) grid (6,6); |
295 \end{figure} |
233 %\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$}; |
296 \noindent |
234 % |
297 Storing all $r_i, c_{i+1}$ pairs recursively |
235 %%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; |
298 allows the algorithm to work in an elegant way, at the expense of |
236 %\node [circle, minimum size = 0.1] (c1) at (-4, 5.4) {$c_1$}; |
|
237 %% |
|
238 %%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; |
|
239 %\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$}; |
|
240 %% |
|
241 %%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$}; |
|
242 %\node [circle, minimum size = 0.1] (c2) at (0, 5.4) {$c_2$}; |
|
243 %% |
|
244 %%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; |
|
245 %\node [circle, draw] (r2) at (2, 5) {$r_2$}; |
|
246 %% |
|
247 %% |
|
248 %\node [] (ldots) at (4.5, 5) {}; |
|
249 %%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$}; |
|
250 % |
|
251 %\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$}; |
|
252 % |
|
253 %\node at ($(ldots)!.4!(rn)$) {\ldots}; |
|
254 % |
|
255 %\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$}; |
|
256 % |
|
257 %\node [] (ldots2) at (3.5, -5) {}; |
|
258 % |
|
259 %\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$}; |
|
260 % |
|
261 %\node at ($(ldots2)!.4!(v2)$) {\ldots}; |
|
262 % |
|
263 % |
|
264 %\node [circle, draw] (v1) at (-2, -5) {$v_1$}; |
|
265 % |
|
266 %\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$}; |
|
267 % |
|
268 %\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack}; |
|
269 % |
|
270 %\path |
|
271 % (r) |
|
272 % edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); |
|
273 %\path |
|
274 % (r1) |
|
275 % edge [] node {} (r2); |
|
276 %\path (r2) |
|
277 % edge [] node {} (ldots); |
|
278 %\path (rn) |
|
279 % edge [] node {$\mkeps$} (vn); |
|
280 %\path (vn) |
|
281 % edge [] node {} (ldots2); |
|
282 %\path (v2) |
|
283 % edge [] node {} (v1); |
|
284 % |
|
285 %\path (v1) |
|
286 % edge [] node {} (v); |
|
287 %\path (r) |
|
288 % edge [] node {saved} (stack); |
|
289 % |
|
290 %\path (r1) |
|
291 % edge [] node {saved} (stack); |
|
292 %\end{tikzpicture} |
|
293 \noindent |
|
294 The information stored in characters and regular expressions |
|
295 make the algorithm work in an elegant way, at the expense of being |
|
296 storing quite a bit of verbose information. |
299 storing quite a bit of verbose information. |
297 |
300 The stack seems to grow at least quadratically fast with respect |
298 |
301 to the input (not taking into account the size bloat of $r_i$), |
299 |
302 which can be inefficient and prone to stack overflow. |
300 The lexer algorithm in Chapter \ref{Inj}, |
303 \section{Bitcoded Algorithm} |
301 stores information of previous lexing steps |
304 To address this, |
302 on a stack, in the form of regular expressions |
305 Sulzmann and Lu chose to define a new datatype |
303 and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc. |
306 called \emph{annotated regular expression}, |
304 The red part represents what we already know during the first |
307 which condenses all the partial lexing information |
305 derivative phase, |
308 (that was originally stored in $r_i, c_{i+1}$ pairs) |
306 and the blue part represents the unknown part of input. |
309 into bitcodes. |
307 \begin{ceqn} |
310 Annotated regular expressions |
308 \begin{equation}%\label{graph:injLexer} |
311 are defined as the following |
309 \begin{tikzcd}[ampersand replacement=\&, execute at end picture={ |
312 Isabelle datatype \footnote{ We use subscript notation to indicate |
310 \begin{scope}[on background layer] |
313 that the bitcodes are auxiliary information that do not |
311 \node[rectangle, fill={red!30}, |
314 interfere with the structure of the regular expressions }: |
312 pattern=north east lines, pattern color=red, |
315 \begin{center} |
313 fit={(-3,-1) (-3, 1) (1, -1) |
316 \begin{tabular}{lcl} |
314 (1, 1)} |
317 $\textit{a}$ & $::=$ & $\ZERO$\\ |
315 ] |
318 & $\mid$ & $_{bs}\ONE$\\ |
316 {}; , |
319 & $\mid$ & $_{bs}{\bf c}$\\ |
317 \node[rectangle, fill={blue!20}, |
320 & $\mid$ & $_{bs}\sum\,as$\\ |
318 pattern=north east lines, pattern color=blue, |
321 & $\mid$ & $_{bs}a_1\cdot a_2$\\ |
319 fit= {(1, -1) (1, 1) (3, -1) (3, 1)} |
322 & $\mid$ & $_{bs}a^*$ |
320 ] |
323 \end{tabular} |
321 {}; |
324 \end{center} |
322 \end{scope}} |
325 \noindent |
323 ] |
326 where $bs$ stands for bit-codes, $a$ for $\mathbf{a}$nnotated regular |
324 r_0 \arrow[r, "\backslash c_0"] \arrow[d] \& r_1 \arrow[r, "\backslash c_1"] \arrow[d] \& r_2 \arrow[r, dashed] \arrow[d] \& r_n \arrow[d, "mkeps" description] \\ |
327 expressions and $as$ for lists of annotated regular expressions. |
325 v_0 \& v_1 \arrow[l,"inj_{r_0} c_0"] \& v_2 \arrow[l, "inj_{r_1} c_1"] \& v_n \arrow[l, dashed] \\ |
328 The alternative constructor, written, $\sum$, has been generalised to |
326 \end{tikzcd} |
329 accept a list of annotated regular expressions rather than just two. |
327 \end{equation} |
330 Why is it generalised? This is because when we open up nested |
328 \end{ceqn} |
331 alternatives, there could be more than two elements at the same level |
329 \noindent |
332 after de-duplication, which can no longer be stored in a binary |
330 The red area expands as we move towards $r_n$, |
333 constructor. |
331 indicating more is known about the lexing result. |
|
332 Despite knowing this partial lexing information during |
|
333 the forward derivative phase, we choose to store them |
|
334 all the way until $r_n$ is reached. |
|
335 Then we reconstruct the value character by character |
|
336 values at a later stage, using information in a Last-In-First-Out |
|
337 manner. Although the algorithm is elegant and natural, |
|
338 it can be inefficient and prone to stack overflow.\\ |
|
339 It turns out we can store lexing |
|
340 information on the fly, while still using regular expression |
|
341 derivatives. |
|
342 If we remove the individual |
|
343 lexing steps, and use red and blue areas as before |
|
344 to indicate consumed (seen) input and constructed |
|
345 partial value (before recovering the rest of the stack), |
|
346 one could see that the seen part's lexical information |
|
347 is stored in the form of a regular expression. |
|
348 Consider the regular expression $(aa)^* \cdot bc$ matching the string $aabc$ |
|
349 and assume we have just read the two characters $aa$: |
|
350 \begin{center} |
|
351 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
352 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] |
|
353 {Partial lexing info: $\ONE \cdot a \cdot (aa)^* \cdot bc$ etc. |
|
354 \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$}; |
|
355 \end{tikzpicture} |
|
356 \end{center} |
|
357 \noindent |
|
358 In the injection-based lexing algorithm, we ``neglect" the red area |
|
359 by putting all the characters we have consumed and |
|
360 intermediate regular expressions on the stack when |
|
361 we go from left to right in the derivative phase. |
|
362 The red area grows till the string is exhausted. |
|
363 During the injection phase, the value in the blue area |
|
364 is built up incrementally, while the red area shrinks. |
|
365 Before we have recovered all characters and intermediate |
|
366 derivative regular expressions from the stack, |
|
367 what values these characters and regular expressions correspond |
|
368 to are unknown: |
|
369 \begin{center} |
|
370 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
371 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={white!30,blue!20},] |
|
372 {$(\ONE \cdot \ONE) \cdot (aa)^* \cdot bc $ correspond to:$???$ |
|
373 \nodepart{two} $b c$ corresponds to $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$}; |
|
374 %\caption{term 1 \ref{term:1}'s matching configuration} |
|
375 \end{tikzpicture} |
|
376 \end{center} |
|
377 \noindent |
|
378 However, they should be calculable, |
|
379 as characters and regular expression shapes |
|
380 after taking derivative w.r.t those characters |
|
381 have already been known, therefore in our example, |
|
382 we know that the value starts with two $a$s, |
|
383 and makes up to an iteration in a Kleene star: |
|
384 (We have put the injection-based lexing's partial |
|
385 result in the right part of the split rectangle |
|
386 to contrast it with the partial valued produced |
|
387 in a forward manner) |
|
388 \begin{center} |
|
389 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
390 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] |
|
391 {$\stackrel{Bitcoded}{\longrightarrow} \Seq(\Stars[\Char(a), \Char(a)], ???)$ |
|
392 \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$ $\stackrel{Inj}{\longleftarrow}$}; |
|
393 %\caption{term 1 \ref{term:1}'s matching configuration} |
|
394 \end{tikzpicture} |
|
395 \end{center} |
|
396 \noindent |
|
397 If we do this kind of "attachment" |
|
398 and each time augment the attached partially |
|
399 constructed value when taking off a |
|
400 character: |
|
401 \begin{center} |
|
402 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
403 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] (spPoint) |
|
404 {$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$ |
|
405 \nodepart{two} Remaining: $b c$}; |
|
406 \end{tikzpicture}\\ |
|
407 $\downarrow$\\ |
|
408 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
409 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] |
|
410 {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \ldots))$ |
|
411 \nodepart{two} Remaining: $c$}; |
|
412 \end{tikzpicture}\\ |
|
413 $\downarrow$\\ |
|
414 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
415 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] |
|
416 {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \Char(c)))$ |
|
417 \nodepart{two} EOF}; |
|
418 \end{tikzpicture} |
|
419 \end{center} |
|
420 \noindent |
|
421 In the end we could recover the value without a backward phase. |
|
422 But (partial) values are a bit clumsy to stick together with a regular expression, so |
|
423 we instead use bit-codes to encode them. |
|
424 |
|
425 Bits and bitcodes (lists of bits) are defined as: |
334 Bits and bitcodes (lists of bits) are defined as: |
426 \begin{center} |
335 \begin{center} |
427 $b ::= S \mid Z \qquad |
336 $b ::= S \mid Z \qquad |
428 bs ::= [] \mid b::bs |
337 bs ::= [] \mid b::bs |
429 $ |
338 $ |
430 \end{center} |
339 \end{center} |
431 |
340 \noindent |
432 \noindent |
341 We use $S$ and $Z$ rather than $1$ and $0$ is to avoid |
433 Using $S$ and $Z$ rather than $1$ and $0$ is to avoid |
|
434 confusion with the regular expressions $\ZERO$ and $\ONE$. |
342 confusion with the regular expressions $\ZERO$ and $\ONE$. |
435 Bitcodes (or |
343 The idea is to use the attached bitcodes |
436 bit-lists) can be used to encode values (or potentially incomplete values) in a |
344 to indicate which choice was made at a certain point |
437 compact form. This can be straightforwardly seen in the following |
345 during lexing. |
438 coding function from values to bitcodes: |
346 For example, |
|
347 $(_{Z}a+_{S}b) \backslash a$ gives us |
|
348 $_{Z}\ONE + \ZERO$, this $Z$ bitcode will |
|
349 later be used to decode that a left branch was |
|
350 selected at the time when the part $a+b$ is being taken |
|
351 derivative of. |
|
352 \subsection{A Bird's Eye View of the Bit-coded Lexer} |
|
353 Before we give out the rest of the functions and definitions |
|
354 related to our |
|
355 $\blexer$ (\emph{b}-itcoded lexer), we first provide a high-level |
|
356 view of the algorithm, so the reader does not get lost in |
|
357 the details. |
|
358 \begin{figure}[H] |
|
359 \begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] |
|
360 %\draw (-6,-6) grid (6,6); |
|
361 |
|
362 \node [circle, draw] (r0) at (-6, 2) {$r$}; |
|
363 |
|
364 \node [radius = 0.5, circle, draw] (r) at (-6, 5) {$_{bs}a$}; |
|
365 \path (r0) |
|
366 edge [] node {$\internalise$} (r); |
|
367 %\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; |
|
368 \node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$}; |
|
369 % |
|
370 %\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; |
|
371 \node [minimum size = 1.0cm, circle, draw] (r1) at (-2, 5) {$_{bs_1}a_1$}; |
|
372 % |
|
373 %\node (0, 6) (c2) circle [radius = 0.3] {$c_2$}; |
|
374 \node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$}; |
|
375 % |
|
376 %\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; |
|
377 \node [circle, draw, minimum size = 1.4cm] (r2) at (2, 5) {$_{bs_2}a_2$}; |
|
378 % |
|
379 % |
|
380 \node [] (ldots) at (4.5, 5) {}; |
|
381 %\node (6, 5) (rn) circle [radius = 0.5] {$r_n$}; |
|
382 |
|
383 \node [minimum size = 2.2cm, circle, draw] (rn) at (6, 5) {$_{bs_n}a_n$}; |
|
384 |
|
385 \node at ($(ldots)!.1!(rn)$) {\ldots}; |
|
386 |
|
387 \node [minimum size = 0.5, circle, ] (v) at (6, 2) {$v$}; |
|
388 |
|
389 %\node [] (v2) at (4, -5) {}; |
|
390 % |
|
391 %\node [draw, cross out] (ldots2) at (5, -5) {}; |
|
392 % |
|
393 %\node at ($(ldots2)!.4!(v2)$) {\ldots}; |
|
394 |
|
395 |
|
396 \node [align = center] (decode) at (6.6, 3.2) {$\bmkeps$\\$\decode$}; |
|
397 |
|
398 \path (c1) |
|
399 edge [dashed, bend left] node {} (r0); |
|
400 |
|
401 \path (c2) |
|
402 edge [dashed, bend left] node {} (r0); |
|
403 |
|
404 \path (r1) |
|
405 edge [dashed, bend right] node {} (r2); |
|
406 |
|
407 |
|
408 \path |
|
409 (r) |
|
410 edge [dashed, bend right] node[left] {} (r1); |
|
411 |
|
412 \path |
|
413 (r) |
|
414 edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); |
|
415 |
|
416 \path |
|
417 (r1) |
|
418 edge [] node {} (r2); |
|
419 \path (r2) |
|
420 edge [] node {} (ldots); |
|
421 \path (rn) |
|
422 edge [] node {} (v); |
|
423 |
|
424 \path (r0) |
|
425 edge [dashed, bend right] node {} (decode); |
|
426 %\path (v) |
|
427 %edge [] node {} (ldots2); |
|
428 |
|
429 |
|
430 |
|
431 \end{tikzpicture} |
|
432 \caption{Bird's Eye View of $\blexer$} |
|
433 \end{figure} |
|
434 \noindent |
|
435 The plain regular expressions |
|
436 are first ``lifted'' to an annotated regular expression, |
|
437 with the function called \emph{internalise}. |
|
438 Then the annotated regular expression $_{bs}a$ will |
|
439 go through successive derivatives with respect |
|
440 to the input stream of characters $c_1, c_2$ etc. |
|
441 Each time a derivative is taken, the bitcodes |
|
442 are moved around, discarded or augmented together |
|
443 with the regular expression they are attached to. |
|
444 After all input has been consumed, the |
|
445 bitcodes are collected by $\bmkeps$, |
|
446 which traverses the nullable part of the regular expression |
|
447 and collect the bitcodes along the way. |
|
448 The collected bitcodes are then $\decode$d with the guidance |
|
449 of the input regular expression $r$. |
|
450 The most notable improvements of $\blexer$ |
|
451 over $\lexer$ are |
|
452 \begin{itemize} |
|
453 \item |
|
454 |
|
455 An absence of the second injection phase. |
|
456 \item |
|
457 One need not store each pair of the |
|
458 intermediate regular expressions $_{bs_i}a_i$ and $c_{i+1}$. |
|
459 The previous annotated regular expression $_{bs_i}a_i$'s information is passed |
|
460 on without loss to its successor $_{bs_{i+1}}a_{i+1}$, |
|
461 and $c_{i+1}$'s already contained in the strings in $L\;r$ \footnote{ |
|
462 which can be easily recovered if we decode in the correct order}. |
|
463 \item |
|
464 The simplification works much better--one can maintain correctness |
|
465 while applying quite strong simplifications, as we shall wee. |
|
466 \end{itemize} |
|
467 \noindent |
|
468 In the next section we will |
|
469 give the operations needed in $\blexer$, |
|
470 with some remarks on the idea behind their definitions. |
|
471 \subsection{Operations in $\textit{Blexer}$} |
|
472 The first operation we define related to bit-coded regular expressions |
|
473 is how we move bits to the inside of regular expressions. |
|
474 Called $\fuse$, this operation attaches bit-codes |
|
475 to the front of an annotated regular expression: |
439 \begin{center} |
476 \begin{center} |
440 \begin{tabular}{lcl} |
477 \begin{tabular}{lcl} |
441 $\textit{code}(\Empty)$ & $\dn$ & $[]$\\ |
478 $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\ |
442 $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\ |
479 $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ & |
443 $\textit{code}(\Left\,v)$ & $\dn$ & $Z :: code(v)$\\ |
480 $_{bs @ bs'}\ONE$\\ |
444 $\textit{code}(\Right\,v)$ & $\dn$ & $S :: code(v)$\\ |
481 $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ & |
445 $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\ |
482 $_{bs@bs'}{\bf c}$\\ |
446 $\textit{code}(\Stars\,[])$ & $\dn$ & $[Z]$\\ |
483 $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ & |
447 $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $S :: code(v) \;@\; |
484 $_{bs@bs'}\sum\textit{as}$\\ |
448 code(\Stars\,vs)$ |
485 $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ & |
|
486 $_{bs@bs'}a_1 \cdot a_2$\\ |
|
487 $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ & |
|
488 $_{bs @ bs'}a^*$ |
449 \end{tabular} |
489 \end{tabular} |
450 \end{center} |
490 \end{center} |
451 \noindent |
491 |
452 Here $\textit{code}$ encodes a value into a bit-code by converting |
492 \noindent |
453 $\Left$ into $Z$, $\Right$ into $S$, and marks the start of any non-empty |
493 With \emph{fuse} we are able to define the $\internalise$ function |
454 star iteration by $S$. The border where a local star terminates |
494 that translates a ``standard'' regular expression into an |
455 is marked by $Z$. |
495 annotated regular expression. |
456 This coding is lossy, as it throws away the information about |
496 This function will be applied before we start |
457 characters, and also does not encode the ``boundary'' between two |
497 with the derivative phase of the algorithm. |
458 sequence values. Moreover, with only the bitcode we cannot even tell |
498 |
459 whether the $S$s and $Z$s are for $\Left/\Right$ or $\Stars$. The |
499 \begin{center} |
460 reason for choosing this compact way of storing information is that the |
500 \begin{tabular}{lcl} |
461 relatively small size of bits can be easily manipulated and ``moved |
501 $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\ |
462 around" in a regular expression. |
502 $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\ |
463 |
503 $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\ |
464 Because of the lossiness, the process of decoding a bitlist requires additionally |
504 $(r_1 + r_2)^\uparrow$ & $\dn$ & |
465 a regular expression. The function $\decode$ is defined as: |
505 $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\, |
466 |
506 \textit{fuse}\,[S]\,r_2^\uparrow]$\\ |
|
507 $(r_1\cdot r_2)^\uparrow$ & $\dn$ & |
|
508 $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\ |
|
509 $(r^*)^\uparrow$ & $\dn$ & |
|
510 $_{[]}(r^\uparrow)^*$\\ |
|
511 \end{tabular} |
|
512 \end{center} |
|
513 \noindent |
|
514 We use an up arrow with postfix notation |
|
515 to denote this operation |
|
516 for convenience. |
|
517 The opposite of $\textit{internalise}$ is |
|
518 $\erase$, where all the bit-codes are removed, |
|
519 and the alternative operator $\sum$ for annotated |
|
520 regular expressions is transformed to the binary version |
|
521 in plain regular expressions. |
|
522 \begin{center} |
|
523 \begin{tabular}{lcl} |
|
524 $\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\ |
|
525 $( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\ |
|
526 $( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\ |
|
527 $( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ & |
|
528 $ (a_1) _\downarrow \cdot (a_2) _\downarrow$\\ |
|
529 $( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\ |
|
530 $( _{bs} [a] )_\downarrow$ & $\dn$ & $a_\downarrow$\\ |
|
531 $_{bs} \sum [a_1, \; a_2]$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\ |
|
532 $(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\ |
|
533 $( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$ |
|
534 \end{tabular} |
|
535 \end{center} |
|
536 \noindent |
|
537 We also abbreviate the $\erase\; a$ operation |
|
538 as $a_\downarrow$, for conciseness. |
|
539 |
|
540 Testing whether an annotated regular expression |
|
541 contains the empty string in its lauguage requires |
|
542 a dedicated function $\bnullable$. |
|
543 $\bnullable$ simply calls $\erase$ before $\nullable$. |
|
544 \begin{definition} |
|
545 $\bnullable \; a \dn \nullable \; (a_\downarrow)$ |
|
546 \end{definition} |
|
547 The function for collecting |
|
548 bitcodes from a |
|
549 (b)nullable regular expression is called $\bmkeps$. |
|
550 $\bmkeps$ is a variation of the $\textit{mkeps}$ function, |
|
551 which follows the path $\mkeps$ takes to traverse the |
|
552 $\nullable$ branches when visiting a regular expression, |
|
553 but gives back bitcodes instead of a value. |
|
554 \begin{center} |
|
555 \begin{tabular}{lcl} |
|
556 $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\ |
|
557 $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ & |
|
558 $\textit{if}\;\textit{bnullable}\,a$\\ |
|
559 & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\ |
|
560 & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\ |
|
561 $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ & |
|
562 $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ |
|
563 $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ & |
|
564 $bs \,@\, [Z]$ |
|
565 \end{tabular} |
|
566 \end{center} |
|
567 \noindent |
|
568 $\bmkeps$ retrieves the value $v$'s |
|
569 information in the format |
|
570 of bitcodes, by travelling along the |
|
571 path of the regular expression that corresponds to a POSIX match, |
|
572 collecting all the bitcodes, and attaching $S$ to indicate the end of star |
|
573 iterations. \\ |
|
574 The bitcodes extracted by $\bmkeps$ need to be |
|
575 $\decode$d (with the guidance of a plain regular expression): |
467 %\begin{definition}[Bitdecoding of Values]\mbox{} |
576 %\begin{definition}[Bitdecoding of Values]\mbox{} |
468 \begin{center} |
577 \begin{center} |
469 \begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}} |
578 \begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}} |
470 $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ |
579 $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ |
471 $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\ |
580 $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\ |
514 By proving a more general version of the lemma, on $\decode'$: |
643 By proving a more general version of the lemma, on $\decode'$: |
515 \[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \] |
644 \[\vdash v : r \implies \decode' \; ((\code \; v) @ ds) \; r = (v, ds) \] |
516 Then setting $ds$ to be $[]$ and unfolding $\decode$ definition, |
645 Then setting $ds$ to be $[]$ and unfolding $\decode$ definition, |
517 we obtain the property. |
646 we obtain the property. |
518 \end{proof} |
647 \end{proof} |
519 With the $\code$ and $\decode$ functions in hand, we know how to |
648 \noindent |
520 switch between bit-codes and values. |
|
521 The next step is to integrate this information into regular expression. |
|
522 Attaching bits to the front of regular expressions is the solution Sulzamann and Lu |
|
523 gave for storing partial values in regular expressions. |
|
524 Annotated regular expressions are therefore defined as the Isabelle |
|
525 datatype: |
|
526 |
|
527 \begin{center} |
|
528 \begin{tabular}{lcl} |
|
529 $\textit{a}$ & $::=$ & $\ZERO$\\ |
|
530 & $\mid$ & $_{bs}\ONE$\\ |
|
531 & $\mid$ & $_{bs}{\bf c}$\\ |
|
532 & $\mid$ & $_{bs}\sum\,as$\\ |
|
533 & $\mid$ & $_{bs}a_1\cdot a_2$\\ |
|
534 & $\mid$ & $_{bs}a^*$ |
|
535 \end{tabular} |
|
536 \end{center} |
|
537 %(in \textit{ALTS}) |
|
538 |
|
539 \noindent |
|
540 where $bs$ stands for bit-codes, $a$ for $\mathbf{a}$nnotated regular |
|
541 expressions and $as$ for lists of annotated regular expressions. |
|
542 The alternative constructor, written, $\sum$, has been generalised to |
|
543 accept a list of annotated regular expressions rather than just two. |
|
544 Why is it generalised? This is because when we open up nested |
|
545 alternatives, there could be more than two elements at the same level |
|
546 after de-duplication, which can no longer be stored in a binary |
|
547 constructor. |
|
548 |
|
549 The first operation we define related to bit-coded regular expressions |
|
550 is how we move bits to the inside of regular expressions. |
|
551 Called $\fuse$, this operation is attaches bit-codes |
|
552 to the front of an annotated regular expression: |
|
553 \begin{center} |
|
554 \begin{tabular}{lcl} |
|
555 $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\ |
|
556 $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ & |
|
557 $_{bs @ bs'}\ONE$\\ |
|
558 $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ & |
|
559 $_{bs@bs'}{\bf c}$\\ |
|
560 $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ & |
|
561 $_{bs@bs'}\sum\textit{as}$\\ |
|
562 $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ & |
|
563 $_{bs@bs'}a_1 \cdot a_2$\\ |
|
564 $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ & |
|
565 $_{bs @ bs'}a^*$ |
|
566 \end{tabular} |
|
567 \end{center} |
|
568 |
|
569 \noindent |
|
570 With \emph{fuse} we are able to define the $\internalise$ function |
|
571 that translates a ``standard'' regular expression into an |
|
572 annotated regular expression. |
|
573 This function will be applied before we start |
|
574 with the derivative phase of the algorithm. |
|
575 |
|
576 \begin{center} |
|
577 \begin{tabular}{lcl} |
|
578 $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\ |
|
579 $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\ |
|
580 $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\ |
|
581 $(r_1 + r_2)^\uparrow$ & $\dn$ & |
|
582 $_{[]}\sum[\textit{fuse}\,[Z]\,r_1^\uparrow,\, |
|
583 \textit{fuse}\,[S]\,r_2^\uparrow]$\\ |
|
584 $(r_1\cdot r_2)^\uparrow$ & $\dn$ & |
|
585 $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\ |
|
586 $(r^*)^\uparrow$ & $\dn$ & |
|
587 $_{[]}(r^\uparrow)^*$\\ |
|
588 \end{tabular} |
|
589 \end{center} |
|
590 %\end{definition} |
|
591 |
|
592 \noindent |
|
593 We use an up arrow with postfix notation |
|
594 to denote this operation. |
|
595 for convenience. The $\textit{internalise} \; r$ |
|
596 notation is more cumbersome. |
|
597 The opposite of $\textit{internalise}$ is |
|
598 $\erase$, where all the bit-codes are removed, |
|
599 and the alternative operator $\sum$ for annotated |
|
600 regular expressions is transformed to the binary alternatives |
|
601 for plain regular expressions. |
|
602 \begin{center} |
|
603 \begin{tabular}{lcl} |
|
604 $\ZERO_\downarrow$ & $\dn$ & $\ZERO$\\ |
|
605 $( _{bs}\ONE )_\downarrow$ & $\dn$ & $\ONE$\\ |
|
606 $( _{bs}\mathbf{c} )_\downarrow$ & $\dn$ & $\mathbf{c}$\\ |
|
607 $( _{bs} a_1 \cdot a_2 )_\downarrow$ & $\dn$ & |
|
608 $ (a_1) _\downarrow \cdot (a_2) _\downarrow$\\ |
|
609 $( _{bs} [])_\downarrow $ & $\dn$ & $\ZERO $\\ |
|
610 $( _{bs} [a] )_\downarrow$ & $\dn$ & $a_\downarrow$\\ |
|
611 $_{bs} \sum [a_1, \; a_2]$ & $\dn$ & $ (a_1) _\downarrow + ( a_2 ) _\downarrow $\\ |
|
612 $(_{bs} \sum (a :: as))_\downarrow$ & $\dn$ & $ a_\downarrow + \; (_{[]} \sum as)_\downarrow$\\ |
|
613 $( _{bs} a^* )_\downarrow$ & $\dn$ & $(a_\downarrow)^*$ |
|
614 \end{tabular} |
|
615 \end{center} |
|
616 \noindent |
|
617 We also abbreviate the $\erase\; a$ operation |
|
618 as $a_\downarrow$, for conciseness. |
|
619 For bit-coded regular expressions, as a different datatype, |
|
620 testing whether they contain empty string in their lauguage requires |
|
621 a dedicated function $\bnullable$ |
|
622 which simply calls $\erase$ first before testing whether it is $\nullable$. |
|
623 \begin{definition} |
|
624 $\bnullable \; a \dn \nullable \; (a_\downarrow)$ |
|
625 \end{definition} |
|
626 The function for collecting the |
|
627 bitcodes at the end of the derivative |
|
628 phase from a (b)nullable regular expression |
|
629 is a generalised version of the $\textit{mkeps}$ function |
|
630 for annotated regular expressions, called $\textit{bmkeps}$: |
|
631 |
|
632 |
|
633 %\begin{definition}[\textit{bmkeps}]\mbox{} |
|
634 \begin{center} |
|
635 \begin{tabular}{lcl} |
|
636 $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\ |
|
637 $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ & |
|
638 $\textit{if}\;\textit{bnullable}\,a$\\ |
|
639 & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\ |
|
640 & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{[]}\sum \textit{as})$\\ |
|
641 $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ & |
|
642 $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ |
|
643 $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ & |
|
644 $bs \,@\, [Z]$ |
|
645 \end{tabular} |
|
646 \end{center} |
|
647 %\end{definition} |
|
648 |
|
649 \noindent |
|
650 $\bmkeps$ completes the value information by travelling along the |
|
651 path of the regular expression that corresponds to a POSIX value and |
|
652 collecting all the bitcodes, and attaching $S$ to indicate the end of star |
|
653 iterations. |
|
654 |
|
655 Now we give out the central part of this lexing algorithm, |
649 Now we give out the central part of this lexing algorithm, |
656 the $\bder$ function (stands for \emph{b}itcoded-derivative). |
650 the $\bder$ function (stands for \emph{b}itcoded-derivative): |
657 For most time we use the infix notation $(\_\backslash\_)$ |
|
658 to mean $\bder$ for brevity when |
|
659 there is no danger of confusion with derivatives on plain regular expressions. |
|
660 For example, we write $( _{[]}r^* ) \backslash c$ instead of $\bder \;c \; _{[]}r^*$, |
|
661 as the bitcodes at the front of $r^*$ indicates that it is |
|
662 a bit-coded regular expression, not a plain one. |
|
663 $\bder$ tells us how regular expressions can be recursively traversed, |
|
664 where the bitcodes are augmented and carried around |
|
665 when a derivative is taken. |
|
666 \begin{center} |
651 \begin{center} |
667 \begin{tabular}{@{}lcl@{}} |
652 \begin{tabular}{@{}lcl@{}} |
668 $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\ |
653 $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\ |
669 $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\ |
654 $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\ |
670 $(_{bs}{\bf d})\,\backslash c$ & $\dn$ & |
655 $(_{bs}{\bf d})\,\backslash c$ & $\dn$ & |
816 \end{tabular} |
812 \end{tabular} |
817 \end{center} |
813 \end{center} |
818 |
814 |
819 \noindent |
815 \noindent |
820 Ausaf and Urban formally proved the correctness of the $\blexer$, namely |
816 Ausaf and Urban formally proved the correctness of the $\blexer$, namely |
821 \begin{conjecture} |
817 \begin{property} |
822 $\blexer \;r \; s = \lexer \; r \; s$. |
818 $\blexer \;r \; s = \lexer \; r \; s$. |
823 \end{conjecture} |
819 \end{property} |
824 This was claimed but not formalised in Sulzmann and Lu's work. |
820 This was claimed but not formalised in Sulzmann and Lu's work. |
825 We introduce the proof later, after we give out all |
821 We introduce the proof later, after we give all |
826 the needed auxiliary functions and definitions |
822 the needed auxiliary functions and definitions. |
827 |
823 But before this we shall first walk the reader |
|
824 through a concrete example of our $\blexer$ calculating the right |
|
825 lexical information through bit-coded regular expressions.\\ |
|
826 Consider the regular expression $(aa)^* \cdot bc$ matching the string $aabc$ |
|
827 and assume we have just read the first character $a$: |
|
828 \begin{center} |
|
829 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
830 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] |
|
831 {$\ONE \cdot a \cdot (aa)^* \cdot bc$ |
|
832 \nodepart{two} $[] \iff \Seq \; (\Stars \; [\Seq\; a \; ??, \;]), ??$}; |
|
833 \end{tikzpicture} |
|
834 \end{center} |
|
835 \noindent |
|
836 We use the red area for (annotated) regular expressions and the blue |
|
837 area the (partial) bitcodes and (partial) values. |
|
838 In the injection-based lexing algorithm, we ``neglect" the red area |
|
839 by putting all the characters we have consumed and |
|
840 intermediate regular expressions on the stack when |
|
841 we go from left to right in the derivative phase. |
|
842 The red area grows till the string is exhausted. |
|
843 During the injection phase, the value in the blue area |
|
844 is built up incrementally, while the red area shrinks. |
|
845 Before we have recovered all characters and intermediate |
|
846 derivative regular expressions from the stack, |
|
847 what values these characters and regular expressions correspond |
|
848 to are unknown: |
|
849 \begin{center} |
|
850 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
851 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={white!30,blue!20},] |
|
852 {$(\ONE \cdot \ONE) \cdot (aa)^* \cdot bc $ correspond to:$???$ |
|
853 \nodepart{two} $b c$ corresponds to $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$}; |
|
854 %\caption{term 1 \ref{term:1}'s matching configuration} |
|
855 \end{tikzpicture} |
|
856 \end{center} |
|
857 \noindent |
|
858 However, they should be calculable, |
|
859 as characters and regular expression shapes |
|
860 after taking derivative w.r.t those characters |
|
861 have already been known, therefore in our example, |
|
862 we know that the value starts with two $a$s, |
|
863 and makes up to an iteration in a Kleene star: |
|
864 (We have put the injection-based lexing's partial |
|
865 result in the right part of the split rectangle |
|
866 to contrast it with the partial valued produced |
|
867 in a forward manner) |
|
868 \begin{center} |
|
869 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
870 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] |
|
871 {$\stackrel{Bitcoded}{\longrightarrow} \Seq(\Stars[\Char(a), \Char(a)], ???)$ |
|
872 \nodepart{two} $\Seq(\ldots, \Seq(\Char(b), \Char(c)))$ $\stackrel{Inj}{\longleftarrow}$}; |
|
873 %\caption{term 1 \ref{term:1}'s matching configuration} |
|
874 \end{tikzpicture} |
|
875 \end{center} |
|
876 \noindent |
|
877 If we do this kind of "attachment" |
|
878 and each time augment the attached partially |
|
879 constructed value when taking off a |
|
880 character: |
|
881 \begin{center} |
|
882 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
883 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] (spPoint) |
|
884 {$\Seq(\Stars[\Char(a), \Char(a)], \ldots)$ |
|
885 \nodepart{two} Remaining: $b c$}; |
|
886 \end{tikzpicture}\\ |
|
887 $\downarrow$\\ |
|
888 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
889 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] |
|
890 {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \ldots))$ |
|
891 \nodepart{two} Remaining: $c$}; |
|
892 \end{tikzpicture}\\ |
|
893 $\downarrow$\\ |
|
894 \begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}] |
|
895 \node [rectangle split, rectangle split horizontal, rectangle split parts=2, rectangle split part fill={red!30,blue!20},] |
|
896 {$\Seq(\Stars[\Char(a), \Char(a)], \Seq(\Char(b), \Char(c)))$ |
|
897 \nodepart{two} EOF}; |
|
898 \end{tikzpicture} |
|
899 \end{center} |
|
900 \noindent |
|
901 In the end we could recover the value without a backward phase. |
|
902 But (partial) values are a bit clumsy to stick together with a regular expression, so |
|
903 we instead use bit-codes to encode them. |
|
904 |
|
905 Bits and bitcodes (lists of bits) are defined as: |
|
906 \begin{center} |
|
907 $b ::= S \mid Z \qquad |
|
908 bs ::= [] \mid b::bs |
|
909 $ |
|
910 \end{center} |
|
911 |
|
912 \noindent |
828 |
913 |
829 %----------------------------------- |
914 %----------------------------------- |
830 % SUBSECTION 1 |
915 % SUBSECTION 1 |
831 %----------------------------------- |
916 %----------------------------------- |
832 \section{Specifications of Some Helper Functions} |
917 \section{Specifications of Some Helper Functions} |