# HG changeset patch # User Chengsong # Date 1660694953 -3600 # Node ID 35df9cdd36ca00a49af21cd3fbf3db5304c26e9a # Parent e71a6e2aca2dc23f8d8f5c5a7dc27312a037e3c5 more chap3 diff -r e71a6e2aca2d -r 35df9cdd36ca ChengsongTanPhdThesis/Chapters/Bitcoded1.tex --- a/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Mon Aug 15 17:26:08 2022 +0200 +++ b/ChengsongTanPhdThesis/Chapters/Bitcoded1.tex Wed Aug 17 01:09:13 2022 +0100 @@ -11,11 +11,299 @@ In this chapter, we are going to describe the bit-coded algorithm introduced by Sulzmann and Lu \parencite{Sulzmann2014} to address the growth problem of regular expressions. +We have implemented their algorithm in Scala, and found out inefficiencies +in their algorithm such as de-duplication not working properly and redundant +fixpoint construction. Their algorithm is improved and verified with the help of +formal proofs. \section{Bit-coded Algorithm} -The lexer algorithm in Chapter \ref{Inj}, as shown in \ref{InjFigure}, +We first do a recap of what was going on +in the lexer algorithm in Chapter \ref{Inj}, +\begin{center} +\begin{tabular}{lcl} + $\lexer \; r \; [] $ & $=$ & $\textit{if} \; (\nullable \; r)\; \textit{then}\; \Some(\mkeps \; r) \; \textit{else} \; \None$\\ + $\lexer \; r \;c::s$ & $=$ & $\textit{case}\; (\lexer \; (r\backslash c) \; s) \;\textit{of}\; $\\ + & & $\quad \phantom{\mid}\; \None \implies \None$\\ + & & $\quad \mid \Some(v) \implies \Some(\inj \; r\; c\; v)$ +\end{tabular} +\end{center} +\noindent +The algorithm recursively calls $\lexer$ on +each new character input, +and before starting a child call +it stores information of previous lexing steps +on a stack, in the form of regular expressions +and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc. +Each descent into deeper recursive calls in $\lexer$ +causes a new pair of $r_i, c_i$ to be pushed to the call stack. +\begin{figure} +\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] +%\draw (-6,-6) grid (6,6); +\node [ circle ] (r) at (-6, 5) {$r$}; + +%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; +\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$}; +% +%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; +\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$}; + +\node [minimum width = 2cm, rectangle, draw] (stack) at (0, 3) {Stack}; + +\path + (r) + edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); + +\path (r1) + edge [bend right, dashed] node {saved} (stack); +\path (c1) + edge [bend right, dashed] node {} (stack); + + +\end{tikzpicture} +\caption{First Derivative Taken} +\end{figure} + + + +\begin{figure} +\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] +%\draw (-6,-6) grid (6,6); +\node [ circle ] (r) at (-6, 5) {$r$}; + +%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; +\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$}; +% +%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; +\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$}; + + +\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$}; +% +%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; +\node [circle, draw] (r2) at (2, 5) {$r_2$}; +\node [minimum width = 3cm, minimum height = 1cm, rectangle, draw] (stack) at (0, 2) {\large Stack}; + +\path + (r) + edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); + +\path (r2) + edge [bend right, dashed] node {} (stack); +\path (c2) + edge [bend right, dashed] node {} (stack); + +\path (r1) + edge [] node {} (r2); + +\end{tikzpicture} +\caption{Second Derivative Taken} +\end{figure} +\noindent +As the number of derivative steps increase, +the stack would increase: + +\begin{figure} +\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] +%\draw (-6,-6) grid (6,6); +\node [ circle ] (r) at (-6, 5) {$r$}; + +%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; +\node [circle, minimum size = 0.1, ] (c1) at (-4, 5.4) {$c_1$}; +% +%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; +\node [minimum size = 0.5, circle, ] (r1) at (-2, 5) {$r_1$}; + + +\node [circle, minimum size = 0.1, ] (c2) at (0, 5.4) {$c_2$}; +% +%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; +\node [circle, ] (r2) at (2, 5) {$r_2$}; +\node [minimum width = 4cm, minimum height = 2.5cm, rectangle, draw] (stack) at (0, 1) { \large Stack}; + +\node [] (ldots) at (3.5, 5) {}; +%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$}; + +\node [minimum size = 0.5, circle, ] (rn) at (6, 5) {}; + +\node (rldots) at ($(ldots)!.4!(rn)$) {\ldots}; + +\path + (r) + edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); + +\path (rldots) + edge [bend left, dashed] node {} (stack); + +\path (r1) + edge [] node {} (r2); + +\path (r2) + edge [] node {} (ldots); +\path (ldots) + edge [bend left, dashed] node {} (stack); +\path (5.03, 4.9) + edge [bend left, dashed] node {} (stack); +\end{tikzpicture} +\caption{More Derivatives Taken} +\end{figure} +\noindent +After all derivatives have been taken, the stack grows to a maximum size +and the pair of regular expressions and characters $r_i, c_{i+1}$ +are then popped out and used in the injection phase: + +\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] +%\draw (-6,-6) grid (6,6); +\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$}; + +%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; +\node [circle, minimum size = 0.1, draw] (c1) at (-4, 5.4) {$c_1$}; +% +%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; +\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$}; +% +%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$}; +\node [circle, minimum size = 0.1, draw] (c2) at (0, 5.4) {$c_2$}; +% +%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; +\node [circle, draw] (r2) at (2, 5) {$r_2$}; +% +% +\node [] (ldots) at (4.5, 5) {}; +%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$}; + +\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$}; + +\node at ($(ldots)!.4!(rn)$) {\ldots}; + +\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$}; + +\node [] (ldots2) at (3.5, -5) {}; + +\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$}; + +\node at ($(ldots2)!.4!(v2)$) {\ldots}; + + +\node [circle, draw] (v1) at (-2, -5) {$v_1$}; + +\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$}; + +\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack}; + +\path + (r) + edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); +\path + (r1) + edge [] node {} (r2); +\path (r2) + edge [] node {} (ldots); +\path (rn) + edge [] node {$\mkeps$} (vn); +\path (vn) + edge [] node {} (ldots2); +\path (v2) + edge [] node {$\inj \; r_1 \; c_2\;v_2$} (v1); + +\path (v1) + edge [] node {$\inj \; r \; c_1 \; v_1$} (v); +\path (r) + edge [dashed, bend right] node {} (stack); + +\path (r1) + edge [dashed, ] node {} (stack); + +\path (c1) + edge [dashed, bend right] node {} (stack); + +\path (c2) + edge [dashed] node {} (stack); +\path (4.5, 5) + edge [dashed, bend left] node {} (stack); +\path (4.9, 5) + edge [dashed, bend left] node {} (stack); +\path (5.3, 5) + edge [dashed, bend left] node {} (stack); +\path (r2) + edge [dashed, ] node {} (stack); +\path (rn) + edge [dashed, bend left] node {} (stack); +\end{tikzpicture} +%\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,thick] +%%\draw (-6,-6) grid (6,6); +%\node [radius = 0.5, circle, draw] (r) at (-6, 5) {$r$}; +% +%%\node (-4, 6) (c1) circle [radius = 0.3] {$c_1$}; +%\node [circle, minimum size = 0.1] (c1) at (-4, 5.4) {$c_1$}; +%% +%%\node (-2, 5) (r1) circle [radius = 0.5] {$r_1$}; +%\node [minimum size = 0.5, circle, draw] (r1) at (-2, 5) {$r_1$}; +%% +%%\node (0, 6) (c2) circle [radius = 0.3] {$c_2$}; +%\node [circle, minimum size = 0.1] (c2) at (0, 5.4) {$c_2$}; +%% +%%\node (2, 5) (r2) circle [radius = 0.5] {$r_2$}; +%\node [circle, draw] (r2) at (2, 5) {$r_2$}; +%% +%% +%\node [] (ldots) at (4.5, 5) {}; +%%\node (6, 5) (rn) circle [radius = 0.5] {$r_n$}; +% +%\node [minimum size = 0.5, circle, draw] (rn) at (6, 5) {$r_n$}; +% +%\node at ($(ldots)!.4!(rn)$) {\ldots}; +% +%\node [minimum size = 0.5, circle, draw] (vn) at (6, -5) {$v_n$}; +% +%\node [] (ldots2) at (3.5, -5) {}; +% +%\node [minimum size = 0.5, circle, draw] (v2) at (2, -5) {$v_2$}; +% +%\node at ($(ldots2)!.4!(v2)$) {\ldots}; +% +% +%\node [circle, draw] (v1) at (-2, -5) {$v_1$}; +% +%\node [radius = 0.5, circle, draw] (v) at (-6, -5) {$v$}; +% +%\node [minimum size = 6cm, rectangle, draw] (stack) at (0, 0) {\Huge Stack}; +% +%\path +% (r) +% edge [->, >=stealth',shorten >=1pt] node[left] {} (r1); +%\path +% (r1) +% edge [] node {} (r2); +%\path (r2) +% edge [] node {} (ldots); +%\path (rn) +% edge [] node {$\mkeps$} (vn); +%\path (vn) +% edge [] node {} (ldots2); +%\path (v2) +% edge [] node {} (v1); +% +%\path (v1) +% edge [] node {} (v); +%\path (r) +% edge [] node {saved} (stack); +% +%\path (r1) +% edge [] node {saved} (stack); +%\end{tikzpicture} +\noindent +The information stored in characters and regular expressions +make the algorithm work in an elegant way, at the expense of being +storing quite a bit of verbose information. + + + +The lexer algorithm in Chapter \ref{Inj}, stores information of previous lexing steps on a stack, in the form of regular expressions and characters: $r_0$, $c_0$, $r_1$, $c_1$, etc. +The red part represents what we already know during the first +derivative phase, +and the blue part represents the unknown part of input. \begin{ceqn} \begin{equation}%\label{graph:injLexer} \begin{tikzcd}[ampersand replacement=\&, execute at end picture={ @@ -39,22 +327,19 @@ \end{equation} \end{ceqn} \noindent -The red part represents what we already know during the first -derivative phase, -and the blue part represents the unknown part of input. The red area expands as we move towards $r_n$, -indicating an increasing stack size during lexing. -Despite having some partial lexing information during +indicating more is known about the lexing result. +Despite knowing this partial lexing information during the forward derivative phase, we choose to store them -temporarily, only to convert the information to lexical -values at a later stage. In essence we are repeating work we -have already done. -This is both inefficient and prone to stack overflow. -A natural question arises as to whether we can store lexing +all the way until $r_n$ is reached. +Then we reconstruct the value character by character +values at a later stage, using information in a Last-In-First-Out +manner. Although the algorithm is elegant and natural, +it can be inefficient and prone to stack overflow.\\ +It turns out we can store lexing information on the fly, while still using regular expression derivatives. - -If we remove the details of the individual +If we remove the individual lexing steps, and use red and blue areas as before to indicate consumed (seen) input and constructed partial value (before recovering the rest of the stack), diff -r e71a6e2aca2d -r 35df9cdd36ca ChengsongTanPhdThesis/Chapters/Bitcoded2.tex --- a/ChengsongTanPhdThesis/Chapters/Bitcoded2.tex Mon Aug 15 17:26:08 2022 +0200 +++ b/ChengsongTanPhdThesis/Chapters/Bitcoded2.tex Wed Aug 17 01:09:13 2022 +0100 @@ -17,13 +17,82 @@ \section{Simplification for Annotated Regular Expressions} The first thing we notice in the fast growth of examples such as $(a^*a^*)^*$'s and $(a^* + (aa)^*)^*$'s derivatives is that a lot of duplicated sub-patterns -are scattered around different levels: - +are scattered around different levels, and therefore requires +de-duplication at different levels: \begin{center} - $(a^*a^*)^* \rightarrow (a^*a^* + a^*)\cdot(a^*a^*)^*$\\ + $(a^*a^*)^* \rightarrow (a^*a^* + a^*)\cdot(a^*a^*)^* \rightarrow $\\ $((a^*a^* + a^*) + a^*)\cdot(a^*a^*)^* + (a^*a^* + a^*)\cdot(a^*a^*)^*$ \end{center} \noindent +As we have already mentioned in \ref{eqn:growth2}, +a simple-minded simplification function cannot simplify +$((a^*a^* + a^*) + a^*)\cdot(a^*a^*)^* + (a^*a^* + a^*)\cdot(a^*a^*)^*$ +any further. +we would expect a better simplification function to work in the +following way: +\begin{gather*} + ((a^*a^* + \underbrace{a^*}_\text{A})+\underbrace{a^*}_\text{duplicate of A})\cdot(a^*a^*)^* + + \underbrace{(a^*a^* + a^*)\cdot(a^*a^*)^*}_\text{further simp removes this}.\\ + \bigg\downarrow \\ + (a^*a^* + a^* + \color{gray} + a^* \color{black})\cdot(a^*a^*)^* + + \underbrace{(a^*a^* + a^*)\cdot(a^*a^*)^*}_\text{further simp removes this} \\ + \bigg\downarrow \\ + (a^*a^* + a^* + )\cdot(a^*a^*)^* + \color{gray} + (a^*a^* + a^*) \cdot(a^*a^*)^* +\end{gather*} +\noindent +This motivating example came from testing Sulzmann and Lu's +algorithm: their simplification does +not work! +We quote their $\textit{simp}$ function verbatim here: +\begin{center} + \begin{tabular}{lcl} + $\simp\; _{bs}(_{bs'}\ONE \cdot r)$ & $\dn$ & + $\textit{if} \; (\textit{zeroable} \; r)\; \textit{then} \;\; \ZERO$\\ + & &$\textit{else}\;\; \fuse \; (bs@ bs') \; r$\\ + $\simp\;(_{bs}r_1\cdot r_2)$ & $\dn$ & $\textit{if} + \; (\textit{zeroable} \; r_1 \; \textit{or} \; \textit{zeroable}\; r_2)\; + \textit{then} \;\; \ZERO$\\ + & & $\textit{else}\;\;_{bs}((\simp\;r_1)\cdot + (\simp\; r_2))$\\ + $\simp \; _{bs}\sum []$ & $\dn$ & $\ZERO$\\ + $\simp \; _{bs}\sum ((_{bs'}\sum rs_1) :: rs_2)$ & $\dn$ & + $_{bs}\sum ((\map \; (\fuse \; bs')\; rs_1) @ rs_2)$\\ + $\simp \; _{bs}\sum[r]$ & $\dn$ & $\fuse \; bs \; (\simp \; r)$\\ + $\simp \; _{bs}\sum(r::rs)$ & $\dn$ & $_{bs}\sum + (\nub \; (\filter \; (\not \circ \zeroable)\;((\simp \; r) :: \map \; \simp \; rs)))$\\ + + \end{tabular} +\end{center} +\noindent +the $\textit{zeroable}$ predicate +which tests whether the regular expression +is equivalent to $\ZERO$, +is defined as: +\begin{center} + \begin{tabular}{lcl} + $\zeroable \; _{bs}\sum (r::rs)$ & $\dn$ & $\zeroable \; r\;\; \land \;\; + \zeroable \;_{[]}\sum\;rs $\\ + $\zeroable\;_{bs}(r_1 \cdot r_2)$ & $\dn$ & $\zeroable\; r_1 \;\; \lor \;\; \zeroable \; r_2$\\ + $\zeroable\;_{bs}r^*$ & $\dn$ & $\textit{false}$ \\ + $\zeroable\;_{bs}c$ & $\dn$ & $\textit{false}$\\ + $\zeroable\;_{bs}\ONE$ & $\dn$ & $\textit{false}$\\ + $\zeroable\;_{bs}\ZERO$ & $\dn$ & $\textit{true}$ + \end{tabular} +\end{center} +%\[ +%\begin{aligned} +%&\textit{isPhi} \left(b s @ r i^{*}\right)=\text { False } \\ +%&\textit{isPhi}\left(b s @ r i_{1} r i_{2}\right)=\textit{isPhi} r i_{1} \vee \textit{isPhi} r i_{2} \\ +%&\textit{isPhi} \left(b s @ r i_{1} \oplus r i_{2}\right)=\textit{isPhi} r i_{1} \wedge \textit{isPhi} r i_{2} \\ +%&\textit{isPhi}(b s @ l)= False \\ +%&\textit{isPhi}(b s @ \epsilon)= False \\ +%&\textit{isPhi} \; \ZERO = True +%\end{aligned} +%\] +\noindent Despite that we have already implemented the simple-minded simplifications such as throwing away useless $\ONE$s and $\ZERO$s. The simplification rule $r + r \rightarrow $ cannot make a difference either diff -r e71a6e2aca2d -r 35df9cdd36ca ChengsongTanPhdThesis/Chapters/Inj.tex --- a/ChengsongTanPhdThesis/Chapters/Inj.tex Mon Aug 15 17:26:08 2022 +0200 +++ b/ChengsongTanPhdThesis/Chapters/Inj.tex Wed Aug 17 01:09:13 2022 +0100 @@ -278,7 +278,7 @@ \end{center} \noindent Brzozowski noticed that this operation -can be ``mirrored" on regular expressions which +can be ``mirrored'' on regular expressions which he calls the derivative of a regular expression $r$ with respect to a character $c$, written $r \backslash c$. This infix operator @@ -288,9 +288,17 @@ The derivative operation on regular expression is defined such that the language of the derivative result coincides with the language of the original -regular expression's language being taken the language +regular expression being taken derivative with respect to the same character: -\begin{center} +\begin{property} + +\[ + L \; (r \backslash c) = \Der \; c \; (L \; r) +\] +\end{property} +\noindent +Pictorially, this looks as follows: + \parskip \baselineskip \def\myupbracefill#1{\rotatebox{90}{\stretchto{\{}{#1}}} \def\rlwd{.5pt} @@ -308,25 +316,98 @@ \;\ldots\}$} } \Longstack{ - $\stackrel{\backslash c}{\longrightarrow}$ + $\stackrel{\backslash c}{\xrightarrow{\hspace*{8cm}}}$ } \Longstack{ - \notate{$r\backslash c$}{2}{$L \; (r\backslash c)= + \notate{$r\backslash c$}{1}{$L \; (r\backslash c)= \{\ldots,\;s_1,\;\ldots\}$} } -\end{center} -\begin{center} +\\ +The derivatives on regular expression can again be +generalised to a string. +One could compute $r\backslash s$ and test membership of $s$ +in $L \; r$ by checking +whether the empty string is in the language of +$r\backslash s$. + -\[ -L(r \backslash c) = \Der \; c \; L(r) -\] +\Longstack{ + \notate{$r_{start}$}{4}{ + \Longstack{$L \; r_{start} = \{\ldots, \;$ + \notate{$s$}{1}{$c_1::s_1$} + $, \ldots\} $ + } + } +} +\Longstack{ + $\stackrel{\backslash c_1}{ \xrightarrow{\hspace*{1.8cm}} }$ +} +\Longstack{ + \notate{$r_1$}{3}{ + $r_1 = r_{start}\backslash c_1$, + $L \; r_1 = $ + \Longstack{ + $\{ \ldots,\;$ \notate{$s_1$}{1}{$c_2::s_2$} + $,\; \ldots \}$ + } +} +} +\Longstack{ + $\stackrel{\backslash c_2}{\xrightarrow{\hspace*{1.8cm}}}$ +} +\Longstack{ + $r_2$ +} +\Longstack{ + $ \xdashrightarrow{\hspace*{0.3cm} \backslash c_3 \ldots \ldots \ldots \hspace*{0.3cm}} $ +} +\Longstack{ + \notate{$r_{end}$}{1}{ + $L \; r_{end} = \{\ldots, \; [], \ldots\}$} +} + + + +\begin{property} + $s \in L \; r_{start} \iff [] \in L \; r_{end}$ +\end{property} +\noindent +Now we give the recursive definition of derivative on +regular expressions, so that it satisfies the properties above. +The derivative function, written $r\backslash c$, +defines how a regular expression evolves into +a new one after all the string it contains is acted on: +if it starts with $c$, then the character is chopped of, +if not, that string is removed. +\begin{center} +\begin{tabular}{lcl} + $\ZERO \backslash c$ & $\dn$ & $\ZERO$\\ + $\ONE \backslash c$ & $\dn$ & $\ZERO$\\ + $d \backslash c$ & $\dn$ & + $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\ +$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\ +$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\ + & & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\ + & & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\ + $(r^*)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^*$\\ +\end{tabular} \end{center} \noindent -where we do derivatives on the regular expression -$r$ and test membership of $s$ by checking -whether the empty string is in the language of -$r\backslash s$. -For example in the sequence case we have +The most involved cases are the sequence case +and the star case. +The sequence case says that if the first regular expression +contains an empty string, then the second component of the sequence +needs to be considered, as its derivative will contribute to the +result of this derivative: +\begin{center} + \begin{tabular}{lcl} + $(r_1 \cdot r_2 ) \backslash c$ & $\dn$ & $\textit{if}\;\,([] \in L(r_1))\; r_1 \backslash c \cdot r_2 + r_2 \backslash c$ \\ + & & $\textit{else} \; (r_1 \backslash c) \cdot r_2$ + \end{tabular} +\end{center} +\noindent +Notice how this closely resembles +the language derivative operation $\Der$: \begin{center} \begin{tabular}{lcl} $\Der \; c \; (A @ B)$ & $\dn$ & @@ -337,26 +418,19 @@ \end{tabular} \end{center} \noindent -This can be translated to -regular expressions in the following -manner: -\begin{center} - \begin{tabular}{lcl} - $(r_1 \cdot r_2 ) \backslash c$ & $\dn$ & $\textit{if}\;\,([] \in L(r_1)) r_1 \backslash c \cdot r_2 + r_2 \backslash c$ \\ - & & $\textit{else} \; (r_1 \backslash c) \cdot r_2$ - \end{tabular} -\end{center} - -\noindent -And similarly, the Kleene star's semantic derivative -can be expressed as +The star regular expression $r^*$'s derivative +unwraps one iteration of $r$, turns it into $r\backslash c$, +and attaches the original $r^*$ +after $r\backslash c$, so that +we can further unfold it as many times as needed: +\[ + (r^*) \backslash c \dn (r \backslash c)\cdot r^*. +\] +Again, +the structure is the same as the semantic derivative of Kleene star: \[ \textit{Der} \;c \;(A*) \dn (\textit{Der}\; c A) @ (A*) \] -which translates to -\[ - (r^*) \backslash c \dn (r \backslash c)\cdot r^*. -\] In the above definition of $(r_1\cdot r_2) \backslash c$, the $\textit{if}$ clause's boolean condition @@ -390,54 +464,28 @@ would require both children to have the empty string to compose an empty string, and the Kleene star is always nullable because it naturally -contains the empty string. - -The derivative function, written $r\backslash c$, -defines how a regular expression evolves into -a new one after all the string it contains is acted on: -if it starts with $c$, then the character is chopped of, -if not, that string is removed. -\begin{center} -\begin{tabular}{lcl} - $\ZERO \backslash c$ & $\dn$ & $\ZERO$\\ - $\ONE \backslash c$ & $\dn$ & $\ZERO$\\ - $d \backslash c$ & $\dn$ & - $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\ -$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\ -$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\ - & & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\ - & & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\ - $(r^*)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^*$\\ -\end{tabular} -\end{center} +contains the empty string. \noindent -The most involved cases are the sequence case -and the star case. -The sequence case says that if the first regular expression -contains an empty string, then the second component of the sequence -needs to be considered, as its derivative will contribute to the -result of this derivative. -The star regular expression $r^*$'s derivative -unwraps one iteration of $r$, turns it into $r\backslash c$, -and attaches the original $r^*$ -after $r\backslash c$, so that -we can further unfold it as many times as needed. We have the following correspondence between derivatives on regular expressions and derivatives on a set of strings: \begin{lemma}\label{derDer} + \begin{itemize} + \item $\textit{Der} \; c \; L(r) = L (r\backslash c)$ +\item + $c\!::\!s \in L(r)$ \textit{iff} $s \in L(r\backslash c)$. + \end{itemize} \end{lemma} - +\begin{proof} + By induction on $r$. +\end{proof} \noindent The main property of the derivative operation (that enables us to reason about the correctness of derivative-based matching) is -\begin{lemma}\label{derStepwise} - $c\!::\!s \in L(r)$ \textit{iff} $s \in L(r\backslash c)$. -\end{lemma} \noindent We can generalise the derivative operation shown above for single characters @@ -478,8 +526,8 @@ $\textit{match} \; s\; r = \textit{true} \; \textit{iff} \; s \in L(r)$ \end{lemma} \begin{proof} - By the stepwise property of derivatives (lemma \ref{derStepwise}) - and lemma \ref{derDer}. + By the stepwise property of derivatives + (lemma \ref{derDer}). \end{proof} \noindent \begin{center} @@ -689,6 +737,34 @@ \caption{POSIX Lexing Rules} \end{figure} \noindent + +\begin{figure} +\begin{tikzpicture}[] + \node [minimum width = 6cm, rectangle split, rectangle split horizontal, + rectangle split parts=2, rectangle split part fill={red!30,blue!20}, style={draw, rounded corners, inner sep=10pt}] + (node1) + {$r_{token1}$ + \nodepart{two} $\;\;\; \quad r_{token2}\;\;\;\quad$ }; + %\node [left = 6.0cm of node1] (start1) {hi}; + \node [left = 0.2cm of node1] (middle) {$v.s.$}; + \node [minimum width = 6cm, left = 0.2cm of middle, rectangle split, rectangle split horizontal, + rectangle split parts=2, rectangle split part fill={red!30,blue!20}, style={draw, rounded corners, inner sep=10pt}] + (node2) + {$\quad\;\;\;r_{token1}\quad\;\;\;$ + \nodepart{two} $r_{token2}$ }; + \node [below = 0.1cm of node2] (text1) {\checkmark preferred by POSIX}; + \node [above = 1.5cm of middle, minimum width = 6cm, + rectangle, style={draw, rounded corners, inner sep=10pt}] + (topNode) {$s$}; + \path[->,draw] + (topNode) edge node {split $A$} (node2) + (topNode) edge node {split $B$} (node1) + ; + + +\end{tikzpicture} +\caption{Maximum munch example: $s$ matches $r_{token1} \cdot r_{token2}$}\label{munch} +\end{figure} The above $\POSIX$ rules follows the intuition described below: \begin{itemize} \item (Left Priority)\\ @@ -697,6 +773,13 @@ \item (Maximum munch)\\ Always match a subpart as much as possible before proceeding to the next token. + For example, when the string $s$ matches + $r_{token1}\cdot r_{token2}$, and we have two ways $s$ can be split: + Then the split that matches a longer string for the first token + $r_{token1}$ is preferred by this maximum munch rule (See + \ref{munch} for an illustration). +\noindent + \end{itemize} \noindent These disambiguation strategies can be @@ -1205,10 +1288,10 @@ and one can remove ones that are absolutely not possible to be POSIX. In the above example, -\[ +\begin{equation}\label{eqn:growth2} ((a^*a^* + \underbrace{a^*}_\text{A})+\underbrace{a^*}_\text{duplicate of A})\cdot(a^*a^*)^* + \underbrace{(a^*a^* + a^*)\cdot(a^*a^*)^*}_\text{further simp removes this}. -\] +\end{equation} can be further simplified by removing the underlined term first, which would open up possibilities diff -r e71a6e2aca2d -r 35df9cdd36ca ChengsongTanPhdThesis/Chapters/Introduction.tex --- a/ChengsongTanPhdThesis/Chapters/Introduction.tex Mon Aug 15 17:26:08 2022 +0200 +++ b/ChengsongTanPhdThesis/Chapters/Introduction.tex Wed Aug 17 01:09:13 2022 +0100 @@ -132,6 +132,13 @@ \def\flts{\textit{flts}} +\def\zeroable{\textit{zeroable}} +\def\nub{\textit{nub}} +\def\filter{\textit{filter}} +\def\not{\textit{not}} + + + \def\RZERO{\mathbf{0}_r } \def\RONE{\mathbf{1}_r} \newcommand\RCHAR[1]{\mathbf{#1}_r} diff -r e71a6e2aca2d -r 35df9cdd36ca ChengsongTanPhdThesis/main.tex --- a/ChengsongTanPhdThesis/main.tex Mon Aug 15 17:26:08 2022 +0200 +++ b/ChengsongTanPhdThesis/main.tex Wed Aug 17 01:09:13 2022 +0100 @@ -81,6 +81,7 @@ #3$% } \makeatother +\def\checkmark{\tikz\fill[scale=0.4](0,.35) -- (.25,0) -- (1,.7) -- (.25,.15) -- cycle;} \usepackage{amsthm} \usepackage{amssymb} @@ -92,6 +93,7 @@ \usepackage{tikz-cd} \usepackage{tikz} \usetikzlibrary{automata, positioning, calc} +\usetikzlibrary{arrows} \usetikzlibrary{fit, shapes.geometric, patterns,