diff -r 8dbc83ecea3b -r dd7f719c451d etnms/etnms.tex --- a/etnms/etnms.tex Fri Jan 24 18:44:52 2020 +0000 +++ b/etnms/etnms.tex Wed Feb 05 10:45:15 2020 +0000 @@ -2,6 +2,10 @@ \usepackage{graphic} \usepackage{data} \usepackage{tikz-cd} +\usepackage{tikz} +\usetikzlibrary{graphs} +\usetikzlibrary{graphdrawing} +\usegdlibrary{trees} %\usepackage{algorithm} \usepackage{amsmath} \usepackage{xcolor} @@ -348,13 +352,13 @@ Here bits and bitcodes (lists of bits) are defined as: \begin{center} - $b ::= S \mid Z \qquad + $b ::= 1 \mid 0 \qquad bs ::= [] \mid b:bs $ \end{center} \noindent -The $S$ and $Z$ are arbitrary names for the bits in order to avoid +The $1$ and $0$ are not in bold in order to avoid confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or bit-lists) can be used to encode values (or incomplete values) in a compact form. This can be straightforwardly seen in the following @@ -364,23 +368,23 @@ \begin{tabular}{lcl} $\textit{code}(\Empty)$ & $\dn$ & $[]$\\ $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\ - $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\ - $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\ + $\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\ + $\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\ $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\ - $\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\ - $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\; + $\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\ + $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\; code(\Stars\,vs)$ \end{tabular} \end{center} \noindent Here $\textit{code}$ encodes a value into a bitcodes by converting -$\Left$ into $\Z$, $\Right$ into $\S$, the start point of a non-empty -star iteration into $\S$, and the border where a local star terminates -into $\Z$. This coding is lossy, as it throws away the information about +$\Left$ into $0$, $\Right$ into $1$, and marks the start of a non-empty +star iteration by $1$. The border where a local star terminates +is marked by $0$. This coding is lossy, as it throws away the information about characters, and also does not encode the ``boundary'' between two sequence values. Moreover, with only the bitcode we cannot even tell -whether the $\S$s and $\Z$s are for $\Left/\Right$ or $\Stars$. The +whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The reason for choosing this compact way of storing information is that the relatively small size of bits can be easily manipulated and ``moved around'' in a regular expression. In order to recover values, we will @@ -1327,24 +1331,60 @@ \begin{center} \begin{tabular}{@{}lcl@{}} - $\textit{simp} \; (_{\textit{bs}}(a_1 \cdot a_2))$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp} \; a_2) \; \textit{match} $ \\ - &&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow \ZERO$ \\ - &&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow \ZERO$ \\ - &&$\quad\textit{case} \; (\ONE, a_2') \Rightarrow \textit{fuse} \; bs \; a_2'$ \\ - &&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow \textit{fuse} \; bs \; a_1'$ \\ - &&$\quad\textit{case} \; (a_1', a_2') \Rightarrow _{\textit{bs}}(a_1' \cdot a_2')$ \\ + $\textit{simp}' \; (_{\textit{bs}}(a_1 \cdot a_2))$ & $\dn$ & $\textit{as} \; \simp \; \textit{was} \; \textit{before} $ \\ - $\textit{simp} \; (bs_\oplus as)$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $ \\ - &&$\quad\textit{case} \; [] \Rightarrow \ZERO$ \\ - &&$\quad\textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$ \\ - &&$\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$\\ + $\textit{simp}' \; (_{bs}\oplus as)$ & $\dn$ & \st{$\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $} \\ + &&\st{$\quad\textit{case} \; [] \Rightarrow \ZERO$} \\ + &&\st{$\quad\textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$} \\ + &&\st{$\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$}\\ + &&$\textit{if}(\textit{hollowAlternatives}( \textit{map \; simp \; as}))$\\ + &&$\textit{then} \; \fuse \; \textit{bs}\; \textit{extractAlt}(\textit{map} \; \simp \; \textit{as} )$\\ + &&$\textit{else} \; \simp(_{bs} \oplus \textit{as})$\\ + - $\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$ + $\textit{simp}' \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$ \end{tabular} \end{center} \noindent +given the definition of $\textit{hollowAlternatives}$ and $\textit{extractAlt}$ : +\begin{center} +$\textit{hollowAlternatives}( \textit{rs}) \dn +\exists r = (_{\textit{bs}_1}\oplus \textit{rs}_1) \in \textit{rs}. \forall r' \in \textit{rs}, \; +\textit{either} \; r' = \ZERO \; \textit{or} \; r' = r $ +$\textit{extractAlt}( \textit{rs}) \dn \textit{if}\big( +\exists r = (_{\textit{bs}_1}\oplus \textit{rs}_1) \in \textit{rs}. \forall r' \in \textit{rs}, \; +\textit{either} \; r' = \ZERO \; \textit{or} \; r' = r \big)\; \textit{then} \; \textit{return} \; r$ +\end{center} +\noindent +Basically, $\textit{hollowAlternatives}$ captures the feature of +a list of regular expression of the shape +\begin{center} +$ \oplus[\ZERO \;, \; _{bs}\oplus \textit{rs} ]$ +\end{center} +and this means we can simply elevate the +inner regular expression $_{bs}\oplus \textit{rs}$ + to the outmost +and throw away the useless $\ZERO$s and +the outer $\oplus$ wrapper. +Using this new definition of simp, +under the example where $r$ is the regular expression +$(a+b)(a+a*)$ and $s$ is the string $aa$ +the problem of $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$ +is resolved. +Unfortunately this causes new problems: +for the counterexample where +\begin{center} +$r$ is the regular expression +$(ab+(a^*+aa))$ and $s$ is the string $aa$ +\end{center} +$\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$ +happens again, whereas this does not happen for the old +version of $\simp$. +This dilemma causes this amendment not a successful +attempt to make $\rup\backslash_{simp} \, s = \simp(\rup\backslash s)$ +under every possible regular expression and string. \subsection{Properties of the Function $\simp$} We have proved in Isabelle quite a few properties @@ -1378,7 +1418,141 @@ $\textit{if} r = \simp(r') \textit{then} \; \textit{good}(r) $ \end{center} \end{itemize} +\subsection{the Contains relation} +$\retrieve$ is a too strong relation in that +it only extracts one bitcode instead of a set of them. +Therefore we try to define another relation(predicate) +to capture the fact the regular expression has bits +being moved around but still has all the bits needed. +The contains symbol, written$\gg$, is a relation that +takes two arguments in an infix form +and returns a truth value. + +In other words, from the set of regular expression and +bitcode pairs +$\textit{RV} = \{(r, v) \mid r \text{r is a regular expression, v is a value}\}$, +those that satisfy the following requirements are in the set +$\textit{RV}_Contains$. +Unlike the $\retrieve$ +function, which takes two arguments $r$ and $v$ and +produces an only answer $\textit{bs}$, it takes only +one argument $r$ and returns a set of bitcodes that +can be generated by $r$. +\begin{center} +\begin{tabular}{llclll} +& & & $_{bs}\ONE$ & $\gg$ & $\textit{bs}$\\ +& & & $_{bs}{\bf c}$ & $\gg$ & $\textit{bs}$\\ +$\textit{if} \; r_1 \gg \textit{bs}_1$ & $r_2 \; \gg \textit{bs}_2$ +& $\textit{then}$ & + $_{bs}{r_1 \cdot r_2}$ & + $\gg$ & + $\textit{bs} @ \textit{bs}_1 @ \textit{bs}_2$\\ + + $\textit{if} \; r \gg \textit{bs}_1$ & & $\textit{then}$ & + $_{bs}{\oplus(r :: \textit{rs}})$ & + $\gg$ & + $\textit{bs} @ \textit{bs}_1 $\\ + + $\textit{if} \; _{bs}(\oplus \textit{rs}) \gg \textit{bs} @ \textit{bs}_1$ & & $\textit{then}$ & + $_{bs}{\oplus(r :: \textit{rs}})$ & + $\gg$ & + $\textit{bs} @ \textit{bs}_1 $\\ + + $\textit{if} \; r \gg \textit{bs}_1\; \textit{and}$ & $_{bs}r^* \gg \textit{bs} @ \textit{bs}_2$ & $\textit{then}$ & + $_{bs}r^* $ & + $\gg$ & + $\textit{bs} @ [0] @ \textit{bs}_1@ \textit{bs}_2 $\\ + + & & & $_{bs}r^*$ & $\gg$ & $\textit{bs} @ [1]$\\ +\end{tabular} +\end{center} +It is a predicate in the sense that given +a regular expression and a bitcode, it +returns true or false, depending on whether +or not the regular expression can actually produce that +value. If the predicates returns a true, then +we say that the regular expression $r$ contains +the bitcode $\textit{bs}$, written +$r \gg \textit{bs}$. +The $\gg$ operator with the +regular expression $r$ may also be seen as a +machine that does a derivative of regular expressions +on all strings simultaneously, taking +the bits by going throught the regular expression tree + structure in a depth first manner, regardless of whether + the part being traversed is nullable or not. + It put all possible bits that can be produced on such a traversal + into a set. + For example, if we are given the regular expression +$((a+b)(c+d))^*$, the tree structure may be written as +\begin{center} +\begin{tikzpicture} +\tikz[tree layout]\graph[nodes={draw, circle}] { +* -> + {@-> { + {+1 -> + {a , b} + }, + {+ -> + {c , d } + } + } + } +}; +\end{tikzpicture} +\end{center} +\subsection{the $\textit{ders}_2$ Function} +If we want to prove the result +\begin{center} + $ \textit{blexer}\_{simp}(r, \; s) = \textit{blexer}(r, \; s)$ +\end{center} +inductively +on the structure of the regular expression, +then we need to induct on the case $r_1 \cdot r_2$, +it can be good if we could express $(r_1 \cdot r_2) \backslash s$ +in terms of $r_1 \backslash s$ and $r_2 \backslash s$, +and this naturally induces the $ders2$ function, +which does a "convolution" on $r_1$ and $r_2$ using the string +$s$. +It is based on the observation that the derivative of $r_1 \cdot r_2$ +with respect to a string $s$ can actually be written in an "explicit form" +composed of $r_1$'s derivative of $s$ and $r_2$'s derivative of $s$. +This can be illustrated in the following procedure execution +\begin{center} + $ (r_1 \cdot r_2) \backslash [c_1c_2] = (\textit{if} \; \nullable(r_1)\; \textit{then} \; ((r_1 \backslash c_1) \cdot r_2 + r_2 \backslash c_1) \; \textit{else} \; (r_1\backslash c_1) \cdot r_2) \backslash c_2$ +\end{center} +which can also be written in a "convoluted sum" +format: +\begin{center} + $ (r_1 \cdot r_2) \backslash [c_1c_2] = \sum{r_1 \backslash s_i \cdot r_2 \backslash s_j}$ +\end{center} +In a more serious manner, we should write +\begin{center} + $ (r_1 \cdot r_2) \backslash [c_1c_2] = \sum{r_1 \backslash s_i \cdot r_2 \backslash s_j}$ +\end{center} +Note this differentiates from the previous form in the sense that +it calculates the results $r_1\backslash s_i , r_2 \backslash s_j$ first and then glue them together +through nested alternatives whereas the $r_1 \cdot r_2 \backslash s$ procedure, +used by algorithm $\lexer$, can only produce each component of the +resulting alternatives regular expression altogether rather than +generating each of the children nodes one by one +n a single recursive call that is only for generating that +very expression itself. +We have this +\section{Conclusion} +Under the exhaustive tests we believe the main +result holds, yet a proof still seems elusive. +We have tried out different approaches, and +found a lot of properties of the function $\simp$. +The counterexamples where $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$ +are also valuable in the sense that +we get to know better why they are not equal and what +are the subtle differences between a +nested simplified regular expression and a +regular expression that is simplified at the final moment. +We are almost there, but a last step is needed to make the proof work. +Hopefully in the next few weeks we will be able to find one. %CONSTRUCTION SITE HERE that is to say, despite the bits are being moved around on the regular expression (difference in bits), the structure of the (unannotated)regular expression @@ -1400,7 +1574,7 @@ for the regular expression $(a+b)(a+a*)$, if we do derivative with respect to string $aa$, -we get +we get \subsection{Another Proof Strategy} sdddddr does not equal sdsdsdsr sometimes.\\ @@ -2266,13 +2440,14 @@ \begin{center} - $b ::= S \mid Z \qquad + $b ::= 0 \mid 1 \qquad bs ::= [] \mid b:bs $ \end{center} \noindent -The $S$ and $Z$ are arbitrary names for the bits in order to avoid +The $0$ and $1$ are arbitrary names for the bits and not in bold +in order to avoid confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or bit-lists) can be used to encode values (or incomplete values) in a compact form. This can be straightforwardly seen in the following @@ -2282,23 +2457,23 @@ \begin{tabular}{lcl} $\textit{code}(\Empty)$ & $\dn$ & $[]$\\ $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\ - $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\ - $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\ + $\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\ + $\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\ $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\ - $\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\ - $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\; + $\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\ + $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\; code(\Stars\,vs)$ \end{tabular} \end{center} \noindent Here $\textit{code}$ encodes a value into a bitcodes by converting -$\Left$ into $\Z$, $\Right$ into $\S$, the start point of a non-empty +$\Left$ into $0$, $\Right$ into $1$, the start point of a non-empty star iteration into $\S$, and the border where a local star terminates -into $\Z$. This coding is lossy, as it throws away the information about +into $0$. This coding is lossy, as it throws away the information about characters, and also does not encode the ``boundary'' between two sequence values. Moreover, with only the bitcode we cannot even tell -whether the $\S$s and $\Z$s are for $\Left/\Right$ or $\Stars$. The +whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The reason for choosing this compact way of storing information is that the relatively small size of bits can be easily manipulated and ``moved around'' in a regular expression. In order to recover values, we will @@ -2311,18 +2486,18 @@ \begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}} $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\ - $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{decode}'\,(0\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\; (\Left\,v, bs_1)$\\ - $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{decode}'\,(1\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\; (\Right\,v, bs_1)$\\ $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ & $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\ & & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\ & & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\ - $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\ - $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & + $\textit{decode}'\,(0\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\ + $\textit{decode}'\,(1\!::\!bs)\,(r^*)$ & $\dn$ & $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\ & & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\ & & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\ @@ -2342,20 +2517,20 @@ \begin{center} \begin{tabular}{lcl} - $\textit{a}$ & $::=$ & $\textit{ZERO}$\\ - & $\mid$ & $\textit{ONE}\;\;bs$\\ - & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\ - & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\ - & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\ - & $\mid$ & $\textit{STAR}\;\;bs\,a$ + $\textit{a}$ & $::=$ & $\ZERO$\\ + & $\mid$ & $_{bs}\ONE$\\ + & $\mid$ & $_{bs}{\bf c}$\\ + & $\mid$ & $_{bs}\oplus \textit{as}$\\ + & $\mid$ & $_{bs}a_1\cdot a_2$\\ + & $\mid$ & $_{bs}a^*$ \end{tabular} \end{center} %(in \textit{ALTS}) \noindent -where $bs$ stands for bitcodes, $a$ for $\bold{a}$nnotated regular -expressions and $as$ for a list of annotated regular expressions. -The alternative constructor($\textit{ALTS}$) has been generalized to +where $\textit{bs}$ stands for bitcodes, $a$ for $\textit{annotated}$ regular +expressions and $\textit{as}$ for a list of annotated regular expressions. +The alternative constructor($\oplus$) has been generalized to accept a list of annotated regular expressions rather than just 2. We will show that these bitcodes encode information about the (POSIX) value that should be generated by the Sulzmann and Lu @@ -2370,16 +2545,16 @@ %\begin{definition} \begin{center} \begin{tabular}{lcl} - $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\ - $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\ - $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\ + $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\ + $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\ + $(c)^\uparrow$ & $\dn$ & $_{bs}\textit{\bf c}$\\ $(r_1 + r_2)^\uparrow$ & $\dn$ & - $\textit{ALTS}\;[]\,List((\textit{fuse}\,[\Z]\,r_1^\uparrow),\, - (\textit{fuse}\,[\S]\,r_2^\uparrow))$\\ + $_{[]}\oplus\,[(\textit{fuse}\,[0]\,r_1^\uparrow),\, + (\textit{fuse}\,[1]\,r_2^\uparrow)]$\\ $(r_1\cdot r_2)^\uparrow$ & $\dn$ & - $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\ + $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\ $(r^*)^\uparrow$ & $\dn$ & - $\textit{STAR}\;[]\,r^\uparrow$\\ + $_{[]}r^\uparrow$\\ \end{tabular} \end{center} %\end{definition} @@ -2393,7 +2568,7 @@ \begin{center} \begin{tabular}{lcl} - $\textit{fuse}\;bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\ + $\textit{fuse}\;bs\,(\ZERO)$ & $\dn$ & $\textit{ZERO}$\\ $\textit{fuse}\;bs\,(\textit{ONE}\,bs')$ & $\dn$ & $\textit{ONE}\,(bs\,@\,bs')$\\ $\textit{fuse}\;bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &