--- a/etnms/etnms.tex Fri Jan 24 18:44:52 2020 +0000
+++ b/etnms/etnms.tex Wed Feb 05 10:45:15 2020 +0000
@@ -2,6 +2,10 @@
\usepackage{graphic}
\usepackage{data}
\usepackage{tikz-cd}
+\usepackage{tikz}
+\usetikzlibrary{graphs}
+\usetikzlibrary{graphdrawing}
+\usegdlibrary{trees}
%\usepackage{algorithm}
\usepackage{amsmath}
\usepackage{xcolor}
@@ -348,13 +352,13 @@
Here bits and bitcodes (lists of bits) are defined as:
\begin{center}
- $b ::= S \mid Z \qquad
+ $b ::= 1 \mid 0 \qquad
bs ::= [] \mid b:bs
$
\end{center}
\noindent
-The $S$ and $Z$ are arbitrary names for the bits in order to avoid
+The $1$ and $0$ are not in bold in order to avoid
confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or
bit-lists) can be used to encode values (or incomplete values) in a
compact form. This can be straightforwardly seen in the following
@@ -364,23 +368,23 @@
\begin{tabular}{lcl}
$\textit{code}(\Empty)$ & $\dn$ & $[]$\\
$\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
- $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\
- $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\
+ $\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\
+ $\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\
$\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
- $\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\
- $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\;
+ $\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\
+ $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\;
code(\Stars\,vs)$
\end{tabular}
\end{center}
\noindent
Here $\textit{code}$ encodes a value into a bitcodes by converting
-$\Left$ into $\Z$, $\Right$ into $\S$, the start point of a non-empty
-star iteration into $\S$, and the border where a local star terminates
-into $\Z$. This coding is lossy, as it throws away the information about
+$\Left$ into $0$, $\Right$ into $1$, and marks the start of a non-empty
+star iteration by $1$. The border where a local star terminates
+is marked by $0$. This coding is lossy, as it throws away the information about
characters, and also does not encode the ``boundary'' between two
sequence values. Moreover, with only the bitcode we cannot even tell
-whether the $\S$s and $\Z$s are for $\Left/\Right$ or $\Stars$. The
+whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The
reason for choosing this compact way of storing information is that the
relatively small size of bits can be easily manipulated and ``moved
around'' in a regular expression. In order to recover values, we will
@@ -1327,24 +1331,60 @@
\begin{center}
\begin{tabular}{@{}lcl@{}}
- $\textit{simp} \; (_{\textit{bs}}(a_1 \cdot a_2))$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp} \; a_2) \; \textit{match} $ \\
- &&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow \ZERO$ \\
- &&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow \ZERO$ \\
- &&$\quad\textit{case} \; (\ONE, a_2') \Rightarrow \textit{fuse} \; bs \; a_2'$ \\
- &&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow \textit{fuse} \; bs \; a_1'$ \\
- &&$\quad\textit{case} \; (a_1', a_2') \Rightarrow _{\textit{bs}}(a_1' \cdot a_2')$ \\
+ $\textit{simp}' \; (_{\textit{bs}}(a_1 \cdot a_2))$ & $\dn$ & $\textit{as} \; \simp \; \textit{was} \; \textit{before} $ \\
- $\textit{simp} \; (bs_\oplus as)$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $ \\
- &&$\quad\textit{case} \; [] \Rightarrow \ZERO$ \\
- &&$\quad\textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$ \\
- &&$\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$\\
+ $\textit{simp}' \; (_{bs}\oplus as)$ & $\dn$ & \st{$\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $} \\
+ &&\st{$\quad\textit{case} \; [] \Rightarrow \ZERO$} \\
+ &&\st{$\quad\textit{case} \; a :: [] \Rightarrow \textit{fuse bs a}$} \\
+ &&\st{$\quad\textit{case} \; as' \Rightarrow \textit{ALTS}\;bs\;as'$}\\
+ &&$\textit{if}(\textit{hollowAlternatives}( \textit{map \; simp \; as}))$\\
+ &&$\textit{then} \; \fuse \; \textit{bs}\; \textit{extractAlt}(\textit{map} \; \simp \; \textit{as} )$\\
+ &&$\textit{else} \; \simp(_{bs} \oplus \textit{as})$\\
+
- $\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$
+ $\textit{simp}' \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$
\end{tabular}
\end{center}
\noindent
+given the definition of $\textit{hollowAlternatives}$ and $\textit{extractAlt}$ :
+\begin{center}
+$\textit{hollowAlternatives}( \textit{rs}) \dn
+\exists r = (_{\textit{bs}_1}\oplus \textit{rs}_1) \in \textit{rs}. \forall r' \in \textit{rs}, \;
+\textit{either} \; r' = \ZERO \; \textit{or} \; r' = r $
+$\textit{extractAlt}( \textit{rs}) \dn \textit{if}\big(
+\exists r = (_{\textit{bs}_1}\oplus \textit{rs}_1) \in \textit{rs}. \forall r' \in \textit{rs}, \;
+\textit{either} \; r' = \ZERO \; \textit{or} \; r' = r \big)\; \textit{then} \; \textit{return} \; r$
+\end{center}
+\noindent
+Basically, $\textit{hollowAlternatives}$ captures the feature of
+a list of regular expression of the shape
+\begin{center}
+$ \oplus[\ZERO \;, \; _{bs}\oplus \textit{rs} ]$
+\end{center}
+and this means we can simply elevate the
+inner regular expression $_{bs}\oplus \textit{rs}$
+ to the outmost
+and throw away the useless $\ZERO$s and
+the outer $\oplus$ wrapper.
+Using this new definition of simp,
+under the example where $r$ is the regular expression
+$(a+b)(a+a*)$ and $s$ is the string $aa$
+the problem of $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$
+is resolved.
+Unfortunately this causes new problems:
+for the counterexample where
+\begin{center}
+$r$ is the regular expression
+$(ab+(a^*+aa))$ and $s$ is the string $aa$
+\end{center}
+$\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$
+happens again, whereas this does not happen for the old
+version of $\simp$.
+This dilemma causes this amendment not a successful
+attempt to make $\rup\backslash_{simp} \, s = \simp(\rup\backslash s)$
+under every possible regular expression and string.
\subsection{Properties of the Function $\simp$}
We have proved in Isabelle quite a few properties
@@ -1378,7 +1418,141 @@
$\textit{if} r = \simp(r') \textit{then} \; \textit{good}(r) $
\end{center}
\end{itemize}
+\subsection{the Contains relation}
+$\retrieve$ is a too strong relation in that
+it only extracts one bitcode instead of a set of them.
+Therefore we try to define another relation(predicate)
+to capture the fact the regular expression has bits
+being moved around but still has all the bits needed.
+The contains symbol, written$\gg$, is a relation that
+takes two arguments in an infix form
+and returns a truth value.
+
+In other words, from the set of regular expression and
+bitcode pairs
+$\textit{RV} = \{(r, v) \mid r \text{r is a regular expression, v is a value}\}$,
+those that satisfy the following requirements are in the set
+$\textit{RV}_Contains$.
+Unlike the $\retrieve$
+function, which takes two arguments $r$ and $v$ and
+produces an only answer $\textit{bs}$, it takes only
+one argument $r$ and returns a set of bitcodes that
+can be generated by $r$.
+\begin{center}
+\begin{tabular}{llclll}
+& & & $_{bs}\ONE$ & $\gg$ & $\textit{bs}$\\
+& & & $_{bs}{\bf c}$ & $\gg$ & $\textit{bs}$\\
+$\textit{if} \; r_1 \gg \textit{bs}_1$ & $r_2 \; \gg \textit{bs}_2$
+& $\textit{then}$ &
+ $_{bs}{r_1 \cdot r_2}$ &
+ $\gg$ &
+ $\textit{bs} @ \textit{bs}_1 @ \textit{bs}_2$\\
+
+ $\textit{if} \; r \gg \textit{bs}_1$ & & $\textit{then}$ &
+ $_{bs}{\oplus(r :: \textit{rs}})$ &
+ $\gg$ &
+ $\textit{bs} @ \textit{bs}_1 $\\
+
+ $\textit{if} \; _{bs}(\oplus \textit{rs}) \gg \textit{bs} @ \textit{bs}_1$ & & $\textit{then}$ &
+ $_{bs}{\oplus(r :: \textit{rs}})$ &
+ $\gg$ &
+ $\textit{bs} @ \textit{bs}_1 $\\
+
+ $\textit{if} \; r \gg \textit{bs}_1\; \textit{and}$ & $_{bs}r^* \gg \textit{bs} @ \textit{bs}_2$ & $\textit{then}$ &
+ $_{bs}r^* $ &
+ $\gg$ &
+ $\textit{bs} @ [0] @ \textit{bs}_1@ \textit{bs}_2 $\\
+
+ & & & $_{bs}r^*$ & $\gg$ & $\textit{bs} @ [1]$\\
+\end{tabular}
+\end{center}
+It is a predicate in the sense that given
+a regular expression and a bitcode, it
+returns true or false, depending on whether
+or not the regular expression can actually produce that
+value. If the predicates returns a true, then
+we say that the regular expression $r$ contains
+the bitcode $\textit{bs}$, written
+$r \gg \textit{bs}$.
+The $\gg$ operator with the
+regular expression $r$ may also be seen as a
+machine that does a derivative of regular expressions
+on all strings simultaneously, taking
+the bits by going throught the regular expression tree
+ structure in a depth first manner, regardless of whether
+ the part being traversed is nullable or not.
+ It put all possible bits that can be produced on such a traversal
+ into a set.
+ For example, if we are given the regular expression
+$((a+b)(c+d))^*$, the tree structure may be written as
+\begin{center}
+\begin{tikzpicture}
+\tikz[tree layout]\graph[nodes={draw, circle}] {
+* ->
+ {@-> {
+ {+1 ->
+ {a , b}
+ },
+ {+ ->
+ {c , d }
+ }
+ }
+ }
+};
+\end{tikzpicture}
+\end{center}
+\subsection{the $\textit{ders}_2$ Function}
+If we want to prove the result
+\begin{center}
+ $ \textit{blexer}\_{simp}(r, \; s) = \textit{blexer}(r, \; s)$
+\end{center}
+inductively
+on the structure of the regular expression,
+then we need to induct on the case $r_1 \cdot r_2$,
+it can be good if we could express $(r_1 \cdot r_2) \backslash s$
+in terms of $r_1 \backslash s$ and $r_2 \backslash s$,
+and this naturally induces the $ders2$ function,
+which does a "convolution" on $r_1$ and $r_2$ using the string
+$s$.
+It is based on the observation that the derivative of $r_1 \cdot r_2$
+with respect to a string $s$ can actually be written in an "explicit form"
+composed of $r_1$'s derivative of $s$ and $r_2$'s derivative of $s$.
+This can be illustrated in the following procedure execution
+\begin{center}
+ $ (r_1 \cdot r_2) \backslash [c_1c_2] = (\textit{if} \; \nullable(r_1)\; \textit{then} \; ((r_1 \backslash c_1) \cdot r_2 + r_2 \backslash c_1) \; \textit{else} \; (r_1\backslash c_1) \cdot r_2) \backslash c_2$
+\end{center}
+which can also be written in a "convoluted sum"
+format:
+\begin{center}
+ $ (r_1 \cdot r_2) \backslash [c_1c_2] = \sum{r_1 \backslash s_i \cdot r_2 \backslash s_j}$
+\end{center}
+In a more serious manner, we should write
+\begin{center}
+ $ (r_1 \cdot r_2) \backslash [c_1c_2] = \sum{r_1 \backslash s_i \cdot r_2 \backslash s_j}$
+\end{center}
+Note this differentiates from the previous form in the sense that
+it calculates the results $r_1\backslash s_i , r_2 \backslash s_j$ first and then glue them together
+through nested alternatives whereas the $r_1 \cdot r_2 \backslash s$ procedure,
+used by algorithm $\lexer$, can only produce each component of the
+resulting alternatives regular expression altogether rather than
+generating each of the children nodes one by one
+n a single recursive call that is only for generating that
+very expression itself.
+We have this
+\section{Conclusion}
+Under the exhaustive tests we believe the main
+result holds, yet a proof still seems elusive.
+We have tried out different approaches, and
+found a lot of properties of the function $\simp$.
+The counterexamples where $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$
+are also valuable in the sense that
+we get to know better why they are not equal and what
+are the subtle differences between a
+nested simplified regular expression and a
+regular expression that is simplified at the final moment.
+We are almost there, but a last step is needed to make the proof work.
+Hopefully in the next few weeks we will be able to find one.
%CONSTRUCTION SITE HERE
that is to say, despite the bits are being moved around on the regular expression
(difference in bits), the structure of the (unannotated)regular expression
@@ -1400,7 +1574,7 @@
for the regular expression
$(a+b)(a+a*)$,
if we do derivative with respect to string $aa$,
-we get
+we get
\subsection{Another Proof Strategy}
sdddddr does not equal sdsdsdsr sometimes.\\
@@ -2266,13 +2440,14 @@
\begin{center}
- $b ::= S \mid Z \qquad
+ $b ::= 0 \mid 1 \qquad
bs ::= [] \mid b:bs
$
\end{center}
\noindent
-The $S$ and $Z$ are arbitrary names for the bits in order to avoid
+The $0$ and $1$ are arbitrary names for the bits and not in bold
+in order to avoid
confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or
bit-lists) can be used to encode values (or incomplete values) in a
compact form. This can be straightforwardly seen in the following
@@ -2282,23 +2457,23 @@
\begin{tabular}{lcl}
$\textit{code}(\Empty)$ & $\dn$ & $[]$\\
$\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
- $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\
- $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\
+ $\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\
+ $\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\
$\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
- $\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\
- $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\;
+ $\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\
+ $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\;
code(\Stars\,vs)$
\end{tabular}
\end{center}
\noindent
Here $\textit{code}$ encodes a value into a bitcodes by converting
-$\Left$ into $\Z$, $\Right$ into $\S$, the start point of a non-empty
+$\Left$ into $0$, $\Right$ into $1$, the start point of a non-empty
star iteration into $\S$, and the border where a local star terminates
-into $\Z$. This coding is lossy, as it throws away the information about
+into $0$. This coding is lossy, as it throws away the information about
characters, and also does not encode the ``boundary'' between two
sequence values. Moreover, with only the bitcode we cannot even tell
-whether the $\S$s and $\Z$s are for $\Left/\Right$ or $\Stars$. The
+whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The
reason for choosing this compact way of storing information is that the
relatively small size of bits can be easily manipulated and ``moved
around'' in a regular expression. In order to recover values, we will
@@ -2311,18 +2486,18 @@
\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
$\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
$\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
- $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+ $\textit{decode}'\,(0\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
(\Left\,v, bs_1)$\\
- $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+ $\textit{decode}'\,(1\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
(\Right\,v, bs_1)$\\
$\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
$\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
& & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
& & \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
- $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
- $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ &
+ $\textit{decode}'\,(0\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
+ $\textit{decode}'\,(1\!::\!bs)\,(r^*)$ & $\dn$ &
$\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
& & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
& & \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
@@ -2342,20 +2517,20 @@
\begin{center}
\begin{tabular}{lcl}
- $\textit{a}$ & $::=$ & $\textit{ZERO}$\\
- & $\mid$ & $\textit{ONE}\;\;bs$\\
- & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\
- & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\
- & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\
- & $\mid$ & $\textit{STAR}\;\;bs\,a$
+ $\textit{a}$ & $::=$ & $\ZERO$\\
+ & $\mid$ & $_{bs}\ONE$\\
+ & $\mid$ & $_{bs}{\bf c}$\\
+ & $\mid$ & $_{bs}\oplus \textit{as}$\\
+ & $\mid$ & $_{bs}a_1\cdot a_2$\\
+ & $\mid$ & $_{bs}a^*$
\end{tabular}
\end{center}
%(in \textit{ALTS})
\noindent
-where $bs$ stands for bitcodes, $a$ for $\bold{a}$nnotated regular
-expressions and $as$ for a list of annotated regular expressions.
-The alternative constructor($\textit{ALTS}$) has been generalized to
+where $\textit{bs}$ stands for bitcodes, $a$ for $\textit{annotated}$ regular
+expressions and $\textit{as}$ for a list of annotated regular expressions.
+The alternative constructor($\oplus$) has been generalized to
accept a list of annotated regular expressions rather than just 2.
We will show that these bitcodes encode information about
the (POSIX) value that should be generated by the Sulzmann and Lu
@@ -2370,16 +2545,16 @@
%\begin{definition}
\begin{center}
\begin{tabular}{lcl}
- $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
- $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
- $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
+ $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
+ $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
+ $(c)^\uparrow$ & $\dn$ & $_{bs}\textit{\bf c}$\\
$(r_1 + r_2)^\uparrow$ & $\dn$ &
- $\textit{ALTS}\;[]\,List((\textit{fuse}\,[\Z]\,r_1^\uparrow),\,
- (\textit{fuse}\,[\S]\,r_2^\uparrow))$\\
+ $_{[]}\oplus\,[(\textit{fuse}\,[0]\,r_1^\uparrow),\,
+ (\textit{fuse}\,[1]\,r_2^\uparrow)]$\\
$(r_1\cdot r_2)^\uparrow$ & $\dn$ &
- $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
+ $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
$(r^*)^\uparrow$ & $\dn$ &
- $\textit{STAR}\;[]\,r^\uparrow$\\
+ $_{[]}r^\uparrow$\\
\end{tabular}
\end{center}
%\end{definition}
@@ -2393,7 +2568,7 @@
\begin{center}
\begin{tabular}{lcl}
- $\textit{fuse}\;bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
+ $\textit{fuse}\;bs\,(\ZERO)$ & $\dn$ & $\textit{ZERO}$\\
$\textit{fuse}\;bs\,(\textit{ONE}\,bs')$ & $\dn$ &
$\textit{ONE}\,(bs\,@\,bs')$\\
$\textit{fuse}\;bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &