etnms/etnms.tex
changeset 114 dd7f719c451d
parent 113 8dbc83ecea3b
child 115 5c8afe4a8090
--- a/etnms/etnms.tex	Fri Jan 24 18:44:52 2020 +0000
+++ b/etnms/etnms.tex	Wed Feb 05 10:45:15 2020 +0000
@@ -2,6 +2,10 @@
 \usepackage{graphic}
 \usepackage{data}
 \usepackage{tikz-cd}
+\usepackage{tikz}
+\usetikzlibrary{graphs}
+\usetikzlibrary{graphdrawing}
+\usegdlibrary{trees}
 %\usepackage{algorithm}
 \usepackage{amsmath}
 \usepackage{xcolor}
@@ -348,13 +352,13 @@
 Here bits and bitcodes (lists of bits) are defined as:
 
 \begin{center}
-		$b ::=   S \mid  Z \qquad
+		$b ::=   1 \mid  0 \qquad
 bs ::= [] \mid b:bs    
 $
 \end{center}
 
 \noindent
-The $S$ and $Z$ are arbitrary names for the bits in order to avoid 
+The $1$ and $0$ are not in bold in order to avoid 
 confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or
 bit-lists) can be used to encode values (or incomplete values) in a
 compact form. This can be straightforwardly seen in the following
@@ -364,23 +368,23 @@
 \begin{tabular}{lcl}
   $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
   $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
-  $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\
-  $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\
+  $\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\
+  $\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\
   $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
-  $\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\
-  $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\;
+  $\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\
+  $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\;
                                                  code(\Stars\,vs)$
 \end{tabular}    
 \end{center} 
 
 \noindent
 Here $\textit{code}$ encodes a value into a bitcodes by converting
-$\Left$ into $\Z$, $\Right$ into $\S$, the start point of a non-empty
-star iteration into $\S$, and the border where a local star terminates
-into $\Z$. This coding is lossy, as it throws away the information about
+$\Left$ into $0$, $\Right$ into $1$, and marks the start of a non-empty
+star iteration by $1$. The border where a local star terminates
+is marked by $0$. This coding is lossy, as it throws away the information about
 characters, and also does not encode the ``boundary'' between two
 sequence values. Moreover, with only the bitcode we cannot even tell
-whether the $\S$s and $\Z$s are for $\Left/\Right$ or $\Stars$. The
+whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The
 reason for choosing this compact way of storing information is that the
 relatively small size of bits can be easily manipulated and ``moved
 around'' in a regular expression. In order to recover values, we will 
@@ -1327,24 +1331,60 @@
 \begin{center}
   \begin{tabular}{@{}lcl@{}}
    
-  $\textit{simp} \; (_{\textit{bs}}(a_1 \cdot a_2))$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp}  \; a_2) \; \textit{match} $ \\
-   &&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow  \ZERO$ \\
-   &&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow  \ZERO$ \\
-   &&$\quad\textit{case} \;  (\ONE, a_2') \Rightarrow  \textit{fuse} \; bs \;  a_2'$ \\
-   &&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow  \textit{fuse} \; bs \;  a_1'$ \\
-   &&$\quad\textit{case} \; (a_1', a_2') \Rightarrow  _{\textit{bs}}(a_1' \cdot a_2')$ \\
+  $\textit{simp}' \; (_{\textit{bs}}(a_1 \cdot a_2))$ & $\dn$ & $\textit{as} \; \simp \; \textit{was} \; \textit{before} $ \\
 
-  $\textit{simp} \; (bs_\oplus as)$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $ \\
-  &&$\quad\textit{case} \; [] \Rightarrow  \ZERO$ \\
-   &&$\quad\textit{case} \; a :: [] \Rightarrow  \textit{fuse bs a}$ \\
-   &&$\quad\textit{case} \;  as' \Rightarrow  \textit{ALTS}\;bs\;as'$\\ 
+  $\textit{simp}' \; (_{bs}\oplus as)$ & $\dn$ & \st{$\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $} \\
+  &&\st{$\quad\textit{case} \; [] \Rightarrow  \ZERO$} \\
+   &&\st{$\quad\textit{case} \; a :: [] \Rightarrow  \textit{fuse bs a}$} \\
+   &&\st{$\quad\textit{case} \;  as' \Rightarrow  \textit{ALTS}\;bs\;as'$}\\ 
+   &&$\textit{if}(\textit{hollowAlternatives}( \textit{map \; simp \; as}))$\\
+   &&$\textit{then} \; \fuse  \; \textit{bs}\; \textit{extractAlt}(\textit{map} \; \simp \; \textit{as} )$\\
+   &&$\textit{else} \; \simp(_{bs} \oplus \textit{as})$\\
+   
 
-   $\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$   
+   $\textit{simp}' \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$   
 \end{tabular}    
 \end{center}    
 
 \noindent
+given the definition of $\textit{hollowAlternatives}$ and  $\textit{extractAlt}$ :
+\begin{center}
+$\textit{hollowAlternatives}( \textit{rs}) \dn 
+\exists r = (_{\textit{bs}_1}\oplus \textit{rs}_1)  \in \textit{rs}.  \forall r' \in \textit{rs}, \;
+\textit{either} \; r' = \ZERO \; \textit{or} \; r' = r $
+$\textit{extractAlt}( \textit{rs}) \dn \textit{if}\big(
+\exists r = (_{\textit{bs}_1}\oplus \textit{rs}_1)  \in \textit{rs}.  \forall r' \in \textit{rs}, \;
+\textit{either} \; r' = \ZERO \; \textit{or} \; r' = r \big)\; \textit{then} \; \textit{return} \; r$
+\end{center}
+\noindent
+Basically, $\textit{hollowAlternatives}$ captures the feature of
+a list of regular expression of the shape 
+\begin{center}
+$  \oplus[\ZERO \;, \; _{bs}\oplus \textit{rs} ]$
+\end{center}
+and this means we can simply elevate the 
+inner regular expression $_{bs}\oplus \textit{rs}$
+ to the outmost
+and throw away the useless $\ZERO$s and
+the outer $\oplus$ wrapper.
+Using this new definition of simp, 
+under the example where  $r$ is the regular expression
+$(a+b)(a+a*)$ and $s$  is the string $aa$
+the problem of $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$
+is resolved.
 
+Unfortunately this causes new problems:
+for the counterexample where 
+\begin{center}
+$r$ is the regular expression
+$(ab+(a^*+aa))$ and $s$  is the string $aa$
+\end{center}
+$\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$
+happens again, whereas this does not happen for the old
+version of $\simp$.
+This dilemma causes this amendment not a successful 
+attempt to make $\rup\backslash_{simp} \, s = \simp(\rup\backslash s)$
+under every possible regular expression and string.
 \subsection{Properties of the Function $\simp$}
 
 We have proved in Isabelle quite a few properties
@@ -1378,7 +1418,141 @@
 $\textit{if} r = \simp(r') \textit{then} \; \textit{good}(r) $
 \end{center}
 \end{itemize}
+\subsection{the Contains relation}
+$\retrieve$ is a too strong relation in that
+it only extracts one bitcode instead of a set of them.
+Therefore we try to define another relation(predicate)
+to capture the fact the regular expression has bits
+being moved around but still has all the bits needed.
+The contains symbol, written$\gg$, is  a relation that
+takes two arguments in an infix form 
+and returns a truth value. 
 
+
+In other words, from the set of regular expression and 
+bitcode pairs 
+$\textit{RV} = \{(r, v) \mid r \text{r is a regular expression, v is a value}\}$,
+those that satisfy the following requirements are in the set
+$\textit{RV}_Contains$.
+Unlike the $\retrieve$
+function, which takes two arguments $r$ and $v$ and 
+produces an only answer $\textit{bs}$, it takes only 
+one argument $r$ and returns a set of bitcodes that 
+can be generated by $r$.
+\begin{center}
+\begin{tabular}{llclll}
+& & & $_{bs}\ONE$ & $\gg$ & $\textit{bs}$\\
+& & & $_{bs}{\bf c}$ & $\gg$ & $\textit{bs}$\\
+$\textit{if} \; r_1 \gg \textit{bs}_1$ & $r_2 \; \gg \textit{bs}_2$ 
+& $\textit{then}$  &
+ $_{bs}{r_1 \cdot r_2}$ & 
+ $\gg$ & 
+ $\textit{bs} @ \textit{bs}_1 @ \textit{bs}_2$\\
+
+ $\textit{if} \; r \gg \textit{bs}_1$ &  & $\textit{then}$  &
+ $_{bs}{\oplus(r :: \textit{rs}})$ & 
+ $\gg$ & 
+ $\textit{bs} @ \textit{bs}_1 $\\ 
+
+ $\textit{if} \; _{bs}(\oplus \textit{rs}) \gg \textit{bs} @ \textit{bs}_1$ &  & $\textit{then}$  &
+ $_{bs}{\oplus(r :: \textit{rs}})$ & 
+ $\gg$ & 
+ $\textit{bs} @ \textit{bs}_1 $\\  
+
+ $\textit{if} \; r \gg \textit{bs}_1\; \textit{and}$ &  $_{bs}r^* \gg \textit{bs} @ \textit{bs}_2$ & $\textit{then}$  &
+ $_{bs}r^* $ & 
+ $\gg$ & 
+ $\textit{bs} @ [0] @ \textit{bs}_1@ \textit{bs}_2 $\\
+ 
+ & & & $_{bs}r^*$ & $\gg$ & $\textit{bs} @ [1]$\\
+\end{tabular}
+\end{center}
+It is a predicate in the sense that given 
+a regular expression and a bitcode, it 
+returns true or false, depending on whether 
+or not the regular expression can actually produce that
+value. If the predicates returns a true, then 
+we say that the regular expression $r$ contains
+the bitcode $\textit{bs}$, written 
+$r \gg \textit{bs}$.
+The $\gg$ operator with the
+regular expression $r$ may also be seen as a 
+machine that does a derivative of regular expressions
+on all strings simultaneously, taking 
+the bits by going throught the regular expression tree
+ structure in a depth first manner, regardless of whether
+ the part being traversed is nullable or not.
+ It put all possible bits that can be produced on such a traversal
+ into a set.
+ For example, if we are given the regular expression 
+$((a+b)(c+d))^*$, the tree structure may be written as
+\begin{center}
+\begin{tikzpicture}
+\tikz[tree layout]\graph[nodes={draw, circle}] {
+* -> 
+    {@-> {
+    {+1 ->
+        {a , b}
+        },
+    {+ ->
+        {c , d }
+        }
+        }
+    }
+};
+\end{tikzpicture}
+\end{center}
+\subsection{the $\textit{ders}_2$ Function}
+If we want to prove the result 
+\begin{center}
+	$ \textit{blexer}\_{simp}(r, \; s) =  \textit{blexer}(r, \; s)$
+\end{center}
+inductively 
+on the structure of the regular expression,
+then we need to induct on the  case $r_1 \cdot r_2$,
+it can be good if we could express $(r_1 \cdot r_2) \backslash s$
+in terms of $r_1 \backslash s$ and $r_2 \backslash s$,
+and this naturally induces the $ders2$ function,
+which does a "convolution" on $r_1$ and $r_2$ using the string
+$s$.
+It is based on the observation that the derivative of $r_1 \cdot r_2$
+with respect to a string $s$ can actually be written in an "explicit form"
+composed of $r_1$'s derivative of $s$ and $r_2$'s derivative of $s$.
+This can be illustrated in the following procedure execution 
+\begin{center}
+	$ (r_1 \cdot r_2) \backslash [c_1c_2] =  (\textit{if} \; \nullable(r_1)\;  \textit{then} \; ((r_1 \backslash c_1) \cdot r_2 + r_2 \backslash c_1) \; \textit{else} \; (r_1\backslash c_1) \cdot r_2) \backslash c_2$
+\end{center}
+which can also be written in a "convoluted sum"
+format:
+\begin{center}
+	$ (r_1 \cdot r_2) \backslash [c_1c_2] =  \sum{r_1 \backslash s_i \cdot r_2 \backslash s_j}$
+\end{center}
+In a more serious manner, we should write
+\begin{center}
+	$ (r_1 \cdot r_2) \backslash [c_1c_2] =  \sum{r_1 \backslash s_i \cdot r_2 \backslash s_j}$
+\end{center}
+Note this differentiates from the previous form in the sense that
+it calculates the results $r_1\backslash s_i , r_2 \backslash s_j$ first and then glue them together
+through nested alternatives whereas the $r_1 \cdot r_2 \backslash s$ procedure,
+used by algorithm $\lexer$, can only produce each component of the 
+resulting alternatives regular expression altogether rather than 
+generating each of the children nodes one by one
+n a single recursive call that is only for generating that
+very expression itself.
+We have this 
+\section{Conclusion}
+Under the exhaustive tests we believe the main
+result holds, yet a proof still seems elusive.
+We have tried out different approaches, and 
+found a lot of properties of the function $\simp$.
+The counterexamples where $\rup\backslash_{simp} \, s \neq \simp(\rup\backslash s)$
+are also valuable in the sense that 
+we get to know better why they are not equal and what 
+are the subtle differences between a 
+nested simplified regular expression and a 
+regular expression that is simplified at the final moment.
+We are almost there, but a last step is needed to make the proof work.
+Hopefully in the next few weeks we will be able to find one.
 %CONSTRUCTION SITE HERE
 that is to say, despite the bits are being moved around on the regular expression
 (difference in bits), the structure of the (unannotated)regular expression
@@ -1400,7 +1574,7 @@
 for the regular expression  
 $(a+b)(a+a*)$,
 if we do derivative with respect to string $aa$,
-we get
+we get 
 
 \subsection{Another Proof Strategy}
 sdddddr does not equal sdsdsdsr sometimes.\\
@@ -2266,13 +2440,14 @@
 
 
 \begin{center}
-		$b ::=   S \mid  Z \qquad
+		$b ::=   0 \mid  1 \qquad
 bs ::= [] \mid b:bs    
 $
 \end{center}
 
 \noindent
-The $S$ and $Z$ are arbitrary names for the bits in order to avoid 
+The $0$ and $1$ are arbitrary names for the bits  and not in bold
+in order to avoid 
 confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or
 bit-lists) can be used to encode values (or incomplete values) in a
 compact form. This can be straightforwardly seen in the following
@@ -2282,23 +2457,23 @@
 \begin{tabular}{lcl}
   $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
   $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
-  $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\
-  $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\
+  $\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\
+  $\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\
   $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
-  $\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\
-  $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\;
+  $\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\
+  $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\;
                                                  code(\Stars\,vs)$
 \end{tabular}    
 \end{center} 
 
 \noindent
 Here $\textit{code}$ encodes a value into a bitcodes by converting
-$\Left$ into $\Z$, $\Right$ into $\S$, the start point of a non-empty
+$\Left$ into $0$, $\Right$ into $1$, the start point of a non-empty
 star iteration into $\S$, and the border where a local star terminates
-into $\Z$. This coding is lossy, as it throws away the information about
+into $0$. This coding is lossy, as it throws away the information about
 characters, and also does not encode the ``boundary'' between two
 sequence values. Moreover, with only the bitcode we cannot even tell
-whether the $\S$s and $\Z$s are for $\Left/\Right$ or $\Stars$. The
+whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The
 reason for choosing this compact way of storing information is that the
 relatively small size of bits can be easily manipulated and ``moved
 around'' in a regular expression. In order to recover values, we will 
@@ -2311,18 +2486,18 @@
 \begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
   $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
   $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
-  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+  $\textit{decode}'\,(0\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
        (\Left\,v, bs_1)$\\
-  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+  $\textit{decode}'\,(1\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
      $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
        (\Right\,v, bs_1)$\\                           
   $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
         $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
   & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
   & &   \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
-  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
-  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
+  $\textit{decode}'\,(0\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
+  $\textit{decode}'\,(1\!::\!bs)\,(r^*)$ & $\dn$ & 
          $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
   & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
   & &   \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
@@ -2342,20 +2517,20 @@
 
 \begin{center}
 \begin{tabular}{lcl}
-  $\textit{a}$ & $::=$  & $\textit{ZERO}$\\
-                  & $\mid$ & $\textit{ONE}\;\;bs$\\
-                  & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\
-                  & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\
-                  & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\
-                  & $\mid$ & $\textit{STAR}\;\;bs\,a$
+  $\textit{a}$ & $::=$  & $\ZERO$\\
+                  & $\mid$ & $_{bs}\ONE$\\
+                  & $\mid$ & $_{bs}{\bf c}$\\
+                  & $\mid$ & $_{bs}\oplus \textit{as}$\\
+                  & $\mid$ & $_{bs}a_1\cdot a_2$\\
+                  & $\mid$ & $_{bs}a^*$
 \end{tabular}    
 \end{center}  
 %(in \textit{ALTS})
 
 \noindent
-where $bs$ stands for bitcodes, $a$  for $\bold{a}$nnotated regular
-expressions and $as$ for a list of annotated regular expressions.
-The alternative constructor($\textit{ALTS}$) has been generalized to 
+where $\textit{bs}$ stands for bitcodes, $a$  for $\textit{annotated}$ regular
+expressions and $\textit{as}$ for a list of annotated regular expressions.
+The alternative constructor($\oplus$) has been generalized to 
 accept a list of annotated regular expressions rather than just 2.
 We will show that these bitcodes encode information about
 the (POSIX) value that should be generated by the Sulzmann and Lu
@@ -2370,16 +2545,16 @@
 %\begin{definition}
 \begin{center}
 \begin{tabular}{lcl}
-  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
-  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
-  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
+  $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
+  $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
+  $(c)^\uparrow$ & $\dn$ & $_{bs}\textit{\bf c}$\\
   $(r_1 + r_2)^\uparrow$ & $\dn$ &
-  $\textit{ALTS}\;[]\,List((\textit{fuse}\,[\Z]\,r_1^\uparrow),\,
-  (\textit{fuse}\,[\S]\,r_2^\uparrow))$\\
+  $_{[]}\oplus\,[(\textit{fuse}\,[0]\,r_1^\uparrow),\,
+  (\textit{fuse}\,[1]\,r_2^\uparrow)]$\\
   $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
-         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
+         $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
   $(r^*)^\uparrow$ & $\dn$ &
-         $\textit{STAR}\;[]\,r^\uparrow$\\
+         $_{[]}r^\uparrow$\\
 \end{tabular}    
 \end{center}    
 %\end{definition}
@@ -2393,7 +2568,7 @@
 
 \begin{center}
 \begin{tabular}{lcl}
-  $\textit{fuse}\;bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
+  $\textit{fuse}\;bs\,(\ZERO)$ & $\dn$ & $\textit{ZERO}$\\
   $\textit{fuse}\;bs\,(\textit{ONE}\,bs')$ & $\dn$ &
      $\textit{ONE}\,(bs\,@\,bs')$\\
   $\textit{fuse}\;bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &