lexing: changeset 416:57182b36ec01

--- a/progs/scala/re-bit2.scala	Sat Feb 05 18:24:37 2022 +0000
+++ b/progs/scala/re-bit2.scala	Sun Feb 06 00:02:04 2022 +0000
@@ -73,6 +73,7 @@
   case AALTS(bs, r::Nil) => erase(r)
   case AALTS(bs, r::rs) => ALT(erase(r), erase(AALTS(bs, rs)))
   case ASEQ(bs, r1, r2) => SEQ (erase(r1), erase(r2))
+  case ASTAR(cs, ASTAR(ds, r))=> STAR(erase(r))
   case ASTAR(cs, r)=> STAR(erase(r))
 }
 
@@ -217,13 +218,78 @@
   }
 } 
 
+def flats(rs: List[ARexp]): List[ARexp] = rs match {
+      case Nil => Nil
+      case AZERO :: rs1 => flats(rs1)
+      case AALTS(bs, rs1) :: rs2 => rs1.map(fuse(bs, _)) ::: flats(rs2)
+      case r1 :: rs2 => r1 :: flats(rs2)
+    }
+
+def projectFirstChild(rs: List[ARexp]) : List[ARexp] = rs match {
+    case (ASEQ(bs, r1, r2p)::rs1) => r1::projectFirstChild(rs1)
+    case Nil => Nil
+  }
+
+
+
+def distinctBy2(xs: List[ARexp], acc: List[Rexp] = Nil): List[ARexp] = xs match {
+    case Nil => Nil
+    case (x::xs) => {
+      val res = erase(x)
+      if(acc.contains(res))
+        distinctBy2(xs, acc)
+      else
+        x match {
+          case ASEQ(bs0, AALTS(bs1, rs), r2) => 
+            val newTerms =  distinctBy2(rs.map(r1 => ASEQ(Nil, r1, r2)), acc)
+            val rsPrime = projectFirstChild(newTerms)
+            newTerms match {
+              case Nil => distinctBy2(xs, acc)
+              case t::Nil => ASEQ(bs0, fuse(bs1, rsPrime.head), r2)::distinctBy2(xs, erase(t)::acc)
+              case ts => ASEQ(bs0, AALTS(bs1, rsPrime), r2)::distinctBy2(xs, newTerms.map(erase(_)):::acc)
+            }
+          case x => x::distinctBy2(xs, res::acc)
+        }
+    }
+  } 
+
+/*
+def strongBsimp(r: ARexp): ARexp =
+  {
+    r match {
+      case ASEQ(bs1, r1, r2) => (strongBsimp(r1), strongBsimp(r2)) match {
+          case (AZERO, _) => AZERO
+          case (_, AZERO) => AZERO
+          case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s)
+          case (r1s, r2s) => ASEQ(bs1, r1s, r2s)
+      }
+      case AALTS(bs1, rs) => {
+            val rs_simp = rs.map(strongBsimp(_))
+            val flat_res = flats(rs_simp)
+            val dist_res = distinctBy2(flat_res)//distinctBy(flat_res, erase)
+            dist_res match {
+              case Nil => AZERO
+              case s :: Nil => fuse(bs1, s)
+              case rs => AALTS(bs1, rs)  
+            }
+          
+      }
+      case r => r
+    }
+  }
+*/
 
 def bsimp(r: ARexp): ARexp = r match {
   case ASEQ(bs1, r1, r2) => (bsimp(r1), bsimp(r2)) match {
       case (AZERO, _) => AZERO
       case (_, AZERO) => AZERO
       case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s)
-      //case (AALTS(bs, rs), r2) => AALTS(bs, rs.map(ASEQ(Nil, _, r2)))
+      //case (AALTS(bs, rs), r) => 
+      //   AALTS(Nil, rs.map(ASEQ(bs1 ++ bs, _, r)))
+      //case (ASEQ(bs2, r1, ASTAR(bs3, r2)), ASTAR(bs4, r3)) if erase(r2) == erase(r3) =>
+      //  ASEQ(bs1 ++ bs2, r1, ASTAR(bs3, r2))
+      //case (ASEQ(bs2, r1, r2), r3)  =>
+      //      ASEQ(bs1 ++ bs2, r1, ASEQ(Nil, r2, r3))
       case (r1s, r2s) => ASEQ(bs1, r1s, r2s)
   }
   case AALTS(bs1, rs) => (flts(rs.map(bsimp))).distinctBy(erase) match {
@@ -231,9 +297,16 @@
       case r::Nil => fuse(bs1, r)
       case rs => AALTS(bs1, rs)
   }
+  //case (ASTAR(bs1, ASTAR(bs2, r))) => bsimp(ASTAR(bs1 ++ bs2, r))
   case r => r
 }
 
+def bders_simp(r: ARexp, s: List[Char]) : ARexp = s match {
+  case Nil => r
+  case c::cs => bders_simp(bsimp(bder(c, r)), cs)
+}
+
+
 def blex_simp(r: ARexp, s: List[Char]) : Bits = s match {
   case Nil => if (bnullable(r)) bmkeps(r) 
               else throw new Exception("Not matched")
@@ -266,8 +339,151 @@
   case Stars(vs) => vs.flatMap(env)
 }
 
+def nullable(r: Rexp) : Boolean = r match {
+  case ZERO => false
+  case ONE => true
+  case CHAR(_) => false
+  case ALT(r1, r2) => nullable(r1) || nullable(r2)
+  case SEQ(r1, r2) => nullable(r1) && nullable(r2)
+  case STAR(_) => true
+}
+
+
+def pder(c: Char, r: Rexp) : Set[Rexp] = r match {
+  case ZERO => Set()
+  case ONE => Set()
+  case CHAR(d) => if (c == d) Set(ONE) else Set()
+  case ALT(r1, r2) => pder(c, r1) ++ pder(c, r2)
+  case SEQ(r1, r2) => {
+    (for (pr1 <- pder(c, r1)) yield SEQ(pr1, r2)) ++
+    (if (nullable(r1)) pder(c, r2) else Set())
+  }  
+  case STAR(r1) => {
+    for (pr1 <- pder(c, r1)) yield SEQ(pr1, STAR(r1))
+  }
+}
+
+def pders(s: List[Char], rs: Set[Rexp]) : Set[Rexp] = s match {
+  case Nil => rs
+  case c::s => pders(s, rs.flatMap(pder(c, _)))
+}
+
+def size(r: Rexp): Int = r match {
+  case ZERO => 1
+  case ONE => 1
+  case CHAR(_) => 1
+  case ALT(r1, r2) => 1 + size(r1) + size (r2)
+  case SEQ(r1, r2) => 1 + size(r1) + size (r2)
+  case STAR(r1) => 1 + size(r1)
+}
+
+def pp(r: ARexp): String = r match {
+  case ASEQ(_, ACHAR(_, a1),ASEQ(_, r1, r2)) => s"${a1}${pp(r1)}${pp(r2)}"
+  case ASEQ(_, ACHAR(_, a1),ACHAR(_, a2)) => s"${a1}${a2}"
+  case AZERO => "0"
+  case AONE(_) => "1"
+  case ACHAR(_, a) => a.toString
+  case AALTS(_, rs) => s"ALTs(${rs.map(pp(_)).mkString(",")})"
+  case ASEQ(_, r1, r2) => s"SEQ(${pp(r1)}, ${pp(r2)})"
+  case ASTAR(_, r1) => s"(${pp(r1)})*"
+}
+
+
+
 // Some Tests
 //============
+val ST = STAR(STAR("a"))
+println(pp(internalise(ST)))
+println(bders(("a" * 1).toList, internalise(ST)))
+println(bders_simp(internalise(ST), ("a" * 1).toList))
+println(pp(bders(("a" * 1).toList, internalise(ST))))
+println(pp(bders_simp(internalise(ST), ("a" * 1).toList)))
+
+
+
+println(pp(bders_simp(internalise(ST), ("a" * 1).toList)))
+println(pp(bders_simp(internalise(ST), ("a" * 2).toList)))
+println(pp(bders_simp(internalise(ST), ("a" * 3).toList)))
+println(pp(bders_simp(internalise(ST), ("a" * 4).toList)))
+
+println(blexing(ST, "a" * 1))
+println(blexing_simp(ST, "a" * 1))
+println(blexing(ST, "a" * 2))
+println(blexing_simp(ST, "a" * 2))
+println(blexing(ST, "a" * 3))
+println(blexing_simp(ST, "a" * 3))
+println(blexing(ST, "a" * 4))
+println(blexing_simp(ST, "a" * 4))
+
+val STARREG = ((STAR("a") | STAR("aaa")) | STAR("aaaaa")).%
+
+println(blexing(STARREG, "a" * 3))
+println(blexing_simp(STARREG, "a" * 3))
+
+
+size(STARREG)
+size(erase(bders_simp(internalise(STARREG), ("a" * 1).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 2).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 3).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 4).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 5).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 6).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 7).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 8).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 9).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 100).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 101).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 102).toList)))
+size(erase(bders_simp(internalise(STARREG), ("a" * 103).toList)))
+
+println(bders_simp(internalise(STARREG), ("a" * 1).toList))
+println(bders_simp(internalise(STARREG), ("a" * 2).toList))
+println(bders_simp(internalise(STARREG), ("a" * 3).toList))
+println(bders_simp(internalise(STARREG), ("a" * 4).toList))
+
+println(erase(bders_simp(internalise(STARREG), ("a" * 1).toList)))
+println(erase(bders_simp(internalise(STARREG), ("a" * 2).toList)))
+println(erase(bders_simp(internalise(STARREG), ("a" * 3).toList)))
+println(erase(bders_simp(internalise(STARREG), ("a" * 4).toList)))
+
+println(pp(internalise(STARREG)))
+println(pp(bders_simp(internalise(STARREG), ("a" * 1).toList)))
+println(pp(bders_simp(internalise(STARREG), ("a" * 2).toList)))
+println(pp(bders_simp(internalise(STARREG), ("a" * 3).toList)))
+println(pp(bders_simp(internalise(STARREG), ("a" * 4).toList)))
+
+val STARR = (STAR("a") | STAR("aa") | 
+             STAR("aaa") | STAR("aaaa") | 
+             STAR("aaaaa") | STAR("aaaaaa") | 
+             STAR("aaaaaaa") | STAR("aaaaaaaa")).%
+
+size(STARR)
+size(erase(bders_simp(internalise(STARR), ("a" * 1).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 2).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 3).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 4).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 5).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 6).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 7).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 8).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 9).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 1000).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 1001).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 1002).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 1010).toList)))
+size(erase(bders_simp(internalise(STARR), ("a" * 1010 ++ "b").toList)))
+
+val r0 = ("a" | "ab") ~ ("b" | "")
+println(pre_blexing(internalise(r0), "ab"))
+println(blexing(r0, "ab"))
+println(blexing_simp(r0, "ab"))
+
+println(pders("a".toList, Set(r0)))
+println(pders("ab".toList, Set(r0)))
+
+val r00 = ("a" ~ ("b" | "")) | ("ab" ~ ("b" | ""))
+
+
 
 /*
 def time_needed[T](i: Int, code: => T) = {

--- a/thys2/Paper/Paper.thy	Sat Feb 05 18:24:37 2022 +0000
+++ b/thys2/Paper/Paper.thy	Sun Feb 06 00:02:04 2022 +0000
@@ -247,6 +247,7 @@
 
 text {*
 
+  In the second part of their paper \cite{Sulzmann2014},
   Sulzmann and Lu describe another algorithm that generates POSIX
   values but dispences with the second phase where characters are
   injected ``back'' into values. For this they annotate bitcodes to
@@ -254,8 +255,7 @@
 
   \begin{center}
   \begin{tabular}{lcl}
-  @{term breg} & $::=$ & @{term "AZERO"}\\
-               & $\mid$ & @{term "AONE bs"}\\
+  @{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\
                & $\mid$ & @{term "ACHAR bs c"}\\
                & $\mid$ & @{term "AALTs bs rs"}\\
                & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
@@ -264,46 +264,243 @@
   \end{center}
 
   \noindent where @{text bs} stands for a bitsequences; @{text r},
-  @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for annotated regular
-  expressions; and @{text rs} for a list of annotated regular
-  expressions.  In contrast to Sulzmann and Lu we generalise the
-  alternative regular expressions to lists, instead of just having
-  binary regular expressions.  The idea with annotated regular
-  expressions is to incrementally generate the value information by
-  recording bitsequences. Sulzmann and Lu then  
-  define a coding function for how values can be coded into bitsequences.
+  @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
+  expressions; and @{text rs} for lists of bitcoded regular
+  expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
+  is just an abbreviation for @{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}. 
+  For bitsequences we just use lists made up of the
+  constants @{text Z} and @{text S}.  The idea with bitcoded regular
+  expressions is to incrementally generate the value information (for
+  example @{text Left} and @{text Right}) as bitsequences
+  as part of the regular expression constructors. 
+  Sulzmann and Lu then define a coding
+  function for how values can be coded into bitsequences.
 
   \begin{center}
+  \begin{tabular}{cc}
   \begin{tabular}{lcl}
   @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
   @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
   @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
-  @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\
+  @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
+  \end{tabular}
+  &
+  \begin{tabular}{lcl}
   @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
   @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
-  @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}
+  @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
+  \mbox{\phantom{XX}}\\
+  \end{tabular}
   \end{tabular}
   \end{center}
+   
+  \noindent
+  As can be seen, this coding is ``lossy'' in the sense that we do not
+  record explicitly character values and also not sequence values (for
+  them we just append two bitsequences). We do, however, record the
+  different alternatives for @{text Left}, respectively @{text Right}, as @{text Z} and
+  @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
+  if there is still a value coming in the list of @{text Stars}, whereas @{text S}
+  indicates the end of the list. The lossiness makes the process of
+  decoding a bit more involved, but the point is that if we have a
+  regular expression \emph{and} a bitsequence of a corresponding value,
+  then we can always decode the value accurately. The decoding can be
+  defined by using two functions called $\textit{decode}'$ and
+  \textit{decode}:
 
+  \begin{center}
+  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
+  $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
+  $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
+  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
+       (\Left\,v, bs_1)$\\
+  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
+       (\Right\,v, bs_1)$\\                           
+  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
+        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
+  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
+        \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
+  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
+  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
+         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
+  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
+        \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
+  $\textit{decode}\,bs\,r$ & $\dn$ &
+     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
+  & & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
+       \textit{else}\;\textit{None}$   
+  \end{tabular}    
+  \end{center}
+
+  \noindent
+  The function \textit{decode} checks whether all of the bitsequence is
+  consumed and returns the corresponding value as @{term "Some v"}; otherwise
+  it fails with @{text "None"}. We can establish that for a value $v$
+  inhabited by a regular expression $r$, the decoding of its
+  bitsequence never fails.
+
+\begin{lemma}\label{codedecode}\it
+  If $\;\vdash v : r$ then
+  $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
+\end{lemma}
+
+\begin{proof}
+  This follows from the property that
+  $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
+  for any bit-sequence $bs$ and $\vdash v : r$. This property can be
+  easily proved by induction on $\vdash v : r$.
+\end{proof}  
+
+  Sulzmann and Lu define the function \emph{internalise}
+  in order to transform standard regular expressions into annotated
+  regular expressions. We write this operation as $r^\uparrow$.
+  This internalisation uses the following
+  \emph{fuse} function.	     
+
+  \begin{center}
+  \begin{tabular}{lcl}
+  $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
+  $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
+     $\textit{ONE}\,(bs\,@\,bs')$\\
+  $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
+     $\textit{CHAR}\,(bs\,@\,bs')\,c$\\
+  $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
+     $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
+  $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
+     $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
+  $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
+     $\textit{STAR}\,(bs\,@\,bs')\,r$
+  \end{tabular}    
+  \end{center}    
+
+  \noindent
+  A regular expression can then be \emph{internalised} into a bitcoded
+  regular expression as follows.
+
+  \begin{center}
+  \begin{tabular}{lcl}
+  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
+  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
+  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
+  $(r_1 + r_2)^\uparrow$ & $\dn$ &
+         $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
+                            (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
+  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
+         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
+  $(r^*)^\uparrow$ & $\dn$ &
+         $\textit{STAR}\;[]\,r^\uparrow$\\
+  \end{tabular}    
+  \end{center}    
+
+  \noindent
+  There is also an \emph{erase}-function, written $a^\downarrow$, which
+  transforms a bitcoded regular expression into a (standard) regular
+  expression by just erasing the annotated bitsequences. We omit the
+  straightforward definition. For defining the algorithm, we also need
+  the functions \textit{bnullable} and \textit{bmkeps}, which are the
+  ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
+  bitcoded regular expressions, instead of regular expressions.
+
+  \begin{center}
+  \begin{tabular}{lcl}
+  $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$ \textbf{fix}\\
+  $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{true}$\\
+  $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{false}$\\
+  $\textit{bnullable}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ &
+     $\textit{bnullable}\,a_1\vee \textit{bnullable}\,a_2$\\
+  $\textit{bnullable}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ &
+     $\textit{bnullable}\,a_1\wedge \textit{bnullable}\,a_2$\\
+  $\textit{bnullable}\,(\textit{STAR}\,bs\,a)$ & $\dn$ &
+     $\textit{true}$
+  \end{tabular}    
+  \end{center}    
+
+  \begin{center}
+  \begin{tabular}{lcl}
+  $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$ \textbf{fix}\\
+  $\textit{bmkeps}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ &
+     $\textit{if}\;\textit{bnullable}\,a_1$\\
+  & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a_1$\\
+  & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,a_2$\\
+  $\textit{bmkeps}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ &
+     $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
+  $\textit{bmkeps}\,(\textit{STAR}\,bs\,a)$ & $\dn$ &
+     $bs \,@\, [\S]$
+  \end{tabular}    
+  \end{center}    
+ 
+
+  \noindent
+  The key function in the bitcoded algorithm is the derivative of an
+  annotated regular expression. This derivative calculates the
+  derivative but at the same time also the incremental part that
+  contributes to constructing a value.	
+
+  \begin{center}
+  \begin{tabular}{@ {}lcl@ {}}
+  $(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \textbf{fix}\\  
+  $(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
+  $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
+        $\textit{if}\;c=d\; \;\textit{then}\;
+         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
+  $(\textit{ALT}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ &
+        $\textit{ALT}\,bs\,(a_1\backslash c)\,(a_2\backslash c)$\\
+  $(\textit{SEQ}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ &
+     $\textit{if}\;\textit{bnullable}\,a_1$\\
+  & &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(a_1\backslash c)\,a_2)$\\
+  & &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\backslash c))$\\
+  & &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\backslash c)\,a_2$\\
+  $(\textit{STAR}\,bs\,a)\backslash c$ & $\dn$ &
+      $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
+       (\textit{STAR}\,[]\,r)$
+  \end{tabular}    
+  \end{center}
+
+
+  \noindent
+  This function can also be extended to strings, written $a\backslash s$,
+  just like the standard derivative.  We omit the details. Finally we
+  can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}: 
+
+  \noindent
+This bitcoded lexer first internalises the regular expression $r$ and then
+builds the annotated derivative according to $s$. If the derivative is
+nullable, then it extracts the bitcoded value using the
+$\textit{bmkeps}$ function. Finally it decodes the bitcoded value.  If
+the derivative is \emph{not} nullable, then $\textit{None}$ is
+returned. The task is to show that this way of calculating a value
+generates the same result as with \textit{lexer}.
+
+Before we can proceed we need to define a function, called
+\textit{retrieve}, which Sulzmann and Lu introduced for the proof.
+
+\textbf{fix}
+
+\noindent
+The idea behind this function is to retrieve a possibly partial
+bitcode from an annotated regular expression, where the retrieval is
+guided by a value.  For example if the value is $\Left$ then we
+descend into the left-hand side of an alternative (annotated) regular
+expression in order to assemble the bitcode. Similarly for
+$\Right$. The property we can show is that for a given $v$ and $r$
+with $\vdash v : r$, the retrieved bitsequence from the internalised
+regular expression is equal to the bitcoded version of $v$.
+
+\begin{lemma}\label{retrievecode}
+If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
+\end{lemma}
+
+*}
+
+text {*
   There is also a corresponding decoding function that takes a bitsequence
   and generates back a value. However, since the bitsequences are a ``lossy''
   coding (@{term Seq}s are not coded) the decoding function depends also
   on a regular expression in order to decode values. 
 
-  \begin{center}
-  \begin{tabular}{lcll}
-  %@{thm (lhs) decode'.simps(1)} & $\dn$ & @{thm (rhs) decode'.simps(1)}\\
-  @{thm (lhs) decode'.simps(2)} & $\dn$ & @{thm (rhs) decode'.simps(2)}\\
-  @{thm (lhs) decode'.simps(3)} & $\dn$ & @{thm (rhs) decode'.simps(3)}\\
-  @{thm (lhs) decode'.simps(4)} & $\dn$ & @{thm (rhs) decode'.simps(4)}\\
-  @{thm (lhs) decode'.simps(5)} & $\dn$ & @{thm (rhs) decode'.simps(5)}\\
-  @{thm (lhs) decode'.simps(6)} & $\dn$ & @{thm (rhs) decode'.simps(6)}\\
-  @{thm (lhs) decode'.simps(7)} & $\dn$ & @{thm (rhs) decode'.simps(7)}\\
-  @{thm (lhs) decode'.simps(8)} & $\dn$ & @{thm (rhs) decode'.simps(8)}\\
-  @{thm (lhs) decode'.simps(9)} & $\dn$ & @{thm (rhs) decode'.simps(9)}\\
-  @{thm (lhs) decode'.simps(10)} & $\dn$ & @{thm (rhs) decode'.simps(10)} & fix
-  \end{tabular}
-  \end{center}
+
 
 
   The idea of the bitcodes is to annotate them to regular expressions and generate values

--- a/thys2/Paper/document/root.tex	Sat Feb 05 18:24:37 2022 +0000
+++ b/thys2/Paper/document/root.tex	Sun Feb 06 00:02:04 2022 +0000
@@ -34,6 +34,23 @@
 \renewcommand{\isasymin}{\ensuremath{\,\in\,}}
 
 
+\def\lexer{\mathit{lexer}}
+\def\mkeps{\mathit{mkeps}}
+\def\inj{\mathit{inj}}
+\def\Empty{\mathit{Empty}}
+\def\Left{\mathit{Left}}
+\def\Right{\mathit{Right}}
+\def\Stars{\mathit{Stars}}
+\def\Char{\mathit{Char}}
+\def\Seq{\mathit{Seq}}
+\def\Der{\mathit{Der}}
+\def\nullable{\mathit{nullable}}
+\def\Z{\mathit{Z}}
+\def\S{\mathit{S}}
+\newcommand{\ZERO}{\mbox{\bf 0}}
+\newcommand{\ONE}{\mbox{\bf 1}}
+
+
 \def\Brz{Brzozowski}
 \def\der{\backslash}
 \newtheorem{falsehood}{Falsehood}

--- a/thys2/SizeBound4.thy	Sat Feb 05 18:24:37 2022 +0000
+++ b/thys2/SizeBound4.thy	Sun Feb 06 00:02:04 2022 +0000
@@ -206,7 +206,7 @@
      (if bnullable r1
       then AALT bs (ASEQ [] (bder c r1) r2) (fuse (bmkeps r1) (bder c r2))
       else ASEQ bs (bder c r1) r2)"
-| "bder c (ASTAR bs r) = ASEQ bs (fuse [Z] (bder c r)) (ASTAR [] r)"
+| "bder c (ASTAR bs r) = ASEQ (bs @ [Z]) (bder c r) (ASTAR [] r)"
 
 
 fun 
@@ -566,8 +566,10 @@
 | ss3:  "r1 \<leadsto> r2 \<Longrightarrow> (r1 # rs) s\<leadsto> (r2 # rs)"
 | ss4:  "(AZERO # rs) s\<leadsto> rs"
 | ss5:  "(AALTs bs1 rs1 # rsb) s\<leadsto> ((map (fuse bs1) rs1) @ rsb)"
-| ss6:  "erase a1 = erase a2 \<Longrightarrow> (rsa@[a1]@rsb@[a2]@rsc) s\<leadsto> (rsa@[a1]@rsb@rsc)"
-
+| ss6:  "L(erase a2) \<subseteq> L(erase a1) \<Longrightarrow> (rsa@[a1]@rsb@[a2]@rsc) s\<leadsto> (rsa@[a1]@rsb@rsc)"
+(*| extra: "bnullable r1 \<Longrightarrow> ASEQ bs0 (ASEQ bs1 r1 r2) r3 \<leadsto>
+            ASEQ (bs0 @ bs1) r1 (ASEQ [] r2 r3)"
+*)
 
 inductive 
   rrewrites:: "arexp \<Rightarrow> arexp \<Rightarrow> bool" ("_ \<leadsto>* _" [100, 100] 100)
@@ -586,6 +588,10 @@
   shows "r1 \<leadsto> r2 \<Longrightarrow> r1 \<leadsto>* r2"
   using rrewrites.intros(1) rrewrites.intros(2) by blast
 
+lemma rs_in_rstar: 
+  shows "r1 s\<leadsto> r2 \<Longrightarrow> r1 s\<leadsto>* r2"
+  using rrewrites.intros(1) rrewrites.intros(2) by blast
+
 lemma rrewrites_trans[trans]: 
   assumes a1: "r1 \<leadsto>* r2"  and a2: "r2 \<leadsto>* r3"
   shows "r1 \<leadsto>* r3"
@@ -674,10 +680,19 @@
   shows "(rs1 @ rs2) s\<leadsto>* (rs1 @ distinctBy rs2 erase (set (map erase rs1)))"
   apply(induct rs2 arbitrary: rs1)
    apply(auto)
-  apply (smt (verit, best) append.assoc append.right_neutral append_Cons append_Nil split_list srewrite2(2) srewrites_trans ss6)
+  prefer 2
   apply(drule_tac x="rs1 @ [a]" in meta_spec)
+   apply(simp)
+  apply(drule_tac x="rs1" in meta_spec)
+  apply(subgoal_tac "(rs1 @ a # rs2) s\<leadsto>* (rs1 @ rs2)")
+  using srewrites_trans apply blast
+   apply(subgoal_tac "\<exists>rs1a rs1b. rs1 = rs1a @ [x] @ rs1b")
+    prefer 2
+  apply (simp add: split_list)
+  apply(erule exE)+
   apply(simp)
-  done
+  using ss6[simplified]
+  using rs_in_rstar by force
 
 lemma ss6_stronger:
   shows "rs1 s\<leadsto>* distinctBy rs1 erase {}"
@@ -710,12 +725,17 @@
   finally show "map (fuse bs) (AALTs bs1 rs1 # rsb) s\<leadsto> map (fuse bs) (map (fuse bs1) rs1 @ rsb)"
     by (simp add: comp_def fuse_append)
 next
-  case (ss6 a1 a2 rsa rsb rsc)
+  case (ss6 a2 a1 rsa rsb rsc)
+  have "L (erase a2) \<subseteq> L (erase a1)" by fact
   then show "map (fuse bs) (rsa @ [a1] @ rsb @ [a2] @ rsc) s\<leadsto> map (fuse bs) (rsa @ [a1] @ rsb @ rsc)"
     apply(simp)
     apply(rule rrewrite_srewrite.ss6[simplified])
     apply(simp add: erase_fuse)
     done
+(*next 
+  case (extra bs0 bs1 r1 r2 r3)
+  then show ?case
+    by (metis append_assoc fuse.simps(5) rrewrite_srewrite.extra)*)
 qed (auto intro: rrewrite_srewrite.intros)
 
 lemma rewrites_fuse:  
@@ -787,8 +807,9 @@
   and "rs1 s\<leadsto> rs2 \<Longrightarrow> bnullables rs1 = bnullables rs2" 
   apply(induct rule: rrewrite_srewrite.inducts)
   apply(auto simp add:  bnullable_fuse)
-  apply (meson UnCI bnullable_fuse imageI)
-  by (metis bnullable_correctness)
+   apply (meson UnCI bnullable_fuse imageI)
+  
+  using bnullable_correctness nullable_correctness by blast
 
 
 lemma rewrites_bnullable_eq: 
@@ -824,11 +845,22 @@
   then show "bmkepss (AALTs bs1 rs1 # rsb) = bmkepss (map (fuse bs1) rs1 @ rsb)"
     by (simp add: bmkepss1 bmkepss2 bmkepss_fuse bnullable_fuse)
 next
-  case (ss6 a1 a2 rsa rsb rsc)
-  have as1: "erase a1 = erase a2" by fact
+  case (ss6 a2 a1 rsa rsb rsc)
+  have as1: "L(erase a2) \<subseteq> L(erase a1)" by fact
   have as3: "bnullables (rsa @ [a1] @ rsb @ [a2] @ rsc)" by fact
   show "bmkepss (rsa @ [a1] @ rsb @ [a2] @ rsc) = bmkepss (rsa @ [a1] @ rsb @ rsc)" using as1 as3
-    by (smt (verit, best) append_Cons bmkeps.simps(3) bmkepss.simps(2) bmkepss1 bmkepss2 bnullable_correctness)
+    by (smt (verit, ccfv_SIG) append_Cons bmkepss.simps(2) bmkepss1 bmkepss2 bnullable_correctness nullable_correctness subset_eq)
+(*next 
+  case (extra bs0 bs1 r1 bs2 r2 bs4 bs3)
+  then show ?case 
+    apply(subst bmkeps.simps)
+    apply(subst bmkeps.simps)
+    apply(subst bmkeps.simps)
+    apply(subst bmkeps.simps)
+    apply(subst bmkeps.simps)
+    apply(subst bmkeps.simps)
+    apply(simp)
+    done*)
 qed (auto)
 
 lemma rewrites_bmkeps: 
@@ -908,6 +940,11 @@
   apply(simp_all add: bder_fuse)
   done
 
+lemma map_single:
+  shows "map f [x] = [f x]"
+  by simp
+
+
 lemma rewrite_preserves_bder: 
   shows "r1 \<leadsto> r2 \<Longrightarrow> bder c r1 \<leadsto>* bder c r2"
   and   "rs1 s\<leadsto> rs2 \<Longrightarrow> map (bder c) rs1 s\<leadsto>* map (bder c) rs2"
@@ -977,20 +1014,39 @@
     apply(simp)
     using bder_fuse_list map_map rrewrite_srewrite.ss5 srewrites.simps by blast
 next
-  case (ss6 a1 a2 bs rsa rsb)
-  have as: "erase a1 = erase a2" by fact
+  case (ss6 a2 a1 bs rsa rsb)
+  have as: "L(erase a2) \<subseteq> L(erase a1)" by fact
   show "map (bder c) (bs @ [a1] @ rsa @ [a2] @ rsb) s\<leadsto>* map (bder c) (bs @ [a1] @ rsa @ rsb)"
     apply(simp only: map_append)
-    by (smt (verit, best) erase_bder list.simps(8) list.simps(9) as rrewrite_srewrite.ss6 srewrites.simps)
+    apply(simp only: map_single)
+    apply(rule rs_in_rstar)
+    thm rrewrite_srewrite.intros
+    apply(rule rrewrite_srewrite.ss6)
+    using as
+    apply(auto simp add: der_correctness Der_def)
+    done 
+(*next 
+  case (extra bs0 bs1 r1 r2 r3)
+  then show ?case
+    apply(auto simp add: comp_def)
+    
+       defer
+    using r_in_rstar rrewrite_srewrite.extra apply presburger
+      prefer 2
+    using r_in_rstar rrewrite_srewrite.extra apply presburger
+     prefer 2
+    sorry*)
 qed
 
+
+
 lemma rewrites_preserves_bder: 
   assumes "r1 \<leadsto>* r2"
   shows "bder c r1 \<leadsto>* bder c r2"
 using assms  
 apply(induction r1 r2 rule: rrewrites.induct)
 apply(simp_all add: rewrite_preserves_bder rrewrites_trans)
-done
+  done
 
 
 lemma central:  
@@ -1034,6 +1090,76 @@
 
 (* some tests *)
 
+definition
+  bders_simp_Set :: "string set \<Rightarrow> arexp \<Rightarrow> arexp set"
+where
+  "bders_simp_Set A r \<equiv> bders_simp r ` A"
+
+lemma pders_Set_union:
+  shows "bders_simp_Set (A \<union> B) r = (bders_simp_Set A r \<union> bders_simp_Set B r)"
+  apply (simp add: bders_simp_Set_def)
+  by (simp add: image_Un)
+  
+lemma pders_Set_subset:
+  shows "A \<subseteq> B \<Longrightarrow> bders_simp_Set A r \<subseteq> bders_simp_Set B r"
+  apply (auto simp add: bders_simp_Set_def)
+  done
+
+
+lemma AZERO_r:
+  "bders_simp AZERO s = AZERO"
+  apply(induct s)
+   apply(auto)
+  done
+
+lemma bders_simp_Set_ZERO [simp]:
+  shows "bders_simp_Set UNIV1 AZERO \<subseteq> {AZERO}"
+  unfolding UNIV1_def bders_simp_Set_def 
+  apply(auto)
+  using AZERO_r by blast
+
+lemma AONE_r:
+  shows "bders_simp (AONE bs) s = AZERO \<or> bders_simp (AONE bs) s = AONE bs"
+  apply(induct s)
+   apply(auto)
+  using AZERO_r apply blast
+  using AZERO_r by blast
+
+lemma bders_simp_Set_ONE [simp]:
+  shows "bders_simp_Set UNIV1 (AONE bs) \<subseteq> {AZERO, AONE bs}"
+  unfolding UNIV1_def bders_simp_Set_def 
+  apply(auto split: if_splits)
+  using AONE_r by blast
+
+lemma ACHAR_r:
+  shows "bders_simp (ACHAR bs c) s = AZERO \<or> 
+         bders_simp (ACHAR bs c) s = AONE bs \<or>
+         bders_simp (ACHAR bs c) s = ACHAR bs c"
+  apply(induct s)
+   apply(auto)
+  using AONE_r apply blast
+  using AZERO_r apply force
+  using AONE_r apply blast
+  using AZERO_r apply blast
+  using AONE_r apply blast
+  using AZERO_r by blast
+
+lemma bders_simp_Set_CHAR [simp]:
+  shows "bders_simp_Set UNIV1 (ACHAR bs c) \<subseteq> {AZERO, AONE bs, ACHAR bs c}"
+unfolding UNIV1_def bders_simp_Set_def
+  apply(auto)
+  using ACHAR_r by blast
+  
+lemma bders_simp_Set_ALT [simp]:
+  shows "bders_simp_Set UNIV1 (AALT bs r1 r2) = bders_simp_Set UNIV1 r1 \<union> bders_simp_Set UNIV1 r2"
+  unfolding UNIV1_def bders_simp_Set_def
+  apply(auto)
+  oops
+
+
+
+
+(*
 lemma asize_fuse:
   shows "asize (fuse bs r) = asize r"
   apply(induct r arbitrary: bs)
@@ -1372,11 +1498,13 @@
   using finite_pder apply blast
   oops
   
-
+*)
 
 
 (* below is the idempotency of bsimp *)
 
+(*
+
 lemma bsimp_ASEQ_fuse:
   shows "fuse bs1 (bsimp_ASEQ bs2 r1 r2) = bsimp_ASEQ (bs1 @ bs2) r1 r2"
   apply(induct r1 r2 arbitrary: bs1 bs2 rule: bsimp_ASEQ.induct)
@@ -1466,6 +1594,8 @@
         apply auto
 
   oops
+*)
+
 
 (*
 AALTs [] [AZERO, AALTs(bs1, [a, b]) ] 
@@ -1475,24 +1605,19 @@
 
 *)
 
-lemma normal_bsimp: 
-  shows "\<nexists>r'. bsimp r \<leadsto> r'"
-  oops
 
   (*r' size bsimp r > size r' 
     r' \<leadsto>* bsimp bsimp r
 size bsimp r > size r' \<ge> size bsimp bsimp r*)
 
-export_code blexer_simp blexer lexer bders bders_simp in Scala module_name VerifiedLexers
-
 
 unused_thms
 
-
+(*
 inductive aggressive:: "arexp \<Rightarrow> arexp \<Rightarrow> bool" ("_ \<leadsto>? _" [99, 99] 99)
   where
  "ASEQ bs (AALTs bs1 rs) r \<leadsto>? AALTs (bs@bs1) (map (\<lambda>r'. ASEQ [] r' r) rs) "
-
+*)
 
 
 end

Binary file thys2/paper.pdf has changed

author	Christian Urban <christian.urban@kcl.ac.uk>
	Sun, 06 Feb 2022 00:02:04 +0000
changeset 416	57182b36ec01
parent 415	5c96fe5306a7
child 417	a2887a9e8539
child 418	41a2a3b63853

progs/scala/re-bit2.scala		file \| annotate \| diff \| comparison \| revisions
thys2/Paper/Paper.thy		file \| annotate \| diff \| comparison \| revisions
thys2/Paper/document/root.tex		file \| annotate \| diff \| comparison \| revisions
thys2/SizeBound4.thy		file \| annotate \| diff \| comparison \| revisions
thys2/paper.pdf		file \| annotate \| diff \| comparison \| revisions