lexing: changeset 439:a5376206fd52

--- a/progs/scala/re-bit2.scala	Wed Mar 02 23:13:59 2022 +0000
+++ b/progs/scala/re-bit2.scala	Wed Mar 02 23:23:19 2022 +0000
@@ -1,3 +1,7 @@
+
+
+
+
 import scala.language.implicitConversions    
 import scala.language.reflectiveCalls
 import scala.annotation.tailrec   
@@ -225,71 +229,11 @@
       case r1 :: rs2 => r1 :: flats(rs2)
     }
 
-def projectFirstChild(rs: List[ARexp]) : List[ARexp] = rs match {
-    case (ASEQ(bs, r1, r2p)::rs1) => r1::projectFirstChild(rs1)
-    case Nil => Nil
-  }
-
-
-
-def distinctBy2(xs: List[ARexp], acc: List[Rexp] = Nil): List[ARexp] = xs match {
-    case Nil => Nil
-    case (x::xs) => {
-      val res = erase(x)
-      if(acc.contains(res))
-        distinctBy2(xs, acc)
-      else
-        x match {
-          case ASEQ(bs0, AALTS(bs1, rs), r2) => 
-            val newTerms =  distinctBy2(rs.map(r1 => ASEQ(Nil, r1, r2)), acc)
-            val rsPrime = projectFirstChild(newTerms)
-            newTerms match {
-              case Nil => distinctBy2(xs, acc)
-              case t::Nil => ASEQ(bs0, fuse(bs1, rsPrime.head), r2)::distinctBy2(xs, erase(t)::acc)
-              case ts => ASEQ(bs0, AALTS(bs1, rsPrime), r2)::distinctBy2(xs, newTerms.map(erase(_)):::acc)
-            }
-          case x => x::distinctBy2(xs, res::acc)
-        }
-    }
-  } 
-
-/*
-def strongBsimp(r: ARexp): ARexp =
-  {
-    r match {
-      case ASEQ(bs1, r1, r2) => (strongBsimp(r1), strongBsimp(r2)) match {
-          case (AZERO, _) => AZERO
-          case (_, AZERO) => AZERO
-          case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s)
-          case (r1s, r2s) => ASEQ(bs1, r1s, r2s)
-      }
-      case AALTS(bs1, rs) => {
-            val rs_simp = rs.map(strongBsimp(_))
-            val flat_res = flats(rs_simp)
-            val dist_res = distinctBy2(flat_res)//distinctBy(flat_res, erase)
-            dist_res match {
-              case Nil => AZERO
-              case s :: Nil => fuse(bs1, s)
-              case rs => AALTS(bs1, rs)  
-            }
-          
-      }
-      case r => r
-    }
-  }
-*/
-
 def bsimp(r: ARexp): ARexp = r match {
   case ASEQ(bs1, r1, r2) => (bsimp(r1), bsimp(r2)) match {
       case (AZERO, _) => AZERO
       case (_, AZERO) => AZERO
       case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s)
-      //case (AALTS(bs, rs), r) => 
-      //   AALTS(Nil, rs.map(ASEQ(bs1 ++ bs, _, r)))
-      //case (ASEQ(bs2, r1, ASTAR(bs3, r2)), ASTAR(bs4, r3)) if erase(r2) == erase(r3) =>
-      //  ASEQ(bs1 ++ bs2, r1, ASTAR(bs3, r2))
-      //case (ASEQ(bs2, r1, r2), r3)  =>
-      //      ASEQ(bs1 ++ bs2, r1, ASEQ(Nil, r2, r3))
       case (r1s, r2s) => ASEQ(bs1, r1s, r2s)
   }
   case AALTS(bs1, rs) => (flts(rs.map(bsimp))).distinctBy(erase) match {
@@ -297,7 +241,6 @@
       case r::Nil => fuse(bs1, r)
       case rs => AALTS(bs1, rs)
   }
-  //case (ASTAR(bs1, ASTAR(bs2, r))) => bsimp(ASTAR(bs1 ++ bs2, r))
   case r => r
 }
 
@@ -310,12 +253,101 @@
 def blex_simp(r: ARexp, s: List[Char]) : Bits = s match {
   case Nil => if (bnullable(r)) bmkeps(r) 
               else throw new Exception("Not matched")
-  case c::cs => blex(bsimp(bder(c, r)), cs)
+  case c::cs => blex_simp(bsimp(bder(c, r)), cs)
 }
 
 def blexing_simp(r: Rexp, s: String) : Val = 
   decode(r, blex_simp(internalise(r), s.toList))
 
+//////////////////////
+// new simplification
+
+// collects first components of sequences.
+def coll(r: Rexp, rs: List[Rexp]) : List[Rexp] = rs match {
+ case Nil => Nil
+ case SEQ(r1, r2) :: rs => 
+   if (r == r2) r1 :: coll(r, rs) else coll(r, rs)
+ case r1 :: rs => coll(r, rs)
+}
+
+def bsimp_ASEQ1(bs: Bits, r1: ARexp, r2: ARexp) : ARexp = r1 match {
+  case AZERO => AZERO
+  case AONE(bs1) => fuse(bs ::: bs1, r2)
+  case _ => ASEQ(bs, r1, r2)
+}
+
+def bsimp_AALTs(bs: Bits, rs: List[ARexp]) : ARexp = rs match {
+  case Nil => AZERO
+  case r::Nil => fuse(bs, r)
+  case _ => AALTS(bs, rs)
+}
+
+def prune(r: ARexp, all: List[Rexp]) : ARexp = r match {
+  case ASEQ(bs, r1, r2) => {
+    val termsTruncated = coll(erase(r2), all)
+    val pruned = prune(r1, termsTruncated) 
+    bsimp_ASEQ1(bs, pruned, r2)
+  } 
+  case AALTS(bs, rs) => {
+    val rsp = rs.map(prune(_, all)).filter(_ != AZERO)
+    bsimp_AALTs(bs, rsp)
+  }
+  case r => 
+    if (all.contains(erase(r))) r else AZERO 
+}
+
+def oneSimp(r: Rexp) : Rexp = r match {
+    case SEQ(ONE, r) => r
+    case SEQ(r1, r2) => SEQ(oneSimp(r1), r2)
+    case r => r
+}
+
+def breakup(r: Rexp) : List[Rexp] = r match {
+    case SEQ(r1, r2) => breakup(r1).map(SEQ(_, r2))
+    case ALT(r1, r2) => breakup(r1) ::: breakup(r2)
+    case _ => r::Nil
+} 
+
+def addToAcc(r: ARexp, acc: List[Rexp]) : List[Rexp] =
+  breakup(erase(r)).filterNot(r => acc.contains(oneSimp(r)))
+
+def dBStrong(rs: List[ARexp], acc: List[Rexp]) : List[ARexp] = rs match {
+  case Nil => Nil
+  case r::rs => if (acc.contains(erase(r))) dBStrong(rs, acc)
+                 else prune(r, addToAcc(r, acc)) match { 
+                   case AZERO => dBStrong(rs, addToAcc(r, acc) ::: acc)
+                   case r1 => r1 :: dBStrong(rs, addToAcc(r, acc) ::: acc)
+                 }
+}
+
+def bsimp_ASEQ(bs: Bits, r1: ARexp, r2: ARexp) : ARexp = (r1, r2) match {
+  case (AZERO, _) => AZERO
+  case (_, AZERO) => AZERO  
+  case (AONE(bs1), r2) => fuse(bs ::: bs1, r2)
+  case _ => ASEQ(bs, r1, r2)
+}
+
+def bsimp2(r: ARexp): ARexp = r match {
+  case ASEQ(bs1, r1, r2) => bsimp_ASEQ(bs1, bsimp2(r1), bsimp2(r2))
+  case AALTS(bs1, rs) => bsimp_AALTs(bs1, dBStrong(flts(rs.map(bsimp2(_))), Nil))
+  case r => r
+}
+
+def bders_simp2(r: ARexp, s: List[Char]) : ARexp = s match {
+  case Nil => r
+  case c::cs => bders_simp2(bsimp2(bder(c, r)), cs)
+}
+
+
+def blex_simp2(r: ARexp, s: List[Char]) : Bits = s match {
+  case Nil => if (bnullable(r)) bmkeps(r) 
+              else throw new Exception("Not matched")
+  case c::cs => blex_simp2(bsimp2(bder(c, r)), cs)
+}
+
+def blexing_simp2(r: Rexp, s: String) : Val = 
+  decode(r, blex_simp2(internalise(r), s.toList))
+
 //println(blexing_simp(reg, "aab"))
 
 
@@ -377,6 +409,9 @@
   case STAR(r1) => 1 + size(r1)
 }
 
+def asize(r: ARexp) : Int = size(erase(r)) 
+def psize(rs: Set[Rexp]) : Int = rs.map(size(_)).sum
+
 def pp(r: ARexp): String = r match {
   case ASEQ(_, ACHAR(_, a1),ASEQ(_, r1, r2)) => s"${a1}${pp(r1)}${pp(r2)}"
   case ASEQ(_, ACHAR(_, a1),ACHAR(_, a2)) => s"${a1}${a2}"
@@ -389,6 +424,27 @@
 }
 
 
+val TEST = STAR("a" | "aa")
+println(asize(bders(("a" * 0).toList, internalise(TEST))))
+println(asize(bders(("a" * 1).toList, internalise(TEST))))
+println(asize(bders(("a" * 2).toList, internalise(TEST))))
+println(asize(bders(("a" * 3).toList, internalise(TEST))))
+println(asize(bders(("a" * 4).toList, internalise(TEST))))
+
+println(asize(bders_simp(internalise(TEST), ("a" * 0).toList)))
+println(asize(bders_simp(internalise(TEST), ("a" * 1).toList)))
+println(asize(bders_simp(internalise(TEST), ("a" * 2).toList)))
+println(asize(bders_simp(internalise(TEST), ("a" * 3).toList)))
+println(asize(bders_simp(internalise(TEST), ("a" * 4).toList)))
+
+
+println(asize(bders_simp2(internalise(TEST), ("a" * 0).toList)))
+println(asize(bders_simp2(internalise(TEST), ("a" * 1).toList)))
+println(asize(bders_simp2(internalise(TEST), ("a" * 2).toList)))
+println(asize(bders_simp2(internalise(TEST), ("a" * 3).toList)))
+println(asize(bders_simp2(internalise(TEST), ("a" * 4).toList)))
+println(asize(bders_simp2(internalise(TEST), ("a" * 5).toList)))
+
 
 // Some Tests
 //============
@@ -419,6 +475,7 @@
 
 println(blexing(STARREG, "a" * 3))
 println(blexing_simp(STARREG, "a" * 3))
+println(pders(List(STARREG), "a" * 3))
 
 
 size(STARREG)
@@ -436,6 +493,9 @@
 size(erase(bders_simp(internalise(STARREG), ("a" * 102).toList)))
 size(erase(bders_simp(internalise(STARREG), ("a" * 103).toList)))
 
+size(erase(bders_simp2(internalise(STARREG), ("a" * 103).toList)))
+psize(pders(("a" * 103).toList, Set(STARREG)))
+
 println(bders_simp(internalise(STARREG), ("a" * 1).toList))
 println(bders_simp(internalise(STARREG), ("a" * 2).toList))
 println(bders_simp(internalise(STARREG), ("a" * 3).toList))
@@ -709,3 +769,25 @@
 encode(inj(dr, 'a', decode(dr_der, res1)))
 
 */
+
+
+
+
+
+
+
+/*
+def star(n: Long) = if ((n & 1L) == 1L) "*" else " "
+def stars(n: Long): String = if (n == 0L) "" else star(n) + " " + stars(n >> 1)
+def spaces(n: Int) = " " * n
+
+
+def sierpinski(n: Int) {  
+  ((1 << n) - 1 to 0 by -1).foldLeft(1L) {
+    case (bitmap, remainingLines) =>
+      println(spaces(remainingLines) + stars(bitmap))
+      (bitmap << 1) ^ bitmap
+  }
+}
+
+*/
\ No newline at end of file

--- a/thys2/PDerivs.thy	Wed Mar 02 23:13:59 2022 +0000
+++ b/thys2/PDerivs.thy	Wed Mar 02 23:23:19 2022 +0000
@@ -400,8 +400,9 @@
    apply(rule subset_trans)
   thm pders_STAR
      apply(rule pders_STAR)
-     apply(simp)
-    apply(auto simp add: pders_Set_def)[1]
+    apply(simp)
+   apply(auto simp add: pders_Set_def)[1]
+(* rest of SEQ case *)  
   apply(simp)
   apply(rule conjI)
    apply blast

--- a/thys2/Paper/Paper.thy	Wed Mar 02 23:13:59 2022 +0000
+++ b/thys2/Paper/Paper.thy	Wed Mar 02 23:23:19 2022 +0000
@@ -675,7 +675,7 @@
   transforms a bitcoded regular expression into a (standard) regular
   expression by just erasing the annotated bitsequences. We omit the
   straightforward definition. For defining the algorithm, we also need
-  the functions \textit{bnullable} and \textit{bmkeps}, which are the
+  the functions \textit{bnullable} and \textit{bmkeps}(\textit{s}), which are the
   ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
   bitcoded regular expressions.
   %
@@ -695,14 +695,16 @@
   &
   \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
   $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$\\
-  $\textit{bmkeps}\,(\textit{ALTs}\,bs\,r\!::\!\rs)$ & $\dn$ &
-     $\textit{if}\;\textit{bnullable}\,r$\\
-  & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,r$\\
-  & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,\rs$\\
+  $\textit{bmkeps}\,(\textit{ALTs}\,bs\,\rs)$ & $\dn$ &
+  $bs\,@\,\textit{bmkepss}\,\rs$\\
   $\textit{bmkeps}\,(\textit{SEQ}\,bs\,r_1\,r_2)$ & $\dn$ &\\
   \multicolumn{3}{r}{$bs \,@\,\textit{bmkeps}\,r_1\,@\, \textit{bmkeps}\,r_2$}\\
   $\textit{bmkeps}\,(\textit{STAR}\,bs\,r)$ & $\dn$ &
-     $bs \,@\, [\S]$
+     $bs \,@\, [\S]$\\
+  $\textit{bmkepss}\,(r\!::\!\rs)$ & $\dn$ &
+     $\textit{if}\;\textit{bnullable}\,r$\\
+  & &$\textit{then}\;\textit{bmkeps}\,r$\\
+  & &$\textit{else}\;\textit{bmkepss}\,\rs$
   \end{tabular}
   \end{tabular}
   \end{center}    
@@ -1040,7 +1042,7 @@
      \noindent where we scan the list from left to right (because we
      have to remove later copies). In @{text distinctBy}, @{text f} is a
      function and @{text acc} is an accumulator for regular
-     expressions---essentially a set of regular expression that we have already seen
+     expressions---essentially a set of regular expressions that we have already seen
      while scanning the list. Therefore we delete an element, say @{text x},
      from the list provided @{text "f x"} is already in the accumulator;
      otherwise we keep @{text x} and scan the rest of the list but 
@@ -1182,12 +1184,12 @@
      simplified regular expressions in small steps (unlike the @{text bsimp}-function which
      does the same in a big step), and show that each of
      the small steps preserves the bitcodes that lead to the final POSIX value.
-     The rewrite system is organised such that $\leadsto$ is for bitcode regular
+     The rewrite system is organised such that $\leadsto$ is for bitcoded regular
      expressions and $\stackrel{s}{\leadsto}$ for lists of bitcoded regular
      expressions. The former essentially implements the simplifications of
      @{text "bsimpSEQ"} and @{text flts}; while the latter implements the
      simplifications in @{text "bsimpALTs"}. We can show that any bitcoded
-     regular expression reduces in one or more steps to the simplified
+     regular expression reduces in zero or more steps to the simplified
      regular expression generated by @{text bsimp}:
 
      \begin{lemma}\label{lemone}
@@ -1317,9 +1319,9 @@
 
 text {*
 
-In this section let us sketch our argument on why the size of the simplified
+In this section let us sketch our argument for why the size of the simplified
 derivatives with the aggressive simplification function is finite. Suppose
-we have a size functions for bitcoded regular expressions, written
+we have a size function for bitcoded regular expressions, written
 @{text "|r|"}, which counts the number of nodes if we regard $r$ as a tree
 (we omit the precise definition). For this we show that for every $r$
 there exists a bound $N$
@@ -1418,7 +1420,7 @@
    \cite[Page 14]{Sulzmann2014}. 
    Given the growth of the
    derivatives in some cases even after aggressive simplification, this
-   is a hard to believe fact. A similar claim of about a theoretical runtime
+   is a hard to believe fact. A similar claim about a theoretical runtime
    of @{text "O(n\<^sup>2)"} is made for the Verbatim lexer, which calculates POSIX matches and is based on
    derivatives \cite{verbatim}. In this case derivatives are not simplified.
    Clearly our result of having finite

--- a/thys2/SizeBound4.thy	Wed Mar 02 23:13:59 2022 +0000
+++ b/thys2/SizeBound4.thy	Wed Mar 02 23:23:19 2022 +0000
@@ -1019,7 +1019,7 @@
     apply(simp only: map_append)
     apply(simp only: map_single)
     apply(rule rs_in_rstar)
-    thm rrewrite_srewrite.intros
+    thm rrewrite_srewrite.intros 
     apply(rule rrewrite_srewrite.ss6)
     using as
     apply(auto simp add: der_correctness Der_def)

--- a/thys2/SizeBoundStrong.thy	Wed Mar 02 23:13:59 2022 +0000
+++ b/thys2/SizeBoundStrong.thy	Wed Mar 02 23:23:19 2022 +0000
@@ -585,17 +585,20 @@
 | "bsimp_ASEQ1 bs r1 r2 = ASEQ  bs r1 r2"
 
 
-fun collect where
+fun collect :: "rexp \<Rightarrow> rexp list \<Rightarrow> rexp list" where
   \<open>collect _ [] = []\<close>
-|  \<open>collect erasedR2  ((SEQ r1 r2) # rs) = (if r2 = erasedR2 then r1 # (collect erasedR2 rs)
-                                        else collect erasedR2 rs)\<close>
+|  \<open>collect erasedR2  ((SEQ r1 r2) # rs) = 
+       (if r2 = erasedR2 then r1 # (collect erasedR2 rs) else collect erasedR2 rs)\<close>
 | \<open>collect erasedR2 (r # rs) = collect erasedR2 rs\<close>
 
 
 fun pruneRexp where
-  \<open>pruneRexp (ASEQ bs r1 r2) allowableTerms = 
-( let termsTruncated = (collect (erase r2) allowableTerms) in (let pruned = pruneRexp r1 termsTruncated in (bsimp_ASEQ1 bs pruned r2))    )\<close>
-| \<open>pruneRexp (AALTs bs rs) allowableTerms = (let rsp = (filter (\<lambda>r. r \<noteq> AZERO)  (map (\<lambda>r. pruneRexp r allowableTerms) rs) ) in bsimp_AALTs bs rsp )
+  "pruneRexp (ASEQ bs r1 r2) allowableTerms = 
+          (let termsTruncated = (collect (erase r2) allowableTerms) in 
+             (let pruned = pruneRexp r1 termsTruncated in 
+                (bsimp_ASEQ1 bs pruned r2)))"
+| \<open>pruneRexp (AALTs bs rs) allowableTerms = 
+     (let rsp = (filter (\<lambda>r. r \<noteq> AZERO)  (map (\<lambda>r. pruneRexp r allowableTerms) rs) ) in bsimp_AALTs bs rsp )
 \<close>
 | \<open>pruneRexp r allowableTerms = (if (erase r) \<in> (set allowableTerms) then r else AZERO)\<close>

Binary file thys2/paper.pdf has changed

author	Chengsong
	Wed, 02 Mar 2022 23:23:19 +0000
changeset 439	a5376206fd52
parent 438	a73b2e553804 (current diff)
parent 437	43b87bab0dac (diff)
child 440	0856fbf8bda7