# HG changeset patch # User Christian Urban # Date 1644105724 0 # Node ID 57182b36ec0193d8aa91f696e6169836bf971043 # Parent 5c96fe5306a7bfd54829b93203ebdc8f2753cd8a more with the paper diff -r 5c96fe5306a7 -r 57182b36ec01 progs/scala/re-bit2.scala --- a/progs/scala/re-bit2.scala Sat Feb 05 18:24:37 2022 +0000 +++ b/progs/scala/re-bit2.scala Sun Feb 06 00:02:04 2022 +0000 @@ -73,6 +73,7 @@ case AALTS(bs, r::Nil) => erase(r) case AALTS(bs, r::rs) => ALT(erase(r), erase(AALTS(bs, rs))) case ASEQ(bs, r1, r2) => SEQ (erase(r1), erase(r2)) + case ASTAR(cs, ASTAR(ds, r))=> STAR(erase(r)) case ASTAR(cs, r)=> STAR(erase(r)) } @@ -217,13 +218,78 @@ } } +def flats(rs: List[ARexp]): List[ARexp] = rs match { + case Nil => Nil + case AZERO :: rs1 => flats(rs1) + case AALTS(bs, rs1) :: rs2 => rs1.map(fuse(bs, _)) ::: flats(rs2) + case r1 :: rs2 => r1 :: flats(rs2) + } + +def projectFirstChild(rs: List[ARexp]) : List[ARexp] = rs match { + case (ASEQ(bs, r1, r2p)::rs1) => r1::projectFirstChild(rs1) + case Nil => Nil + } + + + +def distinctBy2(xs: List[ARexp], acc: List[Rexp] = Nil): List[ARexp] = xs match { + case Nil => Nil + case (x::xs) => { + val res = erase(x) + if(acc.contains(res)) + distinctBy2(xs, acc) + else + x match { + case ASEQ(bs0, AALTS(bs1, rs), r2) => + val newTerms = distinctBy2(rs.map(r1 => ASEQ(Nil, r1, r2)), acc) + val rsPrime = projectFirstChild(newTerms) + newTerms match { + case Nil => distinctBy2(xs, acc) + case t::Nil => ASEQ(bs0, fuse(bs1, rsPrime.head), r2)::distinctBy2(xs, erase(t)::acc) + case ts => ASEQ(bs0, AALTS(bs1, rsPrime), r2)::distinctBy2(xs, newTerms.map(erase(_)):::acc) + } + case x => x::distinctBy2(xs, res::acc) + } + } + } + +/* +def strongBsimp(r: ARexp): ARexp = + { + r match { + case ASEQ(bs1, r1, r2) => (strongBsimp(r1), strongBsimp(r2)) match { + case (AZERO, _) => AZERO + case (_, AZERO) => AZERO + case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s) + case (r1s, r2s) => ASEQ(bs1, r1s, r2s) + } + case AALTS(bs1, rs) => { + val rs_simp = rs.map(strongBsimp(_)) + val flat_res = flats(rs_simp) + val dist_res = distinctBy2(flat_res)//distinctBy(flat_res, erase) + dist_res match { + case Nil => AZERO + case s :: Nil => fuse(bs1, s) + case rs => AALTS(bs1, rs) + } + + } + case r => r + } + } +*/ def bsimp(r: ARexp): ARexp = r match { case ASEQ(bs1, r1, r2) => (bsimp(r1), bsimp(r2)) match { case (AZERO, _) => AZERO case (_, AZERO) => AZERO case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s) - //case (AALTS(bs, rs), r2) => AALTS(bs, rs.map(ASEQ(Nil, _, r2))) + //case (AALTS(bs, rs), r) => + // AALTS(Nil, rs.map(ASEQ(bs1 ++ bs, _, r))) + //case (ASEQ(bs2, r1, ASTAR(bs3, r2)), ASTAR(bs4, r3)) if erase(r2) == erase(r3) => + // ASEQ(bs1 ++ bs2, r1, ASTAR(bs3, r2)) + //case (ASEQ(bs2, r1, r2), r3) => + // ASEQ(bs1 ++ bs2, r1, ASEQ(Nil, r2, r3)) case (r1s, r2s) => ASEQ(bs1, r1s, r2s) } case AALTS(bs1, rs) => (flts(rs.map(bsimp))).distinctBy(erase) match { @@ -231,9 +297,16 @@ case r::Nil => fuse(bs1, r) case rs => AALTS(bs1, rs) } + //case (ASTAR(bs1, ASTAR(bs2, r))) => bsimp(ASTAR(bs1 ++ bs2, r)) case r => r } +def bders_simp(r: ARexp, s: List[Char]) : ARexp = s match { + case Nil => r + case c::cs => bders_simp(bsimp(bder(c, r)), cs) +} + + def blex_simp(r: ARexp, s: List[Char]) : Bits = s match { case Nil => if (bnullable(r)) bmkeps(r) else throw new Exception("Not matched") @@ -266,8 +339,151 @@ case Stars(vs) => vs.flatMap(env) } +def nullable(r: Rexp) : Boolean = r match { + case ZERO => false + case ONE => true + case CHAR(_) => false + case ALT(r1, r2) => nullable(r1) || nullable(r2) + case SEQ(r1, r2) => nullable(r1) && nullable(r2) + case STAR(_) => true +} + + +def pder(c: Char, r: Rexp) : Set[Rexp] = r match { + case ZERO => Set() + case ONE => Set() + case CHAR(d) => if (c == d) Set(ONE) else Set() + case ALT(r1, r2) => pder(c, r1) ++ pder(c, r2) + case SEQ(r1, r2) => { + (for (pr1 <- pder(c, r1)) yield SEQ(pr1, r2)) ++ + (if (nullable(r1)) pder(c, r2) else Set()) + } + case STAR(r1) => { + for (pr1 <- pder(c, r1)) yield SEQ(pr1, STAR(r1)) + } +} + +def pders(s: List[Char], rs: Set[Rexp]) : Set[Rexp] = s match { + case Nil => rs + case c::s => pders(s, rs.flatMap(pder(c, _))) +} + +def size(r: Rexp): Int = r match { + case ZERO => 1 + case ONE => 1 + case CHAR(_) => 1 + case ALT(r1, r2) => 1 + size(r1) + size (r2) + case SEQ(r1, r2) => 1 + size(r1) + size (r2) + case STAR(r1) => 1 + size(r1) +} + +def pp(r: ARexp): String = r match { + case ASEQ(_, ACHAR(_, a1),ASEQ(_, r1, r2)) => s"${a1}${pp(r1)}${pp(r2)}" + case ASEQ(_, ACHAR(_, a1),ACHAR(_, a2)) => s"${a1}${a2}" + case AZERO => "0" + case AONE(_) => "1" + case ACHAR(_, a) => a.toString + case AALTS(_, rs) => s"ALTs(${rs.map(pp(_)).mkString(",")})" + case ASEQ(_, r1, r2) => s"SEQ(${pp(r1)}, ${pp(r2)})" + case ASTAR(_, r1) => s"(${pp(r1)})*" +} + + + // Some Tests //============ +val ST = STAR(STAR("a")) +println(pp(internalise(ST))) +println(bders(("a" * 1).toList, internalise(ST))) +println(bders_simp(internalise(ST), ("a" * 1).toList)) +println(pp(bders(("a" * 1).toList, internalise(ST)))) +println(pp(bders_simp(internalise(ST), ("a" * 1).toList))) + + + +println(pp(bders_simp(internalise(ST), ("a" * 1).toList))) +println(pp(bders_simp(internalise(ST), ("a" * 2).toList))) +println(pp(bders_simp(internalise(ST), ("a" * 3).toList))) +println(pp(bders_simp(internalise(ST), ("a" * 4).toList))) + +println(blexing(ST, "a" * 1)) +println(blexing_simp(ST, "a" * 1)) +println(blexing(ST, "a" * 2)) +println(blexing_simp(ST, "a" * 2)) +println(blexing(ST, "a" * 3)) +println(blexing_simp(ST, "a" * 3)) +println(blexing(ST, "a" * 4)) +println(blexing_simp(ST, "a" * 4)) + +val STARREG = ((STAR("a") | STAR("aaa")) | STAR("aaaaa")).% + +println(blexing(STARREG, "a" * 3)) +println(blexing_simp(STARREG, "a" * 3)) + + +size(STARREG) +size(erase(bders_simp(internalise(STARREG), ("a" * 1).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 2).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 3).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 4).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 5).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 6).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 7).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 8).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 9).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 100).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 101).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 102).toList))) +size(erase(bders_simp(internalise(STARREG), ("a" * 103).toList))) + +println(bders_simp(internalise(STARREG), ("a" * 1).toList)) +println(bders_simp(internalise(STARREG), ("a" * 2).toList)) +println(bders_simp(internalise(STARREG), ("a" * 3).toList)) +println(bders_simp(internalise(STARREG), ("a" * 4).toList)) + +println(erase(bders_simp(internalise(STARREG), ("a" * 1).toList))) +println(erase(bders_simp(internalise(STARREG), ("a" * 2).toList))) +println(erase(bders_simp(internalise(STARREG), ("a" * 3).toList))) +println(erase(bders_simp(internalise(STARREG), ("a" * 4).toList))) + +println(pp(internalise(STARREG))) +println(pp(bders_simp(internalise(STARREG), ("a" * 1).toList))) +println(pp(bders_simp(internalise(STARREG), ("a" * 2).toList))) +println(pp(bders_simp(internalise(STARREG), ("a" * 3).toList))) +println(pp(bders_simp(internalise(STARREG), ("a" * 4).toList))) + +val STARR = (STAR("a") | STAR("aa") | + STAR("aaa") | STAR("aaaa") | + STAR("aaaaa") | STAR("aaaaaa") | + STAR("aaaaaaa") | STAR("aaaaaaaa")).% + +size(STARR) +size(erase(bders_simp(internalise(STARR), ("a" * 1).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 2).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 3).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 4).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 5).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 6).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 7).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 8).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 9).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 1000).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 1001).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 1002).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 1010).toList))) +size(erase(bders_simp(internalise(STARR), ("a" * 1010 ++ "b").toList))) + +val r0 = ("a" | "ab") ~ ("b" | "") +println(pre_blexing(internalise(r0), "ab")) +println(blexing(r0, "ab")) +println(blexing_simp(r0, "ab")) + +println(pders("a".toList, Set(r0))) +println(pders("ab".toList, Set(r0))) + +val r00 = ("a" ~ ("b" | "")) | ("ab" ~ ("b" | "")) + + /* def time_needed[T](i: Int, code: => T) = { diff -r 5c96fe5306a7 -r 57182b36ec01 thys2/Paper/Paper.thy --- a/thys2/Paper/Paper.thy Sat Feb 05 18:24:37 2022 +0000 +++ b/thys2/Paper/Paper.thy Sun Feb 06 00:02:04 2022 +0000 @@ -247,6 +247,7 @@ text {* + In the second part of their paper \cite{Sulzmann2014}, Sulzmann and Lu describe another algorithm that generates POSIX values but dispences with the second phase where characters are injected ``back'' into values. For this they annotate bitcodes to @@ -254,8 +255,7 @@ \begin{center} \begin{tabular}{lcl} - @{term breg} & $::=$ & @{term "AZERO"}\\ - & $\mid$ & @{term "AONE bs"}\\ + @{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\ & $\mid$ & @{term "ACHAR bs c"}\\ & $\mid$ & @{term "AALTs bs rs"}\\ & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\ @@ -264,46 +264,243 @@ \end{center} \noindent where @{text bs} stands for a bitsequences; @{text r}, - @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for annotated regular - expressions; and @{text rs} for a list of annotated regular - expressions. In contrast to Sulzmann and Lu we generalise the - alternative regular expressions to lists, instead of just having - binary regular expressions. The idea with annotated regular - expressions is to incrementally generate the value information by - recording bitsequences. Sulzmann and Lu then - define a coding function for how values can be coded into bitsequences. + @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular + expressions; and @{text rs} for lists of bitcoded regular + expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"} + is just an abbreviation for @{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}. + For bitsequences we just use lists made up of the + constants @{text Z} and @{text S}. The idea with bitcoded regular + expressions is to incrementally generate the value information (for + example @{text Left} and @{text Right}) as bitsequences + as part of the regular expression constructors. + Sulzmann and Lu then define a coding + function for how values can be coded into bitsequences. \begin{center} + \begin{tabular}{cc} \begin{tabular}{lcl} @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\ @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\ @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\ - @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\ + @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)} + \end{tabular} + & + \begin{tabular}{lcl} @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\ @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\ - @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)} + @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\ + \mbox{\phantom{XX}}\\ + \end{tabular} \end{tabular} \end{center} + + \noindent + As can be seen, this coding is ``lossy'' in the sense that we do not + record explicitly character values and also not sequence values (for + them we just append two bitsequences). We do, however, record the + different alternatives for @{text Left}, respectively @{text Right}, as @{text Z} and + @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate + if there is still a value coming in the list of @{text Stars}, whereas @{text S} + indicates the end of the list. The lossiness makes the process of + decoding a bit more involved, but the point is that if we have a + regular expression \emph{and} a bitsequence of a corresponding value, + then we can always decode the value accurately. The decoding can be + defined by using two functions called $\textit{decode}'$ and + \textit{decode}: + \begin{center} + \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}} + $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\ + $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\ + $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\; + (\Left\,v, bs_1)$\\ + $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\; + (\Right\,v, bs_1)$\\ + $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ & + $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\ + & & $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$ + \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\ + $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\ + $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & + $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\ + & & $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$ + \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\ + $\textit{decode}\,bs\,r$ & $\dn$ & + $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\ + & & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\; + \textit{else}\;\textit{None}$ + \end{tabular} + \end{center} + + \noindent + The function \textit{decode} checks whether all of the bitsequence is + consumed and returns the corresponding value as @{term "Some v"}; otherwise + it fails with @{text "None"}. We can establish that for a value $v$ + inhabited by a regular expression $r$, the decoding of its + bitsequence never fails. + +\begin{lemma}\label{codedecode}\it + If $\;\vdash v : r$ then + $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$. +\end{lemma} + +\begin{proof} + This follows from the property that + $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds + for any bit-sequence $bs$ and $\vdash v : r$. This property can be + easily proved by induction on $\vdash v : r$. +\end{proof} + + Sulzmann and Lu define the function \emph{internalise} + in order to transform standard regular expressions into annotated + regular expressions. We write this operation as $r^\uparrow$. + This internalisation uses the following + \emph{fuse} function. + + \begin{center} + \begin{tabular}{lcl} + $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\ + $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ & + $\textit{ONE}\,(bs\,@\,bs')$\\ + $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ & + $\textit{CHAR}\,(bs\,@\,bs')\,c$\\ + $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ & + $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\ + $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ & + $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\ + $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ & + $\textit{STAR}\,(bs\,@\,bs')\,r$ + \end{tabular} + \end{center} + + \noindent + A regular expression can then be \emph{internalised} into a bitcoded + regular expression as follows. + + \begin{center} + \begin{tabular}{lcl} + $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\ + $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\ + $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\ + $(r_1 + r_2)^\uparrow$ & $\dn$ & + $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\, + (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\ + $(r_1\cdot r_2)^\uparrow$ & $\dn$ & + $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\ + $(r^*)^\uparrow$ & $\dn$ & + $\textit{STAR}\;[]\,r^\uparrow$\\ + \end{tabular} + \end{center} + + \noindent + There is also an \emph{erase}-function, written $a^\downarrow$, which + transforms a bitcoded regular expression into a (standard) regular + expression by just erasing the annotated bitsequences. We omit the + straightforward definition. For defining the algorithm, we also need + the functions \textit{bnullable} and \textit{bmkeps}, which are the + ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on + bitcoded regular expressions, instead of regular expressions. + + \begin{center} + \begin{tabular}{lcl} + $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$ \textbf{fix}\\ + $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{true}$\\ + $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{false}$\\ + $\textit{bnullable}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ & + $\textit{bnullable}\,a_1\vee \textit{bnullable}\,a_2$\\ + $\textit{bnullable}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ & + $\textit{bnullable}\,a_1\wedge \textit{bnullable}\,a_2$\\ + $\textit{bnullable}\,(\textit{STAR}\,bs\,a)$ & $\dn$ & + $\textit{true}$ + \end{tabular} + \end{center} + + \begin{center} + \begin{tabular}{lcl} + $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$ \textbf{fix}\\ + $\textit{bmkeps}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,a_1$\\ + & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a_1$\\ + & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,a_2$\\ + $\textit{bmkeps}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ & + $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\ + $\textit{bmkeps}\,(\textit{STAR}\,bs\,a)$ & $\dn$ & + $bs \,@\, [\S]$ + \end{tabular} + \end{center} + + + \noindent + The key function in the bitcoded algorithm is the derivative of an + annotated regular expression. This derivative calculates the + derivative but at the same time also the incremental part that + contributes to constructing a value. + + \begin{center} + \begin{tabular}{@ {}lcl@ {}} + $(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \textbf{fix}\\ + $(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\ + $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ & + $\textit{if}\;c=d\; \;\textit{then}\; + \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\ + $(\textit{ALT}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ & + $\textit{ALT}\,bs\,(a_1\backslash c)\,(a_2\backslash c)$\\ + $(\textit{SEQ}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ & + $\textit{if}\;\textit{bnullable}\,a_1$\\ + & &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(a_1\backslash c)\,a_2)$\\ + & &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\backslash c))$\\ + & &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\backslash c)\,a_2$\\ + $(\textit{STAR}\,bs\,a)\backslash c$ & $\dn$ & + $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\, + (\textit{STAR}\,[]\,r)$ + \end{tabular} + \end{center} + + + \noindent + This function can also be extended to strings, written $a\backslash s$, + just like the standard derivative. We omit the details. Finally we + can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}: + + \noindent +This bitcoded lexer first internalises the regular expression $r$ and then +builds the annotated derivative according to $s$. If the derivative is +nullable, then it extracts the bitcoded value using the +$\textit{bmkeps}$ function. Finally it decodes the bitcoded value. If +the derivative is \emph{not} nullable, then $\textit{None}$ is +returned. The task is to show that this way of calculating a value +generates the same result as with \textit{lexer}. + +Before we can proceed we need to define a function, called +\textit{retrieve}, which Sulzmann and Lu introduced for the proof. + +\textbf{fix} + +\noindent +The idea behind this function is to retrieve a possibly partial +bitcode from an annotated regular expression, where the retrieval is +guided by a value. For example if the value is $\Left$ then we +descend into the left-hand side of an alternative (annotated) regular +expression in order to assemble the bitcode. Similarly for +$\Right$. The property we can show is that for a given $v$ and $r$ +with $\vdash v : r$, the retrieved bitsequence from the internalised +regular expression is equal to the bitcoded version of $v$. + +\begin{lemma}\label{retrievecode} +If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$. +\end{lemma} + +*} + +text {* There is also a corresponding decoding function that takes a bitsequence and generates back a value. However, since the bitsequences are a ``lossy'' coding (@{term Seq}s are not coded) the decoding function depends also on a regular expression in order to decode values. - \begin{center} - \begin{tabular}{lcll} - %@{thm (lhs) decode'.simps(1)} & $\dn$ & @{thm (rhs) decode'.simps(1)}\\ - @{thm (lhs) decode'.simps(2)} & $\dn$ & @{thm (rhs) decode'.simps(2)}\\ - @{thm (lhs) decode'.simps(3)} & $\dn$ & @{thm (rhs) decode'.simps(3)}\\ - @{thm (lhs) decode'.simps(4)} & $\dn$ & @{thm (rhs) decode'.simps(4)}\\ - @{thm (lhs) decode'.simps(5)} & $\dn$ & @{thm (rhs) decode'.simps(5)}\\ - @{thm (lhs) decode'.simps(6)} & $\dn$ & @{thm (rhs) decode'.simps(6)}\\ - @{thm (lhs) decode'.simps(7)} & $\dn$ & @{thm (rhs) decode'.simps(7)}\\ - @{thm (lhs) decode'.simps(8)} & $\dn$ & @{thm (rhs) decode'.simps(8)}\\ - @{thm (lhs) decode'.simps(9)} & $\dn$ & @{thm (rhs) decode'.simps(9)}\\ - @{thm (lhs) decode'.simps(10)} & $\dn$ & @{thm (rhs) decode'.simps(10)} & fix - \end{tabular} - \end{center} + The idea of the bitcodes is to annotate them to regular expressions and generate values diff -r 5c96fe5306a7 -r 57182b36ec01 thys2/Paper/document/root.tex --- a/thys2/Paper/document/root.tex Sat Feb 05 18:24:37 2022 +0000 +++ b/thys2/Paper/document/root.tex Sun Feb 06 00:02:04 2022 +0000 @@ -34,6 +34,23 @@ \renewcommand{\isasymin}{\ensuremath{\,\in\,}} +\def\lexer{\mathit{lexer}} +\def\mkeps{\mathit{mkeps}} +\def\inj{\mathit{inj}} +\def\Empty{\mathit{Empty}} +\def\Left{\mathit{Left}} +\def\Right{\mathit{Right}} +\def\Stars{\mathit{Stars}} +\def\Char{\mathit{Char}} +\def\Seq{\mathit{Seq}} +\def\Der{\mathit{Der}} +\def\nullable{\mathit{nullable}} +\def\Z{\mathit{Z}} +\def\S{\mathit{S}} +\newcommand{\ZERO}{\mbox{\bf 0}} +\newcommand{\ONE}{\mbox{\bf 1}} + + \def\Brz{Brzozowski} \def\der{\backslash} \newtheorem{falsehood}{Falsehood} diff -r 5c96fe5306a7 -r 57182b36ec01 thys2/SizeBound4.thy --- a/thys2/SizeBound4.thy Sat Feb 05 18:24:37 2022 +0000 +++ b/thys2/SizeBound4.thy Sun Feb 06 00:02:04 2022 +0000 @@ -206,7 +206,7 @@ (if bnullable r1 then AALT bs (ASEQ [] (bder c r1) r2) (fuse (bmkeps r1) (bder c r2)) else ASEQ bs (bder c r1) r2)" -| "bder c (ASTAR bs r) = ASEQ bs (fuse [Z] (bder c r)) (ASTAR [] r)" +| "bder c (ASTAR bs r) = ASEQ (bs @ [Z]) (bder c r) (ASTAR [] r)" fun @@ -566,8 +566,10 @@ | ss3: "r1 \ r2 \ (r1 # rs) s\ (r2 # rs)" | ss4: "(AZERO # rs) s\ rs" | ss5: "(AALTs bs1 rs1 # rsb) s\ ((map (fuse bs1) rs1) @ rsb)" -| ss6: "erase a1 = erase a2 \ (rsa@[a1]@rsb@[a2]@rsc) s\ (rsa@[a1]@rsb@rsc)" - +| ss6: "L(erase a2) \ L(erase a1) \ (rsa@[a1]@rsb@[a2]@rsc) s\ (rsa@[a1]@rsb@rsc)" +(*| extra: "bnullable r1 \ ASEQ bs0 (ASEQ bs1 r1 r2) r3 \ + ASEQ (bs0 @ bs1) r1 (ASEQ [] r2 r3)" +*) inductive rrewrites:: "arexp \ arexp \ bool" ("_ \* _" [100, 100] 100) @@ -586,6 +588,10 @@ shows "r1 \ r2 \ r1 \* r2" using rrewrites.intros(1) rrewrites.intros(2) by blast +lemma rs_in_rstar: + shows "r1 s\ r2 \ r1 s\* r2" + using rrewrites.intros(1) rrewrites.intros(2) by blast + lemma rrewrites_trans[trans]: assumes a1: "r1 \* r2" and a2: "r2 \* r3" shows "r1 \* r3" @@ -674,10 +680,19 @@ shows "(rs1 @ rs2) s\* (rs1 @ distinctBy rs2 erase (set (map erase rs1)))" apply(induct rs2 arbitrary: rs1) apply(auto) - apply (smt (verit, best) append.assoc append.right_neutral append_Cons append_Nil split_list srewrite2(2) srewrites_trans ss6) + prefer 2 apply(drule_tac x="rs1 @ [a]" in meta_spec) + apply(simp) + apply(drule_tac x="rs1" in meta_spec) + apply(subgoal_tac "(rs1 @ a # rs2) s\* (rs1 @ rs2)") + using srewrites_trans apply blast + apply(subgoal_tac "\rs1a rs1b. rs1 = rs1a @ [x] @ rs1b") + prefer 2 + apply (simp add: split_list) + apply(erule exE)+ apply(simp) - done + using ss6[simplified] + using rs_in_rstar by force lemma ss6_stronger: shows "rs1 s\* distinctBy rs1 erase {}" @@ -710,12 +725,17 @@ finally show "map (fuse bs) (AALTs bs1 rs1 # rsb) s\ map (fuse bs) (map (fuse bs1) rs1 @ rsb)" by (simp add: comp_def fuse_append) next - case (ss6 a1 a2 rsa rsb rsc) + case (ss6 a2 a1 rsa rsb rsc) + have "L (erase a2) \ L (erase a1)" by fact then show "map (fuse bs) (rsa @ [a1] @ rsb @ [a2] @ rsc) s\ map (fuse bs) (rsa @ [a1] @ rsb @ rsc)" apply(simp) apply(rule rrewrite_srewrite.ss6[simplified]) apply(simp add: erase_fuse) done +(*next + case (extra bs0 bs1 r1 r2 r3) + then show ?case + by (metis append_assoc fuse.simps(5) rrewrite_srewrite.extra)*) qed (auto intro: rrewrite_srewrite.intros) lemma rewrites_fuse: @@ -787,8 +807,9 @@ and "rs1 s\ rs2 \ bnullables rs1 = bnullables rs2" apply(induct rule: rrewrite_srewrite.inducts) apply(auto simp add: bnullable_fuse) - apply (meson UnCI bnullable_fuse imageI) - by (metis bnullable_correctness) + apply (meson UnCI bnullable_fuse imageI) + + using bnullable_correctness nullable_correctness by blast lemma rewrites_bnullable_eq: @@ -824,11 +845,22 @@ then show "bmkepss (AALTs bs1 rs1 # rsb) = bmkepss (map (fuse bs1) rs1 @ rsb)" by (simp add: bmkepss1 bmkepss2 bmkepss_fuse bnullable_fuse) next - case (ss6 a1 a2 rsa rsb rsc) - have as1: "erase a1 = erase a2" by fact + case (ss6 a2 a1 rsa rsb rsc) + have as1: "L(erase a2) \ L(erase a1)" by fact have as3: "bnullables (rsa @ [a1] @ rsb @ [a2] @ rsc)" by fact show "bmkepss (rsa @ [a1] @ rsb @ [a2] @ rsc) = bmkepss (rsa @ [a1] @ rsb @ rsc)" using as1 as3 - by (smt (verit, best) append_Cons bmkeps.simps(3) bmkepss.simps(2) bmkepss1 bmkepss2 bnullable_correctness) + by (smt (verit, ccfv_SIG) append_Cons bmkepss.simps(2) bmkepss1 bmkepss2 bnullable_correctness nullable_correctness subset_eq) +(*next + case (extra bs0 bs1 r1 bs2 r2 bs4 bs3) + then show ?case + apply(subst bmkeps.simps) + apply(subst bmkeps.simps) + apply(subst bmkeps.simps) + apply(subst bmkeps.simps) + apply(subst bmkeps.simps) + apply(subst bmkeps.simps) + apply(simp) + done*) qed (auto) lemma rewrites_bmkeps: @@ -908,6 +940,11 @@ apply(simp_all add: bder_fuse) done +lemma map_single: + shows "map f [x] = [f x]" + by simp + + lemma rewrite_preserves_bder: shows "r1 \ r2 \ bder c r1 \* bder c r2" and "rs1 s\ rs2 \ map (bder c) rs1 s\* map (bder c) rs2" @@ -977,20 +1014,39 @@ apply(simp) using bder_fuse_list map_map rrewrite_srewrite.ss5 srewrites.simps by blast next - case (ss6 a1 a2 bs rsa rsb) - have as: "erase a1 = erase a2" by fact + case (ss6 a2 a1 bs rsa rsb) + have as: "L(erase a2) \ L(erase a1)" by fact show "map (bder c) (bs @ [a1] @ rsa @ [a2] @ rsb) s\* map (bder c) (bs @ [a1] @ rsa @ rsb)" apply(simp only: map_append) - by (smt (verit, best) erase_bder list.simps(8) list.simps(9) as rrewrite_srewrite.ss6 srewrites.simps) + apply(simp only: map_single) + apply(rule rs_in_rstar) + thm rrewrite_srewrite.intros + apply(rule rrewrite_srewrite.ss6) + using as + apply(auto simp add: der_correctness Der_def) + done +(*next + case (extra bs0 bs1 r1 r2 r3) + then show ?case + apply(auto simp add: comp_def) + + defer + using r_in_rstar rrewrite_srewrite.extra apply presburger + prefer 2 + using r_in_rstar rrewrite_srewrite.extra apply presburger + prefer 2 + sorry*) qed + + lemma rewrites_preserves_bder: assumes "r1 \* r2" shows "bder c r1 \* bder c r2" using assms apply(induction r1 r2 rule: rrewrites.induct) apply(simp_all add: rewrite_preserves_bder rrewrites_trans) -done + done lemma central: @@ -1034,6 +1090,76 @@ (* some tests *) +definition + bders_simp_Set :: "string set \ arexp \ arexp set" +where + "bders_simp_Set A r \ bders_simp r ` A" + +lemma pders_Set_union: + shows "bders_simp_Set (A \ B) r = (bders_simp_Set A r \ bders_simp_Set B r)" + apply (simp add: bders_simp_Set_def) + by (simp add: image_Un) + +lemma pders_Set_subset: + shows "A \ B \ bders_simp_Set A r \ bders_simp_Set B r" + apply (auto simp add: bders_simp_Set_def) + done + + +lemma AZERO_r: + "bders_simp AZERO s = AZERO" + apply(induct s) + apply(auto) + done + +lemma bders_simp_Set_ZERO [simp]: + shows "bders_simp_Set UNIV1 AZERO \ {AZERO}" + unfolding UNIV1_def bders_simp_Set_def + apply(auto) + using AZERO_r by blast + +lemma AONE_r: + shows "bders_simp (AONE bs) s = AZERO \ bders_simp (AONE bs) s = AONE bs" + apply(induct s) + apply(auto) + using AZERO_r apply blast + using AZERO_r by blast + +lemma bders_simp_Set_ONE [simp]: + shows "bders_simp_Set UNIV1 (AONE bs) \ {AZERO, AONE bs}" + unfolding UNIV1_def bders_simp_Set_def + apply(auto split: if_splits) + using AONE_r by blast + +lemma ACHAR_r: + shows "bders_simp (ACHAR bs c) s = AZERO \ + bders_simp (ACHAR bs c) s = AONE bs \ + bders_simp (ACHAR bs c) s = ACHAR bs c" + apply(induct s) + apply(auto) + using AONE_r apply blast + using AZERO_r apply force + using AONE_r apply blast + using AZERO_r apply blast + using AONE_r apply blast + using AZERO_r by blast + +lemma bders_simp_Set_CHAR [simp]: + shows "bders_simp_Set UNIV1 (ACHAR bs c) \ {AZERO, AONE bs, ACHAR bs c}" +unfolding UNIV1_def bders_simp_Set_def + apply(auto) + using ACHAR_r by blast + +lemma bders_simp_Set_ALT [simp]: + shows "bders_simp_Set UNIV1 (AALT bs r1 r2) = bders_simp_Set UNIV1 r1 \ bders_simp_Set UNIV1 r2" + unfolding UNIV1_def bders_simp_Set_def + apply(auto) + oops + + + + +(* lemma asize_fuse: shows "asize (fuse bs r) = asize r" apply(induct r arbitrary: bs) @@ -1372,11 +1498,13 @@ using finite_pder apply blast oops - +*) (* below is the idempotency of bsimp *) +(* + lemma bsimp_ASEQ_fuse: shows "fuse bs1 (bsimp_ASEQ bs2 r1 r2) = bsimp_ASEQ (bs1 @ bs2) r1 r2" apply(induct r1 r2 arbitrary: bs1 bs2 rule: bsimp_ASEQ.induct) @@ -1466,6 +1594,8 @@ apply auto oops +*) + (* AALTs [] [AZERO, AALTs(bs1, [a, b]) ] @@ -1475,24 +1605,19 @@ *) -lemma normal_bsimp: - shows "\r'. bsimp r \ r'" - oops (*r' size bsimp r > size r' r' \* bsimp bsimp r size bsimp r > size r' \ size bsimp bsimp r*) -export_code blexer_simp blexer lexer bders bders_simp in Scala module_name VerifiedLexers - unused_thms - +(* inductive aggressive:: "arexp \ arexp \ bool" ("_ \? _" [99, 99] 99) where "ASEQ bs (AALTs bs1 rs) r \? AALTs (bs@bs1) (map (\r'. ASEQ [] r' r) rs) " - +*) end diff -r 5c96fe5306a7 -r 57182b36ec01 thys2/paper.pdf Binary file thys2/paper.pdf has changed