lexing: thys2/Paper/Paper.thy@57182b36ec01


(*<*)
theory Paper
imports 
   "../Lexer"
   "../Simplifying" 
   "../Positions"
   "../SizeBound4" 
   "HOL-Library.LaTeXsugar"
begin

declare [[show_question_marks = false]]

notation (latex output)
  If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
  Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) 


abbreviation 
  "der_syn r c \<equiv> der c r"

notation (latex output)
  der_syn ("_\\_" [79, 1000] 76) and

  ZERO ("\<^bold>0" 81) and 
  ONE ("\<^bold>1" 81) and 
  CH ("_" [1000] 80) and
  ALT ("_ + _" [77,77] 78) and
  SEQ ("_ \<cdot> _" [77,77] 78) and
  STAR ("_\<^sup>\<star>" [79] 78) and

  val.Void ("Empty" 78) and
  val.Char ("Char _" [1000] 78) and
  val.Left ("Left _" [79] 78) and
  val.Right ("Right _" [1000] 78) and
  val.Seq ("Seq _ _" [79,79] 78) and
  val.Stars ("Stars _" [79] 78) and

  Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and

  flat ("|_|" [75] 74) and
  flats ("|_|" [72] 74) and
  injval ("inj _ _ _" [79,77,79] 76) and 
  mkeps ("mkeps _" [79] 76) and 
  length ("len _" [73] 73) and
  set ("_" [73] 73) and

  AZERO ("ZERO" 81) and 
  AONE ("ONE _" [79] 78) and 
  ACHAR ("CHAR _ _" [79, 79] 80) and
  AALTs ("ALTs _ _" [77,77] 78) and
  ASEQ ("SEQ _ _ _" [79, 79,79] 78) and
  ASTAR ("STAR _ _" [79, 79] 78) and

  code ("code _" [79] 74) and
  intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
  erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
  bnullable ("nullable\<^latex>\<open>\\mbox{$_b$}\<close> _" [1000] 80) and
  bmkeps ("mkeps\<^latex>\<open>\\mbox{$_b$}\<close> _" [1000] 80) and

  srewrite ("_\<^latex>\<open>\\mbox{$\\,\\stackrel{s}{\\leadsto}$}\<close> _" [71, 71] 80) 


lemma better_retrieve:
   shows "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Left v) = bs @ retrieve r v"
   and   "rs \<noteq> Nil ==> retrieve (AALTs bs (r#rs)) (Right v) = bs @ retrieve (AALTs [] rs) v"
  apply (metis list.exhaust retrieve.simps(4))
  by (metis list.exhaust retrieve.simps(5))

(*>*)

section {* Introduction *}

text {*

In the last fifteen or so years, Brzozowski's derivatives of regular
expressions have sparked quite a bit of interest in the functional
programming and theorem prover communities.  The beauty of
Brzozowski's derivatives \cite{Brzozowski1964} is that they are neatly
expressible in any functional language, and easily definable and
reasoned about in theorem provers---the definitions just consist of
inductive datatypes and simple recursive functions. A mechanised
correctness proof of Brzozowski's matcher in for example HOL4 has been
mentioned by Owens and Slind~\cite{Owens2008}. Another one in
Isabelle/HOL is part of the work by Krauss and Nipkow
\cite{Krauss2011}.  And another one in Coq is given by Coquand and
Siles \cite{Coquand2012}.


The notion of derivatives
\cite{Brzozowski1964}, written @{term "der c r"}, of a regular
expression give a simple solution to the problem of matching a string
@{term s} with a regular expression @{term r}: if the derivative of
@{term r} w.r.t.\ (in succession) all the characters of the string
matches the empty string, then @{term r} matches @{term s} (and {\em
vice versa}). The derivative has the property (which may almost be
regarded as its specification) that, for every string @{term s} and
regular expression @{term r} and character @{term c}, one has @{term
"cs \<in> L(r)"} if and only if \mbox{@{term "s \<in> L(der c r)"}}.


If a regular expression matches a string, then in general there is more
than one way of how the string is matched. There are two commonly used
disambiguation strategies to generate a unique answer: one is called
GREEDY matching \cite{Frisch2004} and the other is POSIX
matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}.
For example consider the string @{term xy} and the regular expression
\mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be
matched in two `iterations' by the single letter-regular expressions
@{term x} and @{term y}, or directly in one iteration by @{term xy}. The
first case corresponds to GREEDY matching, which first matches with the
left-most symbol and only matches the next symbol in case of a mismatch
(this is greedy in the sense of preferring instant gratification to
delayed repletion). The second case is POSIX matching, which prefers the
longest match.


\begin{center}
\begin{tabular}{cc}
  \begin{tabular}{r@ {\hspace{2mm}}c@ {\hspace{2mm}}l}
  @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
  @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
  @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
  @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{text "if"} @{term "nullable(r\<^sub>1)"}\\
  & & @{text "then"} @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2)"}\\
  & & @{text "else"} @{term "SEQ (der c r\<^sub>1) r\<^sub>2"}\\
  % & & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
  \end{tabular}
  &
  \begin{tabular}{l@ {\hspace{1mm}}c@ {\hspace{1mm}}l}
  @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
  @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
  @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
  @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
  \end{tabular}  
\end{tabular}  
\end{center}


\begin{figure}[t]
\begin{center}
\begin{tikzpicture}[scale=2,node distance=1.3cm,
                    every node/.style={minimum size=6mm}]
\node (r1)  {@{term "r\<^sub>1"}};
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
\draw[->,line width=1mm](r4) -- (v4);
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
\end{tikzpicture}
\end{center}
\mbox{}\\[-13mm]

\caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
left to right) is \Brz's matcher building successive derivatives. If the 
last regular expression is @{term nullable}, then the functions of the 
second phase are called (the top-down and right-to-left arrows): first 
@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
how the empty string has been recognised by @{term "r\<^sub>4"}. After
that the function @{term inj} ``injects back'' the characters of the string into
the values.
\label{Sulz}}
\end{figure} 


*}

section {* Background *}

text {*
  Sulzmann-Lu algorithm with inj. State that POSIX rules.
  metion slg is correct.


  \begin{figure}[t]
  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
  @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
  @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
  @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
    @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
    @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
   {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\\
  @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\medskip\\
  $\mprset{flushleft}
   \inferrule
   {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
    @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
    @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
    @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
   {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>
  \end{tabular}
  \end{center}
  \caption{Our inductive definition of POSIX values.}\label{POSIXrules}
  \end{figure}


  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
  @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
  \end{tabular}
  \end{center}

  \begin{center}
  \begin{tabular}{l@ {\hspace{5mm}}lcl}
  \textit{(1)} & @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
  \textit{(2)} & @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
      @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
  \textit{(3)} & @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
      @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
  \textit{(4)} & @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
  \textit{(5)} & @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
  \textit{(6)} & @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
  \textit{(7)} & @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
      & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
  \end{tabular}
  \end{center}

*}

section {* Bitcoded Regular Expressions and Derivatives *}

text {*

  In the second part of their paper \cite{Sulzmann2014},
  Sulzmann and Lu describe another algorithm that generates POSIX
  values but dispences with the second phase where characters are
  injected ``back'' into values. For this they annotate bitcodes to
  regular expressions, which we define in Isabelle/HOL as the datatype

  \begin{center}
  \begin{tabular}{lcl}
  @{term breg} & $::=$ & @{term "AZERO"} $\quad\mid\quad$ @{term "AONE bs"}\\
               & $\mid$ & @{term "ACHAR bs c"}\\
               & $\mid$ & @{term "AALTs bs rs"}\\
               & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
               & $\mid$ & @{term "ASTAR bs r"}
  \end{tabular}
  \end{center}

  \noindent where @{text bs} stands for a bitsequences; @{text r},
  @{text "r\<^sub>1"} and @{text "r\<^sub>2"} for bitcoded regular
  expressions; and @{text rs} for lists of bitcoded regular
  expressions. The binary alternative @{text "ALT bs r\<^sub>1 r\<^sub>2"}
  is just an abbreviation for @{text "ALTs bs [r\<^sub>1, r\<^sub>2]"}. 
  For bitsequences we just use lists made up of the
  constants @{text Z} and @{text S}.  The idea with bitcoded regular
  expressions is to incrementally generate the value information (for
  example @{text Left} and @{text Right}) as bitsequences
  as part of the regular expression constructors. 
  Sulzmann and Lu then define a coding
  function for how values can be coded into bitsequences.

  \begin{center}
  \begin{tabular}{cc}
  \begin{tabular}{lcl}
  @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
  @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
  @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
  @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}
  \end{tabular}
  &
  \begin{tabular}{lcl}
  @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
  @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}\\
  \mbox{\phantom{XX}}\\
  \end{tabular}
  \end{tabular}
  \end{center}
   
  \noindent
  As can be seen, this coding is ``lossy'' in the sense that we do not
  record explicitly character values and also not sequence values (for
  them we just append two bitsequences). We do, however, record the
  different alternatives for @{text Left}, respectively @{text Right}, as @{text Z} and
  @{text S} followed by some bitsequence. Similarly, we use @{text Z} to indicate
  if there is still a value coming in the list of @{text Stars}, whereas @{text S}
  indicates the end of the list. The lossiness makes the process of
  decoding a bit more involved, but the point is that if we have a
  regular expression \emph{and} a bitsequence of a corresponding value,
  then we can always decode the value accurately. The decoding can be
  defined by using two functions called $\textit{decode}'$ and
  \textit{decode}:

  \begin{center}
  \begin{tabular}{@ {}l@ {\hspace{1mm}}c@ {\hspace{1mm}}l@ {}}
  $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
  $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
       (\Left\,v, bs_1)$\\
  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
       (\Right\,v, bs_1)$\\                           
  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$
        \hspace{2mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$
        \hspace{2mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
  $\textit{decode}\,bs\,r$ & $\dn$ &
     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
  & & \hspace{7mm}$\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
       \textit{else}\;\textit{None}$   
  \end{tabular}    
  \end{center}

  \noindent
  The function \textit{decode} checks whether all of the bitsequence is
  consumed and returns the corresponding value as @{term "Some v"}; otherwise
  it fails with @{text "None"}. We can establish that for a value $v$
  inhabited by a regular expression $r$, the decoding of its
  bitsequence never fails.

\begin{lemma}\label{codedecode}\it
  If $\;\vdash v : r$ then
  $\;\textit{decode}\,(\textit{code}\, v)\,r = \textit{Some}\, v$.
\end{lemma}

\begin{proof}
  This follows from the property that
  $\textit{decode}'\,((\textit{code}\,v) \,@\, bs)\,r = (v, bs)$ holds
  for any bit-sequence $bs$ and $\vdash v : r$. This property can be
  easily proved by induction on $\vdash v : r$.
\end{proof}  

  Sulzmann and Lu define the function \emph{internalise}
  in order to transform standard regular expressions into annotated
  regular expressions. We write this operation as $r^\uparrow$.
  This internalisation uses the following
  \emph{fuse} function.	     

  \begin{center}
  \begin{tabular}{lcl}
  $\textit{fuse}\,bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
  $\textit{fuse}\,bs\,(\textit{ONE}\,bs')$ & $\dn$ &
     $\textit{ONE}\,(bs\,@\,bs')$\\
  $\textit{fuse}\,bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
     $\textit{CHAR}\,(bs\,@\,bs')\,c$\\
  $\textit{fuse}\,bs\,(\textit{ALTs}\,bs'\,rs)$ & $\dn$ &
     $\textit{ALTs}\,(bs\,@\,bs')\,rs$\\
  $\textit{fuse}\,bs\,(\textit{SEQ}\,bs'\,r_1\,r_2)$ & $\dn$ &
     $\textit{SEQ}\,(bs\,@\,bs')\,r_1\,r_2$\\
  $\textit{fuse}\,bs\,(\textit{STAR}\,bs'\,r)$ & $\dn$ &
     $\textit{STAR}\,(bs\,@\,bs')\,r$
  \end{tabular}    
  \end{center}    

  \noindent
  A regular expression can then be \emph{internalised} into a bitcoded
  regular expression as follows.

  \begin{center}
  \begin{tabular}{lcl}
  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
  $(r_1 + r_2)^\uparrow$ & $\dn$ &
         $\textit{ALT}\;[]\,(\textit{fuse}\,[\Z]\,r_1^\uparrow)\,
                            (\textit{fuse}\,[\S]\,r_2^\uparrow)$\\
  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
  $(r^*)^\uparrow$ & $\dn$ &
         $\textit{STAR}\;[]\,r^\uparrow$\\
  \end{tabular}    
  \end{center}    

  \noindent
  There is also an \emph{erase}-function, written $a^\downarrow$, which
  transforms a bitcoded regular expression into a (standard) regular
  expression by just erasing the annotated bitsequences. We omit the
  straightforward definition. For defining the algorithm, we also need
  the functions \textit{bnullable} and \textit{bmkeps}, which are the
  ``lifted'' versions of \textit{nullable} and \textit{mkeps} acting on
  bitcoded regular expressions, instead of regular expressions.

  \begin{center}
  \begin{tabular}{lcl}
  $\textit{bnullable}\,(\textit{ZERO})$ & $\dn$ & $\textit{false}$ \textbf{fix}\\
  $\textit{bnullable}\,(\textit{ONE}\,bs)$ & $\dn$ & $\textit{true}$\\
  $\textit{bnullable}\,(\textit{CHAR}\,bs\,c)$ & $\dn$ & $\textit{false}$\\
  $\textit{bnullable}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ &
     $\textit{bnullable}\,a_1\vee \textit{bnullable}\,a_2$\\
  $\textit{bnullable}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ &
     $\textit{bnullable}\,a_1\wedge \textit{bnullable}\,a_2$\\
  $\textit{bnullable}\,(\textit{STAR}\,bs\,a)$ & $\dn$ &
     $\textit{true}$
  \end{tabular}    
  \end{center}    

  \begin{center}
  \begin{tabular}{lcl}
  $\textit{bmkeps}\,(\textit{ONE}\,bs)$ & $\dn$ & $bs$ \textbf{fix}\\
  $\textit{bmkeps}\,(\textit{ALT}\,bs\,a_1\,a_2)$ & $\dn$ &
     $\textit{if}\;\textit{bnullable}\,a_1$\\
  & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a_1$\\
  & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,a_2$\\
  $\textit{bmkeps}\,(\textit{SEQ}\,bs\,a_1\,a_2)$ & $\dn$ &
     $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
  $\textit{bmkeps}\,(\textit{STAR}\,bs\,a)$ & $\dn$ &
     $bs \,@\, [\S]$
  \end{tabular}    
  \end{center}    
 

  \noindent
  The key function in the bitcoded algorithm is the derivative of an
  annotated regular expression. This derivative calculates the
  derivative but at the same time also the incremental part that
  contributes to constructing a value.	

  \begin{center}
  \begin{tabular}{@ {}lcl@ {}}
  $(\textit{ZERO})\backslash c$ & $\dn$ & $\textit{ZERO}$ \textbf{fix}\\  
  $(\textit{ONE}\;bs)\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
  $(\textit{CHAR}\;bs\,d)\backslash c$ & $\dn$ &
        $\textit{if}\;c=d\; \;\textit{then}\;
         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
  $(\textit{ALT}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ &
        $\textit{ALT}\,bs\,(a_1\backslash c)\,(a_2\backslash c)$\\
  $(\textit{SEQ}\;bs\,a_1\,a_2)\backslash c$ & $\dn$ &
     $\textit{if}\;\textit{bnullable}\,a_1$\\
  & &$\textit{then}\;\textit{ALT}\,bs\,(\textit{SEQ}\,[]\,(a_1\backslash c)\,a_2)$\\
  & &$\phantom{\textit{then}\;\textit{ALT}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\backslash c))$\\
  & &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\backslash c)\,a_2$\\
  $(\textit{STAR}\,bs\,a)\backslash c$ & $\dn$ &
      $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\backslash c))\,
       (\textit{STAR}\,[]\,r)$
  \end{tabular}    
  \end{center}


  \noindent
  This function can also be extended to strings, written $a\backslash s$,
  just like the standard derivative.  We omit the details. Finally we
  can define Sulzmann and Lu's bitcoded lexer, which we call \textit{blexer}: 

  \noindent
This bitcoded lexer first internalises the regular expression $r$ and then
builds the annotated derivative according to $s$. If the derivative is
nullable, then it extracts the bitcoded value using the
$\textit{bmkeps}$ function. Finally it decodes the bitcoded value.  If
the derivative is \emph{not} nullable, then $\textit{None}$ is
returned. The task is to show that this way of calculating a value
generates the same result as with \textit{lexer}.

Before we can proceed we need to define a function, called
\textit{retrieve}, which Sulzmann and Lu introduced for the proof.

\textbf{fix}

\noindent
The idea behind this function is to retrieve a possibly partial
bitcode from an annotated regular expression, where the retrieval is
guided by a value.  For example if the value is $\Left$ then we
descend into the left-hand side of an alternative (annotated) regular
expression in order to assemble the bitcode. Similarly for
$\Right$. The property we can show is that for a given $v$ and $r$
with $\vdash v : r$, the retrieved bitsequence from the internalised
regular expression is equal to the bitcoded version of $v$.

\begin{lemma}\label{retrievecode}
If $\vdash v : r$ then $\textit{code}\, v = \textit{retrieve}\,(r^\uparrow)\,v$.
\end{lemma}

*}

text {*
  There is also a corresponding decoding function that takes a bitsequence
  and generates back a value. However, since the bitsequences are a ``lossy''
  coding (@{term Seq}s are not coded) the decoding function depends also
  on a regular expression in order to decode values. 




  The idea of the bitcodes is to annotate them to regular expressions and generate values
  incrementally. The bitcodes can be read off from the @{text breg} and then decoded into a value.

  \begin{center}
  \begin{tabular}{lcl}
  @{term breg} & $::=$ & @{term "AZERO"}\\
               & $\mid$ & @{term "AONE bs"}\\
               & $\mid$ & @{term "ACHAR bs c"}\\
               & $\mid$ & @{term "AALTs bs rs"}\\
               & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
               & $\mid$ & @{term "ASTAR bs r"}
  \end{tabular}
  \end{center}



  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) retrieve.simps(1)} & $\dn$ & @{thm (rhs) retrieve.simps(1)}\\
  @{thm (lhs) retrieve.simps(2)} & $\dn$ & @{thm (rhs) retrieve.simps(2)}\\
  @{thm (lhs) retrieve.simps(3)} & $\dn$ & @{thm (rhs) retrieve.simps(3)}\\
  @{thm (lhs) better_retrieve(1)} & $\dn$ & @{thm (rhs) better_retrieve(1)}\\
  @{thm (lhs) better_retrieve(2)} & $\dn$ & @{thm (rhs) better_retrieve(2)}\\
  @{thm (lhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}
      & $\dn$ & @{thm (rhs) retrieve.simps(6)[of _ "r\<^sub>1" "r\<^sub>2" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) retrieve.simps(7)} & $\dn$ & @{thm (rhs) retrieve.simps(7)}\\
  @{thm (lhs) retrieve.simps(8)} & $\dn$ & @{thm (rhs) retrieve.simps(8)}
  \end{tabular}
  \end{center}


  \begin{theorem}
  @{thm blexer_correctness} 
  \end{theorem}


  bitcoded regexes / decoding / bmkeps
  gets rid of the second phase (only single phase)   
  correctness
*}


section {* Simplification *}

text {*
     Sulzmann \& Lu apply simplification via a fixpoint operation; also does not use erase to filter out duplicates.
  
   not direct correspondence with PDERs, because of example
   problem with retrieve 

   correctness

   
    



   \begin{figure}[t]
  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Axiom] bs1[of _ "r\<^sub>2"]}\qquad
  @{thm[mode=Axiom] bs2[of _ "r\<^sub>1"]}\qquad
  @{thm[mode=Axiom] bs3[of "bs\<^sub>1" "bs\<^sub>2"]}\\
  @{thm[mode=Rule] bs4[of "r\<^sub>1" "r\<^sub>2" _ "r\<^sub>3"]}\qquad
  @{thm[mode=Rule] bs5[of "r\<^sub>3" "r\<^sub>4" _ "r\<^sub>1"]}\\
  @{thm[mode=Axiom] bs6}\qquad
  @{thm[mode=Axiom] bs7}\\
  @{thm[mode=Rule] bs8[of "rs\<^sub>1" "rs\<^sub>2"]}\\
  %@ { t hm[mode=Axiom] ss1}\qquad
  @{thm[mode=Rule] ss2[of "rs\<^sub>1" "rs\<^sub>2"]}\qquad
  @{thm[mode=Rule] ss3[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm[mode=Axiom] ss4}\qquad
  @{thm[mode=Axiom] ss5[of "bs" "rs\<^sub>1" "rs\<^sub>2"]}\\
  @{thm[mode=Rule] ss6[of "r\<^sub>1" "r\<^sub>2" "rs\<^sub>1" "rs\<^sub>2" "rs\<^sub>3"]}\\
  \end{tabular}
  \end{center}
  \caption{???}\label{SimpRewrites}
  \end{figure}
*}

section {* Bound - NO *}

section {* Bounded Regex / Not *}

section {* Conclusion *}

text {*

\cite{AusafDyckhoffUrban2016}

%%\bibliographystyle{plain}
\bibliography{root}
*}

(*<*)
end
(*>*)
author	Christian Urban <christian.urban@kcl.ac.uk>
	Sun, 06 Feb 2022 00:02:04 +0000
changeset 416	57182b36ec01
parent 410	9261d980225d
child 418	41a2a3b63853
permissions	-rw-r--r--