thys2/Journal/Paper.thy~
changeset 382 aef235b965bb
parent 381 0c666a0c57d7
child 383 aa0a2a3f90a0
equal deleted inserted replaced
381:0c666a0c57d7 382:aef235b965bb
     1 (*<*)
       
     2 theory Paper
       
     3 imports 
       
     4    "../Lexer"
       
     5    "../Simplifying" 
       
     6    "../Positions"
       
     7 
       
     8    "../SizeBound" 
       
     9    "HOL-Library.LaTeXsugar"
       
    10 begin
       
    11 
       
    12 lemma Suc_0_fold:
       
    13   "Suc 0 = 1"
       
    14 by simp
       
    15 
       
    16 
       
    17 
       
    18 declare [[show_question_marks = false]]
       
    19 
       
    20 syntax (latex output)
       
    21   "_Collect" :: "pttrn => bool => 'a set"              ("(1{_ \<^latex>\<open>\\mbox{\\boldmath$\\mid$}\<close> _})")
       
    22   "_CollectIn" :: "pttrn => 'a set => bool => 'a set"   ("(1{_ \<in> _ |e _})")
       
    23 
       
    24 syntax
       
    25   "_Not_Ex" :: "idts \<Rightarrow> bool \<Rightarrow> bool"  ("(3\<nexists>_.a./ _)" [0, 10] 10)
       
    26   "_Not_Ex1" :: "pttrn \<Rightarrow> bool \<Rightarrow> bool"  ("(3\<nexists>!_.a./ _)" [0, 10] 10)
       
    27 
       
    28 
       
    29 abbreviation 
       
    30   "der_syn r c \<equiv> der c r"
       
    31 
       
    32 abbreviation 
       
    33   "ders_syn r s \<equiv> ders s r"
       
    34 
       
    35   abbreviation 
       
    36   "bder_syn r c \<equiv> bder c r"
       
    37 
       
    38 abbreviation 
       
    39   "bders_syn r s \<equiv> bders r s"
       
    40 
       
    41 
       
    42 abbreviation
       
    43   "nprec v1 v2 \<equiv> \<not>(v1 :\<sqsubset>val v2)"
       
    44 
       
    45 
       
    46 
       
    47 
       
    48 notation (latex output)
       
    49   If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
       
    50   Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) and  
       
    51 
       
    52   ZERO ("\<^bold>0" 81) and 
       
    53   ONE ("\<^bold>1" 81) and 
       
    54   CH ("_" [1000] 80) and
       
    55   ALT ("_ + _" [77,77] 78) and
       
    56   SEQ ("_ \<cdot> _" [77,77] 78) and
       
    57   STAR ("_\<^sup>\<star>" [79] 78) and
       
    58   
       
    59   val.Void ("Empty" 78) and
       
    60   val.Char ("Char _" [1000] 78) and
       
    61   val.Left ("Left _" [79] 78) and
       
    62   val.Right ("Right _" [1000] 78) and
       
    63   val.Seq ("Seq _ _" [79,79] 78) and
       
    64   val.Stars ("Stars _" [79] 78) and
       
    65 
       
    66   L ("L'(_')" [10] 78) and
       
    67   LV ("LV _ _" [80,73] 78) and
       
    68   der_syn ("_\\_" [79, 1000] 76) and  
       
    69   ders_syn ("_\\_" [79, 1000] 76) and
       
    70   flat ("|_|" [75] 74) and
       
    71   flats ("|_|" [72] 74) and
       
    72   Sequ ("_ @ _" [78,77] 63) and
       
    73   injval ("inj _ _ _" [79,77,79] 76) and 
       
    74   mkeps ("mkeps _" [79] 76) and 
       
    75   length ("len _" [73] 73) and
       
    76   intlen ("len _" [73] 73) and
       
    77   set ("_" [73] 73) and
       
    78  
       
    79   Prf ("_ : _" [75,75] 75) and
       
    80   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
       
    81  
       
    82   lexer ("lexer _ _" [78,78] 77) and 
       
    83   F_RIGHT ("F\<^bsub>Right\<^esub> _") and
       
    84   F_LEFT ("F\<^bsub>Left\<^esub> _") and  
       
    85   F_ALT ("F\<^bsub>Alt\<^esub> _ _") and
       
    86   F_SEQ1 ("F\<^bsub>Seq1\<^esub> _ _") and
       
    87   F_SEQ2 ("F\<^bsub>Seq2\<^esub> _ _") and
       
    88   F_SEQ ("F\<^bsub>Seq\<^esub> _ _") and
       
    89   simp_SEQ ("simp\<^bsub>Seq\<^esub> _ _" [1000, 1000] 1) and
       
    90   simp_ALT ("simp\<^bsub>Alt\<^esub> _ _" [1000, 1000] 1) and
       
    91   slexer ("lexer\<^sup>+" 1000) and
       
    92 
       
    93   at ("_\<^latex>\<open>\\mbox{$\\downharpoonleft$}\<close>\<^bsub>_\<^esub>") and
       
    94   lex_list ("_ \<prec>\<^bsub>lex\<^esub> _") and
       
    95   PosOrd ("_ \<prec>\<^bsub>_\<^esub> _" [77,77,77] 77) and
       
    96   PosOrd_ex ("_ \<prec> _" [77,77] 77) and
       
    97   PosOrd_ex_eq ("_ \<^latex>\<open>\\mbox{$\\preccurlyeq$}\<close> _" [77,77] 77) and
       
    98   pflat_len ("\<parallel>_\<parallel>\<^bsub>_\<^esub>") and
       
    99   nprec ("_ \<^latex>\<open>\\mbox{$\\not\\prec$}\<close> _" [77,77] 77) and
       
   100 
       
   101   bder_syn ("_\<^latex>\<open>\\mbox{$\\bbslash$}\<close>_" [79, 1000] 76) and  
       
   102   bders_syn ("_\<^latex>\<open>\\mbox{$\\bbslash$}\<close>_" [79, 1000] 76) and
       
   103   intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
       
   104   erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
       
   105   bnullable ("nullable\<^latex>\<open>\\mbox{$_b$}\<close> _" [1000] 80) and
       
   106   bmkeps ("mkeps\<^latex>\<open>\\mbox{$_b$}\<close> _" [1000] 80) and
       
   107   blexer ("lexer\<^latex>\<open>\\mbox{$_b$}\<close> _ _" [77, 77] 80) and
       
   108   code ("code _" [79] 74) and
       
   109 
       
   110   DUMMY ("\<^latex>\<open>\\underline{\\hspace{2mm}}\<close>")
       
   111 
       
   112 
       
   113 definition 
       
   114   "match r s \<equiv> nullable (ders s r)"
       
   115 
       
   116 
       
   117 lemma LV_STAR_ONE_empty: 
       
   118   shows "LV (STAR ONE) [] = {Stars []}"
       
   119 by(auto simp add: LV_def elim: Prf.cases intro: Prf.intros)
       
   120 
       
   121 
       
   122 
       
   123 (*
       
   124 comments not implemented
       
   125 
       
   126 p9. The condition "not exists s3 s4..." appears often enough (in particular in
       
   127 the proof of Lemma 3) to warrant a definition.
       
   128 
       
   129 *)
       
   130 
       
   131 
       
   132 (*>*)
       
   133 
       
   134 section\<open>Core of the proof\<close>
       
   135 text \<open>
       
   136 This paper builds on previous work by Ausaf and Urban using 
       
   137 regular expression'd bit-coded derivatives to do lexing that 
       
   138 is both fast and satisfies the POSIX specification.
       
   139 In their work, a bit-coded algorithm introduced by Sulzmann and Lu
       
   140 was formally verified in Isabelle, by a very clever use of
       
   141 flex function and retrieve to carefully mimic the way a value is 
       
   142 built up by the injection funciton.
       
   143 
       
   144 In the previous work, Ausaf and Urban established the below equality:
       
   145 \begin{lemma}
       
   146 @{thm [mode=IfThen] MAIN_decode}
       
   147 \end{lemma}
       
   148 
       
   149 This lemma establishes a link with the lexer without bit-codes.
       
   150 
       
   151 With it we get the correctness of bit-coded algorithm.
       
   152 \begin{lemma}
       
   153 @{thm [mode=IfThen] blexer_correctness}
       
   154 \end{lemma}
       
   155 
       
   156 However what is not certain is whether we can add simplification
       
   157 to the bit-coded algorithm, without breaking the correct lexing output.
       
   158 
       
   159 
       
   160 The reason that we do need to add a simplification phase
       
   161 after each derivative step of  $\textit{blexer}$ is
       
   162 because it produces intermediate
       
   163 regular expressions that can grow exponentially.
       
   164 For example, the regular expression $(a+aa)^*$ after taking
       
   165 derivative against just 10 $a$s will have size 8192.
       
   166 
       
   167 %TODO: add figure for this?
       
   168 
       
   169 
       
   170 Therefore, we insert a simplification phase
       
   171 after each derivation step, as defined below:
       
   172 \begin{lemma}
       
   173 @{thm blexer_simp_def}
       
   174 \end{lemma}
       
   175 
       
   176 The simplification function is given as follows:
       
   177 
       
   178 \begin{center}
       
   179   \begin{tabular}{lcl}
       
   180   @{thm (lhs) bsimp.simps(1)[of  "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bsimp.simps(1)[of  "bs" "r\<^sub>1" "r\<^sub>2"]}\\
       
   181   @{thm (lhs) bsimp.simps(2)} & $\dn$ & @{thm (rhs) bsimp.simps(2)}\\
       
   182   @{thm (lhs) bsimp.simps(3)} & $\dn$ & @{thm (rhs) bsimp.simps(3)}\\
       
   183 
       
   184 \end{tabular}
       
   185 \end{center}
       
   186 
       
   187 And the two helper functions are:
       
   188 \begin{center}
       
   189   \begin{tabular}{lcl}
       
   190   @{thm (lhs) bsimp_AALTs.simps(2)[of  "bs\<^sub>1" "r" ]} & $\dn$ & @{thm (rhs) bsimp.simps(1)[of  "bs\<^sub>1" "r" ]}\\
       
   191   @{thm (lhs) bsimp_AALTs.simps(2)} & $\dn$ & @{thm (rhs) bsimp.simps(2)}\\
       
   192   @{thm (lhs) bsimp_AALTs.simps(3)} & $\dn$ & @{thm (rhs) bsimp.simps(3)}\\
       
   193 
       
   194 \end{tabular}
       
   195 \end{center}
       
   196 
       
   197 
       
   198 This might sound trivial in the case of producing a YES/NO answer,
       
   199 but once we require a lexing output to be produced (which is required
       
   200 in applications like compiler front-end, malicious attack domain extraction, 
       
   201 etc.), it is not straightforward if we still extract what is needed according
       
   202 to the POSIX standard.
       
   203 
       
   204 
       
   205 
       
   206 
       
   207 
       
   208 By simplification, we mean specifically the following rules:
       
   209 
       
   210 \begin{center}
       
   211   \begin{tabular}{lcl}
       
   212   @{thm[mode=Axiom] rrewrite.intros(1)[of "bs" "r\<^sub>2"]}\\
       
   213   @{thm[mode=Axiom] rrewrite.intros(2)[of "bs" "r\<^sub>1"]}\\
       
   214   @{thm[mode=Axiom] rrewrite.intros(3)[of  "bs" "bs\<^sub>1" "r\<^sub>1"]}\\
       
   215   @{thm[mode=Rule] rrewrite.intros(4)[of  "r\<^sub>1" "r\<^sub>2" "bs" "r\<^sub>3"]}\\
       
   216   @{thm[mode=Rule] rrewrite.intros(5)[of "r\<^sub>3" "r\<^sub>4" "bs" "r\<^sub>1"]}\\
       
   217   @{thm[mode=Rule] rrewrite.intros(6)[of "r" "r'" "bs" "rs\<^sub>1" "rs\<^sub>2"]}\\
       
   218   @{thm[mode=Axiom] rrewrite.intros(7)[of "bs" "rs\<^sub>a" "rs\<^sub>b"]}\\
       
   219   @{thm[mode=Axiom] rrewrite.intros(8)[of "bs" "rs\<^sub>a" "bs\<^sub>1" "rs\<^sub>1" "rs\<^sub>b"]}\\
       
   220   @{thm[mode=Axiom] rrewrite.intros(9)[of "bs" "bs\<^sub>1" "rs"]}\\
       
   221   @{thm[mode=Axiom] rrewrite.intros(10)[of "bs" ]}\\
       
   222   @{thm[mode=Axiom] rrewrite.intros(11)[of "bs" "r\<^sub>1"]}\\
       
   223   @{thm[mode=Rule] rrewrite.intros(12)[of "a\<^sub>1" "a\<^sub>2" "bs" "rs\<^sub>a" "rs\<^sub>b" "rs\<^sub>c"]}\\
       
   224 
       
   225   \end{tabular}
       
   226 \end{center}
       
   227 
       
   228 
       
   229 And these can be made compact by the following simplification function:
       
   230 
       
   231 where the function $\mathit{bsimp_AALTs}$
       
   232 
       
   233 The core idea of the proof is that two regular expressions,
       
   234 if "isomorphic" up to a finite number of rewrite steps, will
       
   235 remain "isomorphic" when we take the same sequence of
       
   236 derivatives on both of them.
       
   237 This can be expressed by the following rewrite relation lemma:
       
   238 \begin{lemma}
       
   239 @{thm [mode=IfThen] central}
       
   240 \end{lemma}
       
   241 
       
   242 This isomorphic relation implies a property that leads to the 
       
   243 correctness result: 
       
   244 if two (nullable) regular expressions are "rewritable" in many steps
       
   245 from one another, 
       
   246 then a call to function $\textit{bmkeps}$ gives the same
       
   247 bit-sequence :
       
   248 \begin{lemma}
       
   249 @{thm [mode=IfThen] rewrites_bmkeps}
       
   250 \end{lemma}
       
   251 
       
   252 Given the same bit-sequence, the decode function
       
   253 will give out the same value, which is the output
       
   254 of both lexers:
       
   255 \begin{lemma}
       
   256 @{thm blexer_def}
       
   257 \end{lemma}
       
   258 
       
   259 \begin{lemma}
       
   260 @{thm blexer_simp_def}
       
   261 \end{lemma}
       
   262 
       
   263 And that yields the correctness result:
       
   264 \begin{lemma}
       
   265 @{thm blexersimp_correctness}
       
   266 \end{lemma}
       
   267 
       
   268 The nice thing about the aove
       
   269 \<close>
       
   270 
       
   271 
       
   272 
       
   273 section \<open>Introduction\<close>
       
   274 
       
   275 
       
   276 text \<open>
       
   277 
       
   278 
       
   279 Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em
       
   280 derivative} @{term "der c r"} of a regular expression \<open>r\<close> w.r.t.\
       
   281 a character~\<open>c\<close>, and showed that it gave a simple solution to the
       
   282 problem of matching a string @{term s} with a regular expression @{term
       
   283 r}: if the derivative of @{term r} w.r.t.\ (in succession) all the
       
   284 characters of the string matches the empty string, then @{term r}
       
   285 matches @{term s} (and {\em vice versa}). The derivative has the
       
   286 property (which may almost be regarded as its specification) that, for
       
   287 every string @{term s} and regular expression @{term r} and character
       
   288 @{term c}, one has @{term "cs \<in> L(r)"} if and only if \mbox{@{term "s \<in> L(der c r)"}}. 
       
   289 The beauty of Brzozowski's derivatives is that
       
   290 they are neatly expressible in any functional language, and easily
       
   291 definable and reasoned about in theorem provers---the definitions just
       
   292 consist of inductive datatypes and simple recursive functions. A
       
   293 mechanised correctness proof of Brzozowski's matcher in for example HOL4
       
   294 has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in
       
   295 Isabelle/HOL is part of the work by Krauss and Nipkow \cite{Krauss2011}.
       
   296 And another one in Coq is given by Coquand and Siles \cite{Coquand2012}.
       
   297 
       
   298 If a regular expression matches a string, then in general there is more
       
   299 than one way of how the string is matched. There are two commonly used
       
   300 disambiguation strategies to generate a unique answer: one is called
       
   301 GREEDY matching \cite{Frisch2004} and the other is POSIX
       
   302 matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}.
       
   303 For example consider the string @{term xy} and the regular expression
       
   304 \mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be
       
   305 matched in two `iterations' by the single letter-regular expressions
       
   306 @{term x} and @{term y}, or directly in one iteration by @{term xy}. The
       
   307 first case corresponds to GREEDY matching, which first matches with the
       
   308 left-most symbol and only matches the next symbol in case of a mismatch
       
   309 (this is greedy in the sense of preferring instant gratification to
       
   310 delayed repletion). The second case is POSIX matching, which prefers the
       
   311 longest match.
       
   312 
       
   313 In the context of lexing, where an input string needs to be split up
       
   314 into a sequence of tokens, POSIX is the more natural disambiguation
       
   315 strategy for what programmers consider basic syntactic building blocks
       
   316 in their programs.  These building blocks are often specified by some
       
   317 regular expressions, say \<open>r\<^bsub>key\<^esub>\<close> and \<open>r\<^bsub>id\<^esub>\<close> for recognising keywords and identifiers,
       
   318 respectively. There are a few underlying (informal) rules behind
       
   319 tokenising a string in a POSIX \cite{POSIX} fashion:
       
   320 
       
   321 \begin{itemize} 
       
   322 \item[$\bullet$] \emph{The Longest Match Rule} (or \emph{``{M}aximal {M}unch {R}ule''}):
       
   323 The longest initial substring matched by any regular expression is taken as
       
   324 next token.\smallskip
       
   325 
       
   326 \item[$\bullet$] \emph{Priority Rule:}
       
   327 For a particular longest initial substring, the first (leftmost) regular expression
       
   328 that can match determines the token.\smallskip
       
   329 
       
   330 \item[$\bullet$] \emph{Star Rule:} A subexpression repeated by ${}^\star$ shall 
       
   331 not match an empty string unless this is the only match for the repetition.\smallskip
       
   332 
       
   333 \item[$\bullet$] \emph{Empty String Rule:} An empty string shall be considered to 
       
   334 be longer than no match at all.
       
   335 \end{itemize}
       
   336 
       
   337 \noindent Consider for example a regular expression \<open>r\<^bsub>key\<^esub>\<close> for recognising keywords such as \<open>if\<close>,
       
   338 \<open>then\<close> and so on; and \<open>r\<^bsub>id\<^esub>\<close>
       
   339 recognising identifiers (say, a single character followed by
       
   340 characters or numbers).  Then we can form the regular expression
       
   341 \<open>(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>\<close>
       
   342 and use POSIX matching to tokenise strings, say \<open>iffoo\<close> and
       
   343 \<open>if\<close>.  For \<open>iffoo\<close> we obtain by the Longest Match Rule
       
   344 a single identifier token, not a keyword followed by an
       
   345 identifier. For \<open>if\<close> we obtain by the Priority Rule a keyword
       
   346 token, not an identifier token---even if \<open>r\<^bsub>id\<^esub>\<close>
       
   347 matches also. By the Star Rule we know \<open>(r\<^bsub>key\<^esub> +
       
   348 r\<^bsub>id\<^esub>)\<^sup>\<star>\<close> matches \<open>iffoo\<close>,
       
   349 respectively \<open>if\<close>, in exactly one `iteration' of the star. The
       
   350 Empty String Rule is for cases where, for example, the regular expression 
       
   351 \<open>(a\<^sup>\<star>)\<^sup>\<star>\<close> matches against the
       
   352 string \<open>bc\<close>. Then the longest initial matched substring is the
       
   353 empty string, which is matched by both the whole regular expression
       
   354 and the parenthesised subexpression.
       
   355 
       
   356 
       
   357 One limitation of Brzozowski's matcher is that it only generates a
       
   358 YES/NO answer for whether a string is being matched by a regular
       
   359 expression.  Sulzmann and Lu~\cite{Sulzmann2014} extended this matcher
       
   360 to allow generation not just of a YES/NO answer but of an actual
       
   361 matching, called a [lexical] {\em value}. Assuming a regular
       
   362 expression matches a string, values encode the information of
       
   363 \emph{how} the string is matched by the regular expression---that is,
       
   364 which part of the string is matched by which part of the regular
       
   365 expression. For this consider again the string \<open>xy\<close> and
       
   366 the regular expression \mbox{\<open>(x + (y + xy))\<^sup>\<star>\<close>}
       
   367 (this time fully parenthesised). We can view this regular expression
       
   368 as tree and if the string \<open>xy\<close> is matched by two Star
       
   369 `iterations', then the \<open>x\<close> is matched by the left-most
       
   370 alternative in this tree and the \<open>y\<close> by the right-left alternative. This
       
   371 suggests to record this matching as
       
   372 
       
   373 \begin{center}
       
   374 @{term "Stars [Left(Char x), Right(Left(Char y))]"}
       
   375 \end{center}
       
   376 
       
   377 \noindent where @{const Stars}, \<open>Left\<close>, \<open>Right\<close> and \<open>Char\<close> are constructors for values. \<open>Stars\<close> records how many
       
   378 iterations were used; \<open>Left\<close>, respectively \<open>Right\<close>, which
       
   379 alternative is used. This `tree view' leads naturally to the idea that
       
   380 regular expressions act as types and values as inhabiting those types
       
   381 (see, for example, \cite{HosoyaVouillonPierce2005}).  The value for
       
   382 matching \<open>xy\<close> in a single `iteration', i.e.~the POSIX value,
       
   383 would look as follows
       
   384 
       
   385 \begin{center}
       
   386 @{term "Stars [Seq (Char x) (Char y)]"}
       
   387 \end{center}
       
   388 
       
   389 \noindent where @{const Stars} has only a single-element list for the
       
   390 single iteration and @{const Seq} indicates that @{term xy} is matched 
       
   391 by a sequence regular expression.
       
   392 
       
   393 %, which we will in what follows 
       
   394 %write more formally as @{term "SEQ x y"}.
       
   395 
       
   396 
       
   397 Sulzmann and Lu give a simple algorithm to calculate a value that
       
   398 appears to be the value associated with POSIX matching.  The challenge
       
   399 then is to specify that value, in an algorithm-independent fashion,
       
   400 and to show that Sulzmann and Lu's derivative-based algorithm does
       
   401 indeed calculate a value that is correct according to the
       
   402 specification.  The answer given by Sulzmann and Lu
       
   403 \cite{Sulzmann2014} is to define a relation (called an ``order
       
   404 relation'') on the set of values of @{term r}, and to show that (once
       
   405 a string to be matched is chosen) there is a maximum element and that
       
   406 it is computed by their derivative-based algorithm. This proof idea is
       
   407 inspired by work of Frisch and Cardelli \cite{Frisch2004} on a GREEDY
       
   408 regular expression matching algorithm. However, we were not able to
       
   409 establish transitivity and totality for the ``order relation'' by
       
   410 Sulzmann and Lu.  There are some inherent problems with their approach
       
   411 (of which some of the proofs are not published in
       
   412 \cite{Sulzmann2014}); perhaps more importantly, we give in this paper
       
   413 a simple inductive (and algorithm-independent) definition of what we
       
   414 call being a {\em POSIX value} for a regular expression @{term r} and
       
   415 a string @{term s}; we show that the algorithm by Sulzmann and Lu
       
   416 computes such a value and that such a value is unique. Our proofs are
       
   417 both done by hand and checked in Isabelle/HOL.  The experience of
       
   418 doing our proofs has been that this mechanical checking was absolutely
       
   419 essential: this subject area has hidden snares. This was also noted by
       
   420 Kuklewicz \cite{Kuklewicz} who found that nearly all POSIX matching
       
   421 implementations are ``buggy'' \cite[Page 203]{Sulzmann2014} and by
       
   422 Grathwohl et al \cite[Page 36]{CrashCourse2014} who wrote:
       
   423 
       
   424 \begin{quote}
       
   425 \it{}``The POSIX strategy is more complicated than the greedy because of 
       
   426 the dependence on information about the length of matched strings in the 
       
   427 various subexpressions.''
       
   428 \end{quote}
       
   429 
       
   430 
       
   431 
       
   432 \noindent {\bf Contributions:} We have implemented in Isabelle/HOL the
       
   433 derivative-based regular expression matching algorithm of
       
   434 Sulzmann and Lu \cite{Sulzmann2014}. We have proved the correctness of this
       
   435 algorithm according to our specification of what a POSIX value is (inspired
       
   436 by work of Vansummeren \cite{Vansummeren2006}). Sulzmann
       
   437 and Lu sketch in \cite{Sulzmann2014} an informal correctness proof: but to
       
   438 us it contains unfillable gaps.\footnote{An extended version of
       
   439 \cite{Sulzmann2014} is available at the website of its first author; this
       
   440 extended version already includes remarks in the appendix that their
       
   441 informal proof contains gaps, and possible fixes are not fully worked out.}
       
   442 Our specification of a POSIX value consists of a simple inductive definition
       
   443 that given a string and a regular expression uniquely determines this value.
       
   444 We also show that our definition is equivalent to an ordering 
       
   445 of values based on positions by Okui and Suzuki \cite{OkuiSuzuki2010}.
       
   446 
       
   447 %Derivatives as calculated by Brzozowski's method are usually more complex
       
   448 %regular expressions than the initial one; various optimisations are
       
   449 %possible. We prove the correctness when simplifications of @{term "ALT ZERO r"}, 
       
   450 %@{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
       
   451 %@{term r} are applied. 
       
   452 
       
   453 We extend our results to ??? Bitcoded version??
       
   454 
       
   455 \<close>
       
   456 
       
   457 
       
   458 
       
   459 
       
   460 section \<open>Preliminaries\<close>
       
   461 
       
   462 text \<open>\noindent Strings in Isabelle/HOL are lists of characters with
       
   463 the empty string being represented by the empty list, written @{term
       
   464 "[]"}, and list-cons being written as @{term "DUMMY # DUMMY"}. Often
       
   465 we use the usual bracket notation for lists also for strings; for
       
   466 example a string consisting of just a single character @{term c} is
       
   467 written @{term "[c]"}. We use the usual definitions for 
       
   468 \emph{prefixes} and \emph{strict prefixes} of strings.  By using the
       
   469 type @{type char} for characters we have a supply of finitely many
       
   470 characters roughly corresponding to the ASCII character set. Regular
       
   471 expressions are defined as usual as the elements of the following
       
   472 inductive datatype:
       
   473 
       
   474   \begin{center}
       
   475   \<open>r :=\<close>
       
   476   @{const "ZERO"} $\mid$
       
   477   @{const "ONE"} $\mid$
       
   478   @{term "CH c"} $\mid$
       
   479   @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
       
   480   @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
       
   481   @{term "STAR r"} 
       
   482   \end{center}
       
   483 
       
   484   \noindent where @{const ZERO} stands for the regular expression that does
       
   485   not match any string, @{const ONE} for the regular expression that matches
       
   486   only the empty string and @{term c} for matching a character literal. The
       
   487   language of a regular expression is also defined as usual by the
       
   488   recursive function @{term L} with the six clauses:
       
   489 
       
   490   \begin{center}
       
   491   \begin{tabular}{l@ {\hspace{4mm}}rcl}
       
   492   \textit{(1)} & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
       
   493   \textit{(2)} & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
       
   494   \textit{(3)} & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
       
   495   \textit{(4)} & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & 
       
   496         @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   497   \textit{(5)} & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & 
       
   498         @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   499   \textit{(6)} & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
       
   500   \end{tabular}
       
   501   \end{center}
       
   502   
       
   503   \noindent In clause \textit{(4)} we use the operation @{term "DUMMY ;;
       
   504   DUMMY"} for the concatenation of two languages (it is also list-append for
       
   505   strings). We use the star-notation for regular expressions and for
       
   506   languages (in the last clause above). The star for languages is defined
       
   507   inductively by two clauses: \<open>(i)\<close> the empty string being in
       
   508   the star of a language and \<open>(ii)\<close> if @{term "s\<^sub>1"} is in a
       
   509   language and @{term "s\<^sub>2"} in the star of this language, then also @{term
       
   510   "s\<^sub>1 @ s\<^sub>2"} is in the star of this language. It will also be convenient
       
   511   to use the following notion of a \emph{semantic derivative} (or \emph{left
       
   512   quotient}) of a language defined as
       
   513   %
       
   514   \begin{center}
       
   515   @{thm Der_def}\;.
       
   516   \end{center}
       
   517  
       
   518   \noindent
       
   519   For semantic derivatives we have the following equations (for example
       
   520   mechanically proved in \cite{Krauss2011}):
       
   521   %
       
   522   \begin{equation}\label{SemDer}
       
   523   \begin{array}{lcl}
       
   524   @{thm (lhs) Der_null}  & \dn & @{thm (rhs) Der_null}\\
       
   525   @{thm (lhs) Der_empty}  & \dn & @{thm (rhs) Der_empty}\\
       
   526   @{thm (lhs) Der_char}  & \dn & @{thm (rhs) Der_char}\\
       
   527   @{thm (lhs) Der_union}  & \dn & @{thm (rhs) Der_union}\\
       
   528   @{thm (lhs) Der_Sequ}  & \dn & @{thm (rhs) Der_Sequ}\\
       
   529   @{thm (lhs) Der_star}  & \dn & @{thm (rhs) Der_star}
       
   530   \end{array}
       
   531   \end{equation}
       
   532 
       
   533 
       
   534   \noindent \emph{\Brz's derivatives} of regular expressions
       
   535   \cite{Brzozowski1964} can be easily defined by two recursive functions:
       
   536   the first is from regular expressions to booleans (implementing a test
       
   537   when a regular expression can match the empty string), and the second
       
   538   takes a regular expression and a character to a (derivative) regular
       
   539   expression:
       
   540 
       
   541   \begin{center}
       
   542   \begin{tabular}{lcl}
       
   543   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
       
   544   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
       
   545   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
       
   546   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   547   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   548   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
       
   549 
       
   550 %  \end{tabular}
       
   551 %  \end{center}
       
   552 
       
   553 %  \begin{center}
       
   554 %  \begin{tabular}{lcl}
       
   555 
       
   556   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
       
   557   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
       
   558   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
       
   559   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   560   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   561   @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
       
   562   \end{tabular}
       
   563   \end{center}
       
   564  
       
   565   \noindent
       
   566   We may extend this definition to give derivatives w.r.t.~strings:
       
   567 
       
   568   \begin{center}
       
   569   \begin{tabular}{lcl}
       
   570   @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\
       
   571   @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\
       
   572   \end{tabular}
       
   573   \end{center}
       
   574 
       
   575   \noindent Given the equations in \eqref{SemDer}, it is a relatively easy
       
   576   exercise in mechanical reasoning to establish that
       
   577 
       
   578   \begin{proposition}\label{derprop}\mbox{}\\ 
       
   579   \begin{tabular}{ll}
       
   580   \textit{(1)} & @{thm (lhs) nullable_correctness} if and only if
       
   581   @{thm (rhs) nullable_correctness}, and \\ 
       
   582   \textit{(2)} & @{thm[mode=IfThen] der_correctness}.
       
   583   \end{tabular}
       
   584   \end{proposition}
       
   585 
       
   586   \noindent With this in place it is also very routine to prove that the
       
   587   regular expression matcher defined as
       
   588   %
       
   589   \begin{center}
       
   590   @{thm match_def}
       
   591   \end{center}
       
   592 
       
   593   \noindent gives a positive answer if and only if @{term "s \<in> L r"}.
       
   594   Consequently, this regular expression matching algorithm satisfies the
       
   595   usual specification for regular expression matching. While the matcher
       
   596   above calculates a provably correct YES/NO answer for whether a regular
       
   597   expression matches a string or not, the novel idea of Sulzmann and Lu
       
   598   \cite{Sulzmann2014} is to append another phase to this algorithm in order
       
   599   to calculate a [lexical] value. We will explain the details next.
       
   600 
       
   601 \<close>
       
   602 
       
   603 section \<open>POSIX Regular Expression Matching\label{posixsec}\<close>
       
   604 
       
   605 text \<open>
       
   606 
       
   607   There have been many previous works that use values for encoding 
       
   608   \emph{how} a regular expression matches a string.
       
   609   The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to 
       
   610   define a function on values that mirrors (but inverts) the
       
   611   construction of the derivative on regular expressions. \emph{Values}
       
   612   are defined as the inductive datatype
       
   613 
       
   614   \begin{center}
       
   615   \<open>v :=\<close>
       
   616   @{const "Void"} $\mid$
       
   617   @{term "val.Char c"} $\mid$
       
   618   @{term "Left v"} $\mid$
       
   619   @{term "Right v"} $\mid$
       
   620   @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
       
   621   @{term "Stars vs"} 
       
   622   \end{center}  
       
   623 
       
   624   \noindent where we use @{term vs} to stand for a list of
       
   625   values. (This is similar to the approach taken by Frisch and
       
   626   Cardelli for GREEDY matching \cite{Frisch2004}, and Sulzmann and Lu
       
   627   for POSIX matching \cite{Sulzmann2014}). The string underlying a
       
   628   value can be calculated by the @{const flat} function, written
       
   629   @{term "flat DUMMY"} and defined as:
       
   630 
       
   631   \begin{center}
       
   632   \begin{tabular}[t]{lcl}
       
   633   @{thm (lhs) flat.simps(1)} & $\dn$ & @{thm (rhs) flat.simps(1)}\\
       
   634   @{thm (lhs) flat.simps(2)} & $\dn$ & @{thm (rhs) flat.simps(2)}\\
       
   635   @{thm (lhs) flat.simps(3)} & $\dn$ & @{thm (rhs) flat.simps(3)}\\
       
   636   @{thm (lhs) flat.simps(4)} & $\dn$ & @{thm (rhs) flat.simps(4)}
       
   637   \end{tabular}\hspace{14mm}
       
   638   \begin{tabular}[t]{lcl}
       
   639   @{thm (lhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
   640   @{thm (lhs) flat.simps(6)} & $\dn$ & @{thm (rhs) flat.simps(6)}\\
       
   641   @{thm (lhs) flat.simps(7)} & $\dn$ & @{thm (rhs) flat.simps(7)}\\
       
   642   \end{tabular}
       
   643   \end{center}
       
   644 
       
   645   \noindent We will sometimes refer to the underlying string of a
       
   646   value as \emph{flattened value}.  We will also overload our notation and 
       
   647   use @{term "flats vs"} for flattening a list of values and concatenating
       
   648   the resulting strings.
       
   649   
       
   650   Sulzmann and Lu define
       
   651   inductively an \emph{inhabitation relation} that associates values to
       
   652   regular expressions. We define this relation as
       
   653   follows:\footnote{Note that the rule for @{term Stars} differs from
       
   654   our earlier paper \cite{AusafDyckhoffUrban2016}. There we used the
       
   655   original definition by Sulzmann and Lu which does not require that
       
   656   the values @{term "v \<in> set vs"} flatten to a non-empty
       
   657   string. The reason for introducing the more restricted version of
       
   658   lexical values is convenience later on when reasoning about an
       
   659   ordering relation for values.}
       
   660 
       
   661   \begin{center}
       
   662   \begin{tabular}{c@ {\hspace{12mm}}c}\label{prfintros}
       
   663   \\[-8mm]
       
   664   @{thm[mode=Axiom] Prf.intros(4)} & 
       
   665   @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm]
       
   666   @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} &
       
   667   @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\\[4mm]
       
   668   @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}  &
       
   669   @{thm[mode=Rule] Prf.intros(6)[of "vs"]}
       
   670   \end{tabular}
       
   671   \end{center}
       
   672 
       
   673   \noindent where in the clause for @{const "Stars"} we use the
       
   674   notation @{term "v \<in> set vs"} for indicating that \<open>v\<close> is a
       
   675   member in the list \<open>vs\<close>.  We require in this rule that every
       
   676   value in @{term vs} flattens to a non-empty string. The idea is that
       
   677   @{term "Stars"}-values satisfy the informal Star Rule (see Introduction)
       
   678   where the $^\star$ does not match the empty string unless this is
       
   679   the only match for the repetition.  Note also that no values are
       
   680   associated with the regular expression @{term ZERO}, and that the
       
   681   only value associated with the regular expression @{term ONE} is
       
   682   @{term Void}.  It is routine to establish how values ``inhabiting''
       
   683   a regular expression correspond to the language of a regular
       
   684   expression, namely
       
   685 
       
   686   \begin{proposition}\label{inhabs}
       
   687   @{thm L_flat_Prf}
       
   688   \end{proposition}
       
   689 
       
   690   \noindent
       
   691   Given a regular expression \<open>r\<close> and a string \<open>s\<close>, we define the 
       
   692   set of all \emph{Lexical Values} inhabited by \<open>r\<close> with the underlying string 
       
   693   being \<open>s\<close>:\footnote{Okui and Suzuki refer to our lexical values 
       
   694   as \emph{canonical values} in \cite{OkuiSuzuki2010}. The notion of \emph{non-problematic
       
   695   values} by Cardelli and Frisch \cite{Frisch2004} is related, but not identical
       
   696   to our lexical values.}
       
   697   
       
   698   \begin{center}
       
   699   @{thm LV_def}
       
   700   \end{center}
       
   701 
       
   702   \noindent The main property of @{term "LV r s"} is that it is alway finite.
       
   703 
       
   704   \begin{proposition}
       
   705   @{thm LV_finite}
       
   706   \end{proposition}
       
   707 
       
   708   \noindent This finiteness property does not hold in general if we
       
   709   remove the side-condition about @{term "flat v \<noteq> []"} in the
       
   710   @{term Stars}-rule above. For example using Sulzmann and Lu's
       
   711   less restrictive definition, @{term "LV (STAR ONE) []"} would contain
       
   712   infinitely many values, but according to our more restricted
       
   713   definition only a single value, namely @{thm LV_STAR_ONE_empty}.
       
   714 
       
   715   If a regular expression \<open>r\<close> matches a string \<open>s\<close>, then
       
   716   generally the set @{term "LV r s"} is not just a singleton set.  In
       
   717   case of POSIX matching the problem is to calculate the unique lexical value
       
   718   that satisfies the (informal) POSIX rules from the Introduction.
       
   719   Graphically the POSIX value calculation algorithm by Sulzmann and Lu
       
   720   can be illustrated by the picture in Figure~\ref{Sulz} where the
       
   721   path from the left to the right involving @{term
       
   722   derivatives}/@{const nullable} is the first phase of the algorithm
       
   723   (calculating successive \Brz's derivatives) and @{const
       
   724   mkeps}/\<open>inj\<close>, the path from right to left, the second
       
   725   phase. This picture shows the steps required when a regular
       
   726   expression, say \<open>r\<^sub>1\<close>, matches the string @{term
       
   727   "[a,b,c]"}. We first build the three derivatives (according to
       
   728   @{term a}, @{term b} and @{term c}). We then use @{const nullable}
       
   729   to find out whether the resulting derivative regular expression
       
   730   @{term "r\<^sub>4"} can match the empty string. If yes, we call the
       
   731   function @{const mkeps} that produces a value @{term "v\<^sub>4"}
       
   732   for how @{term "r\<^sub>4"} can match the empty string (taking into
       
   733   account the POSIX constraints in case there are several ways). This
       
   734   function is defined by the clauses:
       
   735 
       
   736 \begin{figure}[t]
       
   737 \begin{center}
       
   738 \begin{tikzpicture}[scale=2,node distance=1.3cm,
       
   739                     every node/.style={minimum size=6mm}]
       
   740 \node (r1)  {@{term "r\<^sub>1"}};
       
   741 \node (r2) [right=of r1]{@{term "r\<^sub>2"}};
       
   742 \draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
       
   743 \node (r3) [right=of r2]{@{term "r\<^sub>3"}};
       
   744 \draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
       
   745 \node (r4) [right=of r3]{@{term "r\<^sub>4"}};
       
   746 \draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
       
   747 \draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
       
   748 \node (v4) [below=of r4]{@{term "v\<^sub>4"}};
       
   749 \draw[->,line width=1mm](r4) -- (v4);
       
   750 \node (v3) [left=of v4] {@{term "v\<^sub>3"}};
       
   751 \draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
       
   752 \node (v2) [left=of v3]{@{term "v\<^sub>2"}};
       
   753 \draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
       
   754 \node (v1) [left=of v2] {@{term "v\<^sub>1"}};
       
   755 \draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
       
   756 \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
       
   757 \end{tikzpicture}
       
   758 \end{center}
       
   759 \mbox{}\\[-13mm]
       
   760 
       
   761 \caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
       
   762 matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
       
   763 left to right) is \Brz's matcher building successive derivatives. If the 
       
   764 last regular expression is @{term nullable}, then the functions of the 
       
   765 second phase are called (the top-down and right-to-left arrows): first 
       
   766 @{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
       
   767 how the empty string has been recognised by @{term "r\<^sub>4"}. After
       
   768 that the function @{term inj} ``injects back'' the characters of the string into
       
   769 the values.
       
   770 \label{Sulz}}
       
   771 \end{figure} 
       
   772 
       
   773   \begin{center}
       
   774   \begin{tabular}{lcl}
       
   775   @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
       
   776   @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   777   @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   778   @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
       
   779   \end{tabular}
       
   780   \end{center}
       
   781 
       
   782   \noindent Note that this function needs only to be partially defined,
       
   783   namely only for regular expressions that are nullable. In case @{const
       
   784   nullable} fails, the string @{term "[a,b,c]"} cannot be matched by @{term
       
   785   "r\<^sub>1"} and the null value @{term "None"} is returned. Note also how this function
       
   786   makes some subtle choices leading to a POSIX value: for example if an
       
   787   alternative regular expression, say @{term "ALT r\<^sub>1 r\<^sub>2"}, can
       
   788   match the empty string and furthermore @{term "r\<^sub>1"} can match the
       
   789   empty string, then we return a \<open>Left\<close>-value. The \<open>Right\<close>-value will only be returned if @{term "r\<^sub>1"} cannot match the empty
       
   790   string.
       
   791 
       
   792   The most interesting idea from Sulzmann and Lu \cite{Sulzmann2014} is
       
   793   the construction of a value for how @{term "r\<^sub>1"} can match the
       
   794   string @{term "[a,b,c]"} from the value how the last derivative, @{term
       
   795   "r\<^sub>4"} in Fig.~\ref{Sulz}, can match the empty string. Sulzmann and
       
   796   Lu achieve this by stepwise ``injecting back'' the characters into the
       
   797   values thus inverting the operation of building derivatives, but on the level
       
   798   of values. The corresponding function, called @{term inj}, takes three
       
   799   arguments, a regular expression, a character and a value. For example in
       
   800   the first (or right-most) @{term inj}-step in Fig.~\ref{Sulz} the regular
       
   801   expression @{term "r\<^sub>3"}, the character @{term c} from the last
       
   802   derivative step and @{term "v\<^sub>4"}, which is the value corresponding
       
   803   to the derivative regular expression @{term "r\<^sub>4"}. The result is
       
   804   the new value @{term "v\<^sub>3"}. The final result of the algorithm is
       
   805   the value @{term "v\<^sub>1"}. The @{term inj} function is defined by recursion on regular
       
   806   expressions and by analysing the shape of values (corresponding to 
       
   807   the derivative regular expressions).
       
   808   %
       
   809   \begin{center}
       
   810   \begin{tabular}{l@ {\hspace{5mm}}lcl}
       
   811   \textit{(1)} & @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
       
   812   \textit{(2)} & @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
       
   813       @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
       
   814   \textit{(3)} & @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
       
   815       @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
       
   816   \textit{(4)} & @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
       
   817       & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
       
   818   \textit{(5)} & @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
       
   819       & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
       
   820   \textit{(6)} & @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ 
       
   821       & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
       
   822   \textit{(7)} & @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
       
   823       & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
       
   824   \end{tabular}
       
   825   \end{center}
       
   826 
       
   827   \noindent To better understand what is going on in this definition it
       
   828   might be instructive to look first at the three sequence cases (clauses
       
   829   \textit{(4)} -- \textit{(6)}). In each case we need to construct an ``injected value'' for
       
   830   @{term "SEQ r\<^sub>1 r\<^sub>2"}. This must be a value of the form @{term
       
   831   "Seq DUMMY DUMMY"}\,. Recall the clause of the \<open>derivative\<close>-function
       
   832   for sequence regular expressions:
       
   833 
       
   834   \begin{center}
       
   835   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} $\dn$ @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}
       
   836   \end{center}
       
   837 
       
   838   \noindent Consider first the \<open>else\<close>-branch where the derivative is @{term
       
   839   "SEQ (der c r\<^sub>1) r\<^sub>2"}. The corresponding value must therefore
       
   840   be of the form @{term "Seq v\<^sub>1 v\<^sub>2"}, which matches the left-hand
       
   841   side in clause~\textit{(4)} of @{term inj}. In the \<open>if\<close>-branch the derivative is an
       
   842   alternative, namely @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c
       
   843   r\<^sub>2)"}. This means we either have to consider a \<open>Left\<close>- or
       
   844   \<open>Right\<close>-value. In case of the \<open>Left\<close>-value we know further it
       
   845   must be a value for a sequence regular expression. Therefore the pattern
       
   846   we match in the clause \textit{(5)} is @{term "Left (Seq v\<^sub>1 v\<^sub>2)"},
       
   847   while in \textit{(6)} it is just @{term "Right v\<^sub>2"}. One more interesting
       
   848   point is in the right-hand side of clause \textit{(6)}: since in this case the
       
   849   regular expression \<open>r\<^sub>1\<close> does not ``contribute'' to
       
   850   matching the string, that means it only matches the empty string, we need to
       
   851   call @{const mkeps} in order to construct a value for how @{term "r\<^sub>1"}
       
   852   can match this empty string. A similar argument applies for why we can
       
   853   expect in the left-hand side of clause \textit{(7)} that the value is of the form
       
   854   @{term "Seq v (Stars vs)"}---the derivative of a star is @{term "SEQ (der c r)
       
   855   (STAR r)"}. Finally, the reason for why we can ignore the second argument
       
   856   in clause \textit{(1)} of @{term inj} is that it will only ever be called in cases
       
   857   where @{term "c=d"}, but the usual linearity restrictions in patterns do
       
   858   not allow us to build this constraint explicitly into our function
       
   859   definition.\footnote{Sulzmann and Lu state this clause as @{thm (lhs)
       
   860   injval.simps(1)[of "c" "c"]} $\dn$ @{thm (rhs) injval.simps(1)[of "c"]},
       
   861   but our deviation is harmless.}
       
   862 
       
   863   The idea of the @{term inj}-function to ``inject'' a character, say
       
   864   @{term c}, into a value can be made precise by the first part of the
       
   865   following lemma, which shows that the underlying string of an injected
       
   866   value has a prepended character @{term c}; the second part shows that
       
   867   the underlying string of an @{const mkeps}-value is always the empty
       
   868   string (given the regular expression is nullable since otherwise
       
   869   \<open>mkeps\<close> might not be defined).
       
   870 
       
   871   \begin{lemma}\mbox{}\smallskip\\\label{Prf_injval_flat}
       
   872   \begin{tabular}{ll}
       
   873   (1) & @{thm[mode=IfThen] Prf_injval_flat}\\
       
   874   (2) & @{thm[mode=IfThen] mkeps_flat}
       
   875   \end{tabular}
       
   876   \end{lemma}
       
   877 
       
   878   \begin{proof}
       
   879   Both properties are by routine inductions: the first one can, for example,
       
   880   be proved by induction over the definition of @{term derivatives}; the second by
       
   881   an induction on @{term r}. There are no interesting cases.\qed
       
   882   \end{proof}
       
   883 
       
   884   Having defined the @{const mkeps} and \<open>inj\<close> function we can extend
       
   885   \Brz's matcher so that a value is constructed (assuming the
       
   886   regular expression matches the string). The clauses of the Sulzmann and Lu lexer are
       
   887 
       
   888   \begin{center}
       
   889   \begin{tabular}{lcl}
       
   890   @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
       
   891   @{thm (lhs) lexer.simps(2)} & $\dn$ & \<open>case\<close> @{term "lexer (der c r) s"} \<open>of\<close>\\
       
   892                      & & \phantom{$|$} @{term "None"}  \<open>\<Rightarrow>\<close> @{term None}\\
       
   893                      & & $|$ @{term "Some v"} \<open>\<Rightarrow>\<close> @{term "Some (injval r c v)"}                          
       
   894   \end{tabular}
       
   895   \end{center}
       
   896 
       
   897   \noindent If the regular expression does not match the string, @{const None} is
       
   898   returned. If the regular expression \emph{does}
       
   899   match the string, then @{const Some} value is returned. One important
       
   900   virtue of this algorithm is that it can be implemented with ease in any
       
   901   functional programming language and also in Isabelle/HOL. In the remaining
       
   902   part of this section we prove that this algorithm is correct.
       
   903 
       
   904   The well-known idea of POSIX matching is informally defined by some
       
   905   rules such as the Longest Match and Priority Rules (see
       
   906   Introduction); as correctly argued in \cite{Sulzmann2014}, this
       
   907   needs formal specification. Sulzmann and Lu define an ``ordering
       
   908   relation'' between values and argue that there is a maximum value,
       
   909   as given by the derivative-based algorithm.  In contrast, we shall
       
   910   introduce a simple inductive definition that specifies directly what
       
   911   a \emph{POSIX value} is, incorporating the POSIX-specific choices
       
   912   into the side-conditions of our rules. Our definition is inspired by
       
   913   the matching relation given by Vansummeren~\cite{Vansummeren2006}. 
       
   914   The relation we define is ternary and
       
   915   written as \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating
       
   916   strings, regular expressions and values; the inductive rules are given in 
       
   917   Figure~\ref{POSIXrules}.
       
   918   We can prove that given a string @{term s} and regular expression @{term
       
   919    r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow> v"}.
       
   920 
       
   921   %
       
   922   \begin{figure}[t]
       
   923   \begin{center}
       
   924   \begin{tabular}{c}
       
   925   @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
       
   926   @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
       
   927   @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
       
   928   @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
       
   929   $\mprset{flushleft}
       
   930    \inferrule
       
   931    {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
       
   932     @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
       
   933     @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
       
   934    {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\\
       
   935   @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\medskip\\
       
   936   $\mprset{flushleft}
       
   937    \inferrule
       
   938    {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
       
   939     @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
       
   940     @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
       
   941     @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
       
   942    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>
       
   943   \end{tabular}
       
   944   \end{center}
       
   945   \caption{Our inductive definition of POSIX values.}\label{POSIXrules}
       
   946   \end{figure}
       
   947 
       
   948    
       
   949 
       
   950   \begin{theorem}\mbox{}\smallskip\\\label{posixdeterm}
       
   951   \begin{tabular}{ll}
       
   952   (1) & If @{thm (prem 1) Posix1(1)} then @{thm (concl)
       
   953   Posix1(1)} and @{thm (concl) Posix1(2)}.\\
       
   954   (2) & @{thm[mode=IfThen] Posix_determ(1)[of _ _ "v" "v'"]}
       
   955   \end{tabular}
       
   956   \end{theorem}
       
   957 
       
   958   \begin{proof} Both by induction on the definition of @{term "s \<in> r \<rightarrow> v"}. 
       
   959   The second parts follows by a case analysis of @{term "s \<in> r \<rightarrow> v'"} and
       
   960   the first part.\qed
       
   961   \end{proof}
       
   962 
       
   963   \noindent
       
   964   We claim that our @{term "s \<in> r \<rightarrow> v"} relation captures the idea behind the four
       
   965   informal POSIX rules shown in the Introduction: Consider for example the
       
   966   rules \<open>P+L\<close> and \<open>P+R\<close> where the POSIX value for a string
       
   967   and an alternative regular expression, that is @{term "(s, ALT r\<^sub>1 r\<^sub>2)"},
       
   968   is specified---it is always a \<open>Left\<close>-value, \emph{except} when the
       
   969   string to be matched is not in the language of @{term "r\<^sub>1"}; only then it
       
   970   is a \<open>Right\<close>-value (see the side-condition in \<open>P+R\<close>).
       
   971   Interesting is also the rule for sequence regular expressions (\<open>PS\<close>). The first two premises state that @{term "v\<^sub>1"} and @{term "v\<^sub>2"}
       
   972   are the POSIX values for @{term "(s\<^sub>1, r\<^sub>1)"} and @{term "(s\<^sub>2, r\<^sub>2)"}
       
   973   respectively. Consider now the third premise and note that the POSIX value
       
   974   of this rule should match the string \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}}. According to the
       
   975   Longest Match Rule, we want that the @{term "s\<^sub>1"} is the longest initial
       
   976   split of \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}} such that @{term "s\<^sub>2"} is still recognised
       
   977   by @{term "r\<^sub>2"}. Let us assume, contrary to the third premise, that there
       
   978   \emph{exist} an @{term "s\<^sub>3"} and @{term "s\<^sub>4"} such that @{term "s\<^sub>2"}
       
   979   can be split up into a non-empty string @{term "s\<^sub>3"} and a possibly empty
       
   980   string @{term "s\<^sub>4"}. Moreover the longer string @{term "s\<^sub>1 @ s\<^sub>3"} can be
       
   981   matched by \<open>r\<^sub>1\<close> and the shorter @{term "s\<^sub>4"} can still be
       
   982   matched by @{term "r\<^sub>2"}. In this case @{term "s\<^sub>1"} would \emph{not} be the
       
   983   longest initial split of \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}} and therefore @{term "Seq v\<^sub>1
       
   984   v\<^sub>2"} cannot be a POSIX value for @{term "(s\<^sub>1 @ s\<^sub>2, SEQ r\<^sub>1 r\<^sub>2)"}. 
       
   985   The main point is that our side-condition ensures the Longest 
       
   986   Match Rule is satisfied.
       
   987 
       
   988   A similar condition is imposed on the POSIX value in the \<open>P\<star>\<close>-rule. Also there we want that @{term "s\<^sub>1"} is the longest initial
       
   989   split of @{term "s\<^sub>1 @ s\<^sub>2"} and furthermore the corresponding value
       
   990   @{term v} cannot be flattened to the empty string. In effect, we require
       
   991   that in each ``iteration'' of the star, some non-empty substring needs to
       
   992   be ``chipped'' away; only in case of the empty string we accept @{term
       
   993   "Stars []"} as the POSIX value. Indeed we can show that our POSIX values
       
   994   are lexical values which exclude those \<open>Stars\<close> that contain subvalues 
       
   995   that flatten to the empty string.
       
   996 
       
   997   \begin{lemma}\label{LVposix}
       
   998   @{thm [mode=IfThen] Posix_LV}
       
   999   \end{lemma}
       
  1000 
       
  1001   \begin{proof}
       
  1002   By routine induction on @{thm (prem 1) Posix_LV}.\qed 
       
  1003   \end{proof}
       
  1004 
       
  1005   \noindent
       
  1006   Next is the lemma that shows the function @{term "mkeps"} calculates
       
  1007   the POSIX value for the empty string and a nullable regular expression.
       
  1008 
       
  1009   \begin{lemma}\label{lemmkeps}
       
  1010   @{thm[mode=IfThen] Posix_mkeps}
       
  1011   \end{lemma}
       
  1012 
       
  1013   \begin{proof}
       
  1014   By routine induction on @{term r}.\qed 
       
  1015   \end{proof}
       
  1016 
       
  1017   \noindent
       
  1018   The central lemma for our POSIX relation is that the \<open>inj\<close>-function
       
  1019   preserves POSIX values.
       
  1020 
       
  1021   \begin{lemma}\label{Posix2}
       
  1022   @{thm[mode=IfThen] Posix_injval}
       
  1023   \end{lemma}
       
  1024 
       
  1025   \begin{proof}
       
  1026   By induction on \<open>r\<close>. We explain two cases.
       
  1027 
       
  1028   \begin{itemize}
       
  1029   \item[$\bullet$] Case @{term "r = ALT r\<^sub>1 r\<^sub>2"}. There are
       
  1030   two subcases, namely \<open>(a)\<close> \mbox{@{term "v = Left v'"}} and @{term
       
  1031   "s \<in> der c r\<^sub>1 \<rightarrow> v'"}; and \<open>(b)\<close> @{term "v = Right v'"}, @{term
       
  1032   "s \<notin> L (der c r\<^sub>1)"} and @{term "s \<in> der c r\<^sub>2 \<rightarrow> v'"}. In \<open>(a)\<close> we
       
  1033   know @{term "s \<in> der c r\<^sub>1 \<rightarrow> v'"}, from which we can infer @{term "(c # s)
       
  1034   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v'"} by induction hypothesis and hence @{term "(c #
       
  1035   s) \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> injval (ALT r\<^sub>1 r\<^sub>2) c (Left v')"} as needed. Similarly
       
  1036   in subcase \<open>(b)\<close> where, however, in addition we have to use
       
  1037   Proposition~\ref{derprop}(2) in order to infer @{term "c # s \<notin> L r\<^sub>1"} from @{term
       
  1038   "s \<notin> L (der c r\<^sub>1)"}.\smallskip
       
  1039 
       
  1040   \item[$\bullet$] Case @{term "r = SEQ r\<^sub>1 r\<^sub>2"}. There are three subcases:
       
  1041   
       
  1042   \begin{quote}
       
  1043   \begin{description}
       
  1044   \item[\<open>(a)\<close>] @{term "v = Left (Seq v\<^sub>1 v\<^sub>2)"} and @{term "nullable r\<^sub>1"} 
       
  1045   \item[\<open>(b)\<close>] @{term "v = Right v\<^sub>1"} and @{term "nullable r\<^sub>1"} 
       
  1046   \item[\<open>(c)\<close>] @{term "v = Seq v\<^sub>1 v\<^sub>2"} and @{term "\<not> nullable r\<^sub>1"} 
       
  1047   \end{description}
       
  1048   \end{quote}
       
  1049 
       
  1050   \noindent For \<open>(a)\<close> we know @{term "s\<^sub>1 \<in> der c r\<^sub>1 \<rightarrow> v\<^sub>1"} and
       
  1051   @{term "s\<^sub>2 \<in> r\<^sub>2 \<rightarrow> v\<^sub>2"} as well as
       
  1052   %
       
  1053   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> s\<^sub>1 @ s\<^sub>3 \<in> L (der c r\<^sub>1) \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
  1054 
       
  1055   \noindent From the latter we can infer by Proposition~\ref{derprop}(2):
       
  1056   %
       
  1057   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
  1058 
       
  1059   \noindent We can use the induction hypothesis for \<open>r\<^sub>1\<close> to obtain
       
  1060   @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. Putting this all together allows us to infer
       
  1061   @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case \<open>(c)\<close>
       
  1062   is similar.
       
  1063 
       
  1064   For \<open>(b)\<close> we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and 
       
  1065   @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former
       
  1066   we have @{term "(c # s) \<in> r\<^sub>2 \<rightarrow> (injval r\<^sub>2 c v\<^sub>1)"} by induction hypothesis
       
  1067   for @{term "r\<^sub>2"}. From the latter we can infer
       
  1068   %
       
  1069   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = c # s \<and> s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
  1070 
       
  1071   \noindent By Lemma~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"}
       
  1072   holds. Putting this all together, we can conclude with @{term "(c #
       
  1073   s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required.
       
  1074 
       
  1075   Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the
       
  1076   sequence case, except that we need to also ensure that @{term "flat (injval r\<^sub>1
       
  1077   c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1)
       
  1078   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}  (which in turn follows from @{term "s\<^sub>1 \<in> der c
       
  1079   r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed
       
  1080   \end{itemize}
       
  1081   \end{proof}
       
  1082 
       
  1083   \noindent
       
  1084   With Lemma~\ref{Posix2} in place, it is completely routine to establish
       
  1085   that the Sulzmann and Lu lexer satisfies our specification (returning
       
  1086   the null value @{term "None"} iff the string is not in the language of the regular expression,
       
  1087   and returning a unique POSIX value iff the string \emph{is} in the language):
       
  1088 
       
  1089   \begin{theorem}\mbox{}\smallskip\\\label{lexercorrect}
       
  1090   \begin{tabular}{ll}
       
  1091   (1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
       
  1092   (2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
       
  1093   \end{tabular}
       
  1094   \end{theorem}
       
  1095 
       
  1096   \begin{proof}
       
  1097   By induction on @{term s} using Lemma~\ref{lemmkeps} and \ref{Posix2}.\qed  
       
  1098   \end{proof}
       
  1099 
       
  1100   \noindent In \textit{(2)} we further know by Theorem~\ref{posixdeterm} that the
       
  1101   value returned by the lexer must be unique.   A simple corollary 
       
  1102   of our two theorems is:
       
  1103 
       
  1104   \begin{corollary}\mbox{}\smallskip\\\label{lexercorrectcor}
       
  1105   \begin{tabular}{ll}
       
  1106   (1) & @{thm (lhs) lexer_correctness(2)} if and only if @{thm (rhs) lexer_correctness(2)}\\ 
       
  1107   (2) & @{thm (lhs) lexer_correctness(1)} if and only if @{thm (rhs) lexer_correctness(1)}\\
       
  1108   \end{tabular}
       
  1109   \end{corollary}
       
  1110 
       
  1111   \noindent This concludes our correctness proof. Note that we have
       
  1112   not changed the algorithm of Sulzmann and Lu,\footnote{All
       
  1113   deviations we introduced are harmless.} but introduced our own
       
  1114   specification for what a correct result---a POSIX value---should be.
       
  1115   In the next section we show that our specification coincides with
       
  1116   another one given by Okui and Suzuki using a different technique.
       
  1117 
       
  1118 \<close>
       
  1119 
       
  1120 section \<open>Ordering of Values according to Okui and Suzuki\<close>
       
  1121 
       
  1122 text \<open>
       
  1123   
       
  1124   While in the previous section we have defined POSIX values directly
       
  1125   in terms of a ternary relation (see inference rules in Figure~\ref{POSIXrules}),
       
  1126   Sulzmann and Lu took a different approach in \cite{Sulzmann2014}:
       
  1127   they introduced an ordering for values and identified POSIX values
       
  1128   as the maximal elements.  An extended version of \cite{Sulzmann2014}
       
  1129   is available at the website of its first author; this includes more
       
  1130   details of their proofs, but which are evidently not in final form
       
  1131   yet. Unfortunately, we were not able to verify claims that their
       
  1132   ordering has properties such as being transitive or having maximal
       
  1133   elements. 
       
  1134  
       
  1135   Okui and Suzuki \cite{OkuiSuzuki2010,OkuiSuzukiTech} described
       
  1136   another ordering of values, which they use to establish the
       
  1137   correctness of their automata-based algorithm for POSIX matching.
       
  1138   Their ordering resembles some aspects of the one given by Sulzmann
       
  1139   and Lu, but overall is quite different. To begin with, Okui and
       
  1140   Suzuki identify POSIX values as minimal, rather than maximal,
       
  1141   elements in their ordering. A more substantial difference is that
       
  1142   the ordering by Okui and Suzuki uses \emph{positions} in order to
       
  1143   identify and compare subvalues. Positions are lists of natural
       
  1144   numbers. This allows them to quite naturally formalise the Longest
       
  1145   Match and Priority rules of the informal POSIX standard.  Consider
       
  1146   for example the value @{term v}
       
  1147 
       
  1148   \begin{center}
       
  1149   @{term "v == Stars [Seq (Char x) (Char y), Char z]"}
       
  1150   \end{center}
       
  1151 
       
  1152   \noindent
       
  1153   At position \<open>[0,1]\<close> of this value is the
       
  1154   subvalue \<open>Char y\<close> and at position \<open>[1]\<close> the
       
  1155   subvalue @{term "Char z"}.  At the `root' position, or empty list
       
  1156   @{term "[]"}, is the whole value @{term v}. Positions such as \<open>[0,1,0]\<close> or \<open>[2]\<close> are outside of \<open>v\<close>. If it exists, the subvalue of @{term v} at a position \<open>p\<close>, written @{term "at v p"}, can be recursively defined by
       
  1157   
       
  1158   \begin{center}
       
  1159   \begin{tabular}{r@ {\hspace{0mm}}lcl}
       
  1160   @{term v} &  \<open>\<downharpoonleft>\<^bsub>[]\<^esub>\<close> & \<open>\<equiv>\<close>& @{thm (rhs) at.simps(1)}\\
       
  1161   @{term "Left v"} & \<open>\<downharpoonleft>\<^bsub>0::ps\<^esub>\<close> & \<open>\<equiv>\<close>& @{thm (rhs) at.simps(2)}\\
       
  1162   @{term "Right v"} & \<open>\<downharpoonleft>\<^bsub>1::ps\<^esub>\<close> & \<open>\<equiv>\<close> & 
       
  1163   @{thm (rhs) at.simps(3)[simplified Suc_0_fold]}\\
       
  1164   @{term "Seq v\<^sub>1 v\<^sub>2"} & \<open>\<downharpoonleft>\<^bsub>0::ps\<^esub>\<close> & \<open>\<equiv>\<close> & 
       
  1165   @{thm (rhs) at.simps(4)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \\
       
  1166   @{term "Seq v\<^sub>1 v\<^sub>2"} & \<open>\<downharpoonleft>\<^bsub>1::ps\<^esub>\<close>
       
  1167   & \<open>\<equiv>\<close> & 
       
  1168   @{thm (rhs) at.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2", simplified Suc_0_fold]} \\
       
  1169   @{term "Stars vs"} & \<open>\<downharpoonleft>\<^bsub>n::ps\<^esub>\<close> & \<open>\<equiv>\<close>& @{thm (rhs) at.simps(6)}\\
       
  1170   \end{tabular} 
       
  1171   \end{center}
       
  1172 
       
  1173   \noindent In the last clause we use Isabelle's notation @{term "vs ! n"} for the
       
  1174   \<open>n\<close>th element in a list.  The set of positions inside a value \<open>v\<close>,
       
  1175   written @{term "Pos v"}, is given by 
       
  1176 
       
  1177   \begin{center}
       
  1178   \begin{tabular}{lcl}
       
  1179   @{thm (lhs) Pos.simps(1)} & \<open>\<equiv>\<close> & @{thm (rhs) Pos.simps(1)}\\
       
  1180   @{thm (lhs) Pos.simps(2)} & \<open>\<equiv>\<close> & @{thm (rhs) Pos.simps(2)}\\
       
  1181   @{thm (lhs) Pos.simps(3)} & \<open>\<equiv>\<close> & @{thm (rhs) Pos.simps(3)}\\
       
  1182   @{thm (lhs) Pos.simps(4)} & \<open>\<equiv>\<close> & @{thm (rhs) Pos.simps(4)}\\
       
  1183   @{thm (lhs) Pos.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
       
  1184   & \<open>\<equiv>\<close> 
       
  1185   & @{thm (rhs) Pos.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1186   @{thm (lhs) Pos_stars} & \<open>\<equiv>\<close> & @{thm (rhs) Pos_stars}\\
       
  1187   \end{tabular}
       
  1188   \end{center}
       
  1189 
       
  1190   \noindent 
       
  1191   whereby \<open>len\<close> in the last clause stands for the length of a list. Clearly
       
  1192   for every position inside a value there exists a subvalue at that position.
       
  1193  
       
  1194 
       
  1195   To help understanding the ordering of Okui and Suzuki, consider again 
       
  1196   the earlier value
       
  1197   \<open>v\<close> and compare it with the following \<open>w\<close>:
       
  1198 
       
  1199   \begin{center}
       
  1200   \begin{tabular}{l}
       
  1201   @{term "v == Stars [Seq (Char x) (Char y), Char z]"}\\
       
  1202   @{term "w == Stars [Char x, Char y, Char z]"}  
       
  1203   \end{tabular}
       
  1204   \end{center}
       
  1205 
       
  1206   \noindent Both values match the string \<open>xyz\<close>, that means if
       
  1207   we flatten these values at their respective root position, we obtain
       
  1208   \<open>xyz\<close>. However, at position \<open>[0]\<close>, \<open>v\<close> matches
       
  1209   \<open>xy\<close> whereas \<open>w\<close> matches only the shorter \<open>x\<close>. So
       
  1210   according to the Longest Match Rule, we should prefer \<open>v\<close>,
       
  1211   rather than \<open>w\<close> as POSIX value for string \<open>xyz\<close> (and
       
  1212   corresponding regular expression). In order to
       
  1213   formalise this idea, Okui and Suzuki introduce a measure for
       
  1214   subvalues at position \<open>p\<close>, called the \emph{norm} of \<open>v\<close>
       
  1215   at position \<open>p\<close>. We can define this measure in Isabelle as an
       
  1216   integer as follows
       
  1217   
       
  1218   \begin{center}
       
  1219   @{thm pflat_len_def}
       
  1220   \end{center}
       
  1221 
       
  1222   \noindent where we take the length of the flattened value at
       
  1223   position \<open>p\<close>, provided the position is inside \<open>v\<close>; if
       
  1224   not, then the norm is \<open>-1\<close>. The default for outside
       
  1225   positions is crucial for the POSIX requirement of preferring a
       
  1226   \<open>Left\<close>-value over a \<open>Right\<close>-value (if they can match the
       
  1227   same string---see the Priority Rule from the Introduction). For this
       
  1228   consider
       
  1229 
       
  1230   \begin{center}
       
  1231   @{term "v == Left (Char x)"} \qquad and \qquad @{term "w == Right (Char x)"}
       
  1232   \end{center}
       
  1233 
       
  1234   \noindent Both values match \<open>x\<close>. At position \<open>[0]\<close>
       
  1235   the norm of @{term v} is \<open>1\<close> (the subvalue matches \<open>x\<close>),
       
  1236   but the norm of \<open>w\<close> is \<open>-1\<close> (the position is outside
       
  1237   \<open>w\<close> according to how we defined the `inside' positions of
       
  1238   \<open>Left\<close>- and \<open>Right\<close>-values).  Of course at position
       
  1239   \<open>[1]\<close>, the norms @{term "pflat_len v [1]"} and @{term
       
  1240   "pflat_len w [1]"} are reversed, but the point is that subvalues
       
  1241   will be analysed according to lexicographically ordered
       
  1242   positions. According to this ordering, the position \<open>[0]\<close>
       
  1243   takes precedence over \<open>[1]\<close> and thus also \<open>v\<close> will be 
       
  1244   preferred over \<open>w\<close>.  The lexicographic ordering of positions, written
       
  1245   @{term "DUMMY \<sqsubset>lex DUMMY"}, can be conveniently formalised
       
  1246   by three inference rules
       
  1247 
       
  1248   \begin{center}
       
  1249   \begin{tabular}{ccc}
       
  1250   @{thm [mode=Axiom] lex_list.intros(1)}\hspace{1cm} &
       
  1251   @{thm [mode=Rule] lex_list.intros(3)[where ?p1.0="p\<^sub>1" and ?p2.0="p\<^sub>2" and
       
  1252                                             ?ps1.0="ps\<^sub>1" and ?ps2.0="ps\<^sub>2"]}\hspace{1cm} &
       
  1253   @{thm [mode=Rule] lex_list.intros(2)[where ?ps1.0="ps\<^sub>1" and ?ps2.0="ps\<^sub>2"]}
       
  1254   \end{tabular}
       
  1255   \end{center}
       
  1256 
       
  1257   With the norm and lexicographic order in place,
       
  1258   we can state the key definition of Okui and Suzuki
       
  1259   \cite{OkuiSuzuki2010}: a value @{term "v\<^sub>1"} is \emph{smaller at position \<open>p\<close>} than
       
  1260   @{term "v\<^sub>2"}, written @{term "v\<^sub>1 \<sqsubset>val p v\<^sub>2"}, 
       
  1261   if and only if  $(i)$ the norm at position \<open>p\<close> is
       
  1262   greater in @{term "v\<^sub>1"} (that is the string @{term "flat (at v\<^sub>1 p)"} is longer 
       
  1263   than @{term "flat (at v\<^sub>2 p)"}) and $(ii)$ all subvalues at 
       
  1264   positions that are inside @{term "v\<^sub>1"} or @{term "v\<^sub>2"} and that are
       
  1265   lexicographically smaller than \<open>p\<close>, we have the same norm, namely
       
  1266 
       
  1267  \begin{center}
       
  1268  \begin{tabular}{c}
       
  1269  @{thm (lhs) PosOrd_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} 
       
  1270  \<open>\<equiv>\<close> 
       
  1271  $\begin{cases}
       
  1272  (i) & @{term "pflat_len v\<^sub>1 p > pflat_len v\<^sub>2 p"}   \quad\text{and}\smallskip \\
       
  1273  (ii) & @{term "(\<forall>q \<in> Pos v\<^sub>1 \<union> Pos v\<^sub>2. q \<sqsubset>lex p --> pflat_len v\<^sub>1 q = pflat_len v\<^sub>2 q)"}
       
  1274  \end{cases}$
       
  1275  \end{tabular}
       
  1276  \end{center}
       
  1277 
       
  1278  \noindent The position \<open>p\<close> in this definition acts as the
       
  1279   \emph{first distinct position} of \<open>v\<^sub>1\<close> and \<open>v\<^sub>2\<close>, where both values match strings of different length
       
  1280   \cite{OkuiSuzuki2010}.  Since at \<open>p\<close> the values \<open>v\<^sub>1\<close> and \<open>v\<^sub>2\<close> match different strings, the
       
  1281   ordering is irreflexive. Derived from the definition above
       
  1282   are the following two orderings:
       
  1283   
       
  1284   \begin{center}
       
  1285   \begin{tabular}{l}
       
  1286   @{thm PosOrd_ex_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1287   @{thm PosOrd_ex_eq_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
       
  1288   \end{tabular}
       
  1289   \end{center}
       
  1290 
       
  1291  While we encountered a number of obstacles for establishing properties like
       
  1292  transitivity for the ordering of Sulzmann and Lu (and which we failed
       
  1293  to overcome), it is relatively straightforward to establish this
       
  1294  property for the orderings
       
  1295  @{term "DUMMY :\<sqsubset>val DUMMY"} and @{term "DUMMY :\<sqsubseteq>val DUMMY"}  
       
  1296  by Okui and Suzuki.
       
  1297 
       
  1298  \begin{lemma}[Transitivity]\label{transitivity}
       
  1299  @{thm [mode=IfThen] PosOrd_trans[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and ?v3.0="v\<^sub>3"]} 
       
  1300  \end{lemma}
       
  1301 
       
  1302  \begin{proof} From the assumption we obtain two positions \<open>p\<close>
       
  1303  and \<open>q\<close>, where the values \<open>v\<^sub>1\<close> and \<open>v\<^sub>2\<close> (respectively \<open>v\<^sub>2\<close> and \<open>v\<^sub>3\<close>) are `distinct'.  Since \<open>\<prec>\<^bsub>lex\<^esub>\<close> is trichotomous, we need to consider
       
  1304  three cases, namely @{term "p = q"}, @{term "p \<sqsubset>lex q"} and
       
  1305  @{term "q \<sqsubset>lex p"}. Let us look at the first case.  Clearly
       
  1306  @{term "pflat_len v\<^sub>2 p < pflat_len v\<^sub>1 p"} and @{term
       
  1307  "pflat_len v\<^sub>3 p < pflat_len v\<^sub>2 p"} imply @{term
       
  1308  "pflat_len v\<^sub>3 p < pflat_len v\<^sub>1 p"}.  It remains to show
       
  1309  that for a @{term "p' \<in> Pos v\<^sub>1 \<union> Pos v\<^sub>3"}
       
  1310  with @{term "p' \<sqsubset>lex p"} that @{term "pflat_len v\<^sub>1
       
  1311  p' = pflat_len v\<^sub>3 p'"} holds.  Suppose @{term "p' \<in> Pos
       
  1312  v\<^sub>1"}, then we can infer from the first assumption that @{term
       
  1313  "pflat_len v\<^sub>1 p' = pflat_len v\<^sub>2 p'"}.  But this means
       
  1314  that @{term "p'"} must be in @{term "Pos v\<^sub>2"} too (the norm
       
  1315  cannot be \<open>-1\<close> given @{term "p' \<in> Pos v\<^sub>1"}).  
       
  1316  Hence we can use the second assumption and
       
  1317  infer @{term "pflat_len v\<^sub>2 p' = pflat_len v\<^sub>3 p'"},
       
  1318  which concludes this case with @{term "v\<^sub>1 :\<sqsubset>val
       
  1319  v\<^sub>3"}.  The reasoning in the other cases is similar.\qed
       
  1320  \end{proof}
       
  1321 
       
  1322  \noindent 
       
  1323  The proof for $\preccurlyeq$ is similar and omitted.
       
  1324  It is also straightforward to show that \<open>\<prec>\<close> and
       
  1325  $\preccurlyeq$ are partial orders.  Okui and Suzuki furthermore show that they
       
  1326  are linear orderings for lexical values \cite{OkuiSuzuki2010} of a given
       
  1327  regular expression and given string, but we have not formalised this in Isabelle. It is
       
  1328  not essential for our results. What we are going to show below is
       
  1329  that for a given \<open>r\<close> and \<open>s\<close>, the orderings have a unique
       
  1330  minimal element on the set @{term "LV r s"}, which is the POSIX value
       
  1331  we defined in the previous section. We start with two properties that
       
  1332  show how the length of a flattened value relates to the \<open>\<prec>\<close>-ordering.
       
  1333 
       
  1334  \begin{proposition}\mbox{}\smallskip\\\label{ordlen}
       
  1335  \begin{tabular}{@ {}ll}
       
  1336  (1) &
       
  1337  @{thm [mode=IfThen] PosOrd_shorterE[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1338  (2) &
       
  1339  @{thm [mode=IfThen] PosOrd_shorterI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} 
       
  1340  \end{tabular} 
       
  1341  \end{proposition}
       
  1342  
       
  1343  \noindent Both properties follow from the definition of the ordering. Note that
       
  1344  \textit{(2)} entails that a value, say @{term "v\<^sub>2"}, whose underlying 
       
  1345  string is a strict prefix of another flattened value, say @{term "v\<^sub>1"}, then
       
  1346  @{term "v\<^sub>1"} must be smaller than @{term "v\<^sub>2"}. For our proofs it
       
  1347  will be useful to have the following properties---in each case the underlying strings 
       
  1348  of the compared values are the same: 
       
  1349 
       
  1350   \begin{proposition}\mbox{}\smallskip\\\label{ordintros}
       
  1351   \begin{tabular}{ll}
       
  1352   \textit{(1)} & 
       
  1353   @{thm [mode=IfThen] PosOrd_Left_Right[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1354   \textit{(2)} & If
       
  1355   @{thm (prem 1) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;then\;
       
  1356   @{thm (lhs) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;iff\;
       
  1357   @{thm (rhs) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1358   \textit{(3)} & If
       
  1359   @{thm (prem 1) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;then\;
       
  1360   @{thm (lhs) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;iff\;
       
  1361   @{thm (rhs) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1362   \textit{(4)} & If
       
  1363   @{thm (prem 1) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]} \;then\;
       
  1364   @{thm (lhs) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]} \;iff\;
       
  1365   @{thm (rhs) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]}\\
       
  1366   \textit{(5)} & If
       
  1367   @{thm (prem 2) PosOrd_SeqI1[simplified, where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1368                                     ?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]} \;and\;
       
  1369   @{thm (prem 1) PosOrd_SeqI1[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1370                                     ?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]} \;then\;
       
  1371   @{thm (concl) PosOrd_SeqI1[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1372                                    ?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]}\\
       
  1373   \textit{(6)} & If
       
  1374   @{thm (prem 1) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;then\;
       
  1375   @{thm (lhs) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;iff\;
       
  1376   @{thm (rhs) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]}\\  
       
  1377   
       
  1378   \textit{(7)} & If
       
  1379   @{thm (prem 2) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1380                             ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;and\;
       
  1381   @{thm (prem 1) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1382                             ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;then\;
       
  1383    @{thm (concl) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1384                             ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]}\\
       
  1385   \end{tabular} 
       
  1386   \end{proposition}
       
  1387 
       
  1388   \noindent One might prefer that statements \textit{(4)} and \textit{(5)} 
       
  1389   (respectively \textit{(6)} and \textit{(7)})
       
  1390   are combined into a single \textit{iff}-statement (like the ones for \<open>Left\<close> and \<open>Right\<close>). Unfortunately this cannot be done easily: such
       
  1391   a single statement would require an additional assumption about the
       
  1392   two values @{term "Seq v\<^sub>1 v\<^sub>2"} and @{term "Seq w\<^sub>1 w\<^sub>2"}
       
  1393   being inhabited by the same regular expression. The
       
  1394   complexity of the proofs involved seems to not justify such a
       
  1395   `cleaner' single statement. The statements given are just the properties that
       
  1396   allow us to establish our theorems without any difficulty. The proofs 
       
  1397   for Proposition~\ref{ordintros} are routine.
       
  1398  
       
  1399 
       
  1400   Next we establish how Okui and Suzuki's orderings relate to our
       
  1401   definition of POSIX values.  Given a \<open>POSIX\<close> value \<open>v\<^sub>1\<close>
       
  1402   for \<open>r\<close> and \<open>s\<close>, then any other lexical value \<open>v\<^sub>2\<close> in @{term "LV r s"} is greater or equal than \<open>v\<^sub>1\<close>, namely:
       
  1403 
       
  1404 
       
  1405   \begin{theorem}\label{orderone}
       
  1406   @{thm [mode=IfThen] Posix_PosOrd[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
       
  1407   \end{theorem}
       
  1408 
       
  1409   \begin{proof} By induction on our POSIX rules. By
       
  1410   Theorem~\ref{posixdeterm} and the definition of @{const LV}, it is clear
       
  1411   that \<open>v\<^sub>1\<close> and \<open>v\<^sub>2\<close> have the same
       
  1412   underlying string @{term s}.  The three base cases are
       
  1413   straightforward: for example for @{term "v\<^sub>1 = Void"}, we have
       
  1414   that @{term "v\<^sub>2 \<in> LV ONE []"} must also be of the form
       
  1415   \mbox{@{term "v\<^sub>2 = Void"}}. Therefore we have @{term
       
  1416   "v\<^sub>1 :\<sqsubseteq>val v\<^sub>2"}.  The inductive cases for
       
  1417   \<open>r\<close> being of the form @{term "ALT r\<^sub>1 r\<^sub>2"} and
       
  1418   @{term "SEQ r\<^sub>1 r\<^sub>2"} are as follows:
       
  1419 
       
  1420 
       
  1421   \begin{itemize} 
       
  1422 
       
  1423   \item[$\bullet$] Case \<open>P+L\<close> with @{term "s \<in> (ALT r\<^sub>1 r\<^sub>2)
       
  1424   \<rightarrow> (Left w\<^sub>1)"}: In this case the value 
       
  1425   @{term "v\<^sub>2"} is either of the
       
  1426   form @{term "Left w\<^sub>2"} or @{term "Right w\<^sub>2"}. In the
       
  1427   latter case we can immediately conclude with \mbox{@{term "v\<^sub>1
       
  1428   :\<sqsubseteq>val v\<^sub>2"}} since a \<open>Left\<close>-value with the
       
  1429   same underlying string \<open>s\<close> is always smaller than a
       
  1430   \<open>Right\<close>-value by Proposition~\ref{ordintros}\textit{(1)}.  
       
  1431   In the former case we have @{term "w\<^sub>2
       
  1432   \<in> LV r\<^sub>1 s"} and can use the induction hypothesis to infer
       
  1433   @{term "w\<^sub>1 :\<sqsubseteq>val w\<^sub>2"}. Because @{term
       
  1434   "w\<^sub>1"} and @{term "w\<^sub>2"} have the same underlying string
       
  1435   \<open>s\<close>, we can conclude with @{term "Left w\<^sub>1
       
  1436   :\<sqsubseteq>val Left w\<^sub>2"} using
       
  1437   Proposition~\ref{ordintros}\textit{(2)}.\smallskip
       
  1438 
       
  1439   \item[$\bullet$] Case \<open>P+R\<close> with @{term "s \<in> (ALT r\<^sub>1 r\<^sub>2)
       
  1440   \<rightarrow> (Right w\<^sub>1)"}: This case similar to the previous
       
  1441   case, except that we additionally know @{term "s \<notin> L
       
  1442   r\<^sub>1"}. This is needed when @{term "v\<^sub>2"} is of the form
       
  1443   \mbox{@{term "Left w\<^sub>2"}}. Since \mbox{@{term "flat v\<^sub>2 = flat
       
  1444   w\<^sub>2"} \<open>= s\<close>} and @{term "\<Turnstile> w\<^sub>2 :
       
  1445   r\<^sub>1"}, we can derive a contradiction for \mbox{@{term "s \<notin> L
       
  1446   r\<^sub>1"}} using
       
  1447   Proposition~\ref{inhabs}. So also in this case \mbox{@{term "v\<^sub>1
       
  1448   :\<sqsubseteq>val v\<^sub>2"}}.\smallskip
       
  1449 
       
  1450   \item[$\bullet$] Case \<open>PS\<close> with @{term "(s\<^sub>1 @
       
  1451   s\<^sub>2) \<in> (SEQ r\<^sub>1 r\<^sub>2) \<rightarrow> (Seq
       
  1452   w\<^sub>1 w\<^sub>2)"}: We can assume @{term "v\<^sub>2 = Seq
       
  1453   (u\<^sub>1) (u\<^sub>2)"} with @{term "\<Turnstile> u\<^sub>1 :
       
  1454   r\<^sub>1"} and \mbox{@{term "\<Turnstile> u\<^sub>2 :
       
  1455   r\<^sub>2"}}. We have @{term "s\<^sub>1 @ s\<^sub>2 = (flat
       
  1456   u\<^sub>1) @ (flat u\<^sub>2)"}.  By the side-condition of the
       
  1457   \<open>PS\<close>-rule we know that either @{term "s\<^sub>1 = flat
       
  1458   u\<^sub>1"} or that @{term "flat u\<^sub>1"} is a strict prefix of
       
  1459   @{term "s\<^sub>1"}. In the latter case we can infer @{term
       
  1460   "w\<^sub>1 :\<sqsubset>val u\<^sub>1"} by
       
  1461   Proposition~\ref{ordlen}\textit{(2)} and from this @{term "v\<^sub>1
       
  1462   :\<sqsubseteq>val v\<^sub>2"} by Proposition~\ref{ordintros}\textit{(5)}
       
  1463   (as noted above @{term "v\<^sub>1"} and @{term "v\<^sub>2"} must have the
       
  1464   same underlying string).
       
  1465   In the former case we know
       
  1466   @{term "u\<^sub>1 \<in> LV r\<^sub>1 s\<^sub>1"} and @{term
       
  1467   "u\<^sub>2 \<in> LV r\<^sub>2 s\<^sub>2"}. With this we can use the
       
  1468   induction hypotheses to infer @{term "w\<^sub>1 :\<sqsubseteq>val
       
  1469   u\<^sub>1"} and @{term "w\<^sub>2 :\<sqsubseteq>val u\<^sub>2"}. By
       
  1470   Proposition~\ref{ordintros}\textit{(4,5)} we can again infer 
       
  1471   @{term "v\<^sub>1 :\<sqsubseteq>val
       
  1472   v\<^sub>2"}.
       
  1473 
       
  1474   \end{itemize}
       
  1475 
       
  1476   \noindent The case for \<open>P\<star>\<close> is similar to the \<open>PS\<close>-case and omitted.\qed
       
  1477   \end{proof}
       
  1478 
       
  1479   \noindent This theorem shows that our \<open>POSIX\<close> value for a
       
  1480   regular expression \<open>r\<close> and string @{term s} is in fact a
       
  1481   minimal element of the values in \<open>LV r s\<close>. By
       
  1482   Proposition~\ref{ordlen}\textit{(2)} we also know that any value in
       
  1483   \<open>LV r s'\<close>, with @{term "s'"} being a strict prefix, cannot be
       
  1484   smaller than \<open>v\<^sub>1\<close>. The next theorem shows the
       
  1485   opposite---namely any minimal element in @{term "LV r s"} must be a
       
  1486   \<open>POSIX\<close> value. This can be established by induction on \<open>r\<close>, but the proof can be drastically simplified by using the fact
       
  1487   from the previous section about the existence of a \<open>POSIX\<close> value
       
  1488   whenever a string @{term "s \<in> L r"}.
       
  1489 
       
  1490 
       
  1491   \begin{theorem}
       
  1492   @{thm [mode=IfThen] PosOrd_Posix[where ?v1.0="v\<^sub>1"]} 
       
  1493   \end{theorem}
       
  1494 
       
  1495   \begin{proof} 
       
  1496   If @{thm (prem 1) PosOrd_Posix[where ?v1.0="v\<^sub>1"]} then 
       
  1497   @{term "s \<in> L r"} by Proposition~\ref{inhabs}. Hence by Theorem~\ref{lexercorrect}(2) 
       
  1498   there exists a
       
  1499   \<open>POSIX\<close> value @{term "v\<^sub>P"} with @{term "s \<in> r \<rightarrow> v\<^sub>P"}
       
  1500   and by Lemma~\ref{LVposix} we also have \mbox{@{term "v\<^sub>P \<in> LV r s"}}.
       
  1501   By Theorem~\ref{orderone} we therefore have 
       
  1502   @{term "v\<^sub>P :\<sqsubseteq>val v\<^sub>1"}. If @{term "v\<^sub>P = v\<^sub>1"} then
       
  1503   we are done. Otherwise we have @{term "v\<^sub>P :\<sqsubset>val v\<^sub>1"}, which 
       
  1504   however contradicts the second assumption about @{term "v\<^sub>1"} being the smallest
       
  1505   element in @{term "LV r s"}. So we are done in this case too.\qed
       
  1506   \end{proof}
       
  1507 
       
  1508   \noindent
       
  1509   From this we can also show 
       
  1510   that if @{term "LV r s"} is non-empty (or equivalently @{term "s \<in> L r"}) then 
       
  1511   it has a unique minimal element:
       
  1512 
       
  1513   \begin{corollary}
       
  1514   @{thm [mode=IfThen] Least_existence1}
       
  1515   \end{corollary}
       
  1516 
       
  1517 
       
  1518 
       
  1519   \noindent To sum up, we have shown that the (unique) minimal elements 
       
  1520   of the ordering by Okui and Suzuki are exactly the \<open>POSIX\<close>
       
  1521   values we defined inductively in Section~\ref{posixsec}. This provides
       
  1522   an independent confirmation that our ternary relation formalises the
       
  1523   informal POSIX rules. 
       
  1524 
       
  1525 \<close>
       
  1526 
       
  1527 section \<open>Bitcoded Lexing\<close>
       
  1528 
       
  1529 
       
  1530 
       
  1531 
       
  1532 text \<open>
       
  1533 
       
  1534 Incremental calculation of the value. To simplify the proof we first define the function
       
  1535 @{const flex} which calculates the ``iterated'' injection function. With this we can 
       
  1536 rewrite the lexer as
       
  1537 
       
  1538 \begin{center}
       
  1539 @{thm lexer_flex}
       
  1540 \end{center}
       
  1541 
       
  1542 
       
  1543 \<close>
       
  1544 
       
  1545 section \<open>Optimisations\<close>
       
  1546 
       
  1547 text \<open>
       
  1548 
       
  1549   Derivatives as calculated by \Brz's method are usually more complex
       
  1550   regular expressions than the initial one; the result is that the
       
  1551   derivative-based matching and lexing algorithms are often abysmally slow.
       
  1552   However, various optimisations are possible, such as the simplifications
       
  1553   of @{term "ALT ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and
       
  1554   @{term "SEQ r ONE"} to @{term r}. These simplifications can speed up the
       
  1555   algorithms considerably, as noted in \cite{Sulzmann2014}. One of the
       
  1556   advantages of having a simple specification and correctness proof is that
       
  1557   the latter can be refined to prove the correctness of such simplification
       
  1558   steps. While the simplification of regular expressions according to 
       
  1559   rules like
       
  1560 
       
  1561   \begin{equation}\label{Simpl}
       
  1562   \begin{array}{lcllcllcllcl}
       
  1563   @{term "ALT ZERO r"} & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1564   @{term "ALT r ZERO"} & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1565   @{term "SEQ ONE r"}  & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1566   @{term "SEQ r ONE"}  & \<open>\<Rightarrow>\<close> & @{term r}
       
  1567   \end{array}
       
  1568   \end{equation}
       
  1569 
       
  1570   \noindent is well understood, there is an obstacle with the POSIX value
       
  1571   calculation algorithm by Sulzmann and Lu: if we build a derivative regular
       
  1572   expression and then simplify it, we will calculate a POSIX value for this
       
  1573   simplified derivative regular expression, \emph{not} for the original (unsimplified)
       
  1574   derivative regular expression. Sulzmann and Lu \cite{Sulzmann2014} overcome this obstacle by
       
  1575   not just calculating a simplified regular expression, but also calculating
       
  1576   a \emph{rectification function} that ``repairs'' the incorrect value.
       
  1577   
       
  1578   The rectification functions can be (slightly clumsily) implemented  in
       
  1579   Isabelle/HOL as follows using some auxiliary functions:
       
  1580 
       
  1581   \begin{center}
       
  1582   \begin{tabular}{lcl}
       
  1583   @{thm (lhs) F_RIGHT.simps(1)} & $\dn$ & \<open>Right (f v)\<close>\\
       
  1584   @{thm (lhs) F_LEFT.simps(1)} & $\dn$ & \<open>Left (f v)\<close>\\
       
  1585   
       
  1586   @{thm (lhs) F_ALT.simps(1)} & $\dn$ & \<open>Right (f\<^sub>2 v)\<close>\\
       
  1587   @{thm (lhs) F_ALT.simps(2)} & $\dn$ & \<open>Left (f\<^sub>1 v)\<close>\\
       
  1588   
       
  1589   @{thm (lhs) F_SEQ1.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 ()) (f\<^sub>2 v)\<close>\\
       
  1590   @{thm (lhs) F_SEQ2.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 v) (f\<^sub>2 ())\<close>\\
       
  1591   @{thm (lhs) F_SEQ.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 v\<^sub>1) (f\<^sub>2 v\<^sub>2)\<close>\medskip\\
       
  1592   %\end{tabular}
       
  1593   %
       
  1594   %\begin{tabular}{lcl}
       
  1595   @{term "simp_ALT (ZERO, DUMMY) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_RIGHT f\<^sub>2)"}\\
       
  1596   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, DUMMY)"} & $\dn$ & @{term "(r\<^sub>1, F_LEFT f\<^sub>1)"}\\
       
  1597   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)"}\\
       
  1598   @{term "simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)"}\\
       
  1599   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)"}\\
       
  1600   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(SEQ r\<^sub>1 r\<^sub>2, F_SEQ f\<^sub>1 f\<^sub>2)"}\\
       
  1601   \end{tabular}
       
  1602   \end{center}
       
  1603 
       
  1604   \noindent
       
  1605   The functions \<open>simp\<^bsub>Alt\<^esub>\<close> and \<open>simp\<^bsub>Seq\<^esub>\<close> encode the simplification rules
       
  1606   in \eqref{Simpl} and compose the rectification functions (simplifications can occur
       
  1607   deep inside the regular expression). The main simplification function is then 
       
  1608 
       
  1609   \begin{center}
       
  1610   \begin{tabular}{lcl}
       
  1611   @{term "simp (ALT r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_ALT (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
  1612   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
  1613   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
       
  1614   \end{tabular}
       
  1615   \end{center} 
       
  1616 
       
  1617   \noindent where @{term "id"} stands for the identity function. The
       
  1618   function @{const simp} returns a simplified regular expression and a corresponding
       
  1619   rectification function. Note that we do not simplify under stars: this
       
  1620   seems to slow down the algorithm, rather than speed it up. The optimised
       
  1621   lexer is then given by the clauses:
       
  1622   
       
  1623   \begin{center}
       
  1624   \begin{tabular}{lcl}
       
  1625   @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\
       
  1626   @{thm (lhs) slexer.simps(2)} & $\dn$ & 
       
  1627                          \<open>let (r\<^sub>s, f\<^sub>r) = simp (r \<close>$\backslash$\<open> c) in\<close>\\
       
  1628                      & & \<open>case\<close> @{term "slexer r\<^sub>s s"} \<open>of\<close>\\
       
  1629                      & & \phantom{$|$} @{term "None"}  \<open>\<Rightarrow>\<close> @{term None}\\
       
  1630                      & & $|$ @{term "Some v"} \<open>\<Rightarrow>\<close> \<open>Some (inj r c (f\<^sub>r v))\<close>                          
       
  1631   \end{tabular}
       
  1632   \end{center}
       
  1633 
       
  1634   \noindent
       
  1635   In the second clause we first calculate the derivative @{term "der c r"}
       
  1636   and then simpli
       
  1637 
       
  1638 text \<open>
       
  1639 
       
  1640 Incremental calculation of the value. To simplify the proof we first define the function
       
  1641 @{const flex} which calculates the ``iterated'' injection function. With this we can 
       
  1642 rewrite the lexer as
       
  1643 
       
  1644 \begin{center}
       
  1645 @{thm lexer_flex}
       
  1646 \end{center}
       
  1647 
       
  1648 \begin{center}
       
  1649   \begin{tabular}{lcl}
       
  1650   @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
       
  1651   @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
       
  1652   @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
       
  1653   @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\
       
  1654   @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
  1655   @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
       
  1656   @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}
       
  1657 \end{tabular}
       
  1658 \end{center}
       
  1659 
       
  1660 \begin{center}
       
  1661 \begin{tabular}{lcl}
       
  1662   @{term areg} & $::=$ & @{term "AZERO"}\\
       
  1663                & $\mid$ & @{term "AONE bs"}\\
       
  1664                & $\mid$ & @{term "ACHAR bs c"}\\
       
  1665                & $\mid$ & @{term "AALT bs r1 r2"}\\
       
  1666                & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
       
  1667                & $\mid$ & @{term "ASTAR bs r"}
       
  1668 \end{tabular}
       
  1669 \end{center}
       
  1670 
       
  1671 
       
  1672 \begin{center}
       
  1673   \begin{tabular}{lcl}
       
  1674   @{thm (lhs) intern.simps(1)} & $\dn$ & @{thm (rhs) intern.simps(1)}\\
       
  1675   @{thm (lhs) intern.simps(2)} & $\dn$ & @{thm (rhs) intern.simps(2)}\\
       
  1676   @{thm (lhs) intern.simps(3)} & $\dn$ & @{thm (rhs) intern.simps(3)}\\
       
  1677   @{thm (lhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  1678   @{thm (lhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  1679   @{thm (lhs) intern.simps(6)} & $\dn$ & @{thm (rhs) intern.simps(6)}\\
       
  1680 \end{tabular}
       
  1681 \end{center}
       
  1682 
       
  1683 \begin{center}
       
  1684   \begin{tabular}{lcl}
       
  1685   @{thm (lhs) erase.simps(1)} & $\dn$ & @{thm (rhs) erase.simps(1)}\\
       
  1686   @{thm (lhs) erase.simps(2)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(2)[of bs]}\\
       
  1687   @{thm (lhs) erase.simps(3)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(3)[of bs]}\\
       
  1688   @{thm (lhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1689   @{thm (lhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1690   @{thm (lhs) erase.simps(6)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(6)[of bs]}\\
       
  1691 \end{tabular}
       
  1692 \end{center}
       
  1693 
       
  1694 Some simple facts about erase
       
  1695 
       
  1696 \begin{lemma}\mbox{}\\
       
  1697 @{thm erase_bder}\\
       
  1698 @{thm erase_intern}
       
  1699 \end{lemma}
       
  1700 
       
  1701 \begin{center}
       
  1702   \begin{tabular}{lcl}
       
  1703   @{thm (lhs) bnullable.simps(1)} & $\dn$ & @{thm (rhs) bnullable.simps(1)}\\
       
  1704   @{thm (lhs) bnullable.simps(2)} & $\dn$ & @{thm (rhs) bnullable.simps(2)}\\
       
  1705   @{thm (lhs) bnullable.simps(3)} & $\dn$ & @{thm (rhs) bnullable.simps(3)}\\
       
  1706   @{thm (lhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1707   @{thm (lhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1708   @{thm (lhs) bnullable.simps(6)} & $\dn$ & @{thm (rhs) bnullable.simps(6)}\medskip\\
       
  1709 
       
  1710 %  \end{tabular}
       
  1711 %  \end{center}
       
  1712 
       
  1713 %  \begin{center}
       
  1714 %  \begin{tabular}{lcl}
       
  1715 
       
  1716   @{thm (lhs) bder.simps(1)} & $\dn$ & @{thm (rhs) bder.simps(1)}\\
       
  1717   @{thm (lhs) bder.simps(2)} & $\dn$ & @{thm (rhs) bder.simps(2)}\\
       
  1718   @{thm (lhs) bder.simps(3)} & $\dn$ & @{thm (rhs) bder.simps(3)}\\
       
  1719   @{thm (lhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1720   @{thm (lhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1721   @{thm (lhs) bder.simps(6)} & $\dn$ & @{thm (rhs) bder.simps(6)}
       
  1722   \end{tabular}
       
  1723   \end{center}
       
  1724 
       
  1725 
       
  1726 \begin{center}
       
  1727   \begin{tabular}{lcl}
       
  1728   @{thm (lhs) bmkeps.simps(1)} & $\dn$ & @{thm (rhs) bmkeps.simps(1)}\\
       
  1729   @{thm (lhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1730   @{thm (lhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1731   @{thm (lhs) bmkeps.simps(4)} & $\dn$ & @{thm (rhs) bmkeps.simps(4)}\medskip\\
       
  1732 \end{tabular}
       
  1733 \end{center}
       
  1734 
       
  1735 
       
  1736 @{thm [mode=IfThen] bder_retrieve}
       
  1737 
       
  1738 By induction on \<open>r\<close>
       
  1739 
       
  1740 \begin{theorem}[Main Lemma]\mbox{}\\
       
  1741 @{thm [mode=IfThen] MAIN_decode}
       
  1742 \end{theorem}
       
  1743 
       
  1744 \noindent
       
  1745 Definition of the bitcoded lexer
       
  1746 
       
  1747 @{thm blexer_def}
       
  1748 
       
  1749 
       
  1750 \begin{theorem}
       
  1751 @{thm blexer_correctness}
       
  1752 \end{theorem}
       
  1753 
       
  1754 \<close>
       
  1755 
       
  1756 section \<open>Optimisations\<close>
       
  1757 
       
  1758 text \<open>
       
  1759 
       
  1760   Derivatives as calculated by \Brz's method are usually more complex
       
  1761   regular expressions than the initial one; the result is that the
       
  1762   derivative-based matching and lexing algorithms are often abysmally slow.
       
  1763   However, various optimisations are possible, such as the simplifications
       
  1764   of @{term "ALT ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and
       
  1765   @{term "SEQ r ONE"} to @{term r}. These simplifications can speed up the
       
  1766   algorithms considerably, as noted in \cite{Sulzmann2014}. One of the
       
  1767   advantages of having a simple specification and correctness proof is that
       
  1768   the latter can be refined to prove the correctness of such simplification
       
  1769   steps. While the simplification of regular expressions according to 
       
  1770   rules like
       
  1771 
       
  1772   \begin{equation}\label{Simpl}
       
  1773   \begin{array}{lcllcllcllcl}
       
  1774   @{term "ALT ZERO r"} & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1775   @{term "ALT r ZERO"} & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1776   @{term "SEQ ONE r"}  & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1777   @{term "SEQ r ONE"}  & \<open>\<Rightarrow>\<close> & @{term r}
       
  1778   \end{array}
       
  1779   \end{equation}
       
  1780 
       
  1781   \noindent is well understood, there is an obstacle with the POSIX value
       
  1782   calculation algorithm by Sulzmann and Lu: if we build a derivative regular
       
  1783   expression and then simplify it, we will calculate a POSIX value for this
       
  1784   simplified derivative regular expression, \emph{not} for the original (unsimplified)
       
  1785   derivative regular expression. Sulzmann and Lu \cite{Sulzmann2014} overcome this obstacle by
       
  1786   not just calculating a simplified regular expression, but also calculating
       
  1787   a \emph{rectification function} that ``repairs'' the incorrect value.
       
  1788   
       
  1789   The rectification functions can be (slightly clumsily) implemented  in
       
  1790   Isabelle/HOL as follows using some auxiliary functions:
       
  1791 
       
  1792   \begin{center}
       
  1793   \begin{tabular}{lcl}
       
  1794   @{thm (lhs) F_RIGHT.simps(1)} & $\dn$ & \<open>Right (f v)\<close>\\
       
  1795   @{thm (lhs) F_LEFT.simps(1)} & $\dn$ & \<open>Left (f v)\<close>\\
       
  1796   
       
  1797   @{thm (lhs) F_ALT.simps(1)} & $\dn$ & \<open>Right (f\<^sub>2 v)\<close>\\
       
  1798   @{thm (lhs) F_ALT.simps(2)} & $\dn$ & \<open>Left (f\<^sub>1 v)\<close>\\
       
  1799   
       
  1800   @{thm (lhs) F_SEQ1.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 ()) (f\<^sub>2 v)\<close>\\
       
  1801   @{thm (lhs) F_SEQ2.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 v) (f\<^sub>2 ())\<close>\\
       
  1802   @{thm (lhs) F_SEQ.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 v\<^sub>1) (f\<^sub>2 v\<^sub>2)\<close>\medskip\\
       
  1803   %\end{tabular}
       
  1804   %
       
  1805   %\begin{tabular}{lcl}
       
  1806   @{term "simp_ALT (ZERO, DUMMY) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_RIGHT f\<^sub>2)"}\\
       
  1807   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, DUMMY)"} & $\dn$ & @{term "(r\<^sub>1, F_LEFT f\<^sub>1)"}\\
       
  1808   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)"}\\
       
  1809   @{term "simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)"}\\
       
  1810   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)"}\\
       
  1811   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(SEQ r\<^sub>1 r\<^sub>2, F_SEQ f\<^sub>1 f\<^sub>2)"}\\
       
  1812   \end{tabular}
       
  1813   \end{center}
       
  1814 
       
  1815   \noindent
       
  1816   The functions \<open>simp\<^bsub>Alt\<^esub>\<close> and \<open>simp\<^bsub>Seq\<^esub>\<close> encode the simplification rules
       
  1817   in \eqref{Simpl} and compose the rectification functions (simplifications can occur
       
  1818   deep inside the regular expression). The main simplification function is then 
       
  1819 
       
  1820   \begin{center}
       
  1821   \begin{tabular}{lcl}
       
  1822   @{term "simp (ALT r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_ALT (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
  1823   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
  1824   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
       
  1825   \end{tabular}
       
  1826   \end{center} 
       
  1827 
       
  1828   \noindent where @{term "id"} stands for the identity function. The
       
  1829   function @{const simp} returns a simplified regular expression and a corresponding
       
  1830   rectification function. Note that we do not simplify under stars: this
       
  1831   seems to slow down the algorithm, rather than speed it up. The optimised
       
  1832   lexer is then given by the clauses:
       
  1833   
       
  1834   \begin{center}
       
  1835   \begin{tabular}{lcl}
       
  1836   @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\
       
  1837   @{thm (lhs) slexer.simps(2)} & $\dn$ & 
       
  1838                          \<open>let (r\<^sub>s, f\<^sub>r) = simp (r \<close>$\backslash$\<open> c) in\<close>\\
       
  1839                      & & \<open>case\<close> @{term "slexer r\<^sub>s s"} \<open>of\<close>\\
       
  1840                      & & \phantom{$|$} @{term "None"}  \<open>\<Rightarrow>\<close> @{term None}\\
       
  1841                      & & $|$ @{term "Some v"} \<open>\<Rightarrow>\<close> \<open>Some (inj r c (f\<^sub>r v))\<close>                          
       
  1842   \end{tabular}
       
  1843   \end{center}
       
  1844 
       
  1845   \noindent
       
  1846   In the second clause we first calculate the derivative @{term "der c r"}
       
  1847   and then simplify the result. This gives us a simplified derivative
       
  1848   \<open>r\<^sub>s\<close> and a rectification function \<open>f\<^sub>r\<close>. The lexer
       
  1849   is then recursively called with the simplified derivative, but before
       
  1850   we inject the character @{term c} into the value @{term v}, we need to rectify
       
  1851   @{term v} (that is construct @{term "f\<^sub>r v"}). Before we can establish the correctness
       
  1852   of @{term "slexer"}, we need to show that simplification preserves the language
       
  1853   and simplification preserves our POSIX relation once the value is rectified
       
  1854   (recall @{const "simp"} generates a (regular expression, rectification function) pair):
       
  1855 
       
  1856   \begin{lemma}\mbox{}\smallskip\\\label{slexeraux}
       
  1857   \begin{tabular}{ll}
       
  1858   (1) & @{thm L_fst_simp[symmetric]}\\
       
  1859   (2) & @{thm[mode=IfThen] Posix_simp}
       
  1860   \end{tabular}
       
  1861   \end{lemma}
       
  1862 
       
  1863   \begin{proof} Both are by induction on \<open>r\<close>. There is no
       
  1864   interesting case for the first statement. For the second statement,
       
  1865   of interest are the @{term "r = ALT r\<^sub>1 r\<^sub>2"} and @{term "r = SEQ r\<^sub>1
       
  1866   r\<^sub>2"} cases. In each case we have to analyse four subcases whether
       
  1867   @{term "fst (simp r\<^sub>1)"} and @{term "fst (simp r\<^sub>2)"} equals @{const
       
  1868   ZERO} (respectively @{const ONE}). For example for @{term "r = ALT
       
  1869   r\<^sub>1 r\<^sub>2"}, consider the subcase @{term "fst (simp r\<^sub>1) = ZERO"} and
       
  1870   @{term "fst (simp r\<^sub>2) \<noteq> ZERO"}. By assumption we know @{term "s \<in>
       
  1871   fst (simp (ALT r\<^sub>1 r\<^sub>2)) \<rightarrow> v"}. From this we can infer @{term "s \<in> fst (simp r\<^sub>2) \<rightarrow> v"}
       
  1872   and by IH also (*) @{term "s \<in> r\<^sub>2 \<rightarrow> (snd (simp r\<^sub>2) v)"}. Given @{term "fst (simp r\<^sub>1) = ZERO"}
       
  1873   we know @{term "L (fst (simp r\<^sub>1)) = {}"}. By the first statement
       
  1874   @{term "L r\<^sub>1"} is the empty set, meaning (**) @{term "s \<notin> L r\<^sub>1"}.
       
  1875   Taking (*) and (**) together gives by the \mbox{\<open>P+R\<close>}-rule 
       
  1876   @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> Right (snd (simp r\<^sub>2) v)"}. In turn this
       
  1877   gives @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> snd (simp (ALT r\<^sub>1 r\<^sub>2)) v"} as we need to show.
       
  1878   The other cases are similar.\qed
       
  1879   \end{proof}
       
  1880 
       
  1881   \noindent We can now prove relatively straightforwardly that the
       
  1882   optimised lexer produces the expected result:
       
  1883 
       
  1884   \begin{theorem}
       
  1885   @{thm slexer_correctness}
       
  1886   \end{theorem}
       
  1887 
       
  1888   \begin{proof} By induction on @{term s} generalising over @{term
       
  1889   r}. The case @{term "[]"} is trivial. For the cons-case suppose the
       
  1890   string is of the form @{term "c # s"}. By induction hypothesis we
       
  1891   know @{term "slexer r s = lexer r s"} holds for all @{term r} (in
       
  1892   particular for @{term "r"} being the derivative @{term "der c
       
  1893   r"}). Let @{term "r\<^sub>s"} be the simplified derivative regular expression, that is @{term
       
  1894   "fst (simp (der c r))"}, and @{term "f\<^sub>r"} be the rectification
       
  1895   function, that is @{term "snd (simp (der c r))"}.  We distinguish the cases
       
  1896   whether (*) @{term "s \<in> L (der c r)"} or not. In the first case we
       
  1897   have by Theorem~\ref{lexercorrect}(2) a value @{term "v"} so that @{term
       
  1898   "lexer (der c r) s = Some v"} and @{term "s \<in> der c r \<rightarrow> v"} hold.
       
  1899   By Lemma~\ref{slexeraux}(1) we can also infer from~(*) that @{term "s
       
  1900   \<in> L r\<^sub>s"} holds.  Hence we know by Theorem~\ref{lexercorrect}(2) that
       
  1901   there exists a @{term "v'"} with @{term "lexer r\<^sub>s s = Some v'"} and
       
  1902   @{term "s \<in> r\<^sub>s \<rightarrow> v'"}. From the latter we know by
       
  1903   Lemma~\ref{slexeraux}(2) that @{term "s \<in> der c r \<rightarrow> (f\<^sub>r v')"} holds.
       
  1904   By the uniqueness of the POSIX relation (Theorem~\ref{posixdeterm}) we
       
  1905   can infer that @{term v} is equal to @{term "f\<^sub>r v'"}---that is the 
       
  1906   rectification function applied to @{term "v'"}
       
  1907   produces the original @{term "v"}.  Now the case follows by the
       
  1908   definitions of @{const lexer} and @{const slexer}.
       
  1909 
       
  1910   In the second case where @{term "s \<notin> L (der c r)"} we have that
       
  1911   @{term "lexer (der c r) s = None"} by Theorem~\ref{lexercorrect}(1).  We
       
  1912   also know by Lemma~\ref{slexeraux}(1) that @{term "s \<notin> L r\<^sub>s"}. Hence
       
  1913   @{term "lexer r\<^sub>s s = None"} by Theorem~\ref{lexercorrect}(1) and
       
  1914   by IH then also @{term "slexer r\<^sub>s s = None"}. With this we can
       
  1915   conclude in this case too.\qed   
       
  1916 
       
  1917   \end{proof} 
       
  1918 
       
  1919 \<close>
       
  1920 fy the result. This gives us a simplified derivative
       
  1921   \<open>r\<^sub>s\<close> and a rectification function \<open>f\<^sub>r\<close>. The lexer
       
  1922   is then recursively called with the simplified derivative, but before
       
  1923   we inject the character @{term c} into the value @{term v}, we need to rectify
       
  1924   @{term v} (that is construct @{term "f\<^sub>r v"}). Before we can establish the correctness
       
  1925   of @{term "slexer"}, we need to show that simplification preserves the language
       
  1926   and simplification preserves our POSIX relation once the value is rectified
       
  1927   (recall @{const "simp"} generates a (regular expression, rectification function) pair):
       
  1928 
       
  1929   \begin{lemma}\mbox{}\smallskip\\\label{slexeraux}
       
  1930   \begin{tabular}{ll}
       
  1931   (1) & @{thm L_fst_simp[symmetric]}\\
       
  1932   (2) & @{thm[mode=IfThen] Posix_simp}
       
  1933   \end{tabular}
       
  1934   \end{lemma}
       
  1935 
       
  1936   \begin{proof} Both are by induction on \<open>r\<close>. There is no
       
  1937   interesting case for the first statement. For the second statement,
       
  1938   of interest are the @{term "r = ALT r\<^sub>1 r\<^sub>2"} and @{term "r = SEQ r\<^sub>1
       
  1939   r\<^sub>2"} cases. In each case we have to analyse four subcases whether
       
  1940   @{term "fst (simp r\<^sub>1)"} and @{term "fst (simp r\<^sub>2)"} equals @{const
       
  1941   ZERO} (respectively @{const ONE}). For example for @{term "r = ALT
       
  1942   r\<^sub>1 r\<^sub>2"}, consider the subcase @{term "fst (simp r\<^sub>1) = ZERO"} and
       
  1943   @{term "fst (simp r\<^sub>2) \<noteq> ZERO"}. By assumption we know @{term "s \<in>
       
  1944   fst (simp (ALT r\<^sub>1 r\<^sub>2)) \<rightarrow> v"}. From this we can infer @{term "s \<in> fst (simp r\<^sub>2) \<rightarrow> v"}
       
  1945   and by IH also (*) @{term "s \<in> r\<^sub>2 \<rightarrow> (snd (simp r\<^sub>2) v)"}. Given @{term "fst (simp r\<^sub>1) = ZERO"}
       
  1946   we know @{term "L (fst (simp r\<^sub>1)) = {}"}. By the first statement
       
  1947   @{term "L r\<^sub>1"} is the empty set, meaning (**) @{term "s \<notin> L r\<^sub>1"}.
       
  1948   Taking (*) and (**) together gives by the \mbox{\<open>P+R\<close>}-rule 
       
  1949   @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> Right (snd (simp r\<^sub>2) v)"}. In turn this
       
  1950   gives @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> snd (simp (ALT r\<^sub>1 r\<^sub>2)) v"} as we need to show.
       
  1951   The other cases are similar.\qed
       
  1952   \end{proof}
       
  1953 
       
  1954   \noindent We can now prove relatively straightforwardly that the
       
  1955   optimised lexer produces the expected result:
       
  1956 
       
  1957   \begin{theorem}
       
  1958   @{thm slexer_correctness}
       
  1959   \end{theorem}
       
  1960 
       
  1961   \begin{proof} By induction on @{term s} generalising over @{term
       
  1962   r}. The case @{term "[]"} is trivial. For the cons-case suppose the
       
  1963   string is of the form @{term "c # s"}. By induction hypothesis we
       
  1964   know @{term "slexer r s = lexer r s"} holds for all @{term r} (in
       
  1965   particular for @{term "r"} being the derivative @{term "der c
       
  1966   r"}). Let @{term "r\<^sub>s"} be the simplified derivative regular expression, that is @{term
       
  1967   "fst (simp (der c r))"}, and @{term "f\<^sub>r"} be the rectification
       
  1968   function, that is @{term "snd (simp (der c r))"}.  We distinguish the cases
       
  1969   whether (*) @{term "s \<in> L (der c r)"} or not. In the first case we
       
  1970   have by Theorem~\ref{lexercorrect}(2) a value @{term "v"} so that @{term
       
  1971   "lexer (der c r) s = Some v"} and @{term "s \<in> der c r \<rightarrow> v"} hold.
       
  1972   By Lemma~\ref{slexeraux}(1) we can also infer from~(*) that @{term "s
       
  1973   \<in> L r\<^sub>s"} holds.  Hence we know by Theorem~\ref{lexercorrect}(2) that
       
  1974   there exists a @{term "v'"} with @{term "lexer r\<^sub>s s = Some v'"} and
       
  1975   @{term "s \<in> r\<^sub>s \<rightarrow> v'"}. From the latter we know by
       
  1976   Lemma~\ref{slexeraux}(2) that @{term "s \<in> der c r \<rightarrow> (f\<^sub>r v')"} holds.
       
  1977   By the uniqueness of the POSIX relation (Theorem~\ref{posixdeterm}) we
       
  1978   can infer that @{term v} is equal to @{term "f\<^sub>r v'"}---that is the 
       
  1979   rectification function applied to @{term "v'"}
       
  1980   produces the original @{term "v"}.  Now the case follows by the
       
  1981   definitions of @{const lexer} and @{const slexer}.
       
  1982 
       
  1983   In the second case where @{term "s \<notin> L (der c r)"} we have that
       
  1984   @{term "lexer (der c r) s = None"} by Theorem~\ref{lexercorrect}(1).  We
       
  1985   also know by Lemma~\ref{slexeraux}(1) that @{term "s \<notin> L r\<^sub>s"}. Hence
       
  1986   @{term "lexer r\<^sub>s s = None"} by Theorem~\ref{lexercorrect}(1) and
       
  1987   by IH then also @{term "slexer r\<^sub>s s = None"}. With this we can
       
  1988   conclude in this case too.\qed   
       
  1989 
       
  1990   \end{proof} 
       
  1991 
       
  1992 \<close>
       
  1993 
       
  1994 
       
  1995 section \<open>HERE\<close>
       
  1996 
       
  1997 text \<open>
       
  1998 
       
  1999   \begin{lemma}
       
  2000   @{thm [mode=IfThen] bder_retrieve}
       
  2001   \end{lemma}
       
  2002 
       
  2003   \begin{proof}
       
  2004   By induction on the definition of @{term "erase r"}. The cases for rule 1) and 2) are
       
  2005   straightforward as @{term "der c ZERO"} and @{term "der c ONE"} are both equal to 
       
  2006   @{term ZERO}. This means @{term "\<Turnstile> v : ZERO"} cannot hold. Similarly in case of rule 3)
       
  2007   where @{term r} is of the form @{term "ACHAR d"} with @{term "c = d"}. Then by assumption
       
  2008   we know @{term "\<Turnstile> v : ONE"}, which implies @{term "v = Void"}. The equation follows by 
       
  2009   simplification of left- and right-hand side. In  case @{term "c \<noteq> d"} we have again
       
  2010   @{term "\<Turnstile> v : ZERO"}, which cannot  hold. 
       
  2011 
       
  2012   For rule 4a) we have again @{term "\<Turnstile> v : ZERO"}. The property holds by IH for rule 4b).
       
  2013   The  induction hypothesis is 
       
  2014   \[
       
  2015   @{term "retrieve (bder c r) v = retrieve r (injval (erase r) c v)"}
       
  2016   \]
       
  2017   which is what left- and right-hand side simplify to.  The slightly more interesting case
       
  2018   is for 4c). By assumption  we have 
       
  2019   @{term "\<Turnstile> v : ALT (der c (erase r\<^sub>1)) (der c (erase (AALTs bs (r\<^sub>2 # rs))))"}. This means we 
       
  2020   have either (*) @{term "\<Turnstile> v1 : der c (erase r\<^sub>1)"} with @{term "v = Left v1"} or
       
  2021   (**) @{term "\<Turnstile> v2 : der c (erase (AALTs bs (r\<^sub>2 # rs)))"} with @{term "v = Right v2"}.
       
  2022   The former  case is straightforward by simplification. The second case is \ldots TBD.
       
  2023 
       
  2024   Rule 5) TBD.
       
  2025 
       
  2026   Finally for rule 6) the reasoning is as follows:   By assumption we  have
       
  2027   @{term "\<Turnstile> v : SEQ (der c (erase r)) (STAR (erase r))"}. This means we also have
       
  2028   @{term "v = Seq v1 v2"}, @{term "\<Turnstile> v1 : der c (erase r)"}  and @{term "v2 = Stars vs"}.
       
  2029   We want to prove
       
  2030   \begin{align}
       
  2031   & @{term "retrieve (ASEQ bs (fuse [Z] (bder c r)) (ASTAR [] r)) v"}\\
       
  2032   &= @{term "retrieve (ASTAR bs r) (injval (STAR (erase r)) c v)"}
       
  2033   \end{align}
       
  2034   The right-hand side @{term inj}-expression is equal to 
       
  2035   @{term "Stars (injval (erase r) c v1 # vs)"}, which means the @{term  retrieve}-expression
       
  2036   simplifies to 
       
  2037   \[
       
  2038   @{term "bs @ [Z] @ retrieve r (injval (erase r) c v1) @ retrieve (ASTAR [] r) (Stars vs)"}
       
  2039   \]
       
  2040   The left-hand side (3) above simplifies to 
       
  2041   \[
       
  2042   @{term "bs @ retrieve (fuse [Z] (bder c r)) v1 @ retrieve (ASTAR [] r) (Stars vs)"} 
       
  2043   \]
       
  2044   We can move out the @{term "fuse  [Z]"} and then use the IH to show that left-hand side
       
  2045   and right-hand side are equal. This completes the proof. 
       
  2046   \end{proof}   
       
  2047 
       
  2048    
       
  2049 
       
  2050   \bibliographystyle{plain}
       
  2051   \bibliography{root}
       
  2052 
       
  2053 \<close>
       
  2054 (*<*)
       
  2055 end
       
  2056 (*>*)
       
  2057 
       
  2058 (*
       
  2059 
       
  2060 \begin{center}
       
  2061   \begin{tabular}{lcl}
       
  2062   @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
       
  2063   @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
       
  2064   @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
       
  2065   @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\
       
  2066   @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
  2067   @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
       
  2068   @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}
       
  2069 \end{tabular}
       
  2070 \end{center}
       
  2071 
       
  2072 \begin{center}
       
  2073 \begin{tabular}{lcl}
       
  2074   @{term areg} & $::=$ & @{term "AZERO"}\\
       
  2075                & $\mid$ & @{term "AONE bs"}\\
       
  2076                & $\mid$ & @{term "ACHAR bs c"}\\
       
  2077                & $\mid$ & @{term "AALT bs r1 r2"}\\
       
  2078                & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
       
  2079                & $\mid$ & @{term "ASTAR bs r"}
       
  2080 \end{tabular}
       
  2081 \end{center}
       
  2082 
       
  2083 
       
  2084 \begin{center}
       
  2085   \begin{tabular}{lcl}
       
  2086   @{thm (lhs) intern.simps(1)} & $\dn$ & @{thm (rhs) intern.simps(1)}\\
       
  2087   @{thm (lhs) intern.simps(2)} & $\dn$ & @{thm (rhs) intern.simps(2)}\\
       
  2088   @{thm (lhs) intern.simps(3)} & $\dn$ & @{thm (rhs) intern.simps(3)}\\
       
  2089   @{thm (lhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  2090   @{thm (lhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  2091   @{thm (lhs) intern.simps(6)} & $\dn$ & @{thm (rhs) intern.simps(6)}\\
       
  2092 \end{tabular}
       
  2093 \end{center}
       
  2094 
       
  2095 \begin{center}
       
  2096   \begin{tabular}{lcl}
       
  2097   @{thm (lhs) erase.simps(1)} & $\dn$ & @{thm (rhs) erase.simps(1)}\\
       
  2098   @{thm (lhs) erase.simps(2)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(2)[of bs]}\\
       
  2099   @{thm (lhs) erase.simps(3)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(3)[of bs]}\\
       
  2100   @{thm (lhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2101   @{thm (lhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2102   @{thm (lhs) erase.simps(6)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(6)[of bs]}\\
       
  2103 \end{tabular}
       
  2104 \end{center}
       
  2105 
       
  2106 Some simple facts about erase
       
  2107 
       
  2108 \begin{lemma}\mbox{}\\
       
  2109 @{thm erase_bder}\\
       
  2110 @{thm erase_intern}
       
  2111 \end{lemma}
       
  2112 
       
  2113 \begin{center}
       
  2114   \begin{tabular}{lcl}
       
  2115   @{thm (lhs) bnullable.simps(1)} & $\dn$ & @{thm (rhs) bnullable.simps(1)}\\
       
  2116   @{thm (lhs) bnullable.simps(2)} & $\dn$ & @{thm (rhs) bnullable.simps(2)}\\
       
  2117   @{thm (lhs) bnullable.simps(3)} & $\dn$ & @{thm (rhs) bnullable.simps(3)}\\
       
  2118   @{thm (lhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2119   @{thm (lhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2120   @{thm (lhs) bnullable.simps(6)} & $\dn$ & @{thm (rhs) bnullable.simps(6)}\medskip\\
       
  2121 
       
  2122 %  \end{tabular}
       
  2123 %  \end{center}
       
  2124 
       
  2125 %  \begin{center}
       
  2126 %  \begin{tabular}{lcl}
       
  2127 
       
  2128   @{thm (lhs) bder.simps(1)} & $\dn$ & @{thm (rhs) bder.simps(1)}\\
       
  2129   @{thm (lhs) bder.simps(2)} & $\dn$ & @{thm (rhs) bder.simps(2)}\\
       
  2130   @{thm (lhs) bder.simps(3)} & $\dn$ & @{thm (rhs) bder.simps(3)}\\
       
  2131   @{thm (lhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2132   @{thm (lhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2133   @{thm (lhs) bder.simps(6)} & $\dn$ & @{thm (rhs) bder.simps(6)}
       
  2134   \end{tabular}
       
  2135   \end{center}
       
  2136 
       
  2137 
       
  2138 \begin{center}
       
  2139   \begin{tabular}{lcl}
       
  2140   @{thm (lhs) bmkeps.simps(1)} & $\dn$ & @{thm (rhs) bmkeps.simps(1)}\\
       
  2141   @{thm (lhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2142   @{thm (lhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2143   @{thm (lhs) bmkeps.simps(4)} & $\dn$ & @{thm (rhs) bmkeps.simps(4)}\medskip\\
       
  2144 \end{tabular}
       
  2145 \end{center}
       
  2146 
       
  2147 
       
  2148 @{thm [mode=IfThen] bder_retrieve}
       
  2149 
       
  2150 By induction on \<open>r\<close>
       
  2151 
       
  2152 \begin{theorem}[Main Lemma]\mbox{}\\
       
  2153 @{thm [mode=IfThen] MAIN_decode}
       
  2154 \end{theorem}
       
  2155 
       
  2156 \noindent
       
  2157 Definition of the bitcoded lexer
       
  2158 
       
  2159 @{thm blexer_def}
       
  2160 
       
  2161 
       
  2162 \begin{theorem}
       
  2163 @{thm blexer_correctness}
       
  2164 \end{theorem}
       
  2165 
       
  2166 
       
  2167 
       
  2168 
       
  2169 
       
  2170 \begin{center}
       
  2171   \begin{tabular}{lcl}
       
  2172   @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
       
  2173   @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
       
  2174   @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
       
  2175   @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\
       
  2176   @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
  2177   @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
       
  2178   @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}
       
  2179 \end{tabular}
       
  2180 \end{center}
       
  2181 
       
  2182 \begin{center}
       
  2183 \begin{tabular}{lcl}
       
  2184   @{term areg} & $::=$ & @{term "AZERO"}\\
       
  2185                & $\mid$ & @{term "AONE bs"}\\
       
  2186                & $\mid$ & @{term "ACHAR bs c"}\\
       
  2187                & $\mid$ & @{term "AALT bs r1 r2"}\\
       
  2188                & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
       
  2189                & $\mid$ & @{term "ASTAR bs r"}
       
  2190 \end{tabular}
       
  2191 \end{center}
       
  2192 
       
  2193 
       
  2194 \begin{center}
       
  2195   \begin{tabular}{lcl}
       
  2196   @{thm (lhs) intern.simps(1)} & $\dn$ & @{thm (rhs) intern.simps(1)}\\
       
  2197   @{thm (lhs) intern.simps(2)} & $\dn$ & @{thm (rhs) intern.simps(2)}\\
       
  2198   @{thm (lhs) intern.simps(3)} & $\dn$ & @{thm (rhs) intern.simps(3)}\\
       
  2199   @{thm (lhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  2200   @{thm (lhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  2201   @{thm (lhs) intern.simps(6)} & $\dn$ & @{thm (rhs) intern.simps(6)}\\
       
  2202 \end{tabular}
       
  2203 \end{center}
       
  2204 
       
  2205 \begin{center}
       
  2206   \begin{tabular}{lcl}
       
  2207   @{thm (lhs) erase.simps(1)} & $\dn$ & @{thm (rhs) erase.simps(1)}\\
       
  2208   @{thm (lhs) erase.simps(2)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(2)[of bs]}\\
       
  2209   @{thm (lhs) erase.simps(3)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(3)[of bs]}\\
       
  2210   @{thm (lhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2211   @{thm (lhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2212   @{thm (lhs) erase.simps(6)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(6)[of bs]}\\
       
  2213 \end{tabular}
       
  2214 \end{center}
       
  2215 
       
  2216 Some simple facts about erase
       
  2217 
       
  2218 \begin{lemma}\mbox{}\\
       
  2219 @{thm erase_bder}\\
       
  2220 @{thm erase_intern}
       
  2221 \end{lemma}
       
  2222 
       
  2223 \begin{center}
       
  2224   \begin{tabular}{lcl}
       
  2225   @{thm (lhs) bnullable.simps(1)} & $\dn$ & @{thm (rhs) bnullable.simps(1)}\\
       
  2226   @{thm (lhs) bnullable.simps(2)} & $\dn$ & @{thm (rhs) bnullable.simps(2)}\\
       
  2227   @{thm (lhs) bnullable.simps(3)} & $\dn$ & @{thm (rhs) bnullable.simps(3)}\\
       
  2228   @{thm (lhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2229   @{thm (lhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2230   @{thm (lhs) bnullable.simps(6)} & $\dn$ & @{thm (rhs) bnullable.simps(6)}\medskip\\
       
  2231 
       
  2232 %  \end{tabular}
       
  2233 %  \end{center}
       
  2234 
       
  2235 %  \begin{center}
       
  2236 %  \begin{tabular}{lcl}
       
  2237 
       
  2238   @{thm (lhs) bder.simps(1)} & $\dn$ & @{thm (rhs) bder.simps(1)}\\
       
  2239   @{thm (lhs) bder.simps(2)} & $\dn$ & @{thm (rhs) bder.simps(2)}\\
       
  2240   @{thm (lhs) bder.simps(3)} & $\dn$ & @{thm (rhs) bder.simps(3)}\\
       
  2241   @{thm (lhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2242   @{thm (lhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2243   @{thm (lhs) bder.simps(6)} & $\dn$ & @{thm (rhs) bder.simps(6)}
       
  2244   \end{tabular}
       
  2245   \end{center}
       
  2246 
       
  2247 
       
  2248 \begin{center}
       
  2249   \begin{tabular}{lcl}
       
  2250   @{thm (lhs) bmkeps.simps(1)} & $\dn$ & @{thm (rhs) bmkeps.simps(1)}\\
       
  2251   @{thm (lhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2252   @{thm (lhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2253   @{thm (lhs) bmkeps.simps(4)} & $\dn$ & @{thm (rhs) bmkeps.simps(4)}\medskip\\
       
  2254 \end{tabular}
       
  2255 \end{center}
       
  2256 
       
  2257 
       
  2258 @{thm [mode=IfThen] bder_retrieve}
       
  2259 
       
  2260 By induction on \<open>r\<close>
       
  2261 
       
  2262 \begin{theorem}[Main Lemma]\mbox{}\\
       
  2263 @{thm [mode=IfThen] MAIN_decode}
       
  2264 \end{theorem}
       
  2265 
       
  2266 \noindent
       
  2267 Definition of the bitcoded lexer
       
  2268 
       
  2269 @{thm blexer_def}
       
  2270 
       
  2271 
       
  2272 \begin{theorem}
       
  2273 @{thm blexer_correctness}
       
  2274 \end{theorem}
       
  2275 
       
  2276 \<close>
       
  2277 \<close>*)