thys2/Journal/Paper.thy~
changeset 384 f5866f1d6a59
child 389 d4b3b0f942f4
equal deleted inserted replaced
383:aa0a2a3f90a0 384:f5866f1d6a59
       
     1 (*<*)
       
     2 theory Paper
       
     3 imports 
       
     4    "../Lexer"
       
     5    "../Simplifying" 
       
     6    "../Positions"
       
     7 
       
     8    "../SizeBound" 
       
     9    "HOL-Library.LaTeXsugar"
       
    10 begin
       
    11 
       
    12 lemma Suc_0_fold:
       
    13   "Suc 0 = 1"
       
    14 by simp
       
    15 
       
    16 
       
    17 
       
    18 declare [[show_question_marks = false]]
       
    19 
       
    20 syntax (latex output)
       
    21   "_Collect" :: "pttrn => bool => 'a set"              ("(1{_ \<^latex>\<open>\\mbox{\\boldmath$\\mid$}\<close> _})")
       
    22   "_CollectIn" :: "pttrn => 'a set => bool => 'a set"   ("(1{_ \<in> _ |e _})")
       
    23 
       
    24 syntax
       
    25   "_Not_Ex" :: "idts \<Rightarrow> bool \<Rightarrow> bool"  ("(3\<nexists>_.a./ _)" [0, 10] 10)
       
    26   "_Not_Ex1" :: "pttrn \<Rightarrow> bool \<Rightarrow> bool"  ("(3\<nexists>!_.a./ _)" [0, 10] 10)
       
    27 
       
    28 
       
    29 abbreviation 
       
    30   "der_syn r c \<equiv> der c r"
       
    31 
       
    32 abbreviation 
       
    33   "ders_syn r s \<equiv> ders s r"
       
    34 
       
    35   abbreviation 
       
    36   "bder_syn r c \<equiv> bder c r"
       
    37 
       
    38 abbreviation 
       
    39   "bders_syn r s \<equiv> bders r s"
       
    40 
       
    41 
       
    42 abbreviation
       
    43   "nprec v1 v2 \<equiv> \<not>(v1 :\<sqsubset>val v2)"
       
    44 
       
    45 
       
    46 
       
    47 
       
    48 notation (latex output)
       
    49   If  ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
       
    50   Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) and  
       
    51 
       
    52   ZERO ("\<^bold>0" 81) and 
       
    53   ONE ("\<^bold>1" 81) and 
       
    54   CH ("_" [1000] 80) and
       
    55   ALT ("_ + _" [77,77] 78) and
       
    56   SEQ ("_ \<cdot> _" [77,77] 78) and
       
    57   STAR ("_\<^sup>\<star>" [79] 78) and
       
    58   
       
    59   val.Void ("Empty" 78) and
       
    60   val.Char ("Char _" [1000] 78) and
       
    61   val.Left ("Left _" [79] 78) and
       
    62   val.Right ("Right _" [1000] 78) and
       
    63   val.Seq ("Seq _ _" [79,79] 78) and
       
    64   val.Stars ("Stars _" [79] 78) and
       
    65 
       
    66   L ("L'(_')" [10] 78) and
       
    67   LV ("LV _ _" [80,73] 78) and
       
    68   der_syn ("_\\_" [79, 1000] 76) and  
       
    69   ders_syn ("_\\_" [79, 1000] 76) and
       
    70   flat ("|_|" [75] 74) and
       
    71   flats ("|_|" [72] 74) and
       
    72   Sequ ("_ @ _" [78,77] 63) and
       
    73   injval ("inj _ _ _" [79,77,79] 76) and 
       
    74   mkeps ("mkeps _" [79] 76) and 
       
    75   length ("len _" [73] 73) and
       
    76   intlen ("len _" [73] 73) and
       
    77   set ("_" [73] 73) and
       
    78  
       
    79   Prf ("_ : _" [75,75] 75) and
       
    80   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
       
    81  
       
    82   lexer ("lexer _ _" [78,78] 77) and 
       
    83   F_RIGHT ("F\<^bsub>Right\<^esub> _") and
       
    84   F_LEFT ("F\<^bsub>Left\<^esub> _") and  
       
    85   F_ALT ("F\<^bsub>Alt\<^esub> _ _") and
       
    86   F_SEQ1 ("F\<^bsub>Seq1\<^esub> _ _") and
       
    87   F_SEQ2 ("F\<^bsub>Seq2\<^esub> _ _") and
       
    88   F_SEQ ("F\<^bsub>Seq\<^esub> _ _") and
       
    89   simp_SEQ ("simp\<^bsub>Seq\<^esub> _ _" [1000, 1000] 1) and
       
    90   simp_ALT ("simp\<^bsub>Alt\<^esub> _ _" [1000, 1000] 1) and
       
    91   slexer ("lexer\<^sup>+" 1000) and
       
    92 
       
    93   at ("_\<^latex>\<open>\\mbox{$\\downharpoonleft$}\<close>\<^bsub>_\<^esub>") and
       
    94   lex_list ("_ \<prec>\<^bsub>lex\<^esub> _") and
       
    95   PosOrd ("_ \<prec>\<^bsub>_\<^esub> _" [77,77,77] 77) and
       
    96   PosOrd_ex ("_ \<prec> _" [77,77] 77) and
       
    97   PosOrd_ex_eq ("_ \<^latex>\<open>\\mbox{$\\preccurlyeq$}\<close> _" [77,77] 77) and
       
    98   pflat_len ("\<parallel>_\<parallel>\<^bsub>_\<^esub>") and
       
    99   nprec ("_ \<^latex>\<open>\\mbox{$\\not\\prec$}\<close> _" [77,77] 77) and
       
   100 
       
   101   bder_syn ("_\<^latex>\<open>\\mbox{$\\bbslash$}\<close>_" [79, 1000] 76) and  
       
   102   bders_syn ("_\<^latex>\<open>\\mbox{$\\bbslash$}\<close>_" [79, 1000] 76) and
       
   103   intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
       
   104   erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
       
   105   bnullable ("nullable\<^latex>\<open>\\mbox{$_b$}\<close> _" [1000] 80) and
       
   106   bmkeps ("mkeps\<^latex>\<open>\\mbox{$_b$}\<close> _" [1000] 80) and
       
   107   blexer ("lexer\<^latex>\<open>\\mbox{$_b$}\<close> _ _" [77, 77] 80) and
       
   108   code ("code _" [79] 74) and
       
   109 
       
   110   DUMMY ("\<^latex>\<open>\\underline{\\hspace{2mm}}\<close>")
       
   111 
       
   112 
       
   113 definition 
       
   114   "match r s \<equiv> nullable (ders s r)"
       
   115 
       
   116 
       
   117 lemma LV_STAR_ONE_empty: 
       
   118   shows "LV (STAR ONE) [] = {Stars []}"
       
   119 by(auto simp add: LV_def elim: Prf.cases intro: Prf.intros)
       
   120 
       
   121 
       
   122 
       
   123 (*
       
   124 comments not implemented
       
   125 
       
   126 p9. The condition "not exists s3 s4..." appears often enough (in particular in
       
   127 the proof of Lemma 3) to warrant a definition.
       
   128 
       
   129 *)
       
   130 
       
   131 
       
   132 (*>*)
       
   133 
       
   134 section\<open>Core of the proof\<close>
       
   135 text \<open>
       
   136 This paper builds on previous work by Ausaf and Urban using 
       
   137 regular expression'd bit-coded derivatives to do lexing that 
       
   138 is both fast and satisfies the POSIX specification.
       
   139 In their work, a bit-coded algorithm introduced by Sulzmann and Lu
       
   140 was formally verified in Isabelle, by a very clever use of
       
   141 flex function and retrieve to carefully mimic the way a value is 
       
   142 built up by the injection funciton.
       
   143 
       
   144 In the previous work, Ausaf and Urban established the below equality:
       
   145 \begin{lemma}
       
   146 @{thm [mode=IfThen] MAIN_decode}
       
   147 \end{lemma}
       
   148 
       
   149 This lemma establishes a link with the lexer without bit-codes.
       
   150 
       
   151 With it we get the correctness of bit-coded algorithm.
       
   152 \begin{lemma}
       
   153 @{thm [mode=IfThen] blexer_correctness}
       
   154 \end{lemma}
       
   155 
       
   156 However what is not certain is whether we can add simplification
       
   157 to the bit-coded algorithm, without breaking the correct lexing output.
       
   158 
       
   159 
       
   160 The reason that we do need to add a simplification phase
       
   161 after each derivative step of  $\textit{blexer}$ is
       
   162 because it produces intermediate
       
   163 regular expressions that can grow exponentially.
       
   164 For example, the regular expression $(a+aa)^*$ after taking
       
   165 derivative against just 10 $a$s will have size 8192.
       
   166 
       
   167 %TODO: add figure for this?
       
   168 
       
   169 
       
   170 Therefore, we insert a simplification phase
       
   171 after each derivation step, as defined below:
       
   172 \begin{lemma}
       
   173 @{thm blexer_simp_def}
       
   174 \end{lemma}
       
   175 
       
   176 The simplification function is given as follows:
       
   177 
       
   178 \begin{center}
       
   179   \begin{tabular}{lcl}
       
   180   @{thm (lhs) bsimp.simps(1)[of  "bs" "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bsimp.simps(1)[of  "bs" "r\<^sub>1" "r\<^sub>2"]}\\
       
   181   @{thm (lhs) bsimp.simps(2)} & $\dn$ & @{thm (rhs) bsimp.simps(2)}\\
       
   182   @{thm (lhs) bsimp.simps(3)} & $\dn$ & @{thm (rhs) bsimp.simps(3)}\\
       
   183 
       
   184 \end{tabular}
       
   185 \end{center}
       
   186 
       
   187 And the two helper functions are:
       
   188 \begin{center}
       
   189   \begin{tabular}{lcl}
       
   190   @{thm (lhs) bsimp_AALTs.simps(2)[of  "bs\<^sub>1" "r" ]} & $\dn$ & @{thm (rhs) bsimp.simps(1)[of  "bs\<^sub>1" "r" ]}\\
       
   191   @{thm (lhs) bsimp_AALTs.simps(2)} & $\dn$ & @{thm (rhs) bsimp.simps(2)}\\
       
   192   @{thm (lhs) bsimp_AALTs.simps(3)} & $\dn$ & @{thm (rhs) bsimp.simps(3)}\\
       
   193 
       
   194 \end{tabular}
       
   195 \end{center}
       
   196 
       
   197 
       
   198 This might sound trivial in the case of producing a YES/NO answer,
       
   199 but once we require a lexing output to be produced (which is required
       
   200 in applications like compiler front-end, malicious attack domain extraction, 
       
   201 etc.), it is not straightforward if we still extract what is needed according
       
   202 to the POSIX standard.
       
   203 
       
   204 
       
   205 
       
   206 
       
   207 
       
   208 By simplification, we mean specifically the following rules:
       
   209 
       
   210 \begin{center}
       
   211   \begin{tabular}{lcl}
       
   212   @{thm[mode=Axiom] rrewrite.intros(1)[of "bs" "r\<^sub>2"]}\\
       
   213   @{thm[mode=Axiom] rrewrite.intros(2)[of "bs" "r\<^sub>1"]}\\
       
   214   @{thm[mode=Axiom] rrewrite.intros(3)[of  "bs" "bs\<^sub>1" "r\<^sub>1"]}\\
       
   215   @{thm[mode=Rule] rrewrite.intros(4)[of  "r\<^sub>1" "r\<^sub>2" "bs" "r\<^sub>3"]}\\
       
   216   @{thm[mode=Rule] rrewrite.intros(5)[of "r\<^sub>3" "r\<^sub>4" "bs" "r\<^sub>1"]}\\
       
   217   @{thm[mode=Rule] rrewrite.intros(6)[of "r" "r'" "bs" "rs\<^sub>1" "rs\<^sub>2"]}\\
       
   218   @{thm[mode=Axiom] rrewrite.intros(7)[of "bs" "rs\<^sub>a" "rs\<^sub>b"]}\\
       
   219   @{thm[mode=Axiom] rrewrite.intros(8)[of "bs" "rs\<^sub>a" "bs\<^sub>1" "rs\<^sub>1" "rs\<^sub>b"]}\\
       
   220   @{thm[mode=Axiom] rrewrite.intros(9)[of "bs" "bs\<^sub>1" "rs"]}\\
       
   221   @{thm[mode=Axiom] rrewrite.intros(10)[of "bs" ]}\\
       
   222   @{thm[mode=Axiom] rrewrite.intros(11)[of "bs" "r\<^sub>1"]}\\
       
   223   @{thm[mode=Rule] rrewrite.intros(12)[of "a\<^sub>1" "a\<^sub>2" "bs" "rs\<^sub>a" "rs\<^sub>b" "rs\<^sub>c"]}\\
       
   224 
       
   225   \end{tabular}
       
   226 \end{center}
       
   227 
       
   228 
       
   229 And these can be made compact by the following simplification function:
       
   230 
       
   231 where the function $\mathit{bsimp_AALTs}$
       
   232 
       
   233 The core idea of the proof is that two regular expressions,
       
   234 if "isomorphic" up to a finite number of rewrite steps, will
       
   235 remain "isomorphic" when we take the same sequence of
       
   236 derivatives on both of them.
       
   237 This can be expressed by the following rewrite relation lemma:
       
   238 \begin{lemma}
       
   239 @{thm [mode=IfThen] central}
       
   240 \end{lemma}
       
   241 
       
   242 This isomorphic relation implies a property that leads to the 
       
   243 correctness result: 
       
   244 if two (nullable) regular expressions are "rewritable" in many steps
       
   245 from one another, 
       
   246 then a call to function $\textit{bmkeps}$ gives the same
       
   247 bit-sequence :
       
   248 \begin{lemma}
       
   249 @{thm [mode=IfThen] rewrites_bmkeps}
       
   250 \end{lemma}
       
   251 
       
   252 Given the same bit-sequence, the decode function
       
   253 will give out the same value, which is the output
       
   254 of both lexers:
       
   255 \begin{lemma}
       
   256 @{thm blexer_def}
       
   257 \end{lemma}
       
   258 
       
   259 \begin{lemma}
       
   260 @{thm blexer_simp_def}
       
   261 \end{lemma}
       
   262 
       
   263 And that yields the correctness result:
       
   264 \begin{lemma}
       
   265 @{thm blexersimp_correctness}
       
   266 \end{lemma}
       
   267 
       
   268 The nice thing about the above
       
   269 \<close>
       
   270 
       
   271 
       
   272 section \<open> Additional Simp Rules?\<close>
       
   273 
       
   274 
       
   275 text  \<open>
       
   276 One question someone would ask is:
       
   277 can we add more "atomic" simplification/rewriting rules,
       
   278 so the simplification is even more aggressive, making
       
   279 the intermediate results smaller, and therefore more space-efficient? 
       
   280 For example, one might want to do open up alternatives who is a child
       
   281 of a sequence:
       
   282 
       
   283 \begin{center}
       
   284   \begin{tabular}{lcl}
       
   285     @{thm[mode=Rule] aggressive.intros(1)[of "bs" "bs1" "rs" "r"]}\\
       
   286   \end{tabular}
       
   287 \end{center}
       
   288 
       
   289 This rule allows us to simplify \mbox{@{term "(ALT (SEQ (ALT a b) c)  (SEQ a c))"}}
       
   290  into  \mbox{@{term "ALT (SEQ a c)  (SEQ b c)"}},
       
   291 which is cannot be done under the \mbox{ \<leadsto>} rule because only alternatives which are
       
   292 children of another alternative can be spilled out.
       
   293 \<close>
       
   294 
       
   295 (*This rule allows us to simplify \mbox{@{term "ALT (SEQ (ALT a b) c) (SEQ a c)"}}
       
   296  into  \mbox{@{term "ALT (SEQ a c)  (SEQ b c)"}},
       
   297 which is cannot be done under the  \<leadsto> rule because only alternatives which are 
       
   298 children of another alternative can be spilled out.*)
       
   299 section \<open>Introduction\<close>
       
   300 
       
   301 
       
   302 text \<open>
       
   303 
       
   304 
       
   305 Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em
       
   306 derivative} @{term "der c r"} of a regular expression \<open>r\<close> w.r.t.\
       
   307 a character~\<open>c\<close>, and showed that it gave a simple solution to the
       
   308 problem of matching a string @{term s} with a regular expression @{term
       
   309 r}: if the derivative of @{term r} w.r.t.\ (in succession) all the
       
   310 characters of the string matches the empty string, then @{term r}
       
   311 matches @{term s} (and {\em vice versa}). The derivative has the
       
   312 property (which may almost be regarded as its specification) that, for
       
   313 every string @{term s} and regular expression @{term r} and character
       
   314 @{term c}, one has @{term "cs \<in> L(r)"} if and only if \mbox{@{term "s \<in> L(der c r)"}}. 
       
   315 The beauty of Brzozowski's derivatives is that
       
   316 they are neatly expressible in any functional language, and easily
       
   317 definable and reasoned about in theorem provers---the definitions just
       
   318 consist of inductive datatypes and simple recursive functions. A
       
   319 mechanised correctness proof of Brzozowski's matcher in for example HOL4
       
   320 has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in
       
   321 Isabelle/HOL is part of the work by Krauss and Nipkow \cite{Krauss2011}.
       
   322 And another one in Coq is given by Coquand and Siles \cite{Coquand2012}.
       
   323 
       
   324 If a regular expression matches a string, then in general there is more
       
   325 than one way of how the string is matched. There are two commonly used
       
   326 disambiguation strategies to generate a unique answer: one is called
       
   327 GREEDY matching \cite{Frisch2004} and the other is POSIX
       
   328 matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}.
       
   329 For example consider the string @{term xy} and the regular expression
       
   330 \mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be
       
   331 matched in two `iterations' by the single letter-regular expressions
       
   332 @{term x} and @{term y}, or directly in one iteration by @{term xy}. The
       
   333 first case corresponds to GREEDY matching, which first matches with the
       
   334 left-most symbol and only matches the next symbol in case of a mismatch
       
   335 (this is greedy in the sense of preferring instant gratification to
       
   336 delayed repletion). The second case is POSIX matching, which prefers the
       
   337 longest match.
       
   338 
       
   339 In the context of lexing, where an input string needs to be split up
       
   340 into a sequence of tokens, POSIX is the more natural disambiguation
       
   341 strategy for what programmers consider basic syntactic building blocks
       
   342 in their programs.  These building blocks are often specified by some
       
   343 regular expressions, say \<open>r\<^bsub>key\<^esub>\<close> and \<open>r\<^bsub>id\<^esub>\<close> for recognising keywords and identifiers,
       
   344 respectively. There are a few underlying (informal) rules behind
       
   345 tokenising a string in a POSIX \cite{POSIX} fashion:
       
   346 
       
   347 \begin{itemize} 
       
   348 \item[$\bullet$] \emph{The Longest Match Rule} (or \emph{``{M}aximal {M}unch {R}ule''}):
       
   349 The longest initial substring matched by any regular expression is taken as
       
   350 next token.\smallskip
       
   351 
       
   352 \item[$\bullet$] \emph{Priority Rule:}
       
   353 For a particular longest initial substring, the first (leftmost) regular expression
       
   354 that can match determines the token.\smallskip
       
   355 
       
   356 \item[$\bullet$] \emph{Star Rule:} A subexpression repeated by ${}^\star$ shall 
       
   357 not match an empty string unless this is the only match for the repetition.\smallskip
       
   358 
       
   359 \item[$\bullet$] \emph{Empty String Rule:} An empty string shall be considered to 
       
   360 be longer than no match at all.
       
   361 \end{itemize}
       
   362 
       
   363 \noindent Consider for example a regular expression \<open>r\<^bsub>key\<^esub>\<close> for recognising keywords such as \<open>if\<close>,
       
   364 \<open>then\<close> and so on; and \<open>r\<^bsub>id\<^esub>\<close>
       
   365 recognising identifiers (say, a single character followed by
       
   366 characters or numbers).  Then we can form the regular expression
       
   367 \<open>(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>\<close>
       
   368 and use POSIX matching to tokenise strings, say \<open>iffoo\<close> and
       
   369 \<open>if\<close>.  For \<open>iffoo\<close> we obtain by the Longest Match Rule
       
   370 a single identifier token, not a keyword followed by an
       
   371 identifier. For \<open>if\<close> we obtain by the Priority Rule a keyword
       
   372 token, not an identifier token---even if \<open>r\<^bsub>id\<^esub>\<close>
       
   373 matches also. By the Star Rule we know \<open>(r\<^bsub>key\<^esub> +
       
   374 r\<^bsub>id\<^esub>)\<^sup>\<star>\<close> matches \<open>iffoo\<close>,
       
   375 respectively \<open>if\<close>, in exactly one `iteration' of the star. The
       
   376 Empty String Rule is for cases where, for example, the regular expression 
       
   377 \<open>(a\<^sup>\<star>)\<^sup>\<star>\<close> matches against the
       
   378 string \<open>bc\<close>. Then the longest initial matched substring is the
       
   379 empty string, which is matched by both the whole regular expression
       
   380 and the parenthesised subexpression.
       
   381 
       
   382 
       
   383 One limitation of Brzozowski's matcher is that it only generates a
       
   384 YES/NO answer for whether a string is being matched by a regular
       
   385 expression.  Sulzmann and Lu~\cite{Sulzmann2014} extended this matcher
       
   386 to allow generation not just of a YES/NO answer but of an actual
       
   387 matching, called a [lexical] {\em value}. Assuming a regular
       
   388 expression matches a string, values encode the information of
       
   389 \emph{how} the string is matched by the regular expression---that is,
       
   390 which part of the string is matched by which part of the regular
       
   391 expression. For this consider again the string \<open>xy\<close> and
       
   392 the regular expression \mbox{\<open>(x + (y + xy))\<^sup>\<star>\<close>}
       
   393 (this time fully parenthesised). We can view this regular expression
       
   394 as tree and if the string \<open>xy\<close> is matched by two Star
       
   395 `iterations', then the \<open>x\<close> is matched by the left-most
       
   396 alternative in this tree and the \<open>y\<close> by the right-left alternative. This
       
   397 suggests to record this matching as
       
   398 
       
   399 \begin{center}
       
   400 @{term "Stars [Left(Char x), Right(Left(Char y))]"}
       
   401 \end{center}
       
   402 
       
   403 \noindent where @{const Stars}, \<open>Left\<close>, \<open>Right\<close> and \<open>Char\<close> are constructors for values. \<open>Stars\<close> records how many
       
   404 iterations were used; \<open>Left\<close>, respectively \<open>Right\<close>, which
       
   405 alternative is used. This `tree view' leads naturally to the idea that
       
   406 regular expressions act as types and values as inhabiting those types
       
   407 (see, for example, \cite{HosoyaVouillonPierce2005}).  The value for
       
   408 matching \<open>xy\<close> in a single `iteration', i.e.~the POSIX value,
       
   409 would look as follows
       
   410 
       
   411 \begin{center}
       
   412 @{term "Stars [Seq (Char x) (Char y)]"}
       
   413 \end{center}
       
   414 
       
   415 \noindent where @{const Stars} has only a single-element list for the
       
   416 single iteration and @{const Seq} indicates that @{term xy} is matched 
       
   417 by a sequence regular expression.
       
   418 
       
   419 %, which we will in what follows 
       
   420 %write more formally as @{term "SEQ x y"}.
       
   421 
       
   422 
       
   423 Sulzmann and Lu give a simple algorithm to calculate a value that
       
   424 appears to be the value associated with POSIX matching.  The challenge
       
   425 then is to specify that value, in an algorithm-independent fashion,
       
   426 and to show that Sulzmann and Lu's derivative-based algorithm does
       
   427 indeed calculate a value that is correct according to the
       
   428 specification.  The answer given by Sulzmann and Lu
       
   429 \cite{Sulzmann2014} is to define a relation (called an ``order
       
   430 relation'') on the set of values of @{term r}, and to show that (once
       
   431 a string to be matched is chosen) there is a maximum element and that
       
   432 it is computed by their derivative-based algorithm. This proof idea is
       
   433 inspired by work of Frisch and Cardelli \cite{Frisch2004} on a GREEDY
       
   434 regular expression matching algorithm. However, we were not able to
       
   435 establish transitivity and totality for the ``order relation'' by
       
   436 Sulzmann and Lu.  There are some inherent problems with their approach
       
   437 (of which some of the proofs are not published in
       
   438 \cite{Sulzmann2014}); perhaps more importantly, we give in this paper
       
   439 a simple inductive (and algorithm-independent) definition of what we
       
   440 call being a {\em POSIX value} for a regular expression @{term r} and
       
   441 a string @{term s}; we show that the algorithm by Sulzmann and Lu
       
   442 computes such a value and that such a value is unique. Our proofs are
       
   443 both done by hand and checked in Isabelle/HOL.  The experience of
       
   444 doing our proofs has been that this mechanical checking was absolutely
       
   445 essential: this subject area has hidden snares. This was also noted by
       
   446 Kuklewicz \cite{Kuklewicz} who found that nearly all POSIX matching
       
   447 implementations are ``buggy'' \cite[Page 203]{Sulzmann2014} and by
       
   448 Grathwohl et al \cite[Page 36]{CrashCourse2014} who wrote:
       
   449 
       
   450 \begin{quote}
       
   451 \it{}``The POSIX strategy is more complicated than the greedy because of 
       
   452 the dependence on information about the length of matched strings in the 
       
   453 various subexpressions.''
       
   454 \end{quote}
       
   455 
       
   456 
       
   457 
       
   458 \noindent {\bf Contributions:} We have implemented in Isabelle/HOL the
       
   459 derivative-based regular expression matching algorithm of
       
   460 Sulzmann and Lu \cite{Sulzmann2014}. We have proved the correctness of this
       
   461 algorithm according to our specification of what a POSIX value is (inspired
       
   462 by work of Vansummeren \cite{Vansummeren2006}). Sulzmann
       
   463 and Lu sketch in \cite{Sulzmann2014} an informal correctness proof: but to
       
   464 us it contains unfillable gaps.\footnote{An extended version of
       
   465 \cite{Sulzmann2014} is available at the website of its first author; this
       
   466 extended version already includes remarks in the appendix that their
       
   467 informal proof contains gaps, and possible fixes are not fully worked out.}
       
   468 Our specification of a POSIX value consists of a simple inductive definition
       
   469 that given a string and a regular expression uniquely determines this value.
       
   470 We also show that our definition is equivalent to an ordering 
       
   471 of values based on positions by Okui and Suzuki \cite{OkuiSuzuki2010}.
       
   472 
       
   473 %Derivatives as calculated by Brzozowski's method are usually more complex
       
   474 %regular expressions than the initial one; various optimisations are
       
   475 %possible. We prove the correctness when simplifications of @{term "ALT ZERO r"}, 
       
   476 %@{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
       
   477 %@{term r} are applied. 
       
   478 
       
   479 We extend our results to ??? Bitcoded version??
       
   480 
       
   481 \<close>
       
   482 
       
   483 
       
   484 
       
   485 
       
   486 section \<open>Preliminaries\<close>
       
   487 
       
   488 text \<open>\noindent Strings in Isabelle/HOL are lists of characters with
       
   489 the empty string being represented by the empty list, written @{term
       
   490 "[]"}, and list-cons being written as @{term "DUMMY # DUMMY"}. Often
       
   491 we use the usual bracket notation for lists also for strings; for
       
   492 example a string consisting of just a single character @{term c} is
       
   493 written @{term "[c]"}. We use the usual definitions for 
       
   494 \emph{prefixes} and \emph{strict prefixes} of strings.  By using the
       
   495 type @{type char} for characters we have a supply of finitely many
       
   496 characters roughly corresponding to the ASCII character set. Regular
       
   497 expressions are defined as usual as the elements of the following
       
   498 inductive datatype:
       
   499 
       
   500   \begin{center}
       
   501   \<open>r :=\<close>
       
   502   @{const "ZERO"} $\mid$
       
   503   @{const "ONE"} $\mid$
       
   504   @{term "CH c"} $\mid$
       
   505   @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
       
   506   @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
       
   507   @{term "STAR r"} 
       
   508   \end{center}
       
   509 
       
   510   \noindent where @{const ZERO} stands for the regular expression that does
       
   511   not match any string, @{const ONE} for the regular expression that matches
       
   512   only the empty string and @{term c} for matching a character literal. The
       
   513   language of a regular expression is also defined as usual by the
       
   514   recursive function @{term L} with the six clauses:
       
   515 
       
   516   \begin{center}
       
   517   \begin{tabular}{l@ {\hspace{4mm}}rcl}
       
   518   \textit{(1)} & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
       
   519   \textit{(2)} & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
       
   520   \textit{(3)} & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
       
   521   \textit{(4)} & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & 
       
   522         @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   523   \textit{(5)} & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & 
       
   524         @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   525   \textit{(6)} & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
       
   526   \end{tabular}
       
   527   \end{center}
       
   528   
       
   529   \noindent In clause \textit{(4)} we use the operation @{term "DUMMY ;;
       
   530   DUMMY"} for the concatenation of two languages (it is also list-append for
       
   531   strings). We use the star-notation for regular expressions and for
       
   532   languages (in the last clause above). The star for languages is defined
       
   533   inductively by two clauses: \<open>(i)\<close> the empty string being in
       
   534   the star of a language and \<open>(ii)\<close> if @{term "s\<^sub>1"} is in a
       
   535   language and @{term "s\<^sub>2"} in the star of this language, then also @{term
       
   536   "s\<^sub>1 @ s\<^sub>2"} is in the star of this language. It will also be convenient
       
   537   to use the following notion of a \emph{semantic derivative} (or \emph{left
       
   538   quotient}) of a language defined as
       
   539   %
       
   540   \begin{center}
       
   541   @{thm Der_def}\;.
       
   542   \end{center}
       
   543  
       
   544   \noindent
       
   545   For semantic derivatives we have the following equations (for example
       
   546   mechanically proved in \cite{Krauss2011}):
       
   547   %
       
   548   \begin{equation}\label{SemDer}
       
   549   \begin{array}{lcl}
       
   550   @{thm (lhs) Der_null}  & \dn & @{thm (rhs) Der_null}\\
       
   551   @{thm (lhs) Der_empty}  & \dn & @{thm (rhs) Der_empty}\\
       
   552   @{thm (lhs) Der_char}  & \dn & @{thm (rhs) Der_char}\\
       
   553   @{thm (lhs) Der_union}  & \dn & @{thm (rhs) Der_union}\\
       
   554   @{thm (lhs) Der_Sequ}  & \dn & @{thm (rhs) Der_Sequ}\\
       
   555   @{thm (lhs) Der_star}  & \dn & @{thm (rhs) Der_star}
       
   556   \end{array}
       
   557   \end{equation}
       
   558 
       
   559 
       
   560   \noindent \emph{\Brz's derivatives} of regular expressions
       
   561   \cite{Brzozowski1964} can be easily defined by two recursive functions:
       
   562   the first is from regular expressions to booleans (implementing a test
       
   563   when a regular expression can match the empty string), and the second
       
   564   takes a regular expression and a character to a (derivative) regular
       
   565   expression:
       
   566 
       
   567   \begin{center}
       
   568   \begin{tabular}{lcl}
       
   569   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
       
   570   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
       
   571   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
       
   572   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   573   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   574   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
       
   575 
       
   576 %  \end{tabular}
       
   577 %  \end{center}
       
   578 
       
   579 %  \begin{center}
       
   580 %  \begin{tabular}{lcl}
       
   581 
       
   582   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
       
   583   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
       
   584   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
       
   585   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   586   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   587   @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
       
   588   \end{tabular}
       
   589   \end{center}
       
   590  
       
   591   \noindent
       
   592   We may extend this definition to give derivatives w.r.t.~strings:
       
   593 
       
   594   \begin{center}
       
   595   \begin{tabular}{lcl}
       
   596   @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\
       
   597   @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\
       
   598   \end{tabular}
       
   599   \end{center}
       
   600 
       
   601   \noindent Given the equations in \eqref{SemDer}, it is a relatively easy
       
   602   exercise in mechanical reasoning to establish that
       
   603 
       
   604   \begin{proposition}\label{derprop}\mbox{}\\ 
       
   605   \begin{tabular}{ll}
       
   606   \textit{(1)} & @{thm (lhs) nullable_correctness} if and only if
       
   607   @{thm (rhs) nullable_correctness}, and \\ 
       
   608   \textit{(2)} & @{thm[mode=IfThen] der_correctness}.
       
   609   \end{tabular}
       
   610   \end{proposition}
       
   611 
       
   612   \noindent With this in place it is also very routine to prove that the
       
   613   regular expression matcher defined as
       
   614   %
       
   615   \begin{center}
       
   616   @{thm match_def}
       
   617   \end{center}
       
   618 
       
   619   \noindent gives a positive answer if and only if @{term "s \<in> L r"}.
       
   620   Consequently, this regular expression matching algorithm satisfies the
       
   621   usual specification for regular expression matching. While the matcher
       
   622   above calculates a provably correct YES/NO answer for whether a regular
       
   623   expression matches a string or not, the novel idea of Sulzmann and Lu
       
   624   \cite{Sulzmann2014} is to append another phase to this algorithm in order
       
   625   to calculate a [lexical] value. We will explain the details next.
       
   626 
       
   627 \<close>
       
   628 
       
   629 section \<open>POSIX Regular Expression Matching\label{posixsec}\<close>
       
   630 
       
   631 text \<open>
       
   632 
       
   633   There have been many previous works that use values for encoding 
       
   634   \emph{how} a regular expression matches a string.
       
   635   The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to 
       
   636   define a function on values that mirrors (but inverts) the
       
   637   construction of the derivative on regular expressions. \emph{Values}
       
   638   are defined as the inductive datatype
       
   639 
       
   640   \begin{center}
       
   641   \<open>v :=\<close>
       
   642   @{const "Void"} $\mid$
       
   643   @{term "val.Char c"} $\mid$
       
   644   @{term "Left v"} $\mid$
       
   645   @{term "Right v"} $\mid$
       
   646   @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
       
   647   @{term "Stars vs"} 
       
   648   \end{center}  
       
   649 
       
   650   \noindent where we use @{term vs} to stand for a list of
       
   651   values. (This is similar to the approach taken by Frisch and
       
   652   Cardelli for GREEDY matching \cite{Frisch2004}, and Sulzmann and Lu
       
   653   for POSIX matching \cite{Sulzmann2014}). The string underlying a
       
   654   value can be calculated by the @{const flat} function, written
       
   655   @{term "flat DUMMY"} and defined as:
       
   656 
       
   657   \begin{center}
       
   658   \begin{tabular}[t]{lcl}
       
   659   @{thm (lhs) flat.simps(1)} & $\dn$ & @{thm (rhs) flat.simps(1)}\\
       
   660   @{thm (lhs) flat.simps(2)} & $\dn$ & @{thm (rhs) flat.simps(2)}\\
       
   661   @{thm (lhs) flat.simps(3)} & $\dn$ & @{thm (rhs) flat.simps(3)}\\
       
   662   @{thm (lhs) flat.simps(4)} & $\dn$ & @{thm (rhs) flat.simps(4)}
       
   663   \end{tabular}\hspace{14mm}
       
   664   \begin{tabular}[t]{lcl}
       
   665   @{thm (lhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
   666   @{thm (lhs) flat.simps(6)} & $\dn$ & @{thm (rhs) flat.simps(6)}\\
       
   667   @{thm (lhs) flat.simps(7)} & $\dn$ & @{thm (rhs) flat.simps(7)}\\
       
   668   \end{tabular}
       
   669   \end{center}
       
   670 
       
   671   \noindent We will sometimes refer to the underlying string of a
       
   672   value as \emph{flattened value}.  We will also overload our notation and 
       
   673   use @{term "flats vs"} for flattening a list of values and concatenating
       
   674   the resulting strings.
       
   675   
       
   676   Sulzmann and Lu define
       
   677   inductively an \emph{inhabitation relation} that associates values to
       
   678   regular expressions. We define this relation as
       
   679   follows:\footnote{Note that the rule for @{term Stars} differs from
       
   680   our earlier paper \cite{AusafDyckhoffUrban2016}. There we used the
       
   681   original definition by Sulzmann and Lu which does not require that
       
   682   the values @{term "v \<in> set vs"} flatten to a non-empty
       
   683   string. The reason for introducing the more restricted version of
       
   684   lexical values is convenience later on when reasoning about an
       
   685   ordering relation for values.}
       
   686 
       
   687   \begin{center}
       
   688   \begin{tabular}{c@ {\hspace{12mm}}c}\label{prfintros}
       
   689   \\[-8mm]
       
   690   @{thm[mode=Axiom] Prf.intros(4)} & 
       
   691   @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm]
       
   692   @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} &
       
   693   @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\\[4mm]
       
   694   @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}  &
       
   695   @{thm[mode=Rule] Prf.intros(6)[of "vs"]}
       
   696   \end{tabular}
       
   697   \end{center}
       
   698 
       
   699   \noindent where in the clause for @{const "Stars"} we use the
       
   700   notation @{term "v \<in> set vs"} for indicating that \<open>v\<close> is a
       
   701   member in the list \<open>vs\<close>.  We require in this rule that every
       
   702   value in @{term vs} flattens to a non-empty string. The idea is that
       
   703   @{term "Stars"}-values satisfy the informal Star Rule (see Introduction)
       
   704   where the $^\star$ does not match the empty string unless this is
       
   705   the only match for the repetition.  Note also that no values are
       
   706   associated with the regular expression @{term ZERO}, and that the
       
   707   only value associated with the regular expression @{term ONE} is
       
   708   @{term Void}.  It is routine to establish how values ``inhabiting''
       
   709   a regular expression correspond to the language of a regular
       
   710   expression, namely
       
   711 
       
   712   \begin{proposition}\label{inhabs}
       
   713   @{thm L_flat_Prf}
       
   714   \end{proposition}
       
   715 
       
   716   \noindent
       
   717   Given a regular expression \<open>r\<close> and a string \<open>s\<close>, we define the 
       
   718   set of all \emph{Lexical Values} inhabited by \<open>r\<close> with the underlying string 
       
   719   being \<open>s\<close>:\footnote{Okui and Suzuki refer to our lexical values 
       
   720   as \emph{canonical values} in \cite{OkuiSuzuki2010}. The notion of \emph{non-problematic
       
   721   values} by Cardelli and Frisch \cite{Frisch2004} is related, but not identical
       
   722   to our lexical values.}
       
   723   
       
   724   \begin{center}
       
   725   @{thm LV_def}
       
   726   \end{center}
       
   727 
       
   728   \noindent The main property of @{term "LV r s"} is that it is alway finite.
       
   729 
       
   730   \begin{proposition}
       
   731   @{thm LV_finite}
       
   732   \end{proposition}
       
   733 
       
   734   \noindent This finiteness property does not hold in general if we
       
   735   remove the side-condition about @{term "flat v \<noteq> []"} in the
       
   736   @{term Stars}-rule above. For example using Sulzmann and Lu's
       
   737   less restrictive definition, @{term "LV (STAR ONE) []"} would contain
       
   738   infinitely many values, but according to our more restricted
       
   739   definition only a single value, namely @{thm LV_STAR_ONE_empty}.
       
   740 
       
   741   If a regular expression \<open>r\<close> matches a string \<open>s\<close>, then
       
   742   generally the set @{term "LV r s"} is not just a singleton set.  In
       
   743   case of POSIX matching the problem is to calculate the unique lexical value
       
   744   that satisfies the (informal) POSIX rules from the Introduction.
       
   745   Graphically the POSIX value calculation algorithm by Sulzmann and Lu
       
   746   can be illustrated by the picture in Figure~\ref{Sulz} where the
       
   747   path from the left to the right involving @{term
       
   748   derivatives}/@{const nullable} is the first phase of the algorithm
       
   749   (calculating successive \Brz's derivatives) and @{const
       
   750   mkeps}/\<open>inj\<close>, the path from right to left, the second
       
   751   phase. This picture shows the steps required when a regular
       
   752   expression, say \<open>r\<^sub>1\<close>, matches the string @{term
       
   753   "[a,b,c]"}. We first build the three derivatives (according to
       
   754   @{term a}, @{term b} and @{term c}). We then use @{const nullable}
       
   755   to find out whether the resulting derivative regular expression
       
   756   @{term "r\<^sub>4"} can match the empty string. If yes, we call the
       
   757   function @{const mkeps} that produces a value @{term "v\<^sub>4"}
       
   758   for how @{term "r\<^sub>4"} can match the empty string (taking into
       
   759   account the POSIX constraints in case there are several ways). This
       
   760   function is defined by the clauses:
       
   761 
       
   762 \begin{figure}[t]
       
   763 \begin{center}
       
   764 \begin{tikzpicture}[scale=2,node distance=1.3cm,
       
   765                     every node/.style={minimum size=6mm}]
       
   766 \node (r1)  {@{term "r\<^sub>1"}};
       
   767 \node (r2) [right=of r1]{@{term "r\<^sub>2"}};
       
   768 \draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
       
   769 \node (r3) [right=of r2]{@{term "r\<^sub>3"}};
       
   770 \draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
       
   771 \node (r4) [right=of r3]{@{term "r\<^sub>4"}};
       
   772 \draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
       
   773 \draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
       
   774 \node (v4) [below=of r4]{@{term "v\<^sub>4"}};
       
   775 \draw[->,line width=1mm](r4) -- (v4);
       
   776 \node (v3) [left=of v4] {@{term "v\<^sub>3"}};
       
   777 \draw[->,line width=1mm](v4)--(v3) node[below,midway] {\<open>inj r\<^sub>3 c\<close>};
       
   778 \node (v2) [left=of v3]{@{term "v\<^sub>2"}};
       
   779 \draw[->,line width=1mm](v3)--(v2) node[below,midway] {\<open>inj r\<^sub>2 b\<close>};
       
   780 \node (v1) [left=of v2] {@{term "v\<^sub>1"}};
       
   781 \draw[->,line width=1mm](v2)--(v1) node[below,midway] {\<open>inj r\<^sub>1 a\<close>};
       
   782 \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
       
   783 \end{tikzpicture}
       
   784 \end{center}
       
   785 \mbox{}\\[-13mm]
       
   786 
       
   787 \caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
       
   788 matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
       
   789 left to right) is \Brz's matcher building successive derivatives. If the 
       
   790 last regular expression is @{term nullable}, then the functions of the 
       
   791 second phase are called (the top-down and right-to-left arrows): first 
       
   792 @{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
       
   793 how the empty string has been recognised by @{term "r\<^sub>4"}. After
       
   794 that the function @{term inj} ``injects back'' the characters of the string into
       
   795 the values.
       
   796 \label{Sulz}}
       
   797 \end{figure} 
       
   798 
       
   799   \begin{center}
       
   800   \begin{tabular}{lcl}
       
   801   @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
       
   802   @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   803   @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   804   @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
       
   805   \end{tabular}
       
   806   \end{center}
       
   807 
       
   808   \noindent Note that this function needs only to be partially defined,
       
   809   namely only for regular expressions that are nullable. In case @{const
       
   810   nullable} fails, the string @{term "[a,b,c]"} cannot be matched by @{term
       
   811   "r\<^sub>1"} and the null value @{term "None"} is returned. Note also how this function
       
   812   makes some subtle choices leading to a POSIX value: for example if an
       
   813   alternative regular expression, say @{term "ALT r\<^sub>1 r\<^sub>2"}, can
       
   814   match the empty string and furthermore @{term "r\<^sub>1"} can match the
       
   815   empty string, then we return a \<open>Left\<close>-value. The \<open>Right\<close>-value will only be returned if @{term "r\<^sub>1"} cannot match the empty
       
   816   string.
       
   817 
       
   818   The most interesting idea from Sulzmann and Lu \cite{Sulzmann2014} is
       
   819   the construction of a value for how @{term "r\<^sub>1"} can match the
       
   820   string @{term "[a,b,c]"} from the value how the last derivative, @{term
       
   821   "r\<^sub>4"} in Fig.~\ref{Sulz}, can match the empty string. Sulzmann and
       
   822   Lu achieve this by stepwise ``injecting back'' the characters into the
       
   823   values thus inverting the operation of building derivatives, but on the level
       
   824   of values. The corresponding function, called @{term inj}, takes three
       
   825   arguments, a regular expression, a character and a value. For example in
       
   826   the first (or right-most) @{term inj}-step in Fig.~\ref{Sulz} the regular
       
   827   expression @{term "r\<^sub>3"}, the character @{term c} from the last
       
   828   derivative step and @{term "v\<^sub>4"}, which is the value corresponding
       
   829   to the derivative regular expression @{term "r\<^sub>4"}. The result is
       
   830   the new value @{term "v\<^sub>3"}. The final result of the algorithm is
       
   831   the value @{term "v\<^sub>1"}. The @{term inj} function is defined by recursion on regular
       
   832   expressions and by analysing the shape of values (corresponding to 
       
   833   the derivative regular expressions).
       
   834   %
       
   835   \begin{center}
       
   836   \begin{tabular}{l@ {\hspace{5mm}}lcl}
       
   837   \textit{(1)} & @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
       
   838   \textit{(2)} & @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
       
   839       @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
       
   840   \textit{(3)} & @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
       
   841       @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
       
   842   \textit{(4)} & @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
       
   843       & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
       
   844   \textit{(5)} & @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
       
   845       & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
       
   846   \textit{(6)} & @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ 
       
   847       & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
       
   848   \textit{(7)} & @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
       
   849       & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
       
   850   \end{tabular}
       
   851   \end{center}
       
   852 
       
   853   \noindent To better understand what is going on in this definition it
       
   854   might be instructive to look first at the three sequence cases (clauses
       
   855   \textit{(4)} -- \textit{(6)}). In each case we need to construct an ``injected value'' for
       
   856   @{term "SEQ r\<^sub>1 r\<^sub>2"}. This must be a value of the form @{term
       
   857   "Seq DUMMY DUMMY"}\,. Recall the clause of the \<open>derivative\<close>-function
       
   858   for sequence regular expressions:
       
   859 
       
   860   \begin{center}
       
   861   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} $\dn$ @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}
       
   862   \end{center}
       
   863 
       
   864   \noindent Consider first the \<open>else\<close>-branch where the derivative is @{term
       
   865   "SEQ (der c r\<^sub>1) r\<^sub>2"}. The corresponding value must therefore
       
   866   be of the form @{term "Seq v\<^sub>1 v\<^sub>2"}, which matches the left-hand
       
   867   side in clause~\textit{(4)} of @{term inj}. In the \<open>if\<close>-branch the derivative is an
       
   868   alternative, namely @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c
       
   869   r\<^sub>2)"}. This means we either have to consider a \<open>Left\<close>- or
       
   870   \<open>Right\<close>-value. In case of the \<open>Left\<close>-value we know further it
       
   871   must be a value for a sequence regular expression. Therefore the pattern
       
   872   we match in the clause \textit{(5)} is @{term "Left (Seq v\<^sub>1 v\<^sub>2)"},
       
   873   while in \textit{(6)} it is just @{term "Right v\<^sub>2"}. One more interesting
       
   874   point is in the right-hand side of clause \textit{(6)}: since in this case the
       
   875   regular expression \<open>r\<^sub>1\<close> does not ``contribute'' to
       
   876   matching the string, that means it only matches the empty string, we need to
       
   877   call @{const mkeps} in order to construct a value for how @{term "r\<^sub>1"}
       
   878   can match this empty string. A similar argument applies for why we can
       
   879   expect in the left-hand side of clause \textit{(7)} that the value is of the form
       
   880   @{term "Seq v (Stars vs)"}---the derivative of a star is @{term "SEQ (der c r)
       
   881   (STAR r)"}. Finally, the reason for why we can ignore the second argument
       
   882   in clause \textit{(1)} of @{term inj} is that it will only ever be called in cases
       
   883   where @{term "c=d"}, but the usual linearity restrictions in patterns do
       
   884   not allow us to build this constraint explicitly into our function
       
   885   definition.\footnote{Sulzmann and Lu state this clause as @{thm (lhs)
       
   886   injval.simps(1)[of "c" "c"]} $\dn$ @{thm (rhs) injval.simps(1)[of "c"]},
       
   887   but our deviation is harmless.}
       
   888 
       
   889   The idea of the @{term inj}-function to ``inject'' a character, say
       
   890   @{term c}, into a value can be made precise by the first part of the
       
   891   following lemma, which shows that the underlying string of an injected
       
   892   value has a prepended character @{term c}; the second part shows that
       
   893   the underlying string of an @{const mkeps}-value is always the empty
       
   894   string (given the regular expression is nullable since otherwise
       
   895   \<open>mkeps\<close> might not be defined).
       
   896 
       
   897   \begin{lemma}\mbox{}\smallskip\\\label{Prf_injval_flat}
       
   898   \begin{tabular}{ll}
       
   899   (1) & @{thm[mode=IfThen] Prf_injval_flat}\\
       
   900   (2) & @{thm[mode=IfThen] mkeps_flat}
       
   901   \end{tabular}
       
   902   \end{lemma}
       
   903 
       
   904   \begin{proof}
       
   905   Both properties are by routine inductions: the first one can, for example,
       
   906   be proved by induction over the definition of @{term derivatives}; the second by
       
   907   an induction on @{term r}. There are no interesting cases.\qed
       
   908   \end{proof}
       
   909 
       
   910   Having defined the @{const mkeps} and \<open>inj\<close> function we can extend
       
   911   \Brz's matcher so that a value is constructed (assuming the
       
   912   regular expression matches the string). The clauses of the Sulzmann and Lu lexer are
       
   913 
       
   914   \begin{center}
       
   915   \begin{tabular}{lcl}
       
   916   @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
       
   917   @{thm (lhs) lexer.simps(2)} & $\dn$ & \<open>case\<close> @{term "lexer (der c r) s"} \<open>of\<close>\\
       
   918                      & & \phantom{$|$} @{term "None"}  \<open>\<Rightarrow>\<close> @{term None}\\
       
   919                      & & $|$ @{term "Some v"} \<open>\<Rightarrow>\<close> @{term "Some (injval r c v)"}                          
       
   920   \end{tabular}
       
   921   \end{center}
       
   922 
       
   923   \noindent If the regular expression does not match the string, @{const None} is
       
   924   returned. If the regular expression \emph{does}
       
   925   match the string, then @{const Some} value is returned. One important
       
   926   virtue of this algorithm is that it can be implemented with ease in any
       
   927   functional programming language and also in Isabelle/HOL. In the remaining
       
   928   part of this section we prove that this algorithm is correct.
       
   929 
       
   930   The well-known idea of POSIX matching is informally defined by some
       
   931   rules such as the Longest Match and Priority Rules (see
       
   932   Introduction); as correctly argued in \cite{Sulzmann2014}, this
       
   933   needs formal specification. Sulzmann and Lu define an ``ordering
       
   934   relation'' between values and argue that there is a maximum value,
       
   935   as given by the derivative-based algorithm.  In contrast, we shall
       
   936   introduce a simple inductive definition that specifies directly what
       
   937   a \emph{POSIX value} is, incorporating the POSIX-specific choices
       
   938   into the side-conditions of our rules. Our definition is inspired by
       
   939   the matching relation given by Vansummeren~\cite{Vansummeren2006}. 
       
   940   The relation we define is ternary and
       
   941   written as \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating
       
   942   strings, regular expressions and values; the inductive rules are given in 
       
   943   Figure~\ref{POSIXrules}.
       
   944   We can prove that given a string @{term s} and regular expression @{term
       
   945    r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow> v"}.
       
   946 
       
   947   %
       
   948   \begin{figure}[t]
       
   949   \begin{center}
       
   950   \begin{tabular}{c}
       
   951   @{thm[mode=Axiom] Posix.intros(1)}\<open>P\<close>@{term "ONE"} \qquad
       
   952   @{thm[mode=Axiom] Posix.intros(2)}\<open>P\<close>@{term "c"}\medskip\\
       
   953   @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\<open>P+L\<close>\qquad
       
   954   @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\<open>P+R\<close>\medskip\\
       
   955   $\mprset{flushleft}
       
   956    \inferrule
       
   957    {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
       
   958     @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
       
   959     @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
       
   960    {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$\<open>PS\<close>\\
       
   961   @{thm[mode=Axiom] Posix.intros(7)}\<open>P[]\<close>\medskip\\
       
   962   $\mprset{flushleft}
       
   963    \inferrule
       
   964    {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
       
   965     @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
       
   966     @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
       
   967     @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
       
   968    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$\<open>P\<star>\<close>
       
   969   \end{tabular}
       
   970   \end{center}
       
   971   \caption{Our inductive definition of POSIX values.}\label{POSIXrules}
       
   972   \end{figure}
       
   973 
       
   974    
       
   975 
       
   976   \begin{theorem}\mbox{}\smallskip\\\label{posixdeterm}
       
   977   \begin{tabular}{ll}
       
   978   (1) & If @{thm (prem 1) Posix1(1)} then @{thm (concl)
       
   979   Posix1(1)} and @{thm (concl) Posix1(2)}.\\
       
   980   (2) & @{thm[mode=IfThen] Posix_determ(1)[of _ _ "v" "v'"]}
       
   981   \end{tabular}
       
   982   \end{theorem}
       
   983 
       
   984   \begin{proof} Both by induction on the definition of @{term "s \<in> r \<rightarrow> v"}. 
       
   985   The second parts follows by a case analysis of @{term "s \<in> r \<rightarrow> v'"} and
       
   986   the first part.\qed
       
   987   \end{proof}
       
   988 
       
   989   \noindent
       
   990   We claim that our @{term "s \<in> r \<rightarrow> v"} relation captures the idea behind the four
       
   991   informal POSIX rules shown in the Introduction: Consider for example the
       
   992   rules \<open>P+L\<close> and \<open>P+R\<close> where the POSIX value for a string
       
   993   and an alternative regular expression, that is @{term "(s, ALT r\<^sub>1 r\<^sub>2)"},
       
   994   is specified---it is always a \<open>Left\<close>-value, \emph{except} when the
       
   995   string to be matched is not in the language of @{term "r\<^sub>1"}; only then it
       
   996   is a \<open>Right\<close>-value (see the side-condition in \<open>P+R\<close>).
       
   997   Interesting is also the rule for sequence regular expressions (\<open>PS\<close>). The first two premises state that @{term "v\<^sub>1"} and @{term "v\<^sub>2"}
       
   998   are the POSIX values for @{term "(s\<^sub>1, r\<^sub>1)"} and @{term "(s\<^sub>2, r\<^sub>2)"}
       
   999   respectively. Consider now the third premise and note that the POSIX value
       
  1000   of this rule should match the string \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}}. According to the
       
  1001   Longest Match Rule, we want that the @{term "s\<^sub>1"} is the longest initial
       
  1002   split of \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}} such that @{term "s\<^sub>2"} is still recognised
       
  1003   by @{term "r\<^sub>2"}. Let us assume, contrary to the third premise, that there
       
  1004   \emph{exist} an @{term "s\<^sub>3"} and @{term "s\<^sub>4"} such that @{term "s\<^sub>2"}
       
  1005   can be split up into a non-empty string @{term "s\<^sub>3"} and a possibly empty
       
  1006   string @{term "s\<^sub>4"}. Moreover the longer string @{term "s\<^sub>1 @ s\<^sub>3"} can be
       
  1007   matched by \<open>r\<^sub>1\<close> and the shorter @{term "s\<^sub>4"} can still be
       
  1008   matched by @{term "r\<^sub>2"}. In this case @{term "s\<^sub>1"} would \emph{not} be the
       
  1009   longest initial split of \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}} and therefore @{term "Seq v\<^sub>1
       
  1010   v\<^sub>2"} cannot be a POSIX value for @{term "(s\<^sub>1 @ s\<^sub>2, SEQ r\<^sub>1 r\<^sub>2)"}. 
       
  1011   The main point is that our side-condition ensures the Longest 
       
  1012   Match Rule is satisfied.
       
  1013 
       
  1014   A similar condition is imposed on the POSIX value in the \<open>P\<star>\<close>-rule. Also there we want that @{term "s\<^sub>1"} is the longest initial
       
  1015   split of @{term "s\<^sub>1 @ s\<^sub>2"} and furthermore the corresponding value
       
  1016   @{term v} cannot be flattened to the empty string. In effect, we require
       
  1017   that in each ``iteration'' of the star, some non-empty substring needs to
       
  1018   be ``chipped'' away; only in case of the empty string we accept @{term
       
  1019   "Stars []"} as the POSIX value. Indeed we can show that our POSIX values
       
  1020   are lexical values which exclude those \<open>Stars\<close> that contain subvalues 
       
  1021   that flatten to the empty string.
       
  1022 
       
  1023   \begin{lemma}\label{LVposix}
       
  1024   @{thm [mode=IfThen] Posix_LV}
       
  1025   \end{lemma}
       
  1026 
       
  1027   \begin{proof}
       
  1028   By routine induction on @{thm (prem 1) Posix_LV}.\qed 
       
  1029   \end{proof}
       
  1030 
       
  1031   \noindent
       
  1032   Next is the lemma that shows the function @{term "mkeps"} calculates
       
  1033   the POSIX value for the empty string and a nullable regular expression.
       
  1034 
       
  1035   \begin{lemma}\label{lemmkeps}
       
  1036   @{thm[mode=IfThen] Posix_mkeps}
       
  1037   \end{lemma}
       
  1038 
       
  1039   \begin{proof}
       
  1040   By routine induction on @{term r}.\qed 
       
  1041   \end{proof}
       
  1042 
       
  1043   \noindent
       
  1044   The central lemma for our POSIX relation is that the \<open>inj\<close>-function
       
  1045   preserves POSIX values.
       
  1046 
       
  1047   \begin{lemma}\label{Posix2}
       
  1048   @{thm[mode=IfThen] Posix_injval}
       
  1049   \end{lemma}
       
  1050 
       
  1051   \begin{proof}
       
  1052   By induction on \<open>r\<close>. We explain two cases.
       
  1053 
       
  1054   \begin{itemize}
       
  1055   \item[$\bullet$] Case @{term "r = ALT r\<^sub>1 r\<^sub>2"}. There are
       
  1056   two subcases, namely \<open>(a)\<close> \mbox{@{term "v = Left v'"}} and @{term
       
  1057   "s \<in> der c r\<^sub>1 \<rightarrow> v'"}; and \<open>(b)\<close> @{term "v = Right v'"}, @{term
       
  1058   "s \<notin> L (der c r\<^sub>1)"} and @{term "s \<in> der c r\<^sub>2 \<rightarrow> v'"}. In \<open>(a)\<close> we
       
  1059   know @{term "s \<in> der c r\<^sub>1 \<rightarrow> v'"}, from which we can infer @{term "(c # s)
       
  1060   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v'"} by induction hypothesis and hence @{term "(c #
       
  1061   s) \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> injval (ALT r\<^sub>1 r\<^sub>2) c (Left v')"} as needed. Similarly
       
  1062   in subcase \<open>(b)\<close> where, however, in addition we have to use
       
  1063   Proposition~\ref{derprop}(2) in order to infer @{term "c # s \<notin> L r\<^sub>1"} from @{term
       
  1064   "s \<notin> L (der c r\<^sub>1)"}.\smallskip
       
  1065 
       
  1066   \item[$\bullet$] Case @{term "r = SEQ r\<^sub>1 r\<^sub>2"}. There are three subcases:
       
  1067   
       
  1068   \begin{quote}
       
  1069   \begin{description}
       
  1070   \item[\<open>(a)\<close>] @{term "v = Left (Seq v\<^sub>1 v\<^sub>2)"} and @{term "nullable r\<^sub>1"} 
       
  1071   \item[\<open>(b)\<close>] @{term "v = Right v\<^sub>1"} and @{term "nullable r\<^sub>1"} 
       
  1072   \item[\<open>(c)\<close>] @{term "v = Seq v\<^sub>1 v\<^sub>2"} and @{term "\<not> nullable r\<^sub>1"} 
       
  1073   \end{description}
       
  1074   \end{quote}
       
  1075 
       
  1076   \noindent For \<open>(a)\<close> we know @{term "s\<^sub>1 \<in> der c r\<^sub>1 \<rightarrow> v\<^sub>1"} and
       
  1077   @{term "s\<^sub>2 \<in> r\<^sub>2 \<rightarrow> v\<^sub>2"} as well as
       
  1078   %
       
  1079   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> s\<^sub>1 @ s\<^sub>3 \<in> L (der c r\<^sub>1) \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
  1080 
       
  1081   \noindent From the latter we can infer by Proposition~\ref{derprop}(2):
       
  1082   %
       
  1083   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
  1084 
       
  1085   \noindent We can use the induction hypothesis for \<open>r\<^sub>1\<close> to obtain
       
  1086   @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. Putting this all together allows us to infer
       
  1087   @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case \<open>(c)\<close>
       
  1088   is similar.
       
  1089 
       
  1090   For \<open>(b)\<close> we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and 
       
  1091   @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former
       
  1092   we have @{term "(c # s) \<in> r\<^sub>2 \<rightarrow> (injval r\<^sub>2 c v\<^sub>1)"} by induction hypothesis
       
  1093   for @{term "r\<^sub>2"}. From the latter we can infer
       
  1094   %
       
  1095   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = c # s \<and> s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
  1096 
       
  1097   \noindent By Lemma~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"}
       
  1098   holds. Putting this all together, we can conclude with @{term "(c #
       
  1099   s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required.
       
  1100 
       
  1101   Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the
       
  1102   sequence case, except that we need to also ensure that @{term "flat (injval r\<^sub>1
       
  1103   c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1)
       
  1104   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}  (which in turn follows from @{term "s\<^sub>1 \<in> der c
       
  1105   r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed
       
  1106   \end{itemize}
       
  1107   \end{proof}
       
  1108 
       
  1109   \noindent
       
  1110   With Lemma~\ref{Posix2} in place, it is completely routine to establish
       
  1111   that the Sulzmann and Lu lexer satisfies our specification (returning
       
  1112   the null value @{term "None"} iff the string is not in the language of the regular expression,
       
  1113   and returning a unique POSIX value iff the string \emph{is} in the language):
       
  1114 
       
  1115   \begin{theorem}\mbox{}\smallskip\\\label{lexercorrect}
       
  1116   \begin{tabular}{ll}
       
  1117   (1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
       
  1118   (2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
       
  1119   \end{tabular}
       
  1120   \end{theorem}
       
  1121 
       
  1122   \begin{proof}
       
  1123   By induction on @{term s} using Lemma~\ref{lemmkeps} and \ref{Posix2}.\qed  
       
  1124   \end{proof}
       
  1125 
       
  1126   \noindent In \textit{(2)} we further know by Theorem~\ref{posixdeterm} that the
       
  1127   value returned by the lexer must be unique.   A simple corollary 
       
  1128   of our two theorems is:
       
  1129 
       
  1130   \begin{corollary}\mbox{}\smallskip\\\label{lexercorrectcor}
       
  1131   \begin{tabular}{ll}
       
  1132   (1) & @{thm (lhs) lexer_correctness(2)} if and only if @{thm (rhs) lexer_correctness(2)}\\ 
       
  1133   (2) & @{thm (lhs) lexer_correctness(1)} if and only if @{thm (rhs) lexer_correctness(1)}\\
       
  1134   \end{tabular}
       
  1135   \end{corollary}
       
  1136 
       
  1137   \noindent This concludes our correctness proof. Note that we have
       
  1138   not changed the algorithm of Sulzmann and Lu,\footnote{All
       
  1139   deviations we introduced are harmless.} but introduced our own
       
  1140   specification for what a correct result---a POSIX value---should be.
       
  1141   In the next section we show that our specification coincides with
       
  1142   another one given by Okui and Suzuki using a different technique.
       
  1143 
       
  1144 \<close>
       
  1145 
       
  1146 section \<open>Ordering of Values according to Okui and Suzuki\<close>
       
  1147 
       
  1148 text \<open>
       
  1149   
       
  1150   While in the previous section we have defined POSIX values directly
       
  1151   in terms of a ternary relation (see inference rules in Figure~\ref{POSIXrules}),
       
  1152   Sulzmann and Lu took a different approach in \cite{Sulzmann2014}:
       
  1153   they introduced an ordering for values and identified POSIX values
       
  1154   as the maximal elements.  An extended version of \cite{Sulzmann2014}
       
  1155   is available at the website of its first author; this includes more
       
  1156   details of their proofs, but which are evidently not in final form
       
  1157   yet. Unfortunately, we were not able to verify claims that their
       
  1158   ordering has properties such as being transitive or having maximal
       
  1159   elements. 
       
  1160  
       
  1161   Okui and Suzuki \cite{OkuiSuzuki2010,OkuiSuzukiTech} described
       
  1162   another ordering of values, which they use to establish the
       
  1163   correctness of their automata-based algorithm for POSIX matching.
       
  1164   Their ordering resembles some aspects of the one given by Sulzmann
       
  1165   and Lu, but overall is quite different. To begin with, Okui and
       
  1166   Suzuki identify POSIX values as minimal, rather than maximal,
       
  1167   elements in their ordering. A more substantial difference is that
       
  1168   the ordering by Okui and Suzuki uses \emph{positions} in order to
       
  1169   identify and compare subvalues. Positions are lists of natural
       
  1170   numbers. This allows them to quite naturally formalise the Longest
       
  1171   Match and Priority rules of the informal POSIX standard.  Consider
       
  1172   for example the value @{term v}
       
  1173 
       
  1174   \begin{center}
       
  1175   @{term "v == Stars [Seq (Char x) (Char y), Char z]"}
       
  1176   \end{center}
       
  1177 
       
  1178   \noindent
       
  1179   At position \<open>[0,1]\<close> of this value is the
       
  1180   subvalue \<open>Char y\<close> and at position \<open>[1]\<close> the
       
  1181   subvalue @{term "Char z"}.  At the `root' position, or empty list
       
  1182   @{term "[]"}, is the whole value @{term v}. Positions such as \<open>[0,1,0]\<close> or \<open>[2]\<close> are outside of \<open>v\<close>. If it exists, the subvalue of @{term v} at a position \<open>p\<close>, written @{term "at v p"}, can be recursively defined by
       
  1183   
       
  1184   \begin{center}
       
  1185   \begin{tabular}{r@ {\hspace{0mm}}lcl}
       
  1186   @{term v} &  \<open>\<downharpoonleft>\<^bsub>[]\<^esub>\<close> & \<open>\<equiv>\<close>& @{thm (rhs) at.simps(1)}\\
       
  1187   @{term "Left v"} & \<open>\<downharpoonleft>\<^bsub>0::ps\<^esub>\<close> & \<open>\<equiv>\<close>& @{thm (rhs) at.simps(2)}\\
       
  1188   @{term "Right v"} & \<open>\<downharpoonleft>\<^bsub>1::ps\<^esub>\<close> & \<open>\<equiv>\<close> & 
       
  1189   @{thm (rhs) at.simps(3)[simplified Suc_0_fold]}\\
       
  1190   @{term "Seq v\<^sub>1 v\<^sub>2"} & \<open>\<downharpoonleft>\<^bsub>0::ps\<^esub>\<close> & \<open>\<equiv>\<close> & 
       
  1191   @{thm (rhs) at.simps(4)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \\
       
  1192   @{term "Seq v\<^sub>1 v\<^sub>2"} & \<open>\<downharpoonleft>\<^bsub>1::ps\<^esub>\<close>
       
  1193   & \<open>\<equiv>\<close> & 
       
  1194   @{thm (rhs) at.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2", simplified Suc_0_fold]} \\
       
  1195   @{term "Stars vs"} & \<open>\<downharpoonleft>\<^bsub>n::ps\<^esub>\<close> & \<open>\<equiv>\<close>& @{thm (rhs) at.simps(6)}\\
       
  1196   \end{tabular} 
       
  1197   \end{center}
       
  1198 
       
  1199   \noindent In the last clause we use Isabelle's notation @{term "vs ! n"} for the
       
  1200   \<open>n\<close>th element in a list.  The set of positions inside a value \<open>v\<close>,
       
  1201   written @{term "Pos v"}, is given by 
       
  1202 
       
  1203   \begin{center}
       
  1204   \begin{tabular}{lcl}
       
  1205   @{thm (lhs) Pos.simps(1)} & \<open>\<equiv>\<close> & @{thm (rhs) Pos.simps(1)}\\
       
  1206   @{thm (lhs) Pos.simps(2)} & \<open>\<equiv>\<close> & @{thm (rhs) Pos.simps(2)}\\
       
  1207   @{thm (lhs) Pos.simps(3)} & \<open>\<equiv>\<close> & @{thm (rhs) Pos.simps(3)}\\
       
  1208   @{thm (lhs) Pos.simps(4)} & \<open>\<equiv>\<close> & @{thm (rhs) Pos.simps(4)}\\
       
  1209   @{thm (lhs) Pos.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
       
  1210   & \<open>\<equiv>\<close> 
       
  1211   & @{thm (rhs) Pos.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1212   @{thm (lhs) Pos_stars} & \<open>\<equiv>\<close> & @{thm (rhs) Pos_stars}\\
       
  1213   \end{tabular}
       
  1214   \end{center}
       
  1215 
       
  1216   \noindent 
       
  1217   whereby \<open>len\<close> in the last clause stands for the length of a list. Clearly
       
  1218   for every position inside a value there exists a subvalue at that position.
       
  1219  
       
  1220 
       
  1221   To help understanding the ordering of Okui and Suzuki, consider again 
       
  1222   the earlier value
       
  1223   \<open>v\<close> and compare it with the following \<open>w\<close>:
       
  1224 
       
  1225   \begin{center}
       
  1226   \begin{tabular}{l}
       
  1227   @{term "v == Stars [Seq (Char x) (Char y), Char z]"}\\
       
  1228   @{term "w == Stars [Char x, Char y, Char z]"}  
       
  1229   \end{tabular}
       
  1230   \end{center}
       
  1231 
       
  1232   \noindent Both values match the string \<open>xyz\<close>, that means if
       
  1233   we flatten these values at their respective root position, we obtain
       
  1234   \<open>xyz\<close>. However, at position \<open>[0]\<close>, \<open>v\<close> matches
       
  1235   \<open>xy\<close> whereas \<open>w\<close> matches only the shorter \<open>x\<close>. So
       
  1236   according to the Longest Match Rule, we should prefer \<open>v\<close>,
       
  1237   rather than \<open>w\<close> as POSIX value for string \<open>xyz\<close> (and
       
  1238   corresponding regular expression). In order to
       
  1239   formalise this idea, Okui and Suzuki introduce a measure for
       
  1240   subvalues at position \<open>p\<close>, called the \emph{norm} of \<open>v\<close>
       
  1241   at position \<open>p\<close>. We can define this measure in Isabelle as an
       
  1242   integer as follows
       
  1243   
       
  1244   \begin{center}
       
  1245   @{thm pflat_len_def}
       
  1246   \end{center}
       
  1247 
       
  1248   \noindent where we take the length of the flattened value at
       
  1249   position \<open>p\<close>, provided the position is inside \<open>v\<close>; if
       
  1250   not, then the norm is \<open>-1\<close>. The default for outside
       
  1251   positions is crucial for the POSIX requirement of preferring a
       
  1252   \<open>Left\<close>-value over a \<open>Right\<close>-value (if they can match the
       
  1253   same string---see the Priority Rule from the Introduction). For this
       
  1254   consider
       
  1255 
       
  1256   \begin{center}
       
  1257   @{term "v == Left (Char x)"} \qquad and \qquad @{term "w == Right (Char x)"}
       
  1258   \end{center}
       
  1259 
       
  1260   \noindent Both values match \<open>x\<close>. At position \<open>[0]\<close>
       
  1261   the norm of @{term v} is \<open>1\<close> (the subvalue matches \<open>x\<close>),
       
  1262   but the norm of \<open>w\<close> is \<open>-1\<close> (the position is outside
       
  1263   \<open>w\<close> according to how we defined the `inside' positions of
       
  1264   \<open>Left\<close>- and \<open>Right\<close>-values).  Of course at position
       
  1265   \<open>[1]\<close>, the norms @{term "pflat_len v [1]"} and @{term
       
  1266   "pflat_len w [1]"} are reversed, but the point is that subvalues
       
  1267   will be analysed according to lexicographically ordered
       
  1268   positions. According to this ordering, the position \<open>[0]\<close>
       
  1269   takes precedence over \<open>[1]\<close> and thus also \<open>v\<close> will be 
       
  1270   preferred over \<open>w\<close>.  The lexicographic ordering of positions, written
       
  1271   @{term "DUMMY \<sqsubset>lex DUMMY"}, can be conveniently formalised
       
  1272   by three inference rules
       
  1273 
       
  1274   \begin{center}
       
  1275   \begin{tabular}{ccc}
       
  1276   @{thm [mode=Axiom] lex_list.intros(1)}\hspace{1cm} &
       
  1277   @{thm [mode=Rule] lex_list.intros(3)[where ?p1.0="p\<^sub>1" and ?p2.0="p\<^sub>2" and
       
  1278                                             ?ps1.0="ps\<^sub>1" and ?ps2.0="ps\<^sub>2"]}\hspace{1cm} &
       
  1279   @{thm [mode=Rule] lex_list.intros(2)[where ?ps1.0="ps\<^sub>1" and ?ps2.0="ps\<^sub>2"]}
       
  1280   \end{tabular}
       
  1281   \end{center}
       
  1282 
       
  1283   With the norm and lexicographic order in place,
       
  1284   we can state the key definition of Okui and Suzuki
       
  1285   \cite{OkuiSuzuki2010}: a value @{term "v\<^sub>1"} is \emph{smaller at position \<open>p\<close>} than
       
  1286   @{term "v\<^sub>2"}, written @{term "v\<^sub>1 \<sqsubset>val p v\<^sub>2"}, 
       
  1287   if and only if  $(i)$ the norm at position \<open>p\<close> is
       
  1288   greater in @{term "v\<^sub>1"} (that is the string @{term "flat (at v\<^sub>1 p)"} is longer 
       
  1289   than @{term "flat (at v\<^sub>2 p)"}) and $(ii)$ all subvalues at 
       
  1290   positions that are inside @{term "v\<^sub>1"} or @{term "v\<^sub>2"} and that are
       
  1291   lexicographically smaller than \<open>p\<close>, we have the same norm, namely
       
  1292 
       
  1293  \begin{center}
       
  1294  \begin{tabular}{c}
       
  1295  @{thm (lhs) PosOrd_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} 
       
  1296  \<open>\<equiv>\<close> 
       
  1297  $\begin{cases}
       
  1298  (i) & @{term "pflat_len v\<^sub>1 p > pflat_len v\<^sub>2 p"}   \quad\text{and}\smallskip \\
       
  1299  (ii) & @{term "(\<forall>q \<in> Pos v\<^sub>1 \<union> Pos v\<^sub>2. q \<sqsubset>lex p --> pflat_len v\<^sub>1 q = pflat_len v\<^sub>2 q)"}
       
  1300  \end{cases}$
       
  1301  \end{tabular}
       
  1302  \end{center}
       
  1303 
       
  1304  \noindent The position \<open>p\<close> in this definition acts as the
       
  1305   \emph{first distinct position} of \<open>v\<^sub>1\<close> and \<open>v\<^sub>2\<close>, where both values match strings of different length
       
  1306   \cite{OkuiSuzuki2010}.  Since at \<open>p\<close> the values \<open>v\<^sub>1\<close> and \<open>v\<^sub>2\<close> match different strings, the
       
  1307   ordering is irreflexive. Derived from the definition above
       
  1308   are the following two orderings:
       
  1309   
       
  1310   \begin{center}
       
  1311   \begin{tabular}{l}
       
  1312   @{thm PosOrd_ex_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1313   @{thm PosOrd_ex_eq_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
       
  1314   \end{tabular}
       
  1315   \end{center}
       
  1316 
       
  1317  While we encountered a number of obstacles for establishing properties like
       
  1318  transitivity for the ordering of Sulzmann and Lu (and which we failed
       
  1319  to overcome), it is relatively straightforward to establish this
       
  1320  property for the orderings
       
  1321  @{term "DUMMY :\<sqsubset>val DUMMY"} and @{term "DUMMY :\<sqsubseteq>val DUMMY"}  
       
  1322  by Okui and Suzuki.
       
  1323 
       
  1324  \begin{lemma}[Transitivity]\label{transitivity}
       
  1325  @{thm [mode=IfThen] PosOrd_trans[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and ?v3.0="v\<^sub>3"]} 
       
  1326  \end{lemma}
       
  1327 
       
  1328  \begin{proof} From the assumption we obtain two positions \<open>p\<close>
       
  1329  and \<open>q\<close>, where the values \<open>v\<^sub>1\<close> and \<open>v\<^sub>2\<close> (respectively \<open>v\<^sub>2\<close> and \<open>v\<^sub>3\<close>) are `distinct'.  Since \<open>\<prec>\<^bsub>lex\<^esub>\<close> is trichotomous, we need to consider
       
  1330  three cases, namely @{term "p = q"}, @{term "p \<sqsubset>lex q"} and
       
  1331  @{term "q \<sqsubset>lex p"}. Let us look at the first case.  Clearly
       
  1332  @{term "pflat_len v\<^sub>2 p < pflat_len v\<^sub>1 p"} and @{term
       
  1333  "pflat_len v\<^sub>3 p < pflat_len v\<^sub>2 p"} imply @{term
       
  1334  "pflat_len v\<^sub>3 p < pflat_len v\<^sub>1 p"}.  It remains to show
       
  1335  that for a @{term "p' \<in> Pos v\<^sub>1 \<union> Pos v\<^sub>3"}
       
  1336  with @{term "p' \<sqsubset>lex p"} that @{term "pflat_len v\<^sub>1
       
  1337  p' = pflat_len v\<^sub>3 p'"} holds.  Suppose @{term "p' \<in> Pos
       
  1338  v\<^sub>1"}, then we can infer from the first assumption that @{term
       
  1339  "pflat_len v\<^sub>1 p' = pflat_len v\<^sub>2 p'"}.  But this means
       
  1340  that @{term "p'"} must be in @{term "Pos v\<^sub>2"} too (the norm
       
  1341  cannot be \<open>-1\<close> given @{term "p' \<in> Pos v\<^sub>1"}).  
       
  1342  Hence we can use the second assumption and
       
  1343  infer @{term "pflat_len v\<^sub>2 p' = pflat_len v\<^sub>3 p'"},
       
  1344  which concludes this case with @{term "v\<^sub>1 :\<sqsubset>val
       
  1345  v\<^sub>3"}.  The reasoning in the other cases is similar.\qed
       
  1346  \end{proof}
       
  1347 
       
  1348  \noindent 
       
  1349  The proof for $\preccurlyeq$ is similar and omitted.
       
  1350  It is also straightforward to show that \<open>\<prec>\<close> and
       
  1351  $\preccurlyeq$ are partial orders.  Okui and Suzuki furthermore show that they
       
  1352  are linear orderings for lexical values \cite{OkuiSuzuki2010} of a given
       
  1353  regular expression and given string, but we have not formalised this in Isabelle. It is
       
  1354  not essential for our results. What we are going to show below is
       
  1355  that for a given \<open>r\<close> and \<open>s\<close>, the orderings have a unique
       
  1356  minimal element on the set @{term "LV r s"}, which is the POSIX value
       
  1357  we defined in the previous section. We start with two properties that
       
  1358  show how the length of a flattened value relates to the \<open>\<prec>\<close>-ordering.
       
  1359 
       
  1360  \begin{proposition}\mbox{}\smallskip\\\label{ordlen}
       
  1361  \begin{tabular}{@ {}ll}
       
  1362  (1) &
       
  1363  @{thm [mode=IfThen] PosOrd_shorterE[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1364  (2) &
       
  1365  @{thm [mode=IfThen] PosOrd_shorterI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} 
       
  1366  \end{tabular} 
       
  1367  \end{proposition}
       
  1368  
       
  1369  \noindent Both properties follow from the definition of the ordering. Note that
       
  1370  \textit{(2)} entails that a value, say @{term "v\<^sub>2"}, whose underlying 
       
  1371  string is a strict prefix of another flattened value, say @{term "v\<^sub>1"}, then
       
  1372  @{term "v\<^sub>1"} must be smaller than @{term "v\<^sub>2"}. For our proofs it
       
  1373  will be useful to have the following properties---in each case the underlying strings 
       
  1374  of the compared values are the same: 
       
  1375 
       
  1376   \begin{proposition}\mbox{}\smallskip\\\label{ordintros}
       
  1377   \begin{tabular}{ll}
       
  1378   \textit{(1)} & 
       
  1379   @{thm [mode=IfThen] PosOrd_Left_Right[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1380   \textit{(2)} & If
       
  1381   @{thm (prem 1) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;then\;
       
  1382   @{thm (lhs) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;iff\;
       
  1383   @{thm (rhs) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1384   \textit{(3)} & If
       
  1385   @{thm (prem 1) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;then\;
       
  1386   @{thm (lhs) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;iff\;
       
  1387   @{thm (rhs) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
       
  1388   \textit{(4)} & If
       
  1389   @{thm (prem 1) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]} \;then\;
       
  1390   @{thm (lhs) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]} \;iff\;
       
  1391   @{thm (rhs) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]}\\
       
  1392   \textit{(5)} & If
       
  1393   @{thm (prem 2) PosOrd_SeqI1[simplified, where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1394                                     ?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]} \;and\;
       
  1395   @{thm (prem 1) PosOrd_SeqI1[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1396                                     ?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]} \;then\;
       
  1397   @{thm (concl) PosOrd_SeqI1[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1398                                    ?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]}\\
       
  1399   \textit{(6)} & If
       
  1400   @{thm (prem 1) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;then\;
       
  1401   @{thm (lhs) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;iff\;
       
  1402   @{thm (rhs) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]}\\  
       
  1403   
       
  1404   \textit{(7)} & If
       
  1405   @{thm (prem 2) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1406                             ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;and\;
       
  1407   @{thm (prem 1) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1408                             ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;then\;
       
  1409    @{thm (concl) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
       
  1410                             ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]}\\
       
  1411   \end{tabular} 
       
  1412   \end{proposition}
       
  1413 
       
  1414   \noindent One might prefer that statements \textit{(4)} and \textit{(5)} 
       
  1415   (respectively \textit{(6)} and \textit{(7)})
       
  1416   are combined into a single \textit{iff}-statement (like the ones for \<open>Left\<close> and \<open>Right\<close>). Unfortunately this cannot be done easily: such
       
  1417   a single statement would require an additional assumption about the
       
  1418   two values @{term "Seq v\<^sub>1 v\<^sub>2"} and @{term "Seq w\<^sub>1 w\<^sub>2"}
       
  1419   being inhabited by the same regular expression. The
       
  1420   complexity of the proofs involved seems to not justify such a
       
  1421   `cleaner' single statement. The statements given are just the properties that
       
  1422   allow us to establish our theorems without any difficulty. The proofs 
       
  1423   for Proposition~\ref{ordintros} are routine.
       
  1424  
       
  1425 
       
  1426   Next we establish how Okui and Suzuki's orderings relate to our
       
  1427   definition of POSIX values.  Given a \<open>POSIX\<close> value \<open>v\<^sub>1\<close>
       
  1428   for \<open>r\<close> and \<open>s\<close>, then any other lexical value \<open>v\<^sub>2\<close> in @{term "LV r s"} is greater or equal than \<open>v\<^sub>1\<close>, namely:
       
  1429 
       
  1430 
       
  1431   \begin{theorem}\label{orderone}
       
  1432   @{thm [mode=IfThen] Posix_PosOrd[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
       
  1433   \end{theorem}
       
  1434 
       
  1435   \begin{proof} By induction on our POSIX rules. By
       
  1436   Theorem~\ref{posixdeterm} and the definition of @{const LV}, it is clear
       
  1437   that \<open>v\<^sub>1\<close> and \<open>v\<^sub>2\<close> have the same
       
  1438   underlying string @{term s}.  The three base cases are
       
  1439   straightforward: for example for @{term "v\<^sub>1 = Void"}, we have
       
  1440   that @{term "v\<^sub>2 \<in> LV ONE []"} must also be of the form
       
  1441   \mbox{@{term "v\<^sub>2 = Void"}}. Therefore we have @{term
       
  1442   "v\<^sub>1 :\<sqsubseteq>val v\<^sub>2"}.  The inductive cases for
       
  1443   \<open>r\<close> being of the form @{term "ALT r\<^sub>1 r\<^sub>2"} and
       
  1444   @{term "SEQ r\<^sub>1 r\<^sub>2"} are as follows:
       
  1445 
       
  1446 
       
  1447   \begin{itemize} 
       
  1448 
       
  1449   \item[$\bullet$] Case \<open>P+L\<close> with @{term "s \<in> (ALT r\<^sub>1 r\<^sub>2)
       
  1450   \<rightarrow> (Left w\<^sub>1)"}: In this case the value 
       
  1451   @{term "v\<^sub>2"} is either of the
       
  1452   form @{term "Left w\<^sub>2"} or @{term "Right w\<^sub>2"}. In the
       
  1453   latter case we can immediately conclude with \mbox{@{term "v\<^sub>1
       
  1454   :\<sqsubseteq>val v\<^sub>2"}} since a \<open>Left\<close>-value with the
       
  1455   same underlying string \<open>s\<close> is always smaller than a
       
  1456   \<open>Right\<close>-value by Proposition~\ref{ordintros}\textit{(1)}.  
       
  1457   In the former case we have @{term "w\<^sub>2
       
  1458   \<in> LV r\<^sub>1 s"} and can use the induction hypothesis to infer
       
  1459   @{term "w\<^sub>1 :\<sqsubseteq>val w\<^sub>2"}. Because @{term
       
  1460   "w\<^sub>1"} and @{term "w\<^sub>2"} have the same underlying string
       
  1461   \<open>s\<close>, we can conclude with @{term "Left w\<^sub>1
       
  1462   :\<sqsubseteq>val Left w\<^sub>2"} using
       
  1463   Proposition~\ref{ordintros}\textit{(2)}.\smallskip
       
  1464 
       
  1465   \item[$\bullet$] Case \<open>P+R\<close> with @{term "s \<in> (ALT r\<^sub>1 r\<^sub>2)
       
  1466   \<rightarrow> (Right w\<^sub>1)"}: This case similar to the previous
       
  1467   case, except that we additionally know @{term "s \<notin> L
       
  1468   r\<^sub>1"}. This is needed when @{term "v\<^sub>2"} is of the form
       
  1469   \mbox{@{term "Left w\<^sub>2"}}. Since \mbox{@{term "flat v\<^sub>2 = flat
       
  1470   w\<^sub>2"} \<open>= s\<close>} and @{term "\<Turnstile> w\<^sub>2 :
       
  1471   r\<^sub>1"}, we can derive a contradiction for \mbox{@{term "s \<notin> L
       
  1472   r\<^sub>1"}} using
       
  1473   Proposition~\ref{inhabs}. So also in this case \mbox{@{term "v\<^sub>1
       
  1474   :\<sqsubseteq>val v\<^sub>2"}}.\smallskip
       
  1475 
       
  1476   \item[$\bullet$] Case \<open>PS\<close> with @{term "(s\<^sub>1 @
       
  1477   s\<^sub>2) \<in> (SEQ r\<^sub>1 r\<^sub>2) \<rightarrow> (Seq
       
  1478   w\<^sub>1 w\<^sub>2)"}: We can assume @{term "v\<^sub>2 = Seq
       
  1479   (u\<^sub>1) (u\<^sub>2)"} with @{term "\<Turnstile> u\<^sub>1 :
       
  1480   r\<^sub>1"} and \mbox{@{term "\<Turnstile> u\<^sub>2 :
       
  1481   r\<^sub>2"}}. We have @{term "s\<^sub>1 @ s\<^sub>2 = (flat
       
  1482   u\<^sub>1) @ (flat u\<^sub>2)"}.  By the side-condition of the
       
  1483   \<open>PS\<close>-rule we know that either @{term "s\<^sub>1 = flat
       
  1484   u\<^sub>1"} or that @{term "flat u\<^sub>1"} is a strict prefix of
       
  1485   @{term "s\<^sub>1"}. In the latter case we can infer @{term
       
  1486   "w\<^sub>1 :\<sqsubset>val u\<^sub>1"} by
       
  1487   Proposition~\ref{ordlen}\textit{(2)} and from this @{term "v\<^sub>1
       
  1488   :\<sqsubseteq>val v\<^sub>2"} by Proposition~\ref{ordintros}\textit{(5)}
       
  1489   (as noted above @{term "v\<^sub>1"} and @{term "v\<^sub>2"} must have the
       
  1490   same underlying string).
       
  1491   In the former case we know
       
  1492   @{term "u\<^sub>1 \<in> LV r\<^sub>1 s\<^sub>1"} and @{term
       
  1493   "u\<^sub>2 \<in> LV r\<^sub>2 s\<^sub>2"}. With this we can use the
       
  1494   induction hypotheses to infer @{term "w\<^sub>1 :\<sqsubseteq>val
       
  1495   u\<^sub>1"} and @{term "w\<^sub>2 :\<sqsubseteq>val u\<^sub>2"}. By
       
  1496   Proposition~\ref{ordintros}\textit{(4,5)} we can again infer 
       
  1497   @{term "v\<^sub>1 :\<sqsubseteq>val
       
  1498   v\<^sub>2"}.
       
  1499 
       
  1500   \end{itemize}
       
  1501 
       
  1502   \noindent The case for \<open>P\<star>\<close> is similar to the \<open>PS\<close>-case and omitted.\qed
       
  1503   \end{proof}
       
  1504 
       
  1505   \noindent This theorem shows that our \<open>POSIX\<close> value for a
       
  1506   regular expression \<open>r\<close> and string @{term s} is in fact a
       
  1507   minimal element of the values in \<open>LV r s\<close>. By
       
  1508   Proposition~\ref{ordlen}\textit{(2)} we also know that any value in
       
  1509   \<open>LV r s'\<close>, with @{term "s'"} being a strict prefix, cannot be
       
  1510   smaller than \<open>v\<^sub>1\<close>. The next theorem shows the
       
  1511   opposite---namely any minimal element in @{term "LV r s"} must be a
       
  1512   \<open>POSIX\<close> value. This can be established by induction on \<open>r\<close>, but the proof can be drastically simplified by using the fact
       
  1513   from the previous section about the existence of a \<open>POSIX\<close> value
       
  1514   whenever a string @{term "s \<in> L r"}.
       
  1515 
       
  1516 
       
  1517   \begin{theorem}
       
  1518   @{thm [mode=IfThen] PosOrd_Posix[where ?v1.0="v\<^sub>1"]} 
       
  1519   \end{theorem}
       
  1520 
       
  1521   \begin{proof} 
       
  1522   If @{thm (prem 1) PosOrd_Posix[where ?v1.0="v\<^sub>1"]} then 
       
  1523   @{term "s \<in> L r"} by Proposition~\ref{inhabs}. Hence by Theorem~\ref{lexercorrect}(2) 
       
  1524   there exists a
       
  1525   \<open>POSIX\<close> value @{term "v\<^sub>P"} with @{term "s \<in> r \<rightarrow> v\<^sub>P"}
       
  1526   and by Lemma~\ref{LVposix} we also have \mbox{@{term "v\<^sub>P \<in> LV r s"}}.
       
  1527   By Theorem~\ref{orderone} we therefore have 
       
  1528   @{term "v\<^sub>P :\<sqsubseteq>val v\<^sub>1"}. If @{term "v\<^sub>P = v\<^sub>1"} then
       
  1529   we are done. Otherwise we have @{term "v\<^sub>P :\<sqsubset>val v\<^sub>1"}, which 
       
  1530   however contradicts the second assumption about @{term "v\<^sub>1"} being the smallest
       
  1531   element in @{term "LV r s"}. So we are done in this case too.\qed
       
  1532   \end{proof}
       
  1533 
       
  1534   \noindent
       
  1535   From this we can also show 
       
  1536   that if @{term "LV r s"} is non-empty (or equivalently @{term "s \<in> L r"}) then 
       
  1537   it has a unique minimal element:
       
  1538 
       
  1539   \begin{corollary}
       
  1540   @{thm [mode=IfThen] Least_existence1}
       
  1541   \end{corollary}
       
  1542 
       
  1543 
       
  1544 
       
  1545   \noindent To sum up, we have shown that the (unique) minimal elements 
       
  1546   of the ordering by Okui and Suzuki are exactly the \<open>POSIX\<close>
       
  1547   values we defined inductively in Section~\ref{posixsec}. This provides
       
  1548   an independent confirmation that our ternary relation formalises the
       
  1549   informal POSIX rules. 
       
  1550 
       
  1551 \<close>
       
  1552 
       
  1553 section \<open>Bitcoded Lexing\<close>
       
  1554 
       
  1555 
       
  1556 
       
  1557 
       
  1558 text \<open>
       
  1559 
       
  1560 Incremental calculation of the value. To simplify the proof we first define the function
       
  1561 @{const flex} which calculates the ``iterated'' injection function. With this we can 
       
  1562 rewrite the lexer as
       
  1563 
       
  1564 \begin{center}
       
  1565 @{thm lexer_flex}
       
  1566 \end{center}
       
  1567 
       
  1568 
       
  1569 \<close>
       
  1570 
       
  1571 section \<open>Optimisations\<close>
       
  1572 
       
  1573 text \<open>
       
  1574 
       
  1575   Derivatives as calculated by \Brz's method are usually more complex
       
  1576   regular expressions than the initial one; the result is that the
       
  1577   derivative-based matching and lexing algorithms are often abysmally slow.
       
  1578   However, various optimisations are possible, such as the simplifications
       
  1579   of @{term "ALT ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and
       
  1580   @{term "SEQ r ONE"} to @{term r}. These simplifications can speed up the
       
  1581   algorithms considerably, as noted in \cite{Sulzmann2014}. One of the
       
  1582   advantages of having a simple specification and correctness proof is that
       
  1583   the latter can be refined to prove the correctness of such simplification
       
  1584   steps. While the simplification of regular expressions according to 
       
  1585   rules like
       
  1586 
       
  1587   \begin{equation}\label{Simpl}
       
  1588   \begin{array}{lcllcllcllcl}
       
  1589   @{term "ALT ZERO r"} & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1590   @{term "ALT r ZERO"} & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1591   @{term "SEQ ONE r"}  & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1592   @{term "SEQ r ONE"}  & \<open>\<Rightarrow>\<close> & @{term r}
       
  1593   \end{array}
       
  1594   \end{equation}
       
  1595 
       
  1596   \noindent is well understood, there is an obstacle with the POSIX value
       
  1597   calculation algorithm by Sulzmann and Lu: if we build a derivative regular
       
  1598   expression and then simplify it, we will calculate a POSIX value for this
       
  1599   simplified derivative regular expression, \emph{not} for the original (unsimplified)
       
  1600   derivative regular expression. Sulzmann and Lu \cite{Sulzmann2014} overcome this obstacle by
       
  1601   not just calculating a simplified regular expression, but also calculating
       
  1602   a \emph{rectification function} that ``repairs'' the incorrect value.
       
  1603   
       
  1604   The rectification functions can be (slightly clumsily) implemented  in
       
  1605   Isabelle/HOL as follows using some auxiliary functions:
       
  1606 
       
  1607   \begin{center}
       
  1608   \begin{tabular}{lcl}
       
  1609   @{thm (lhs) F_RIGHT.simps(1)} & $\dn$ & \<open>Right (f v)\<close>\\
       
  1610   @{thm (lhs) F_LEFT.simps(1)} & $\dn$ & \<open>Left (f v)\<close>\\
       
  1611   
       
  1612   @{thm (lhs) F_ALT.simps(1)} & $\dn$ & \<open>Right (f\<^sub>2 v)\<close>\\
       
  1613   @{thm (lhs) F_ALT.simps(2)} & $\dn$ & \<open>Left (f\<^sub>1 v)\<close>\\
       
  1614   
       
  1615   @{thm (lhs) F_SEQ1.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 ()) (f\<^sub>2 v)\<close>\\
       
  1616   @{thm (lhs) F_SEQ2.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 v) (f\<^sub>2 ())\<close>\\
       
  1617   @{thm (lhs) F_SEQ.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 v\<^sub>1) (f\<^sub>2 v\<^sub>2)\<close>\medskip\\
       
  1618   %\end{tabular}
       
  1619   %
       
  1620   %\begin{tabular}{lcl}
       
  1621   @{term "simp_ALT (ZERO, DUMMY) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_RIGHT f\<^sub>2)"}\\
       
  1622   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, DUMMY)"} & $\dn$ & @{term "(r\<^sub>1, F_LEFT f\<^sub>1)"}\\
       
  1623   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)"}\\
       
  1624   @{term "simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)"}\\
       
  1625   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)"}\\
       
  1626   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(SEQ r\<^sub>1 r\<^sub>2, F_SEQ f\<^sub>1 f\<^sub>2)"}\\
       
  1627   \end{tabular}
       
  1628   \end{center}
       
  1629 
       
  1630   \noindent
       
  1631   The functions \<open>simp\<^bsub>Alt\<^esub>\<close> and \<open>simp\<^bsub>Seq\<^esub>\<close> encode the simplification rules
       
  1632   in \eqref{Simpl} and compose the rectification functions (simplifications can occur
       
  1633   deep inside the regular expression). The main simplification function is then 
       
  1634 
       
  1635   \begin{center}
       
  1636   \begin{tabular}{lcl}
       
  1637   @{term "simp (ALT r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_ALT (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
  1638   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
  1639   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
       
  1640   \end{tabular}
       
  1641   \end{center} 
       
  1642 
       
  1643   \noindent where @{term "id"} stands for the identity function. The
       
  1644   function @{const simp} returns a simplified regular expression and a corresponding
       
  1645   rectification function. Note that we do not simplify under stars: this
       
  1646   seems to slow down the algorithm, rather than speed it up. The optimised
       
  1647   lexer is then given by the clauses:
       
  1648   
       
  1649   \begin{center}
       
  1650   \begin{tabular}{lcl}
       
  1651   @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\
       
  1652   @{thm (lhs) slexer.simps(2)} & $\dn$ & 
       
  1653                          \<open>let (r\<^sub>s, f\<^sub>r) = simp (r \<close>$\backslash$\<open> c) in\<close>\\
       
  1654                      & & \<open>case\<close> @{term "slexer r\<^sub>s s"} \<open>of\<close>\\
       
  1655                      & & \phantom{$|$} @{term "None"}  \<open>\<Rightarrow>\<close> @{term None}\\
       
  1656                      & & $|$ @{term "Some v"} \<open>\<Rightarrow>\<close> \<open>Some (inj r c (f\<^sub>r v))\<close>                          
       
  1657   \end{tabular}
       
  1658   \end{center}
       
  1659 
       
  1660   \noindent
       
  1661   In the second clause we first calculate the derivative @{term "der c r"}
       
  1662   and then simpli
       
  1663 
       
  1664 text \<open>
       
  1665 
       
  1666 Incremental calculation of the value. To simplify the proof we first define the function
       
  1667 @{const flex} which calculates the ``iterated'' injection function. With this we can 
       
  1668 rewrite the lexer as
       
  1669 
       
  1670 \begin{center}
       
  1671 @{thm lexer_flex}
       
  1672 \end{center}
       
  1673 
       
  1674 \begin{center}
       
  1675   \begin{tabular}{lcl}
       
  1676   @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
       
  1677   @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
       
  1678   @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
       
  1679   @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\
       
  1680   @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
  1681   @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
       
  1682   @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}
       
  1683 \end{tabular}
       
  1684 \end{center}
       
  1685 
       
  1686 \begin{center}
       
  1687 \begin{tabular}{lcl}
       
  1688   @{term areg} & $::=$ & @{term "AZERO"}\\
       
  1689                & $\mid$ & @{term "AONE bs"}\\
       
  1690                & $\mid$ & @{term "ACHAR bs c"}\\
       
  1691                & $\mid$ & @{term "AALT bs r1 r2"}\\
       
  1692                & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
       
  1693                & $\mid$ & @{term "ASTAR bs r"}
       
  1694 \end{tabular}
       
  1695 \end{center}
       
  1696 
       
  1697 
       
  1698 \begin{center}
       
  1699   \begin{tabular}{lcl}
       
  1700   @{thm (lhs) intern.simps(1)} & $\dn$ & @{thm (rhs) intern.simps(1)}\\
       
  1701   @{thm (lhs) intern.simps(2)} & $\dn$ & @{thm (rhs) intern.simps(2)}\\
       
  1702   @{thm (lhs) intern.simps(3)} & $\dn$ & @{thm (rhs) intern.simps(3)}\\
       
  1703   @{thm (lhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  1704   @{thm (lhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  1705   @{thm (lhs) intern.simps(6)} & $\dn$ & @{thm (rhs) intern.simps(6)}\\
       
  1706 \end{tabular}
       
  1707 \end{center}
       
  1708 
       
  1709 \begin{center}
       
  1710   \begin{tabular}{lcl}
       
  1711   @{thm (lhs) erase.simps(1)} & $\dn$ & @{thm (rhs) erase.simps(1)}\\
       
  1712   @{thm (lhs) erase.simps(2)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(2)[of bs]}\\
       
  1713   @{thm (lhs) erase.simps(3)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(3)[of bs]}\\
       
  1714   @{thm (lhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1715   @{thm (lhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1716   @{thm (lhs) erase.simps(6)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(6)[of bs]}\\
       
  1717 \end{tabular}
       
  1718 \end{center}
       
  1719 
       
  1720 Some simple facts about erase
       
  1721 
       
  1722 \begin{lemma}\mbox{}\\
       
  1723 @{thm erase_bder}\\
       
  1724 @{thm erase_intern}
       
  1725 \end{lemma}
       
  1726 
       
  1727 \begin{center}
       
  1728   \begin{tabular}{lcl}
       
  1729   @{thm (lhs) bnullable.simps(1)} & $\dn$ & @{thm (rhs) bnullable.simps(1)}\\
       
  1730   @{thm (lhs) bnullable.simps(2)} & $\dn$ & @{thm (rhs) bnullable.simps(2)}\\
       
  1731   @{thm (lhs) bnullable.simps(3)} & $\dn$ & @{thm (rhs) bnullable.simps(3)}\\
       
  1732   @{thm (lhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1733   @{thm (lhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1734   @{thm (lhs) bnullable.simps(6)} & $\dn$ & @{thm (rhs) bnullable.simps(6)}\medskip\\
       
  1735 
       
  1736 %  \end{tabular}
       
  1737 %  \end{center}
       
  1738 
       
  1739 %  \begin{center}
       
  1740 %  \begin{tabular}{lcl}
       
  1741 
       
  1742   @{thm (lhs) bder.simps(1)} & $\dn$ & @{thm (rhs) bder.simps(1)}\\
       
  1743   @{thm (lhs) bder.simps(2)} & $\dn$ & @{thm (rhs) bder.simps(2)}\\
       
  1744   @{thm (lhs) bder.simps(3)} & $\dn$ & @{thm (rhs) bder.simps(3)}\\
       
  1745   @{thm (lhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1746   @{thm (lhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1747   @{thm (lhs) bder.simps(6)} & $\dn$ & @{thm (rhs) bder.simps(6)}
       
  1748   \end{tabular}
       
  1749   \end{center}
       
  1750 
       
  1751 
       
  1752 \begin{center}
       
  1753   \begin{tabular}{lcl}
       
  1754   @{thm (lhs) bmkeps.simps(1)} & $\dn$ & @{thm (rhs) bmkeps.simps(1)}\\
       
  1755   @{thm (lhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1756   @{thm (lhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  1757   @{thm (lhs) bmkeps.simps(4)} & $\dn$ & @{thm (rhs) bmkeps.simps(4)}\medskip\\
       
  1758 \end{tabular}
       
  1759 \end{center}
       
  1760 
       
  1761 
       
  1762 @{thm [mode=IfThen] bder_retrieve}
       
  1763 
       
  1764 By induction on \<open>r\<close>
       
  1765 
       
  1766 \begin{theorem}[Main Lemma]\mbox{}\\
       
  1767 @{thm [mode=IfThen] MAIN_decode}
       
  1768 \end{theorem}
       
  1769 
       
  1770 \noindent
       
  1771 Definition of the bitcoded lexer
       
  1772 
       
  1773 @{thm blexer_def}
       
  1774 
       
  1775 
       
  1776 \begin{theorem}
       
  1777 @{thm blexer_correctness}
       
  1778 \end{theorem}
       
  1779 
       
  1780 \<close>
       
  1781 
       
  1782 section \<open>Optimisations\<close>
       
  1783 
       
  1784 text \<open>
       
  1785 
       
  1786   Derivatives as calculated by \Brz's method are usually more complex
       
  1787   regular expressions than the initial one; the result is that the
       
  1788   derivative-based matching and lexing algorithms are often abysmally slow.
       
  1789   However, various optimisations are possible, such as the simplifications
       
  1790   of @{term "ALT ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and
       
  1791   @{term "SEQ r ONE"} to @{term r}. These simplifications can speed up the
       
  1792   algorithms considerably, as noted in \cite{Sulzmann2014}. One of the
       
  1793   advantages of having a simple specification and correctness proof is that
       
  1794   the latter can be refined to prove the correctness of such simplification
       
  1795   steps. While the simplification of regular expressions according to 
       
  1796   rules like
       
  1797 
       
  1798   \begin{equation}\label{Simpl}
       
  1799   \begin{array}{lcllcllcllcl}
       
  1800   @{term "ALT ZERO r"} & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1801   @{term "ALT r ZERO"} & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1802   @{term "SEQ ONE r"}  & \<open>\<Rightarrow>\<close> & @{term r} \hspace{8mm}%\\
       
  1803   @{term "SEQ r ONE"}  & \<open>\<Rightarrow>\<close> & @{term r}
       
  1804   \end{array}
       
  1805   \end{equation}
       
  1806 
       
  1807   \noindent is well understood, there is an obstacle with the POSIX value
       
  1808   calculation algorithm by Sulzmann and Lu: if we build a derivative regular
       
  1809   expression and then simplify it, we will calculate a POSIX value for this
       
  1810   simplified derivative regular expression, \emph{not} for the original (unsimplified)
       
  1811   derivative regular expression. Sulzmann and Lu \cite{Sulzmann2014} overcome this obstacle by
       
  1812   not just calculating a simplified regular expression, but also calculating
       
  1813   a \emph{rectification function} that ``repairs'' the incorrect value.
       
  1814   
       
  1815   The rectification functions can be (slightly clumsily) implemented  in
       
  1816   Isabelle/HOL as follows using some auxiliary functions:
       
  1817 
       
  1818   \begin{center}
       
  1819   \begin{tabular}{lcl}
       
  1820   @{thm (lhs) F_RIGHT.simps(1)} & $\dn$ & \<open>Right (f v)\<close>\\
       
  1821   @{thm (lhs) F_LEFT.simps(1)} & $\dn$ & \<open>Left (f v)\<close>\\
       
  1822   
       
  1823   @{thm (lhs) F_ALT.simps(1)} & $\dn$ & \<open>Right (f\<^sub>2 v)\<close>\\
       
  1824   @{thm (lhs) F_ALT.simps(2)} & $\dn$ & \<open>Left (f\<^sub>1 v)\<close>\\
       
  1825   
       
  1826   @{thm (lhs) F_SEQ1.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 ()) (f\<^sub>2 v)\<close>\\
       
  1827   @{thm (lhs) F_SEQ2.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 v) (f\<^sub>2 ())\<close>\\
       
  1828   @{thm (lhs) F_SEQ.simps(1)} & $\dn$ & \<open>Seq (f\<^sub>1 v\<^sub>1) (f\<^sub>2 v\<^sub>2)\<close>\medskip\\
       
  1829   %\end{tabular}
       
  1830   %
       
  1831   %\begin{tabular}{lcl}
       
  1832   @{term "simp_ALT (ZERO, DUMMY) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_RIGHT f\<^sub>2)"}\\
       
  1833   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, DUMMY)"} & $\dn$ & @{term "(r\<^sub>1, F_LEFT f\<^sub>1)"}\\
       
  1834   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)"}\\
       
  1835   @{term "simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)"}\\
       
  1836   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)"}\\
       
  1837   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(SEQ r\<^sub>1 r\<^sub>2, F_SEQ f\<^sub>1 f\<^sub>2)"}\\
       
  1838   \end{tabular}
       
  1839   \end{center}
       
  1840 
       
  1841   \noindent
       
  1842   The functions \<open>simp\<^bsub>Alt\<^esub>\<close> and \<open>simp\<^bsub>Seq\<^esub>\<close> encode the simplification rules
       
  1843   in \eqref{Simpl} and compose the rectification functions (simplifications can occur
       
  1844   deep inside the regular expression). The main simplification function is then 
       
  1845 
       
  1846   \begin{center}
       
  1847   \begin{tabular}{lcl}
       
  1848   @{term "simp (ALT r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_ALT (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
  1849   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
  1850   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
       
  1851   \end{tabular}
       
  1852   \end{center} 
       
  1853 
       
  1854   \noindent where @{term "id"} stands for the identity function. The
       
  1855   function @{const simp} returns a simplified regular expression and a corresponding
       
  1856   rectification function. Note that we do not simplify under stars: this
       
  1857   seems to slow down the algorithm, rather than speed it up. The optimised
       
  1858   lexer is then given by the clauses:
       
  1859   
       
  1860   \begin{center}
       
  1861   \begin{tabular}{lcl}
       
  1862   @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\
       
  1863   @{thm (lhs) slexer.simps(2)} & $\dn$ & 
       
  1864                          \<open>let (r\<^sub>s, f\<^sub>r) = simp (r \<close>$\backslash$\<open> c) in\<close>\\
       
  1865                      & & \<open>case\<close> @{term "slexer r\<^sub>s s"} \<open>of\<close>\\
       
  1866                      & & \phantom{$|$} @{term "None"}  \<open>\<Rightarrow>\<close> @{term None}\\
       
  1867                      & & $|$ @{term "Some v"} \<open>\<Rightarrow>\<close> \<open>Some (inj r c (f\<^sub>r v))\<close>                          
       
  1868   \end{tabular}
       
  1869   \end{center}
       
  1870 
       
  1871   \noindent
       
  1872   In the second clause we first calculate the derivative @{term "der c r"}
       
  1873   and then simplify the result. This gives us a simplified derivative
       
  1874   \<open>r\<^sub>s\<close> and a rectification function \<open>f\<^sub>r\<close>. The lexer
       
  1875   is then recursively called with the simplified derivative, but before
       
  1876   we inject the character @{term c} into the value @{term v}, we need to rectify
       
  1877   @{term v} (that is construct @{term "f\<^sub>r v"}). Before we can establish the correctness
       
  1878   of @{term "slexer"}, we need to show that simplification preserves the language
       
  1879   and simplification preserves our POSIX relation once the value is rectified
       
  1880   (recall @{const "simp"} generates a (regular expression, rectification function) pair):
       
  1881 
       
  1882   \begin{lemma}\mbox{}\smallskip\\\label{slexeraux}
       
  1883   \begin{tabular}{ll}
       
  1884   (1) & @{thm L_fst_simp[symmetric]}\\
       
  1885   (2) & @{thm[mode=IfThen] Posix_simp}
       
  1886   \end{tabular}
       
  1887   \end{lemma}
       
  1888 
       
  1889   \begin{proof} Both are by induction on \<open>r\<close>. There is no
       
  1890   interesting case for the first statement. For the second statement,
       
  1891   of interest are the @{term "r = ALT r\<^sub>1 r\<^sub>2"} and @{term "r = SEQ r\<^sub>1
       
  1892   r\<^sub>2"} cases. In each case we have to analyse four subcases whether
       
  1893   @{term "fst (simp r\<^sub>1)"} and @{term "fst (simp r\<^sub>2)"} equals @{const
       
  1894   ZERO} (respectively @{const ONE}). For example for @{term "r = ALT
       
  1895   r\<^sub>1 r\<^sub>2"}, consider the subcase @{term "fst (simp r\<^sub>1) = ZERO"} and
       
  1896   @{term "fst (simp r\<^sub>2) \<noteq> ZERO"}. By assumption we know @{term "s \<in>
       
  1897   fst (simp (ALT r\<^sub>1 r\<^sub>2)) \<rightarrow> v"}. From this we can infer @{term "s \<in> fst (simp r\<^sub>2) \<rightarrow> v"}
       
  1898   and by IH also (*) @{term "s \<in> r\<^sub>2 \<rightarrow> (snd (simp r\<^sub>2) v)"}. Given @{term "fst (simp r\<^sub>1) = ZERO"}
       
  1899   we know @{term "L (fst (simp r\<^sub>1)) = {}"}. By the first statement
       
  1900   @{term "L r\<^sub>1"} is the empty set, meaning (**) @{term "s \<notin> L r\<^sub>1"}.
       
  1901   Taking (*) and (**) together gives by the \mbox{\<open>P+R\<close>}-rule 
       
  1902   @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> Right (snd (simp r\<^sub>2) v)"}. In turn this
       
  1903   gives @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> snd (simp (ALT r\<^sub>1 r\<^sub>2)) v"} as we need to show.
       
  1904   The other cases are similar.\qed
       
  1905   \end{proof}
       
  1906 
       
  1907   \noindent We can now prove relatively straightforwardly that the
       
  1908   optimised lexer produces the expected result:
       
  1909 
       
  1910   \begin{theorem}
       
  1911   @{thm slexer_correctness}
       
  1912   \end{theorem}
       
  1913 
       
  1914   \begin{proof} By induction on @{term s} generalising over @{term
       
  1915   r}. The case @{term "[]"} is trivial. For the cons-case suppose the
       
  1916   string is of the form @{term "c # s"}. By induction hypothesis we
       
  1917   know @{term "slexer r s = lexer r s"} holds for all @{term r} (in
       
  1918   particular for @{term "r"} being the derivative @{term "der c
       
  1919   r"}). Let @{term "r\<^sub>s"} be the simplified derivative regular expression, that is @{term
       
  1920   "fst (simp (der c r))"}, and @{term "f\<^sub>r"} be the rectification
       
  1921   function, that is @{term "snd (simp (der c r))"}.  We distinguish the cases
       
  1922   whether (*) @{term "s \<in> L (der c r)"} or not. In the first case we
       
  1923   have by Theorem~\ref{lexercorrect}(2) a value @{term "v"} so that @{term
       
  1924   "lexer (der c r) s = Some v"} and @{term "s \<in> der c r \<rightarrow> v"} hold.
       
  1925   By Lemma~\ref{slexeraux}(1) we can also infer from~(*) that @{term "s
       
  1926   \<in> L r\<^sub>s"} holds.  Hence we know by Theorem~\ref{lexercorrect}(2) that
       
  1927   there exists a @{term "v'"} with @{term "lexer r\<^sub>s s = Some v'"} and
       
  1928   @{term "s \<in> r\<^sub>s \<rightarrow> v'"}. From the latter we know by
       
  1929   Lemma~\ref{slexeraux}(2) that @{term "s \<in> der c r \<rightarrow> (f\<^sub>r v')"} holds.
       
  1930   By the uniqueness of the POSIX relation (Theorem~\ref{posixdeterm}) we
       
  1931   can infer that @{term v} is equal to @{term "f\<^sub>r v'"}---that is the 
       
  1932   rectification function applied to @{term "v'"}
       
  1933   produces the original @{term "v"}.  Now the case follows by the
       
  1934   definitions of @{const lexer} and @{const slexer}.
       
  1935 
       
  1936   In the second case where @{term "s \<notin> L (der c r)"} we have that
       
  1937   @{term "lexer (der c r) s = None"} by Theorem~\ref{lexercorrect}(1).  We
       
  1938   also know by Lemma~\ref{slexeraux}(1) that @{term "s \<notin> L r\<^sub>s"}. Hence
       
  1939   @{term "lexer r\<^sub>s s = None"} by Theorem~\ref{lexercorrect}(1) and
       
  1940   by IH then also @{term "slexer r\<^sub>s s = None"}. With this we can
       
  1941   conclude in this case too.\qed   
       
  1942 
       
  1943   \end{proof} 
       
  1944 
       
  1945 \<close>
       
  1946 fy the result. This gives us a simplified derivative
       
  1947   \<open>r\<^sub>s\<close> and a rectification function \<open>f\<^sub>r\<close>. The lexer
       
  1948   is then recursively called with the simplified derivative, but before
       
  1949   we inject the character @{term c} into the value @{term v}, we need to rectify
       
  1950   @{term v} (that is construct @{term "f\<^sub>r v"}). Before we can establish the correctness
       
  1951   of @{term "slexer"}, we need to show that simplification preserves the language
       
  1952   and simplification preserves our POSIX relation once the value is rectified
       
  1953   (recall @{const "simp"} generates a (regular expression, rectification function) pair):
       
  1954 
       
  1955   \begin{lemma}\mbox{}\smallskip\\\label{slexeraux}
       
  1956   \begin{tabular}{ll}
       
  1957   (1) & @{thm L_fst_simp[symmetric]}\\
       
  1958   (2) & @{thm[mode=IfThen] Posix_simp}
       
  1959   \end{tabular}
       
  1960   \end{lemma}
       
  1961 
       
  1962   \begin{proof} Both are by induction on \<open>r\<close>. There is no
       
  1963   interesting case for the first statement. For the second statement,
       
  1964   of interest are the @{term "r = ALT r\<^sub>1 r\<^sub>2"} and @{term "r = SEQ r\<^sub>1
       
  1965   r\<^sub>2"} cases. In each case we have to analyse four subcases whether
       
  1966   @{term "fst (simp r\<^sub>1)"} and @{term "fst (simp r\<^sub>2)"} equals @{const
       
  1967   ZERO} (respectively @{const ONE}). For example for @{term "r = ALT
       
  1968   r\<^sub>1 r\<^sub>2"}, consider the subcase @{term "fst (simp r\<^sub>1) = ZERO"} and
       
  1969   @{term "fst (simp r\<^sub>2) \<noteq> ZERO"}. By assumption we know @{term "s \<in>
       
  1970   fst (simp (ALT r\<^sub>1 r\<^sub>2)) \<rightarrow> v"}. From this we can infer @{term "s \<in> fst (simp r\<^sub>2) \<rightarrow> v"}
       
  1971   and by IH also (*) @{term "s \<in> r\<^sub>2 \<rightarrow> (snd (simp r\<^sub>2) v)"}. Given @{term "fst (simp r\<^sub>1) = ZERO"}
       
  1972   we know @{term "L (fst (simp r\<^sub>1)) = {}"}. By the first statement
       
  1973   @{term "L r\<^sub>1"} is the empty set, meaning (**) @{term "s \<notin> L r\<^sub>1"}.
       
  1974   Taking (*) and (**) together gives by the \mbox{\<open>P+R\<close>}-rule 
       
  1975   @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> Right (snd (simp r\<^sub>2) v)"}. In turn this
       
  1976   gives @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> snd (simp (ALT r\<^sub>1 r\<^sub>2)) v"} as we need to show.
       
  1977   The other cases are similar.\qed
       
  1978   \end{proof}
       
  1979 
       
  1980   \noindent We can now prove relatively straightforwardly that the
       
  1981   optimised lexer produces the expected result:
       
  1982 
       
  1983   \begin{theorem}
       
  1984   @{thm slexer_correctness}
       
  1985   \end{theorem}
       
  1986 
       
  1987   \begin{proof} By induction on @{term s} generalising over @{term
       
  1988   r}. The case @{term "[]"} is trivial. For the cons-case suppose the
       
  1989   string is of the form @{term "c # s"}. By induction hypothesis we
       
  1990   know @{term "slexer r s = lexer r s"} holds for all @{term r} (in
       
  1991   particular for @{term "r"} being the derivative @{term "der c
       
  1992   r"}). Let @{term "r\<^sub>s"} be the simplified derivative regular expression, that is @{term
       
  1993   "fst (simp (der c r))"}, and @{term "f\<^sub>r"} be the rectification
       
  1994   function, that is @{term "snd (simp (der c r))"}.  We distinguish the cases
       
  1995   whether (*) @{term "s \<in> L (der c r)"} or not. In the first case we
       
  1996   have by Theorem~\ref{lexercorrect}(2) a value @{term "v"} so that @{term
       
  1997   "lexer (der c r) s = Some v"} and @{term "s \<in> der c r \<rightarrow> v"} hold.
       
  1998   By Lemma~\ref{slexeraux}(1) we can also infer from~(*) that @{term "s
       
  1999   \<in> L r\<^sub>s"} holds.  Hence we know by Theorem~\ref{lexercorrect}(2) that
       
  2000   there exists a @{term "v'"} with @{term "lexer r\<^sub>s s = Some v'"} and
       
  2001   @{term "s \<in> r\<^sub>s \<rightarrow> v'"}. From the latter we know by
       
  2002   Lemma~\ref{slexeraux}(2) that @{term "s \<in> der c r \<rightarrow> (f\<^sub>r v')"} holds.
       
  2003   By the uniqueness of the POSIX relation (Theorem~\ref{posixdeterm}) we
       
  2004   can infer that @{term v} is equal to @{term "f\<^sub>r v'"}---that is the 
       
  2005   rectification function applied to @{term "v'"}
       
  2006   produces the original @{term "v"}.  Now the case follows by the
       
  2007   definitions of @{const lexer} and @{const slexer}.
       
  2008 
       
  2009   In the second case where @{term "s \<notin> L (der c r)"} we have that
       
  2010   @{term "lexer (der c r) s = None"} by Theorem~\ref{lexercorrect}(1).  We
       
  2011   also know by Lemma~\ref{slexeraux}(1) that @{term "s \<notin> L r\<^sub>s"}. Hence
       
  2012   @{term "lexer r\<^sub>s s = None"} by Theorem~\ref{lexercorrect}(1) and
       
  2013   by IH then also @{term "slexer r\<^sub>s s = None"}. With this we can
       
  2014   conclude in this case too.\qed   
       
  2015 
       
  2016   \end{proof} 
       
  2017 
       
  2018 \<close>
       
  2019 
       
  2020 
       
  2021 section \<open>HERE\<close>
       
  2022 
       
  2023 text \<open>
       
  2024 
       
  2025   \begin{lemma}
       
  2026   @{thm [mode=IfThen] bder_retrieve}
       
  2027   \end{lemma}
       
  2028 
       
  2029   \begin{proof}
       
  2030   By induction on the definition of @{term "erase r"}. The cases for rule 1) and 2) are
       
  2031   straightforward as @{term "der c ZERO"} and @{term "der c ONE"} are both equal to 
       
  2032   @{term ZERO}. This means @{term "\<Turnstile> v : ZERO"} cannot hold. Similarly in case of rule 3)
       
  2033   where @{term r} is of the form @{term "ACHAR d"} with @{term "c = d"}. Then by assumption
       
  2034   we know @{term "\<Turnstile> v : ONE"}, which implies @{term "v = Void"}. The equation follows by 
       
  2035   simplification of left- and right-hand side. In  case @{term "c \<noteq> d"} we have again
       
  2036   @{term "\<Turnstile> v : ZERO"}, which cannot  hold. 
       
  2037 
       
  2038   For rule 4a) we have again @{term "\<Turnstile> v : ZERO"}. The property holds by IH for rule 4b).
       
  2039   The  induction hypothesis is 
       
  2040   \[
       
  2041   @{term "retrieve (bder c r) v = retrieve r (injval (erase r) c v)"}
       
  2042   \]
       
  2043   which is what left- and right-hand side simplify to.  The slightly more interesting case
       
  2044   is for 4c). By assumption  we have 
       
  2045   @{term "\<Turnstile> v : ALT (der c (erase r\<^sub>1)) (der c (erase (AALTs bs (r\<^sub>2 # rs))))"}. This means we 
       
  2046   have either (*) @{term "\<Turnstile> v1 : der c (erase r\<^sub>1)"} with @{term "v = Left v1"} or
       
  2047   (**) @{term "\<Turnstile> v2 : der c (erase (AALTs bs (r\<^sub>2 # rs)))"} with @{term "v = Right v2"}.
       
  2048   The former  case is straightforward by simplification. The second case is \ldots TBD.
       
  2049 
       
  2050   Rule 5) TBD.
       
  2051 
       
  2052   Finally for rule 6) the reasoning is as follows:   By assumption we  have
       
  2053   @{term "\<Turnstile> v : SEQ (der c (erase r)) (STAR (erase r))"}. This means we also have
       
  2054   @{term "v = Seq v1 v2"}, @{term "\<Turnstile> v1 : der c (erase r)"}  and @{term "v2 = Stars vs"}.
       
  2055   We want to prove
       
  2056   \begin{align}
       
  2057   & @{term "retrieve (ASEQ bs (fuse [Z] (bder c r)) (ASTAR [] r)) v"}\\
       
  2058   &= @{term "retrieve (ASTAR bs r) (injval (STAR (erase r)) c v)"}
       
  2059   \end{align}
       
  2060   The right-hand side @{term inj}-expression is equal to 
       
  2061   @{term "Stars (injval (erase r) c v1 # vs)"}, which means the @{term  retrieve}-expression
       
  2062   simplifies to 
       
  2063   \[
       
  2064   @{term "bs @ [Z] @ retrieve r (injval (erase r) c v1) @ retrieve (ASTAR [] r) (Stars vs)"}
       
  2065   \]
       
  2066   The left-hand side (3) above simplifies to 
       
  2067   \[
       
  2068   @{term "bs @ retrieve (fuse [Z] (bder c r)) v1 @ retrieve (ASTAR [] r) (Stars vs)"} 
       
  2069   \]
       
  2070   We can move out the @{term "fuse  [Z]"} and then use the IH to show that left-hand side
       
  2071   and right-hand side are equal. This completes the proof. 
       
  2072   \end{proof}   
       
  2073 
       
  2074    
       
  2075 
       
  2076   \bibliographystyle{plain}
       
  2077   \bibliography{root}
       
  2078 
       
  2079 \<close>
       
  2080 (*<*)
       
  2081 end
       
  2082 (*>*)
       
  2083 
       
  2084 
       
  2085 
       
  2086 (*
       
  2087 
       
  2088 \begin{center}
       
  2089   \begin{tabular}
       
  2090     @{thm[mode=Rule] aggressive.intros(1)[of "bs" "bs1" "rs" "r"]}
       
  2091   end{tabular}
       
  2092 \end{center}
       
  2093 
       
  2094 
       
  2095 
       
  2096 \begin{center}
       
  2097   \begin{tabular}{lcl}
       
  2098   @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
       
  2099   @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
       
  2100   @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
       
  2101   @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\
       
  2102   @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
  2103   @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
       
  2104   @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}
       
  2105 \end{tabular}
       
  2106 \end{center}
       
  2107 
       
  2108 \begin{center}
       
  2109 \begin{tabular}{lcl}
       
  2110   @{term areg} & $::=$ & @{term "AZERO"}\\
       
  2111                & $\mid$ & @{term "AONE bs"}\\
       
  2112                & $\mid$ & @{term "ACHAR bs c"}\\
       
  2113                & $\mid$ & @{term "AALT bs r1 r2"}\\
       
  2114                & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
       
  2115                & $\mid$ & @{term "ASTAR bs r"}
       
  2116 \end{tabular}
       
  2117 \end{center}
       
  2118 
       
  2119 
       
  2120 \begin{center}
       
  2121   \begin{tabular}{lcl}
       
  2122   @{thm (lhs) intern.simps(1)} & $\dn$ & @{thm (rhs) intern.simps(1)}\\
       
  2123   @{thm (lhs) intern.simps(2)} & $\dn$ & @{thm (rhs) intern.simps(2)}\\
       
  2124   @{thm (lhs) intern.simps(3)} & $\dn$ & @{thm (rhs) intern.simps(3)}\\
       
  2125   @{thm (lhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  2126   @{thm (lhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  2127   @{thm (lhs) intern.simps(6)} & $\dn$ & @{thm (rhs) intern.simps(6)}\\
       
  2128 \end{tabular}
       
  2129 \end{center}
       
  2130 
       
  2131 \begin{center}
       
  2132   \begin{tabular}{lcl}
       
  2133   @{thm (lhs) erase.simps(1)} & $\dn$ & @{thm (rhs) erase.simps(1)}\\
       
  2134   @{thm (lhs) erase.simps(2)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(2)[of bs]}\\
       
  2135   @{thm (lhs) erase.simps(3)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(3)[of bs]}\\
       
  2136   @{thm (lhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2137   @{thm (lhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2138   @{thm (lhs) erase.simps(6)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(6)[of bs]}\\
       
  2139 \end{tabular}
       
  2140 \end{center}
       
  2141 
       
  2142 Some simple facts about erase
       
  2143 
       
  2144 \begin{lemma}\mbox{}\\
       
  2145 @{thm erase_bder}\\
       
  2146 @{thm erase_intern}
       
  2147 \end{lemma}
       
  2148 
       
  2149 \begin{center}
       
  2150   \begin{tabular}{lcl}
       
  2151   @{thm (lhs) bnullable.simps(1)} & $\dn$ & @{thm (rhs) bnullable.simps(1)}\\
       
  2152   @{thm (lhs) bnullable.simps(2)} & $\dn$ & @{thm (rhs) bnullable.simps(2)}\\
       
  2153   @{thm (lhs) bnullable.simps(3)} & $\dn$ & @{thm (rhs) bnullable.simps(3)}\\
       
  2154   @{thm (lhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2155   @{thm (lhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2156   @{thm (lhs) bnullable.simps(6)} & $\dn$ & @{thm (rhs) bnullable.simps(6)}\medskip\\
       
  2157 
       
  2158 %  \end{tabular}
       
  2159 %  \end{center}
       
  2160 
       
  2161 %  \begin{center}
       
  2162 %  \begin{tabular}{lcl}
       
  2163 
       
  2164   @{thm (lhs) bder.simps(1)} & $\dn$ & @{thm (rhs) bder.simps(1)}\\
       
  2165   @{thm (lhs) bder.simps(2)} & $\dn$ & @{thm (rhs) bder.simps(2)}\\
       
  2166   @{thm (lhs) bder.simps(3)} & $\dn$ & @{thm (rhs) bder.simps(3)}\\
       
  2167   @{thm (lhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2168   @{thm (lhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2169   @{thm (lhs) bder.simps(6)} & $\dn$ & @{thm (rhs) bder.simps(6)}
       
  2170   \end{tabular}
       
  2171   \end{center}
       
  2172 
       
  2173 
       
  2174 \begin{center}
       
  2175   \begin{tabular}{lcl}
       
  2176   @{thm (lhs) bmkeps.simps(1)} & $\dn$ & @{thm (rhs) bmkeps.simps(1)}\\
       
  2177   @{thm (lhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2178   @{thm (lhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2179   @{thm (lhs) bmkeps.simps(4)} & $\dn$ & @{thm (rhs) bmkeps.simps(4)}\medskip\\
       
  2180 \end{tabular}
       
  2181 \end{center}
       
  2182 
       
  2183 
       
  2184 @{thm [mode=IfThen] bder_retrieve}
       
  2185 
       
  2186 By induction on \<open>r\<close>
       
  2187 
       
  2188 \begin{theorem}[Main Lemma]\mbox{}\\
       
  2189 @{thm [mode=IfThen] MAIN_decode}
       
  2190 \end{theorem}
       
  2191 
       
  2192 \noindent
       
  2193 Definition of the bitcoded lexer
       
  2194 
       
  2195 @{thm blexer_def}
       
  2196 
       
  2197 
       
  2198 \begin{theorem}
       
  2199 @{thm blexer_correctness}
       
  2200 \end{theorem}
       
  2201 
       
  2202 
       
  2203 
       
  2204 
       
  2205 
       
  2206 \begin{center}
       
  2207   \begin{tabular}{lcl}
       
  2208   @{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
       
  2209   @{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
       
  2210   @{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
       
  2211   @{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\
       
  2212   @{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
  2213   @{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
       
  2214   @{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}
       
  2215 \end{tabular}
       
  2216 \end{center}
       
  2217 
       
  2218 \begin{center}
       
  2219 \begin{tabular}{lcl}
       
  2220   @{term areg} & $::=$ & @{term "AZERO"}\\
       
  2221                & $\mid$ & @{term "AONE bs"}\\
       
  2222                & $\mid$ & @{term "ACHAR bs c"}\\
       
  2223                & $\mid$ & @{term "AALT bs r1 r2"}\\
       
  2224                & $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
       
  2225                & $\mid$ & @{term "ASTAR bs r"}
       
  2226 \end{tabular}
       
  2227 \end{center}
       
  2228 
       
  2229 
       
  2230 \begin{center}
       
  2231   \begin{tabular}{lcl}
       
  2232   @{thm (lhs) intern.simps(1)} & $\dn$ & @{thm (rhs) intern.simps(1)}\\
       
  2233   @{thm (lhs) intern.simps(2)} & $\dn$ & @{thm (rhs) intern.simps(2)}\\
       
  2234   @{thm (lhs) intern.simps(3)} & $\dn$ & @{thm (rhs) intern.simps(3)}\\
       
  2235   @{thm (lhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  2236   @{thm (lhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
  2237   @{thm (lhs) intern.simps(6)} & $\dn$ & @{thm (rhs) intern.simps(6)}\\
       
  2238 \end{tabular}
       
  2239 \end{center}
       
  2240 
       
  2241 \begin{center}
       
  2242   \begin{tabular}{lcl}
       
  2243   @{thm (lhs) erase.simps(1)} & $\dn$ & @{thm (rhs) erase.simps(1)}\\
       
  2244   @{thm (lhs) erase.simps(2)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(2)[of bs]}\\
       
  2245   @{thm (lhs) erase.simps(3)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(3)[of bs]}\\
       
  2246   @{thm (lhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2247   @{thm (lhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2248   @{thm (lhs) erase.simps(6)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(6)[of bs]}\\
       
  2249 \end{tabular}
       
  2250 \end{center}
       
  2251 
       
  2252 Some simple facts about erase
       
  2253 
       
  2254 \begin{lemma}\mbox{}\\
       
  2255 @{thm erase_bder}\\
       
  2256 @{thm erase_intern}
       
  2257 \end{lemma}
       
  2258 
       
  2259 \begin{center}
       
  2260   \begin{tabular}{lcl}
       
  2261   @{thm (lhs) bnullable.simps(1)} & $\dn$ & @{thm (rhs) bnullable.simps(1)}\\
       
  2262   @{thm (lhs) bnullable.simps(2)} & $\dn$ & @{thm (rhs) bnullable.simps(2)}\\
       
  2263   @{thm (lhs) bnullable.simps(3)} & $\dn$ & @{thm (rhs) bnullable.simps(3)}\\
       
  2264   @{thm (lhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2265   @{thm (lhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2266   @{thm (lhs) bnullable.simps(6)} & $\dn$ & @{thm (rhs) bnullable.simps(6)}\medskip\\
       
  2267 
       
  2268 %  \end{tabular}
       
  2269 %  \end{center}
       
  2270 
       
  2271 %  \begin{center}
       
  2272 %  \begin{tabular}{lcl}
       
  2273 
       
  2274   @{thm (lhs) bder.simps(1)} & $\dn$ & @{thm (rhs) bder.simps(1)}\\
       
  2275   @{thm (lhs) bder.simps(2)} & $\dn$ & @{thm (rhs) bder.simps(2)}\\
       
  2276   @{thm (lhs) bder.simps(3)} & $\dn$ & @{thm (rhs) bder.simps(3)}\\
       
  2277   @{thm (lhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2278   @{thm (lhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2279   @{thm (lhs) bder.simps(6)} & $\dn$ & @{thm (rhs) bder.simps(6)}
       
  2280   \end{tabular}
       
  2281   \end{center}
       
  2282 
       
  2283 
       
  2284 \begin{center}
       
  2285   \begin{tabular}{lcl}
       
  2286   @{thm (lhs) bmkeps.simps(1)} & $\dn$ & @{thm (rhs) bmkeps.simps(1)}\\
       
  2287   @{thm (lhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2288   @{thm (lhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
       
  2289   @{thm (lhs) bmkeps.simps(4)} & $\dn$ & @{thm (rhs) bmkeps.simps(4)}\medskip\\
       
  2290 \end{tabular}
       
  2291 \end{center}
       
  2292 
       
  2293 
       
  2294 @{thm [mode=IfThen] bder_retrieve}
       
  2295 
       
  2296 By induction on \<open>r\<close>
       
  2297 
       
  2298 \begin{theorem}[Main Lemma]\mbox{}\\
       
  2299 @{thm [mode=IfThen] MAIN_decode}
       
  2300 \end{theorem}
       
  2301 
       
  2302 \noindent
       
  2303 Definition of the bitcoded lexer
       
  2304 
       
  2305 @{thm blexer_def}
       
  2306 
       
  2307 
       
  2308 \begin{theorem}
       
  2309 @{thm blexer_correctness}
       
  2310 \end{theorem}
       
  2311 
       
  2312 \<close>
       
  2313 \<close>*)