thys/Journal/Paper.thy
changeset 218 16af5b8bd285
child 265 d36be1e356c0
equal deleted inserted replaced
217:47179a172c54 218:16af5b8bd285
       
     1 (*<*)
       
     2 theory Paper
       
     3 imports 
       
     4    "../Lexer"
       
     5    "../Simplifying" 
       
     6    "../Sulzmann" 
       
     7    "~~/src/HOL/Library/LaTeXsugar"
       
     8 begin
       
     9 
       
    10 declare [[show_question_marks = false]]
       
    11 
       
    12 abbreviation 
       
    13  "der_syn r c \<equiv> der c r"
       
    14 
       
    15 abbreviation 
       
    16  "ders_syn r s \<equiv> ders s r"
       
    17 
       
    18 notation (latex output)
       
    19   If  ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and
       
    20   Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [75,73] 73) and  
       
    21 
       
    22   ZERO ("\<^bold>0" 78) and 
       
    23   ONE ("\<^bold>1" 78) and 
       
    24   CHAR ("_" [1000] 80) and
       
    25   ALT ("_ + _" [77,77] 78) and
       
    26   SEQ ("_ \<cdot> _" [77,77] 78) and
       
    27   STAR ("_\<^sup>\<star>" [1000] 78) and
       
    28   
       
    29   val.Void ("'(')" 1000) and
       
    30   val.Char ("Char _" [1000] 78) and
       
    31   val.Left ("Left _" [79] 78) and
       
    32   val.Right ("Right _" [1000] 78) and
       
    33   val.Seq ("Seq _ _" [79,79] 78) and
       
    34   val.Stars ("Stars _" [79] 78) and
       
    35 
       
    36   L ("L'(_')" [10] 78) and
       
    37   der_syn ("_\\_" [79, 1000] 76) and  
       
    38   ders_syn ("_\\_" [79, 1000] 76) and
       
    39   flat ("|_|" [75] 74) and
       
    40   Sequ ("_ @ _" [78,77] 63) and
       
    41   injval ("inj _ _ _" [79,77,79] 76) and 
       
    42   mkeps ("mkeps _" [79] 76) and 
       
    43   length ("len _" [73] 73) and
       
    44  
       
    45   Prf ("_ : _" [75,75] 75) and  
       
    46   Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
       
    47  
       
    48   lexer ("lexer _ _" [78,78] 77) and 
       
    49   F_RIGHT ("F\<^bsub>Right\<^esub> _") and
       
    50   F_LEFT ("F\<^bsub>Left\<^esub> _") and  
       
    51   F_ALT ("F\<^bsub>Alt\<^esub> _ _") and
       
    52   F_SEQ1 ("F\<^bsub>Seq1\<^esub> _ _") and
       
    53   F_SEQ2 ("F\<^bsub>Seq2\<^esub> _ _") and
       
    54   F_SEQ ("F\<^bsub>Seq\<^esub> _ _") and
       
    55   simp_SEQ ("simp\<^bsub>Seq\<^esub> _ _" [1000, 1000] 1) and
       
    56   simp_ALT ("simp\<^bsub>Alt\<^esub> _ _" [1000, 1000] 1) and
       
    57   slexer ("lexer\<^sup>+" 1000) and
       
    58 
       
    59   ValOrd ("_ >\<^bsub>_\<^esub> _" [77,77,77] 77) and
       
    60   ValOrdEq ("_ \<ge>\<^bsub>_\<^esub> _" [77,77,77] 77)
       
    61 
       
    62 definition 
       
    63   "match r s \<equiv> nullable (ders s r)"
       
    64 
       
    65 (*
       
    66 comments not implemented
       
    67 
       
    68 p9. The condtion "not exists s3 s4..." appears often enough (in particular in
       
    69 the proof of Lemma 3) to warrant a definition.
       
    70 
       
    71 *)
       
    72 
       
    73 (*>*)
       
    74 
       
    75 section {* Introduction *}
       
    76 
       
    77 
       
    78 text {*
       
    79 
       
    80 Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em
       
    81 derivative} @{term "der c r"} of a regular expression @{text r} w.r.t.\ a
       
    82 character~@{text c}, and showed that it gave a simple solution to the
       
    83 problem of matching a string @{term s} with a regular expression @{term r}:
       
    84 if the derivative of @{term r} w.r.t.\ (in succession) all the characters of
       
    85 the string matches the empty string, then @{term r} matches @{term s} (and
       
    86 {\em vice versa}). The derivative has the property (which may almost be
       
    87 regarded as its specification) that, for every string @{term s} and regular
       
    88 expression @{term r} and character @{term c}, one has @{term "cs \<in> L(r)"} if
       
    89 and only if \mbox{@{term "s \<in> L(der c r)"}}. The beauty of Brzozowski's
       
    90 derivatives is that they are neatly expressible in any functional language,
       
    91 and easily definable and reasoned about in theorem provers---the definitions
       
    92 just consist of inductive datatypes and simple recursive functions. A
       
    93 mechanised correctness proof of Brzozowski's matcher in for example HOL4
       
    94 has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in Isabelle/HOL is part
       
    95 of the work by Krauss and Nipkow \cite{Krauss2011}. And another one in Coq is given
       
    96 by Coquand and Siles \cite{Coquand2012}.
       
    97 
       
    98 If a regular expression matches a string, then in general there is more than
       
    99 one way of how the string is matched. There are two commonly used
       
   100 disambiguation strategies to generate a unique answer: one is called GREEDY
       
   101 matching \cite{Frisch2004} and the other is POSIX
       
   102 matching~\cite{Kuklewicz,Sulzmann2014,Vansummeren2006}. For example consider
       
   103 the string @{term xy} and the regular expression \mbox{@{term "STAR (ALT
       
   104 (ALT x y) xy)"}}. Either the string can be matched in two `iterations' by
       
   105 the single letter-regular expressions @{term x} and @{term y}, or directly
       
   106 in one iteration by @{term xy}. The first case corresponds to GREEDY
       
   107 matching, which first matches with the left-most symbol and only matches the
       
   108 next symbol in case of a mismatch (this is greedy in the sense of preferring
       
   109 instant gratification to delayed repletion). The second case is POSIX
       
   110 matching, which prefers the longest match.
       
   111 
       
   112 In the context of lexing, where an input string needs to be split up into a
       
   113 sequence of tokens, POSIX is the more natural disambiguation strategy for
       
   114 what programmers consider basic syntactic building blocks in their programs.
       
   115 These building blocks are often specified by some regular expressions, say
       
   116 @{text "r\<^bsub>key\<^esub>"} and @{text "r\<^bsub>id\<^esub>"} for recognising keywords and
       
   117 identifiers, respectively. There are two underlying (informal) rules behind
       
   118 tokenising a string in a POSIX fashion according to a collection of regular
       
   119 expressions:
       
   120 
       
   121 \begin{itemize} 
       
   122 \item[$\bullet$] \emph{The Longest Match Rule} (or \emph{``maximal munch rule''}):
       
   123 The longest initial substring matched by any regular expression is taken as
       
   124 next token.\smallskip
       
   125 
       
   126 \item[$\bullet$] \emph{Priority Rule:}
       
   127 For a particular longest initial substring, the first regular expression
       
   128 that can match determines the token.
       
   129 \end{itemize}
       
   130 
       
   131 \noindent Consider for example a regular expression @{text "r\<^bsub>key\<^esub>"} for recognising keywords
       
   132 such as @{text "if"}, @{text "then"} and so on; and @{text "r\<^bsub>id\<^esub>"}
       
   133 recognising identifiers (say, a single character followed by
       
   134 characters or numbers).  Then we can form the regular expression
       
   135 @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} and use POSIX matching to tokenise strings,
       
   136 say @{text "iffoo"} and @{text "if"}.  For @{text "iffoo"} we obtain
       
   137 by the Longest Match Rule a single identifier token, not a keyword
       
   138 followed by an identifier. For @{text "if"} we obtain by the Priority
       
   139 Rule a keyword token, not an identifier token---even if @{text "r\<^bsub>id\<^esub>"}
       
   140 matches also.
       
   141 
       
   142 One limitation of Brzozowski's matcher is that it only generates a
       
   143 YES/NO answer for whether a string is being matched by a regular
       
   144 expression.  Sulzmann and Lu~\cite{Sulzmann2014} extended this matcher
       
   145 to allow generation not just of a YES/NO answer but of an actual
       
   146 matching, called a [lexical] {\em value}. They give a simple algorithm
       
   147 to calculate a value that appears to be the value associated with
       
   148 POSIX matching.  The challenge then is to specify that value, in an
       
   149 algorithm-independent fashion, and to show that Sulzmann and Lu's
       
   150 derivative-based algorithm does indeed calculate a value that is
       
   151 correct according to the specification.
       
   152 
       
   153 The answer given by Sulzmann and Lu \cite{Sulzmann2014} is to define a
       
   154 relation (called an ``order relation'') on the set of values of @{term
       
   155 r}, and to show that (once a string to be matched is chosen) there is
       
   156 a maximum element and that it is computed by their derivative-based
       
   157 algorithm. This proof idea is inspired by work of Frisch and Cardelli
       
   158 \cite{Frisch2004} on a GREEDY regular expression matching
       
   159 algorithm. However, we were not able to establish transitivity and
       
   160 totality for the ``order relation'' by Sulzmann and Lu. In Section
       
   161 \ref{argu} we identify some inherent problems with their approach (of
       
   162 which some of the proofs are not published in \cite{Sulzmann2014});
       
   163 perhaps more importantly, we give a simple inductive (and
       
   164 algorithm-independent) definition of what we call being a {\em POSIX
       
   165 value} for a regular expression @{term r} and a string @{term s}; we
       
   166 show that the algorithm computes such a value and that such a value is
       
   167 unique. Our proofs are both done by hand and checked in Isabelle/HOL.  The
       
   168 experience of doing our proofs has been that this mechanical checking
       
   169 was absolutely essential: this subject area has hidden snares. This
       
   170 was also noted by Kuklewicz \cite{Kuklewicz} who found that nearly all
       
   171 POSIX matching implementations are ``buggy'' \cite[Page
       
   172 203]{Sulzmann2014} and by Grathwohl et al \cite[Page 36]{CrashCourse2014}
       
   173 who wrote:
       
   174 
       
   175 \begin{quote}
       
   176 \it{}``The POSIX strategy is more complicated than the greedy because of 
       
   177 the dependence on information about the length of matched strings in the 
       
   178 various subexpressions.''
       
   179 \end{quote}
       
   180 
       
   181 %\footnote{The relation @{text "\<ge>\<^bsub>r\<^esub>"} defined by Sulzmann and Lu \cite{Sulzmann2014} 
       
   182 %is a relation on the
       
   183 %values for the regular expression @{term r}; but it only holds between
       
   184 %@{term "v\<^sub>1"} and @{term "v\<^sub>2"} in cases where @{term "v\<^sub>1"} and @{term "v\<^sub>2"} have
       
   185 %the same flattening (underlying string). So a counterexample to totality is
       
   186 %given by taking two values @{term "v\<^sub>1"} and @{term "v\<^sub>2"} for @{term r} that
       
   187 %have different flattenings (see Section~\ref{posixsec}). A different
       
   188 %relation @{text "\<ge>\<^bsub>r,s\<^esub>"} on the set of values for @{term r}
       
   189 %with flattening @{term s} is definable by the same approach, and is indeed
       
   190 %total; but that is not what Proposition 1 of \cite{Sulzmann2014} does.}
       
   191 
       
   192 
       
   193 \noindent {\bf Contributions:} We have implemented in Isabelle/HOL the
       
   194 derivative-based regular expression matching algorithm of
       
   195 Sulzmann and Lu \cite{Sulzmann2014}. We have proved the correctness of this
       
   196 algorithm according to our specification of what a POSIX value is (inspired
       
   197 by work of Vansummeren \cite{Vansummeren2006}). Sulzmann
       
   198 and Lu sketch in \cite{Sulzmann2014} an informal correctness proof: but to
       
   199 us it contains unfillable gaps.\footnote{An extended version of
       
   200 \cite{Sulzmann2014} is available at the website of its first author; this
       
   201 extended version already includes remarks in the appendix that their
       
   202 informal proof contains gaps, and possible fixes are not fully worked out.}
       
   203 Our specification of a POSIX value consists of a simple inductive definition
       
   204 that given a string and a regular expression uniquely determines this value.
       
   205 Derivatives as calculated by Brzozowski's method are usually more complex
       
   206 regular expressions than the initial one; various optimisations are
       
   207 possible. We prove the correctness when simplifications of @{term "ALT ZERO
       
   208 r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
       
   209 @{term r} are applied.
       
   210 
       
   211 *}
       
   212 
       
   213 section {* Preliminaries *}
       
   214 
       
   215 text {* \noindent Strings in Isabelle/HOL are lists of characters with the
       
   216 empty string being represented by the empty list, written @{term "[]"}, and
       
   217 list-cons being written as @{term "DUMMY # DUMMY"}. Often we use the usual
       
   218 bracket notation for lists also for strings; for example a string consisting
       
   219 of just a single character @{term c} is written @{term "[c]"}. By using the
       
   220 type @{type char} for characters we have a supply of finitely many
       
   221 characters roughly corresponding to the ASCII character set. Regular
       
   222 expressions are defined as usual as the elements of the following inductive
       
   223 datatype:
       
   224 
       
   225   \begin{center}
       
   226   @{text "r :="}
       
   227   @{const "ZERO"} $\mid$
       
   228   @{const "ONE"} $\mid$
       
   229   @{term "CHAR c"} $\mid$
       
   230   @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
       
   231   @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
       
   232   @{term "STAR r"} 
       
   233   \end{center}
       
   234 
       
   235   \noindent where @{const ZERO} stands for the regular expression that does
       
   236   not match any string, @{const ONE} for the regular expression that matches
       
   237   only the empty string and @{term c} for matching a character literal. The
       
   238   language of a regular expression is also defined as usual by the
       
   239   recursive function @{term L} with the six clauses:
       
   240 
       
   241   \begin{center}
       
   242   \begin{tabular}{l@ {\hspace{3mm}}rcl}
       
   243   (1) & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
       
   244   (2) & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
       
   245   (3) & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
       
   246   \end{tabular}
       
   247   \hspace{14mm}
       
   248   \begin{tabular}{l@ {\hspace{3mm}}rcl}
       
   249   (4) & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   250   (5) & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   251   (6) & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
       
   252   \end{tabular}
       
   253   \end{center}
       
   254   
       
   255   \noindent In clause (4) we use the operation @{term "DUMMY ;;
       
   256   DUMMY"} for the concatenation of two languages (it is also list-append for
       
   257   strings). We use the star-notation for regular expressions and for
       
   258   languages (in the last clause above). The star for languages is defined
       
   259   inductively by two clauses: @{text "(i)"} the empty string being in
       
   260   the star of a language and @{text "(ii)"} if @{term "s\<^sub>1"} is in a
       
   261   language and @{term "s\<^sub>2"} in the star of this language, then also @{term
       
   262   "s\<^sub>1 @ s\<^sub>2"} is in the star of this language. It will also be convenient
       
   263   to use the following notion of a \emph{semantic derivative} (or \emph{left
       
   264   quotient}) of a language defined as
       
   265   %
       
   266   \begin{center}
       
   267   @{thm Der_def}\;.
       
   268   \end{center}
       
   269  
       
   270   \noindent
       
   271   For semantic derivatives we have the following equations (for example
       
   272   mechanically proved in \cite{Krauss2011}):
       
   273   %
       
   274   \begin{equation}\label{SemDer}
       
   275   \begin{array}{lcl}
       
   276   @{thm (lhs) Der_null}  & \dn & @{thm (rhs) Der_null}\\
       
   277   @{thm (lhs) Der_empty}  & \dn & @{thm (rhs) Der_empty}\\
       
   278   @{thm (lhs) Der_char}  & \dn & @{thm (rhs) Der_char}\\
       
   279   @{thm (lhs) Der_union}  & \dn & @{thm (rhs) Der_union}\\
       
   280   @{thm (lhs) Der_Sequ}  & \dn & @{thm (rhs) Der_Sequ}\\
       
   281   @{thm (lhs) Der_star}  & \dn & @{thm (rhs) Der_star}
       
   282   \end{array}
       
   283   \end{equation}
       
   284 
       
   285 
       
   286   \noindent \emph{\Brz's derivatives} of regular expressions
       
   287   \cite{Brzozowski1964} can be easily defined by two recursive functions:
       
   288   the first is from regular expressions to booleans (implementing a test
       
   289   when a regular expression can match the empty string), and the second
       
   290   takes a regular expression and a character to a (derivative) regular
       
   291   expression:
       
   292 
       
   293   \begin{center}
       
   294   \begin{tabular}{lcl}
       
   295   @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
       
   296   @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
       
   297   @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
       
   298   @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   299   @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   300   @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
       
   301   
       
   302   %\end{tabular}
       
   303   %\end{center}
       
   304 
       
   305   %\begin{center}
       
   306   %\begin{tabular}{lcl}
       
   307   
       
   308   @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
       
   309   @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
       
   310   @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
       
   311   @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   312   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
       
   313   @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
       
   314   \end{tabular}
       
   315   \end{center}
       
   316  
       
   317   \noindent
       
   318   We may extend this definition to give derivatives w.r.t.~strings:
       
   319 
       
   320   \begin{center}
       
   321   \begin{tabular}{lcl}
       
   322   @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\
       
   323   @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\
       
   324   \end{tabular}
       
   325   \end{center}
       
   326 
       
   327   \noindent Given the equations in \eqref{SemDer}, it is a relatively easy
       
   328   exercise in mechanical reasoning to establish that
       
   329 
       
   330   \begin{proposition}\label{derprop}\mbox{}\\ 
       
   331   \begin{tabular}{ll}
       
   332   @{text "(1)"} & @{thm (lhs) nullable_correctness} if and only if
       
   333   @{thm (rhs) nullable_correctness}, and \\ 
       
   334   @{text "(2)"} & @{thm[mode=IfThen] der_correctness}.
       
   335   \end{tabular}
       
   336   \end{proposition}
       
   337 
       
   338   \noindent With this in place it is also very routine to prove that the
       
   339   regular expression matcher defined as
       
   340   %
       
   341   \begin{center}
       
   342   @{thm match_def}
       
   343   \end{center}
       
   344 
       
   345   \noindent gives a positive answer if and only if @{term "s \<in> L r"}.
       
   346   Consequently, this regular expression matching algorithm satisfies the
       
   347   usual specification for regular expression matching. While the matcher
       
   348   above calculates a provably correct YES/NO answer for whether a regular
       
   349   expression matches a string or not, the novel idea of Sulzmann and Lu
       
   350   \cite{Sulzmann2014} is to append another phase to this algorithm in order
       
   351   to calculate a [lexical] value. We will explain the details next.
       
   352 
       
   353 *}
       
   354 
       
   355 section {* POSIX Regular Expression Matching\label{posixsec} *}
       
   356 
       
   357 text {* 
       
   358 
       
   359   The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to define
       
   360   values for encoding \emph{how} a regular expression matches a string
       
   361   and then define a function on values that mirrors (but inverts) the
       
   362   construction of the derivative on regular expressions. \emph{Values}
       
   363   are defined as the inductive datatype
       
   364 
       
   365   \begin{center}
       
   366   @{text "v :="}
       
   367   @{const "Void"} $\mid$
       
   368   @{term "val.Char c"} $\mid$
       
   369   @{term "Left v"} $\mid$
       
   370   @{term "Right v"} $\mid$
       
   371   @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
       
   372   @{term "Stars vs"} 
       
   373   \end{center}  
       
   374 
       
   375   \noindent where we use @{term vs} to stand for a list of
       
   376   values. (This is similar to the approach taken by Frisch and
       
   377   Cardelli for GREEDY matching \cite{Frisch2004}, and Sulzmann and Lu
       
   378   for POSIX matching \cite{Sulzmann2014}). The string underlying a
       
   379   value can be calculated by the @{const flat} function, written
       
   380   @{term "flat DUMMY"} and defined as:
       
   381 
       
   382   \begin{center}
       
   383   \begin{tabular}[t]{lcl}
       
   384   @{thm (lhs) flat.simps(1)} & $\dn$ & @{thm (rhs) flat.simps(1)}\\
       
   385   @{thm (lhs) flat.simps(2)} & $\dn$ & @{thm (rhs) flat.simps(2)}\\
       
   386   @{thm (lhs) flat.simps(3)} & $\dn$ & @{thm (rhs) flat.simps(3)}\\
       
   387   @{thm (lhs) flat.simps(4)} & $\dn$ & @{thm (rhs) flat.simps(4)}
       
   388   \end{tabular}\hspace{14mm}
       
   389   \begin{tabular}[t]{lcl}
       
   390   @{thm (lhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
       
   391   @{thm (lhs) flat.simps(6)} & $\dn$ & @{thm (rhs) flat.simps(6)}\\
       
   392   @{thm (lhs) flat.simps(7)} & $\dn$ & @{thm (rhs) flat.simps(7)}\\
       
   393   \end{tabular}
       
   394   \end{center}
       
   395 
       
   396   \noindent Sulzmann and Lu also define inductively an inhabitation relation
       
   397   that associates values to regular expressions:
       
   398 
       
   399   \begin{center}
       
   400   \begin{tabular}{c}
       
   401   \\[-8mm]
       
   402   @{thm[mode=Axiom] Prf.intros(4)} \qquad
       
   403   @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm]
       
   404   @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} \qquad 
       
   405   @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\\[4mm]
       
   406   @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\\[4mm]
       
   407   @{thm[mode=Axiom] Prf.intros(6)[of "r"]} \qquad  
       
   408   @{thm[mode=Rule] Prf.intros(7)[of "v" "r" "vs"]}
       
   409   \end{tabular}
       
   410   \end{center}
       
   411 
       
   412   \noindent Note that no values are associated with the regular expression
       
   413   @{term ZERO}, and that the only value associated with the regular
       
   414   expression @{term ONE} is @{term Void}, pronounced (if one must) as @{text
       
   415   "Void"}. It is routine to establish how values ``inhabiting'' a regular
       
   416   expression correspond to the language of a regular expression, namely
       
   417 
       
   418   \begin{proposition}
       
   419   @{thm L_flat_Prf}
       
   420   \end{proposition}
       
   421 
       
   422   In general there is more than one value associated with a regular
       
   423   expression. In case of POSIX matching the problem is to calculate the
       
   424   unique value that satisfies the (informal) POSIX rules from the
       
   425   Introduction. Graphically the POSIX value calculation algorithm by
       
   426   Sulzmann and Lu can be illustrated by the picture in Figure~\ref{Sulz}
       
   427   where the path from the left to the right involving @{term derivatives}/@{const
       
   428   nullable} is the first phase of the algorithm (calculating successive
       
   429   \Brz's derivatives) and @{const mkeps}/@{text inj}, the path from right to
       
   430   left, the second phase. This picture shows the steps required when a
       
   431   regular expression, say @{text "r\<^sub>1"}, matches the string @{term
       
   432   "[a,b,c]"}. We first build the three derivatives (according to @{term a},
       
   433   @{term b} and @{term c}). We then use @{const nullable} to find out
       
   434   whether the resulting derivative regular expression @{term "r\<^sub>4"}
       
   435   can match the empty string. If yes, we call the function @{const mkeps}
       
   436   that produces a value @{term "v\<^sub>4"} for how @{term "r\<^sub>4"} can
       
   437   match the empty string (taking into account the POSIX constraints in case
       
   438   there are several ways). This function is defined by the clauses:
       
   439 
       
   440 \begin{figure}[t]
       
   441 \begin{center}
       
   442 \begin{tikzpicture}[scale=2,node distance=1.3cm,
       
   443                     every node/.style={minimum size=6mm}]
       
   444 \node (r1)  {@{term "r\<^sub>1"}};
       
   445 \node (r2) [right=of r1]{@{term "r\<^sub>2"}};
       
   446 \draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
       
   447 \node (r3) [right=of r2]{@{term "r\<^sub>3"}};
       
   448 \draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
       
   449 \node (r4) [right=of r3]{@{term "r\<^sub>4"}};
       
   450 \draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
       
   451 \draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
       
   452 \node (v4) [below=of r4]{@{term "v\<^sub>4"}};
       
   453 \draw[->,line width=1mm](r4) -- (v4);
       
   454 \node (v3) [left=of v4] {@{term "v\<^sub>3"}};
       
   455 \draw[->,line width=1mm](v4)--(v3) node[below,midway] {@{text "inj r\<^sub>3 c"}};
       
   456 \node (v2) [left=of v3]{@{term "v\<^sub>2"}};
       
   457 \draw[->,line width=1mm](v3)--(v2) node[below,midway] {@{text "inj r\<^sub>2 b"}};
       
   458 \node (v1) [left=of v2] {@{term "v\<^sub>1"}};
       
   459 \draw[->,line width=1mm](v2)--(v1) node[below,midway] {@{text "inj r\<^sub>1 a"}};
       
   460 \draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
       
   461 \end{tikzpicture}
       
   462 \end{center}
       
   463 \mbox{}\\[-13mm]
       
   464 
       
   465 \caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
       
   466 matching the string @{term "[a,b,c]"}. The first phase (the arrows from 
       
   467 left to right) is \Brz's matcher building successive derivatives. If the 
       
   468 last regular expression is @{term nullable}, then the functions of the 
       
   469 second phase are called (the top-down and right-to-left arrows): first 
       
   470 @{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
       
   471 how the empty string has been recognised by @{term "r\<^sub>4"}. After
       
   472 that the function @{term inj} ``injects back'' the characters of the string into
       
   473 the values.
       
   474 \label{Sulz}}
       
   475 \end{figure} 
       
   476 
       
   477   \begin{center}
       
   478   \begin{tabular}{lcl}
       
   479   @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
       
   480   @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   481   @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
       
   482   @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
       
   483   \end{tabular}
       
   484   \end{center}
       
   485 
       
   486   \noindent Note that this function needs only to be partially defined,
       
   487   namely only for regular expressions that are nullable. In case @{const
       
   488   nullable} fails, the string @{term "[a,b,c]"} cannot be matched by @{term
       
   489   "r\<^sub>1"} and the null value @{term "None"} is returned. Note also how this function
       
   490   makes some subtle choices leading to a POSIX value: for example if an
       
   491   alternative regular expression, say @{term "ALT r\<^sub>1 r\<^sub>2"}, can
       
   492   match the empty string and furthermore @{term "r\<^sub>1"} can match the
       
   493   empty string, then we return a @{text Left}-value. The @{text
       
   494   Right}-value will only be returned if @{term "r\<^sub>1"} cannot match the empty
       
   495   string.
       
   496 
       
   497   The most interesting idea from Sulzmann and Lu \cite{Sulzmann2014} is
       
   498   the construction of a value for how @{term "r\<^sub>1"} can match the
       
   499   string @{term "[a,b,c]"} from the value how the last derivative, @{term
       
   500   "r\<^sub>4"} in Fig.~\ref{Sulz}, can match the empty string. Sulzmann and
       
   501   Lu achieve this by stepwise ``injecting back'' the characters into the
       
   502   values thus inverting the operation of building derivatives, but on the level
       
   503   of values. The corresponding function, called @{term inj}, takes three
       
   504   arguments, a regular expression, a character and a value. For example in
       
   505   the first (or right-most) @{term inj}-step in Fig.~\ref{Sulz} the regular
       
   506   expression @{term "r\<^sub>3"}, the character @{term c} from the last
       
   507   derivative step and @{term "v\<^sub>4"}, which is the value corresponding
       
   508   to the derivative regular expression @{term "r\<^sub>4"}. The result is
       
   509   the new value @{term "v\<^sub>3"}. The final result of the algorithm is
       
   510   the value @{term "v\<^sub>1"}. The @{term inj} function is defined by recursion on regular
       
   511   expressions and by analysing the shape of values (corresponding to 
       
   512   the derivative regular expressions).
       
   513   %
       
   514   \begin{center}
       
   515   \begin{tabular}{l@ {\hspace{5mm}}lcl}
       
   516   (1) & @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
       
   517   (2) & @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
       
   518       @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
       
   519   (3) & @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
       
   520       @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
       
   521   (4) & @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
       
   522       & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
       
   523   (5) & @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
       
   524       & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
       
   525   (6) & @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ 
       
   526       & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
       
   527   (7) & @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
       
   528       & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
       
   529   \end{tabular}
       
   530   \end{center}
       
   531 
       
   532   \noindent To better understand what is going on in this definition it
       
   533   might be instructive to look first at the three sequence cases (clauses
       
   534   (4)--(6)). In each case we need to construct an ``injected value'' for
       
   535   @{term "SEQ r\<^sub>1 r\<^sub>2"}. This must be a value of the form @{term
       
   536   "Seq DUMMY DUMMY"}\,. Recall the clause of the @{text derivative}-function
       
   537   for sequence regular expressions:
       
   538 
       
   539   \begin{center}
       
   540   @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} $\dn$ @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}
       
   541   \end{center}
       
   542 
       
   543   \noindent Consider first the @{text "else"}-branch where the derivative is @{term
       
   544   "SEQ (der c r\<^sub>1) r\<^sub>2"}. The corresponding value must therefore
       
   545   be of the form @{term "Seq v\<^sub>1 v\<^sub>2"}, which matches the left-hand
       
   546   side in clause~(4) of @{term inj}. In the @{text "if"}-branch the derivative is an
       
   547   alternative, namely @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c
       
   548   r\<^sub>2)"}. This means we either have to consider a @{text Left}- or
       
   549   @{text Right}-value. In case of the @{text Left}-value we know further it
       
   550   must be a value for a sequence regular expression. Therefore the pattern
       
   551   we match in the clause (5) is @{term "Left (Seq v\<^sub>1 v\<^sub>2)"},
       
   552   while in (6) it is just @{term "Right v\<^sub>2"}. One more interesting
       
   553   point is in the right-hand side of clause (6): since in this case the
       
   554   regular expression @{text "r\<^sub>1"} does not ``contribute'' to
       
   555   matching the string, that means it only matches the empty string, we need to
       
   556   call @{const mkeps} in order to construct a value for how @{term "r\<^sub>1"}
       
   557   can match this empty string. A similar argument applies for why we can
       
   558   expect in the left-hand side of clause (7) that the value is of the form
       
   559   @{term "Seq v (Stars vs)"}---the derivative of a star is @{term "SEQ (der c r)
       
   560   (STAR r)"}. Finally, the reason for why we can ignore the second argument
       
   561   in clause (1) of @{term inj} is that it will only ever be called in cases
       
   562   where @{term "c=d"}, but the usual linearity restrictions in patterns do
       
   563   not allow us to build this constraint explicitly into our function
       
   564   definition.\footnote{Sulzmann and Lu state this clause as @{thm (lhs)
       
   565   injval.simps(1)[of "c" "c"]} $\dn$ @{thm (rhs) injval.simps(1)[of "c"]},
       
   566   but our deviation is harmless.}
       
   567 
       
   568   The idea of the @{term inj}-function to ``inject'' a character, say
       
   569   @{term c}, into a value can be made precise by the first part of the
       
   570   following lemma, which shows that the underlying string of an injected
       
   571   value has a prepended character @{term c}; the second part shows that the
       
   572   underlying string of an @{const mkeps}-value is always the empty string
       
   573   (given the regular expression is nullable since otherwise @{text mkeps}
       
   574   might not be defined).
       
   575 
       
   576   \begin{lemma}\mbox{}\smallskip\\\label{Prf_injval_flat}
       
   577   \begin{tabular}{ll}
       
   578   (1) & @{thm[mode=IfThen] Prf_injval_flat}\\
       
   579   (2) & @{thm[mode=IfThen] mkeps_flat}
       
   580   \end{tabular}
       
   581   \end{lemma}
       
   582 
       
   583   \begin{proof}
       
   584   Both properties are by routine inductions: the first one can, for example,
       
   585   be proved by induction over the definition of @{term derivatives}; the second by
       
   586   an induction on @{term r}. There are no interesting cases.\qed
       
   587   \end{proof}
       
   588 
       
   589   Having defined the @{const mkeps} and @{text inj} function we can extend
       
   590   \Brz's matcher so that a [lexical] value is constructed (assuming the
       
   591   regular expression matches the string). The clauses of the Sulzmann and Lu lexer are
       
   592 
       
   593   \begin{center}
       
   594   \begin{tabular}{lcl}
       
   595   @{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
       
   596   @{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}\\
       
   597                      & & \phantom{$|$} @{term "None"}  @{text "\<Rightarrow>"} @{term None}\\
       
   598                      & & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}                          
       
   599   \end{tabular}
       
   600   \end{center}
       
   601 
       
   602   \noindent If the regular expression does not match the string, @{const None} is
       
   603   returned. If the regular expression \emph{does}
       
   604   match the string, then @{const Some} value is returned. One important
       
   605   virtue of this algorithm is that it can be implemented with ease in any
       
   606   functional programming language and also in Isabelle/HOL. In the remaining
       
   607   part of this section we prove that this algorithm is correct.
       
   608 
       
   609   The well-known idea of POSIX matching is informally defined by the longest
       
   610   match and priority rule (see Introduction); as correctly argued in \cite{Sulzmann2014}, this
       
   611   needs formal specification. Sulzmann and Lu define an ``ordering
       
   612   relation'' between values and argue
       
   613   that there is a maximum value, as given by the derivative-based algorithm.
       
   614   In contrast, we shall introduce a simple inductive definition that
       
   615   specifies directly what a \emph{POSIX value} is, incorporating the
       
   616   POSIX-specific choices into the side-conditions of our rules. Our
       
   617   definition is inspired by the matching relation given by Vansummeren
       
   618   \cite{Vansummeren2006}. The relation we define is ternary and written as
       
   619   \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating strings, regular expressions and
       
   620   values.
       
   621   %
       
   622   \begin{center}
       
   623   \begin{tabular}{c}
       
   624   @{thm[mode=Axiom] Posix.intros(1)}@{text "P"}@{term "ONE"} \qquad
       
   625   @{thm[mode=Axiom] Posix.intros(2)}@{text "P"}@{term "c"}\medskip\\
       
   626   @{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}@{text "P+L"}\qquad
       
   627   @{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}@{text "P+R"}\medskip\\
       
   628   $\mprset{flushleft}
       
   629    \inferrule
       
   630    {@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
       
   631     @{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
       
   632     @{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
       
   633    {@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$@{text "PS"}\\
       
   634   @{thm[mode=Axiom] Posix.intros(7)}@{text "P[]"}\medskip\\
       
   635   $\mprset{flushleft}
       
   636    \inferrule
       
   637    {@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
       
   638     @{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
       
   639     @{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
       
   640     @{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
       
   641    {@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$@{text "P\<star>"}
       
   642   \end{tabular}
       
   643   \end{center}
       
   644 
       
   645    \noindent
       
   646    We can prove that given a string @{term s} and regular expression @{term
       
   647    r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow> v"}.
       
   648 
       
   649   \begin{theorem}\mbox{}\smallskip\\\label{posixdeterm}
       
   650   \begin{tabular}{ll}
       
   651   @{text "(1)"} & If @{thm (prem 1) Posix1(1)} then @{thm (concl)
       
   652   Posix1(1)} and @{thm (concl) Posix1(2)}.\\
       
   653   @{text "(2)"} & @{thm[mode=IfThen] Posix_determ(1)[of _ _ "v" "v'"]}
       
   654   \end{tabular}
       
   655   \end{theorem}
       
   656 
       
   657   \begin{proof} Both by induction on the definition of @{term "s \<in> r \<rightarrow> v"}. 
       
   658   The second parts follows by a case analysis of @{term "s \<in> r \<rightarrow> v'"} and
       
   659   the first part.\qed
       
   660   \end{proof}
       
   661 
       
   662   \noindent
       
   663   We claim that our @{term "s \<in> r \<rightarrow> v"} relation captures the idea behind the two
       
   664   informal POSIX rules shown in the Introduction: Consider for example the
       
   665   rules @{text "P+L"} and @{text "P+R"} where the POSIX value for a string
       
   666   and an alternative regular expression, that is @{term "(s, ALT r\<^sub>1 r\<^sub>2)"},
       
   667   is specified---it is always a @{text "Left"}-value, \emph{except} when the
       
   668   string to be matched is not in the language of @{term "r\<^sub>1"}; only then it
       
   669   is a @{text Right}-value (see the side-condition in @{text "P+R"}).
       
   670   Interesting is also the rule for sequence regular expressions (@{text
       
   671   "PS"}). The first two premises state that @{term "v\<^sub>1"} and @{term "v\<^sub>2"}
       
   672   are the POSIX values for @{term "(s\<^sub>1, r\<^sub>1)"} and @{term "(s\<^sub>2, r\<^sub>2)"}
       
   673   respectively. Consider now the third premise and note that the POSIX value
       
   674   of this rule should match the string \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}}. According to the
       
   675   longest match rule, we want that the @{term "s\<^sub>1"} is the longest initial
       
   676   split of \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}} such that @{term "s\<^sub>2"} is still recognised
       
   677   by @{term "r\<^sub>2"}. Let us assume, contrary to the third premise, that there
       
   678   \emph{exist} an @{term "s\<^sub>3"} and @{term "s\<^sub>4"} such that @{term "s\<^sub>2"}
       
   679   can be split up into a non-empty string @{term "s\<^sub>3"} and a possibly empty
       
   680   string @{term "s\<^sub>4"}. Moreover the longer string @{term "s\<^sub>1 @ s\<^sub>3"} can be
       
   681   matched by @{text "r\<^sub>1"} and the shorter @{term "s\<^sub>4"} can still be
       
   682   matched by @{term "r\<^sub>2"}. In this case @{term "s\<^sub>1"} would \emph{not} be the
       
   683   longest initial split of \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}} and therefore @{term "Seq v\<^sub>1
       
   684   v\<^sub>2"} cannot be a POSIX value for @{term "(s\<^sub>1 @ s\<^sub>2, SEQ r\<^sub>1 r\<^sub>2)"}. 
       
   685   The main point is that our side-condition ensures the longest 
       
   686   match rule is satisfied.
       
   687 
       
   688   A similar condition is imposed on the POSIX value in the @{text
       
   689   "P\<star>"}-rule. Also there we want that @{term "s\<^sub>1"} is the longest initial
       
   690   split of @{term "s\<^sub>1 @ s\<^sub>2"} and furthermore the corresponding value
       
   691   @{term v} cannot be flattened to the empty string. In effect, we require
       
   692   that in each ``iteration'' of the star, some non-empty substring needs to
       
   693   be ``chipped'' away; only in case of the empty string we accept @{term
       
   694   "Stars []"} as the POSIX value.
       
   695 
       
   696   Next is the lemma that shows the function @{term "mkeps"} calculates
       
   697   the POSIX value for the empty string and a nullable regular expression.
       
   698 
       
   699   \begin{lemma}\label{lemmkeps}
       
   700   @{thm[mode=IfThen] Posix_mkeps}
       
   701   \end{lemma}
       
   702 
       
   703   \begin{proof}
       
   704   By routine induction on @{term r}.\qed 
       
   705   \end{proof}
       
   706 
       
   707   \noindent
       
   708   The central lemma for our POSIX relation is that the @{text inj}-function
       
   709   preserves POSIX values.
       
   710 
       
   711   \begin{lemma}\label{Posix2}
       
   712   @{thm[mode=IfThen] Posix_injval}
       
   713   \end{lemma}
       
   714 
       
   715   \begin{proof}
       
   716   By induction on @{text r}. We explain two cases.
       
   717 
       
   718   \begin{itemize}
       
   719   \item[$\bullet$] Case @{term "r = ALT r\<^sub>1 r\<^sub>2"}. There are
       
   720   two subcases, namely @{text "(a)"} \mbox{@{term "v = Left v'"}} and @{term
       
   721   "s \<in> der c r\<^sub>1 \<rightarrow> v'"}; and @{text "(b)"} @{term "v = Right v'"}, @{term
       
   722   "s \<notin> L (der c r\<^sub>1)"} and @{term "s \<in> der c r\<^sub>2 \<rightarrow> v'"}. In @{text "(a)"} we
       
   723   know @{term "s \<in> der c r\<^sub>1 \<rightarrow> v'"}, from which we can infer @{term "(c # s)
       
   724   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v'"} by induction hypothesis and hence @{term "(c #
       
   725   s) \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> injval (ALT r\<^sub>1 r\<^sub>2) c (Left v')"} as needed. Similarly
       
   726   in subcase @{text "(b)"} where, however, in addition we have to use
       
   727   Prop.~\ref{derprop}(2) in order to infer @{term "c # s \<notin> L r\<^sub>1"} from @{term
       
   728   "s \<notin> L (der c r\<^sub>1)"}.
       
   729 
       
   730   \item[$\bullet$] Case @{term "r = SEQ r\<^sub>1 r\<^sub>2"}. There are three subcases:
       
   731   
       
   732   \begin{quote}
       
   733   \begin{description}
       
   734   \item[@{text "(a)"}] @{term "v = Left (Seq v\<^sub>1 v\<^sub>2)"} and @{term "nullable r\<^sub>1"} 
       
   735   \item[@{text "(b)"}] @{term "v = Right v\<^sub>1"} and @{term "nullable r\<^sub>1"} 
       
   736   \item[@{text "(c)"}] @{term "v = Seq v\<^sub>1 v\<^sub>2"} and @{term "\<not> nullable r\<^sub>1"} 
       
   737   \end{description}
       
   738   \end{quote}
       
   739 
       
   740   \noindent For @{text "(a)"} we know @{term "s\<^sub>1 \<in> der c r\<^sub>1 \<rightarrow> v\<^sub>1"} and
       
   741   @{term "s\<^sub>2 \<in> r\<^sub>2 \<rightarrow> v\<^sub>2"} as well as
       
   742   %
       
   743   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> s\<^sub>1 @ s\<^sub>3 \<in> L (der c r\<^sub>1) \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
   744 
       
   745   \noindent From the latter we can infer by Prop.~\ref{derprop}(2):
       
   746   %
       
   747   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
   748 
       
   749   \noindent We can use the induction hypothesis for @{text "r\<^sub>1"} to obtain
       
   750   @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. Putting this all together allows us to infer
       
   751   @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case @{text "(c)"}
       
   752   is similar.
       
   753 
       
   754   For @{text "(b)"} we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and 
       
   755   @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former
       
   756   we have @{term "(c # s) \<in> r\<^sub>2 \<rightarrow> (injval r\<^sub>2 c v\<^sub>1)"} by induction hypothesis
       
   757   for @{term "r\<^sub>2"}. From the latter we can infer
       
   758   %
       
   759   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = c # s \<and> s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
       
   760 
       
   761   \noindent By Lem.~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"}
       
   762   holds. Putting this all together, we can conclude with @{term "(c #
       
   763   s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required.
       
   764 
       
   765   Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the
       
   766   sequence case, except that we need to also ensure that @{term "flat (injval r\<^sub>1
       
   767   c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1)
       
   768   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}  (which in turn follows from @{term "s\<^sub>1 \<in> der c
       
   769   r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed
       
   770   \end{itemize}
       
   771   \end{proof}
       
   772 
       
   773   \noindent
       
   774   With Lem.~\ref{Posix2} in place, it is completely routine to establish
       
   775   that the Sulzmann and Lu lexer satisfies our specification (returning
       
   776   the null value @{term "None"} iff the string is not in the language of the regular expression,
       
   777   and returning a unique POSIX value iff the string \emph{is} in the language):
       
   778 
       
   779   \begin{theorem}\mbox{}\smallskip\\\label{lexercorrect}
       
   780   \begin{tabular}{ll}
       
   781   (1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
       
   782   (2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
       
   783   \end{tabular}
       
   784   \end{theorem}
       
   785 
       
   786   \begin{proof}
       
   787   By induction on @{term s} using Lem.~\ref{lemmkeps} and \ref{Posix2}.\qed  
       
   788   \end{proof}
       
   789 
       
   790   \noindent In (2) we further know by Thm.~\ref{posixdeterm} that the
       
   791   value returned by the lexer must be unique.   A simple corollary 
       
   792   of our two theorems is:
       
   793 
       
   794   \begin{corollary}\mbox{}\smallskip\\\label{lexercorrectcor}
       
   795   \begin{tabular}{ll}
       
   796   (1) & @{thm (lhs) lexer_correctness(2)} if and only if @{thm (rhs) lexer_correctness(2)}\\ 
       
   797   (2) & @{thm (lhs) lexer_correctness(1)} if and only if @{thm (rhs) lexer_correctness(1)}\\
       
   798   \end{tabular}
       
   799   \end{corollary}
       
   800 
       
   801   \noindent
       
   802   This concludes our
       
   803   correctness proof. Note that we have not changed the algorithm of
       
   804   Sulzmann and Lu,\footnote{All deviations we introduced are
       
   805   harmless.} but introduced our own specification for what a correct
       
   806   result---a POSIX value---should be. A strong point in favour of
       
   807   Sulzmann and Lu's algorithm is that it can be extended in various
       
   808   ways.
       
   809 
       
   810 *}
       
   811 
       
   812 section {* Extensions and Optimisations*}
       
   813 
       
   814 text {*
       
   815 
       
   816   If we are interested in tokenising a string, then we need to not just
       
   817   split up the string into tokens, but also ``classify'' the tokens (for
       
   818   example whether it is a keyword or an identifier). This can be done with
       
   819   only minor modifications to the algorithm by introducing \emph{record
       
   820   regular expressions} and \emph{record values} (for example
       
   821   \cite{Sulzmann2014b}):
       
   822 
       
   823   \begin{center}  
       
   824   @{text "r :="}
       
   825   @{text "..."} $\mid$
       
   826   @{text "(l : r)"} \qquad\qquad
       
   827   @{text "v :="}
       
   828   @{text "..."} $\mid$
       
   829   @{text "(l : v)"}
       
   830   \end{center}
       
   831   
       
   832   \noindent where @{text l} is a label, say a string, @{text r} a regular
       
   833   expression and @{text v} a value. All functions can be smoothly extended
       
   834   to these regular expressions and values. For example \mbox{@{text "(l :
       
   835   r)"}} is nullable iff @{term r} is, and so on. The purpose of the record
       
   836   regular expression is to mark certain parts of a regular expression and
       
   837   then record in the calculated value which parts of the string were matched
       
   838   by this part. The label can then serve as classification for the tokens.
       
   839   For this recall the regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} for
       
   840   keywords and identifiers from the Introduction. With the record regular
       
   841   expression we can form \mbox{@{text "((key : r\<^bsub>key\<^esub>) + (id : r\<^bsub>id\<^esub>))\<^sup>\<star>"}}
       
   842   and then traverse the calculated value and only collect the underlying
       
   843   strings in record values. With this we obtain finite sequences of pairs of
       
   844   labels and strings, for example
       
   845 
       
   846   \[@{text "(l\<^sub>1 : s\<^sub>1), ..., (l\<^sub>n : s\<^sub>n)"}\]
       
   847   
       
   848   \noindent from which tokens with classifications (keyword-token,
       
   849   identifier-token and so on) can be extracted.
       
   850 
       
   851   Derivatives as calculated by \Brz's method are usually more complex
       
   852   regular expressions than the initial one; the result is that the
       
   853   derivative-based matching and lexing algorithms are often abysmally slow.
       
   854   However, various optimisations are possible, such as the simplifications
       
   855   of @{term "ALT ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and
       
   856   @{term "SEQ r ONE"} to @{term r}. These simplifications can speed up the
       
   857   algorithms considerably, as noted in \cite{Sulzmann2014}. One of the
       
   858   advantages of having a simple specification and correctness proof is that
       
   859   the latter can be refined to prove the correctness of such simplification
       
   860   steps. While the simplification of regular expressions according to 
       
   861   rules like
       
   862 
       
   863   \begin{equation}\label{Simpl}
       
   864   \begin{array}{lcllcllcllcl}
       
   865   @{term "ALT ZERO r"} & @{text "\<Rightarrow>"} & @{term r} \hspace{8mm}%\\
       
   866   @{term "ALT r ZERO"} & @{text "\<Rightarrow>"} & @{term r} \hspace{8mm}%\\
       
   867   @{term "SEQ ONE r"}  & @{text "\<Rightarrow>"} & @{term r} \hspace{8mm}%\\
       
   868   @{term "SEQ r ONE"}  & @{text "\<Rightarrow>"} & @{term r}
       
   869   \end{array}
       
   870   \end{equation}
       
   871 
       
   872   \noindent is well understood, there is an obstacle with the POSIX value
       
   873   calculation algorithm by Sulzmann and Lu: if we build a derivative regular
       
   874   expression and then simplify it, we will calculate a POSIX value for this
       
   875   simplified derivative regular expression, \emph{not} for the original (unsimplified)
       
   876   derivative regular expression. Sulzmann and Lu \cite{Sulzmann2014} overcome this obstacle by
       
   877   not just calculating a simplified regular expression, but also calculating
       
   878   a \emph{rectification function} that ``repairs'' the incorrect value.
       
   879   
       
   880   The rectification functions can be (slightly clumsily) implemented  in
       
   881   Isabelle/HOL as follows using some auxiliary functions:
       
   882 
       
   883   \begin{center}
       
   884   \begin{tabular}{lcl}
       
   885   @{thm (lhs) F_RIGHT.simps(1)} & $\dn$ & @{text "Right (f v)"}\\
       
   886   @{thm (lhs) F_LEFT.simps(1)} & $\dn$ & @{text "Left (f v)"}\\
       
   887   
       
   888   @{thm (lhs) F_ALT.simps(1)} & $\dn$ & @{text "Right (f\<^sub>2 v)"}\\
       
   889   @{thm (lhs) F_ALT.simps(2)} & $\dn$ & @{text "Left (f\<^sub>1 v)"}\\
       
   890   
       
   891   @{thm (lhs) F_SEQ1.simps(1)} & $\dn$ & @{text "Seq (f\<^sub>1 ()) (f\<^sub>2 v)"}\\
       
   892   @{thm (lhs) F_SEQ2.simps(1)} & $\dn$ & @{text "Seq (f\<^sub>1 v) (f\<^sub>2 ())"}\\
       
   893   @{thm (lhs) F_SEQ.simps(1)} & $\dn$ & @{text "Seq (f\<^sub>1 v\<^sub>1) (f\<^sub>2 v\<^sub>2)"}\medskip\\
       
   894   %\end{tabular}
       
   895   %
       
   896   %\begin{tabular}{lcl}
       
   897   @{term "simp_ALT (ZERO, DUMMY) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_RIGHT f\<^sub>2)"}\\
       
   898   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, DUMMY)"} & $\dn$ & @{term "(r\<^sub>1, F_LEFT f\<^sub>1)"}\\
       
   899   @{term "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)"}\\
       
   900   @{term "simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)"}\\
       
   901   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)"}\\
       
   902   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(SEQ r\<^sub>1 r\<^sub>2, F_SEQ f\<^sub>1 f\<^sub>2)"}\\
       
   903   \end{tabular}
       
   904   \end{center}
       
   905 
       
   906   \noindent
       
   907   The functions @{text "simp\<^bsub>Alt\<^esub>"} and @{text "simp\<^bsub>Seq\<^esub>"} encode the simplification rules
       
   908   in \eqref{Simpl} and compose the rectification functions (simplifications can occur
       
   909   deep inside the regular expression). The main simplification function is then 
       
   910 
       
   911   \begin{center}
       
   912   \begin{tabular}{lcl}
       
   913   @{term "simp (ALT r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_ALT (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
   914   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
       
   915   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
       
   916   \end{tabular}
       
   917   \end{center} 
       
   918 
       
   919   \noindent where @{term "id"} stands for the identity function. The
       
   920   function @{const simp} returns a simplified regular expression and a corresponding
       
   921   rectification function. Note that we do not simplify under stars: this
       
   922   seems to slow down the algorithm, rather than speed it up. The optimised
       
   923   lexer is then given by the clauses:
       
   924   
       
   925   \begin{center}
       
   926   \begin{tabular}{lcl}
       
   927   @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\
       
   928   @{thm (lhs) slexer.simps(2)} & $\dn$ & 
       
   929                          @{text "let (r\<^sub>s, f\<^sub>r) = simp (r "}$\backslash$@{text " c) in"}\\
       
   930                      & & @{text "case"} @{term "slexer r\<^sub>s s"} @{text of}\\
       
   931                      & & \phantom{$|$} @{term "None"}  @{text "\<Rightarrow>"} @{term None}\\
       
   932                      & & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{text "Some (inj r c (f\<^sub>r v))"}                          
       
   933   \end{tabular}
       
   934   \end{center}
       
   935 
       
   936   \noindent
       
   937   In the second clause we first calculate the derivative @{term "der c r"}
       
   938   and then simplify the result. This gives us a simplified derivative
       
   939   @{text "r\<^sub>s"} and a rectification function @{text "f\<^sub>r"}. The lexer
       
   940   is then recursively called with the simplified derivative, but before
       
   941   we inject the character @{term c} into the value @{term v}, we need to rectify
       
   942   @{term v} (that is construct @{term "f\<^sub>r v"}). Before we can establish the correctness
       
   943   of @{term "slexer"}, we need to show that simplification preserves the language
       
   944   and simplification preserves our POSIX relation once the value is rectified
       
   945   (recall @{const "simp"} generates a (regular expression, rectification function) pair):
       
   946 
       
   947   \begin{lemma}\mbox{}\smallskip\\\label{slexeraux}
       
   948   \begin{tabular}{ll}
       
   949   (1) & @{thm L_fst_simp[symmetric]}\\
       
   950   (2) & @{thm[mode=IfThen] Posix_simp}
       
   951   \end{tabular}
       
   952   \end{lemma}
       
   953 
       
   954   \begin{proof} Both are by induction on @{text r}. There is no
       
   955   interesting case for the first statement. For the second statement,
       
   956   of interest are the @{term "r = ALT r\<^sub>1 r\<^sub>2"} and @{term "r = SEQ r\<^sub>1
       
   957   r\<^sub>2"} cases. In each case we have to analyse four subcases whether
       
   958   @{term "fst (simp r\<^sub>1)"} and @{term "fst (simp r\<^sub>2)"} equals @{const
       
   959   ZERO} (respectively @{const ONE}). For example for @{term "r = ALT
       
   960   r\<^sub>1 r\<^sub>2"}, consider the subcase @{term "fst (simp r\<^sub>1) = ZERO"} and
       
   961   @{term "fst (simp r\<^sub>2) \<noteq> ZERO"}. By assumption we know @{term "s \<in>
       
   962   fst (simp (ALT r\<^sub>1 r\<^sub>2)) \<rightarrow> v"}. From this we can infer @{term "s \<in> fst (simp r\<^sub>2) \<rightarrow> v"}
       
   963   and by IH also (*) @{term "s \<in> r\<^sub>2 \<rightarrow> (snd (simp r\<^sub>2) v)"}. Given @{term "fst (simp r\<^sub>1) = ZERO"}
       
   964   we know @{term "L (fst (simp r\<^sub>1)) = {}"}. By the first statement
       
   965   @{term "L r\<^sub>1"} is the empty set, meaning (**) @{term "s \<notin> L r\<^sub>1"}.
       
   966   Taking (*) and (**) together gives by the \mbox{@{text "P+R"}}-rule 
       
   967   @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> Right (snd (simp r\<^sub>2) v)"}. In turn this
       
   968   gives @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> snd (simp (ALT r\<^sub>1 r\<^sub>2)) v"} as we need to show.
       
   969   The other cases are similar.\qed
       
   970   \end{proof}
       
   971 
       
   972   \noindent We can now prove relatively straightforwardly that the
       
   973   optimised lexer produces the expected result:
       
   974 
       
   975   \begin{theorem}
       
   976   @{thm slexer_correctness}
       
   977   \end{theorem}
       
   978 
       
   979   \begin{proof} By induction on @{term s} generalising over @{term
       
   980   r}. The case @{term "[]"} is trivial. For the cons-case suppose the
       
   981   string is of the form @{term "c # s"}. By induction hypothesis we
       
   982   know @{term "slexer r s = lexer r s"} holds for all @{term r} (in
       
   983   particular for @{term "r"} being the derivative @{term "der c
       
   984   r"}). Let @{term "r\<^sub>s"} be the simplified derivative regular expression, that is @{term
       
   985   "fst (simp (der c r))"}, and @{term "f\<^sub>r"} be the rectification
       
   986   function, that is @{term "snd (simp (der c r))"}.  We distinguish the cases
       
   987   whether (*) @{term "s \<in> L (der c r)"} or not. In the first case we
       
   988   have by Thm.~\ref{lexercorrect}(2) a value @{term "v"} so that @{term
       
   989   "lexer (der c r) s = Some v"} and @{term "s \<in> der c r \<rightarrow> v"} hold.
       
   990   By Lem.~\ref{slexeraux}(1) we can also infer from~(*) that @{term "s
       
   991   \<in> L r\<^sub>s"} holds.  Hence we know by Thm.~\ref{lexercorrect}(2) that
       
   992   there exists a @{term "v'"} with @{term "lexer r\<^sub>s s = Some v'"} and
       
   993   @{term "s \<in> r\<^sub>s \<rightarrow> v'"}. From the latter we know by
       
   994   Lem.~\ref{slexeraux}(2) that @{term "s \<in> der c r \<rightarrow> (f\<^sub>r v')"} holds.
       
   995   By the uniqueness of the POSIX relation (Thm.~\ref{posixdeterm}) we
       
   996   can infer that @{term v} is equal to @{term "f\<^sub>r v'"}---that is the 
       
   997   rectification function applied to @{term "v'"}
       
   998   produces the original @{term "v"}.  Now the case follows by the
       
   999   definitions of @{const lexer} and @{const slexer}.
       
  1000 
       
  1001   In the second case where @{term "s \<notin> L (der c r)"} we have that
       
  1002   @{term "lexer (der c r) s = None"} by Thm.~\ref{lexercorrect}(1).  We
       
  1003   also know by Lem.~\ref{slexeraux}(1) that @{term "s \<notin> L r\<^sub>s"}. Hence
       
  1004   @{term "lexer r\<^sub>s s = None"} by Thm.~\ref{lexercorrect}(1) and
       
  1005   by IH then also @{term "slexer r\<^sub>s s = None"}. With this we can
       
  1006   conclude in this case too.\qed   
       
  1007 
       
  1008   \end{proof} 
       
  1009 *}
       
  1010 
       
  1011 section {* The Correctness Argument by Sulzmann and Lu\label{argu} *}
       
  1012 
       
  1013 text {*
       
  1014 %  \newcommand{\greedy}{\succcurlyeq_{gr}}
       
  1015  \newcommand{\posix}{>}
       
  1016 
       
  1017   An extended version of \cite{Sulzmann2014} is available at the website of
       
  1018   its first author; this includes some ``proofs'', claimed in
       
  1019   \cite{Sulzmann2014} to be ``rigorous''. Since these are evidently not in
       
  1020   final form, we make no comment thereon, preferring to give general reasons
       
  1021   for our belief that the approach of \cite{Sulzmann2014} is problematic.
       
  1022   Their central definition is an ``ordering relation'' defined by the
       
  1023   rules (slightly adapted to fit our notation):
       
  1024 
       
  1025 \begin{center}  
       
  1026 \begin{tabular}{@ {}c@ {\hspace{4mm}}c@ {}}	
       
  1027 @{thm[mode=Rule] C2[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>1\<iota>" "v\<^sub>2" "r\<^sub>2" "v\<^sub>2\<iota>"]}\,(C2) &
       
  1028 @{thm[mode=Rule] C1[of "v\<^sub>2" "r\<^sub>2" "v\<^sub>2\<iota>" "v\<^sub>1" "r\<^sub>1"]}\,(C1)\smallskip\\
       
  1029 
       
  1030 @{thm[mode=Rule] A1[of "v\<^sub>1" "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\,(A1) &
       
  1031 @{thm[mode=Rule] A2[of "v\<^sub>2" "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\,(A2)\smallskip\\
       
  1032 
       
  1033 @{thm[mode=Rule] A3[of "v\<^sub>1" "r\<^sub>2" "v\<^sub>2" "r\<^sub>1"]}\,(A3) &
       
  1034 @{thm[mode=Rule] A4[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\,(A4)\smallskip\\
       
  1035 
       
  1036 @{thm[mode=Rule] K1[of "v" "vs" "r"]}\,(K1) &
       
  1037 @{thm[mode=Rule] K2[of "v" "vs" "r"]}\,(K2)\smallskip\\
       
  1038 
       
  1039 @{thm[mode=Rule] K3[of "v\<^sub>1" "r" "v\<^sub>2" "vs\<^sub>1" "vs\<^sub>2"]}\,(K3) &
       
  1040 @{thm[mode=Rule] K4[of "vs\<^sub>1" "r" "vs\<^sub>2" "v"]}\,(K4)
       
  1041 \end{tabular}
       
  1042 \end{center}
       
  1043 
       
  1044   \noindent The idea behind the rules (A1) and (A2), for example, is that a
       
  1045   @{text Left}-value is bigger than a @{text Right}-value, if the underlying
       
  1046   string of the @{text Left}-value is longer or of equal length to the
       
  1047   underlying string of the @{text Right}-value. The order is reversed,
       
  1048   however, if the @{text Right}-value can match a longer string than a
       
  1049   @{text Left}-value. In this way the POSIX value is supposed to be the
       
  1050   biggest value for a given string and regular expression.
       
  1051 
       
  1052   Sulzmann and Lu explicitly refer to the paper \cite{Frisch2004} by Frisch
       
  1053   and Cardelli from where they have taken the idea for their correctness
       
  1054   proof. Frisch and Cardelli introduced a similar ordering for GREEDY
       
  1055   matching and they showed that their GREEDY matching algorithm always
       
  1056   produces a maximal element according to this ordering (from all possible
       
  1057   solutions). The only difference between their GREEDY ordering and the
       
  1058   ``ordering'' by Sulzmann and Lu is that GREEDY always prefers a @{text
       
  1059   Left}-value over a @{text Right}-value, no matter what the underlying
       
  1060   string is. This seems to be only a very minor difference, but it has
       
  1061   drastic consequences in terms of what properties both orderings enjoy.
       
  1062   What is interesting for our purposes is that the properties reflexivity,
       
  1063   totality and transitivity for this GREEDY ordering can be proved
       
  1064   relatively easily by induction.
       
  1065 
       
  1066   These properties of GREEDY, however, do not transfer to the POSIX
       
  1067   ``ordering'' by Sulzmann and Lu, which they define as @{text "v\<^sub>1 \<ge>\<^sub>r v\<^sub>2"}. 
       
  1068   To start with, @{text "v\<^sub>1 \<ge>\<^sub>r v\<^sub>2"} is
       
  1069   not defined inductively, but as $($@{term "v\<^sub>1 = v\<^sub>2"}$)$ $\vee$ $($@{term "(v\<^sub>1 >r
       
  1070   v\<^sub>2) \<and> (flat v\<^sub>1 = flat (v\<^sub>2::val))"}$)$. This means that @{term "v\<^sub>1
       
  1071   >(r::rexp) (v\<^sub>2::val)"} does not necessarily imply @{term "v\<^sub>1 \<ge>(r::rexp)
       
  1072   (v\<^sub>2::val)"}. Moreover, transitivity does not hold in the ``usual''
       
  1073   formulation, for example:
       
  1074 
       
  1075   \begin{falsehood}
       
  1076   Suppose @{term "\<turnstile> v\<^sub>1 : r"}, @{term "\<turnstile> v\<^sub>2 : r"} and @{term "\<turnstile> v\<^sub>3 : r"}.
       
  1077   If @{term "v\<^sub>1 >(r::rexp) (v\<^sub>2::val)"} and @{term "v\<^sub>2 >(r::rexp) (v\<^sub>3::val)"}
       
  1078   then @{term "v\<^sub>1 >(r::rexp) (v\<^sub>3::val)"}.
       
  1079   \end{falsehood}
       
  1080 
       
  1081   \noindent If formulated in this way, then there are various counter
       
  1082   examples: For example let @{term r} be @{text "a + ((a + a)\<cdot>(a + \<zero>))"}
       
  1083   then the @{term "v\<^sub>1"}, @{term "v\<^sub>2"} and @{term "v\<^sub>3"} below are values
       
  1084   of @{term r}:
       
  1085 
       
  1086   \begin{center}
       
  1087   \begin{tabular}{lcl}
       
  1088   @{term "v\<^sub>1"} & $=$ & @{term "Left(Char a)"}\\
       
  1089   @{term "v\<^sub>2"} & $=$ & @{term "Right(Seq (Left(Char a)) (Right Void))"}\\
       
  1090   @{term "v\<^sub>3"} & $=$ & @{term "Right(Seq (Right(Char a)) (Left(Char a)))"}
       
  1091   \end{tabular}
       
  1092   \end{center}
       
  1093 
       
  1094   \noindent Moreover @{term "v\<^sub>1 >(r::rexp) v\<^sub>2"} and @{term "v\<^sub>2 >(r::rexp)
       
  1095   v\<^sub>3"}, but \emph{not} @{term "v\<^sub>1 >(r::rexp) v\<^sub>3"}! The reason is that
       
  1096   although @{term "v\<^sub>3"} is a @{text "Right"}-value, it can match a longer
       
  1097   string, namely @{term "flat v\<^sub>3 = [a,a]"}, while @{term "flat v\<^sub>1"} (and
       
  1098   @{term "flat v\<^sub>2"}) matches only @{term "[a]"}. So transitivity in this
       
  1099   formulation does not hold---in this example actually @{term "v\<^sub>3
       
  1100   >(r::rexp) v\<^sub>1"}!
       
  1101 
       
  1102   Sulzmann and Lu ``fix'' this problem by weakening the transitivity
       
  1103   property. They require in addition that the underlying strings are of the
       
  1104   same length. This excludes the counter example above and any
       
  1105   counter-example we were able to find (by hand and by machine). Thus the
       
  1106   transitivity lemma should be formulated as:
       
  1107 
       
  1108   \begin{conject}
       
  1109   Suppose @{term "\<turnstile> v\<^sub>1 : r"}, @{term "\<turnstile> v\<^sub>2 : r"} and @{term "\<turnstile> v\<^sub>3 : r"},
       
  1110   and also @{text "|v\<^sub>1| = |v\<^sub>2| = |v\<^sub>3|"}.\\
       
  1111   If @{term "v\<^sub>1 >(r::rexp) (v\<^sub>2::val)"} and @{term "v\<^sub>2 >(r::rexp) (v\<^sub>3::val)"}
       
  1112   then @{term "v\<^sub>1 >(r::rexp) (v\<^sub>3::val)"}.
       
  1113   \end{conject}
       
  1114 
       
  1115   \noindent While we agree with Sulzmann and Lu that this property
       
  1116   probably(!) holds, proving it seems not so straightforward: although one
       
  1117   begins with the assumption that the values have the same flattening, this
       
  1118   cannot be maintained as one descends into the induction. This is a problem
       
  1119   that occurs in a number of places in the proofs by Sulzmann and Lu.
       
  1120 
       
  1121   Although they do not give an explicit proof of the transitivity property,
       
  1122   they give a closely related property about the existence of maximal
       
  1123   elements. They state that this can be verified by an induction on @{term r}. We
       
  1124   disagree with this as we shall show next in case of transitivity. The case
       
  1125   where the reasoning breaks down is the sequence case, say @{term "SEQ r\<^sub>1 r\<^sub>2"}.
       
  1126   The induction hypotheses in this case are
       
  1127 
       
  1128 \begin{center}
       
  1129 \begin{tabular}{@ {}c@ {\hspace{10mm}}c@ {}}
       
  1130 \begin{tabular}{@ {}l@ {\hspace{-7mm}}l@ {}}
       
  1131 IH @{term "r\<^sub>1"}:\\
       
  1132 @{text "\<forall> v\<^sub>1, v\<^sub>2, v\<^sub>3."} \\
       
  1133   & @{term "\<turnstile> v\<^sub>1 : r\<^sub>1"}\;@{text "\<and>"}
       
  1134     @{term "\<turnstile> v\<^sub>2 : r\<^sub>1"}\;@{text "\<and>"}
       
  1135     @{term "\<turnstile> v\<^sub>3 : r\<^sub>1"}\\
       
  1136   & @{text "\<and>"} @{text "|v\<^sub>1| = |v\<^sub>2| = |v\<^sub>3|"}\\
       
  1137   & @{text "\<and>"} @{term "v\<^sub>1 >(r\<^sub>1::rexp) v\<^sub>2 \<and> v\<^sub>2 >(r\<^sub>1::rexp) v\<^sub>3"}\medskip\\
       
  1138   & $\Rightarrow$ @{term "v\<^sub>1 >(r\<^sub>1::rexp) v\<^sub>3"}
       
  1139 \end{tabular} &
       
  1140 \begin{tabular}{@ {}l@ {\hspace{-7mm}}l@ {}}
       
  1141 IH @{term "r\<^sub>2"}:\\
       
  1142 @{text "\<forall> v\<^sub>1, v\<^sub>2, v\<^sub>3."}\\ 
       
  1143   & @{term "\<turnstile> v\<^sub>1 : r\<^sub>2"}\;@{text "\<and>"}
       
  1144     @{term "\<turnstile> v\<^sub>2 : r\<^sub>2"}\;@{text "\<and>"}
       
  1145     @{term "\<turnstile> v\<^sub>3 : r\<^sub>2"}\\
       
  1146   & @{text "\<and>"} @{text "|v\<^sub>1| = |v\<^sub>2| = |v\<^sub>3|"}\\
       
  1147   & @{text "\<and>"} @{term "v\<^sub>1 >(r\<^sub>2::rexp) v\<^sub>2 \<and> v\<^sub>2 >(r\<^sub>2::rexp) v\<^sub>3"}\medskip\\
       
  1148   & $\Rightarrow$ @{term "v\<^sub>1 >(r\<^sub>2::rexp) v\<^sub>3"}
       
  1149 \end{tabular}
       
  1150 \end{tabular}
       
  1151 \end{center} 
       
  1152 
       
  1153   \noindent We can assume that
       
  1154   %
       
  1155   \begin{equation}
       
  1156   @{term "(Seq (v\<^sub>1\<^sub>l) (v\<^sub>1\<^sub>r)) >(SEQ r\<^sub>1 r\<^sub>2) (Seq (v\<^sub>2\<^sub>l) (v\<^sub>2\<^sub>r))"}
       
  1157   \qquad\textrm{and}\qquad
       
  1158   @{term "(Seq (v\<^sub>2\<^sub>l) (v\<^sub>2\<^sub>r)) >(SEQ r\<^sub>1 r\<^sub>2) (Seq (v\<^sub>3\<^sub>l) (v\<^sub>3\<^sub>r))"}
       
  1159   \label{assms}
       
  1160   \end{equation}
       
  1161 
       
  1162   \noindent hold, and furthermore that the values have equal length, namely:
       
  1163   %
       
  1164   \begin{equation}
       
  1165   @{term "flat (Seq (v\<^sub>1\<^sub>l) (v\<^sub>1\<^sub>r)) = flat (Seq (v\<^sub>2\<^sub>l) (v\<^sub>2\<^sub>r))"}
       
  1166   \qquad\textrm{and}\qquad
       
  1167   @{term "flat (Seq (v\<^sub>2\<^sub>l) (v\<^sub>2\<^sub>r)) = flat (Seq (v\<^sub>3\<^sub>l) (v\<^sub>3\<^sub>r))"}
       
  1168   \label{lens}
       
  1169   \end{equation} 
       
  1170 
       
  1171   \noindent We need to show that @{term "(Seq (v\<^sub>1\<^sub>l) (v\<^sub>1\<^sub>r)) >(SEQ r\<^sub>1 r\<^sub>2)
       
  1172   (Seq (v\<^sub>3\<^sub>l) (v\<^sub>3\<^sub>r))"} holds. We can proceed by analysing how the
       
  1173   assumptions in \eqref{assms} have arisen. There are four cases. Let us
       
  1174   assume we are in the case where we know
       
  1175 
       
  1176   \[
       
  1177   @{term "v\<^sub>1\<^sub>l >(r\<^sub>1::rexp) v\<^sub>2\<^sub>l"}
       
  1178   \qquad\textrm{and}\qquad
       
  1179   @{term "v\<^sub>2\<^sub>l >(r\<^sub>1::rexp) v\<^sub>3\<^sub>l"}
       
  1180   \]
       
  1181 
       
  1182   \noindent and also know the corresponding inhabitation judgements. This is
       
  1183   exactly a case where we would like to apply the induction hypothesis
       
  1184   IH~$r_1$. But we cannot! We still need to show that @{term "flat (v\<^sub>1\<^sub>l) =
       
  1185   flat(v\<^sub>2\<^sub>l)"} and @{term "flat(v\<^sub>2\<^sub>l) = flat(v\<^sub>3\<^sub>l)"}. We know from
       
  1186   \eqref{lens} that the lengths of the sequence values are equal, but from
       
  1187   this we cannot infer anything about the lengths of the component values.
       
  1188   Indeed in general they will be unequal, that is
       
  1189 
       
  1190   \[
       
  1191   @{term "flat(v\<^sub>1\<^sub>l) \<noteq> flat(v\<^sub>2\<^sub>l)"}  
       
  1192   \qquad\textrm{and}\qquad
       
  1193   @{term "flat(v\<^sub>1\<^sub>r) \<noteq> flat(v\<^sub>2\<^sub>r)"}
       
  1194   \]
       
  1195 
       
  1196   \noindent but still \eqref{lens} will hold. Now we are stuck, since the IH
       
  1197   does not apply. As said, this problem where the induction hypothesis does
       
  1198   not apply arises in several places in the proof of Sulzmann and Lu, not
       
  1199   just for proving transitivity.
       
  1200 
       
  1201 *}
       
  1202 
       
  1203 section {* Conclusion *}
       
  1204 
       
  1205 text {*
       
  1206 
       
  1207   We have implemented the POSIX value calculation algorithm introduced by
       
  1208   Sulzmann and Lu
       
  1209   \cite{Sulzmann2014}. Our implementation is nearly identical to the
       
  1210   original and all modifications we introduced are harmless (like our char-clause for
       
  1211   @{text inj}). We have proved this algorithm to be correct, but correct
       
  1212   according to our own specification of what POSIX values are. Our
       
  1213   specification (inspired from work by Vansummeren \cite{Vansummeren2006}) appears to be
       
  1214   much simpler than in \cite{Sulzmann2014} and our proofs are nearly always
       
  1215   straightforward. We have attempted to formalise the original proof
       
  1216   by Sulzmann and Lu \cite{Sulzmann2014}, but we believe it contains
       
  1217   unfillable gaps. In the online version of \cite{Sulzmann2014}, the authors
       
  1218   already acknowledge some small problems, but our experience suggests
       
  1219   that there are more serious problems. 
       
  1220   
       
  1221   Having proved the correctness of the POSIX lexing algorithm in
       
  1222   \cite{Sulzmann2014}, which lessons have we learned? Well, this is a
       
  1223   perfect example for the importance of the \emph{right} definitions. We
       
  1224   have (on and off) explored mechanisations as soon as first versions
       
  1225   of \cite{Sulzmann2014} appeared, but have made little progress with
       
  1226   turning the relatively detailed proof sketch in \cite{Sulzmann2014} into a
       
  1227   formalisable proof. Having seen \cite{Vansummeren2006} and adapted the
       
  1228   POSIX definition given there for the algorithm by Sulzmann and Lu made all
       
  1229   the difference: the proofs, as said, are nearly straightforward. The
       
  1230   question remains whether the original proof idea of \cite{Sulzmann2014},
       
  1231   potentially using our result as a stepping stone, can be made to work?
       
  1232   Alas, we really do not know despite considerable effort.
       
  1233 
       
  1234 
       
  1235   Closely related to our work is an automata-based lexer formalised by
       
  1236   Nipkow \cite{Nipkow98}. This lexer also splits up strings into longest
       
  1237   initial substrings, but Nipkow's algorithm is not completely
       
  1238   computational. The algorithm by Sulzmann and Lu, in contrast, can be
       
  1239   implemented with ease in any functional language. A bespoke lexer for the
       
  1240   Imp-language is formalised in Coq as part of the Software Foundations book
       
  1241   by Pierce et al \cite{Pierce2015}. The disadvantage of such bespoke lexers is that they
       
  1242   do not generalise easily to more advanced features.
       
  1243   Our formalisation is available from the Archive of Formal Proofs \cite{aduAFP16}
       
  1244   under \url{http://www.isa-afp.org/entries/Posix-Lexing.shtml}.\medskip
       
  1245 
       
  1246   \noindent
       
  1247   {\bf Acknowledgements:}
       
  1248   We are very grateful to Martin Sulzmann for his comments on our work and 
       
  1249   moreover for patiently explaining to us the details in \cite{Sulzmann2014}. We
       
  1250   also received very helpful comments from James Cheney and anonymous referees.
       
  1251   %  \small
       
  1252   \bibliographystyle{plain}
       
  1253   \bibliography{root}
       
  1254 
       
  1255 *}
       
  1256 
       
  1257 
       
  1258 (*<*)
       
  1259 end
       
  1260 (*>*)