(*<*)
theory Paper
imports
"../Lexer"
"../Simplifying"
"../Positions"
"../Sulzmann"
"~~/src/HOL/Library/LaTeXsugar"
begin
lemma Suc_0_fold:
"Suc 0 = 1"
by simp
declare [[show_question_marks = false]]
syntax (latex output)
"_Collect" :: "pttrn => bool => 'a set" ("(1{_ \<^latex>\<open>\\mbox{\\boldmath$\\mid$}\<close> _})")
"_CollectIn" :: "pttrn => 'a set => bool => 'a set" ("(1{_ \<in> _ |e _})")
syntax
"_Not_Ex" :: "idts \<Rightarrow> bool \<Rightarrow> bool" ("(3\<nexists>_.a./ _)" [0, 10] 10)
"_Not_Ex1" :: "pttrn \<Rightarrow> bool \<Rightarrow> bool" ("(3\<nexists>!_.a./ _)" [0, 10] 10)
abbreviation
"der_syn r c \<equiv> der c r"
abbreviation
"ders_syn r s \<equiv> ders s r"
abbreviation
"bder_syn r c \<equiv> bder c r"
abbreviation
"bders_syn r s \<equiv> bders r s"
abbreviation
"nprec v1 v2 \<equiv> \<not>(v1 :\<sqsubset>val v2)"
notation (latex output)
If ("(\<^latex>\<open>\\textrm{\<close>if\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>then\<^latex>\<open>}\<close> (_)/ \<^latex>\<open>\\textrm{\<close>else\<^latex>\<open>}\<close> (_))" 10) and
Cons ("_\<^latex>\<open>\\mbox{$\\,$}\<close>::\<^latex>\<open>\\mbox{$\\,$}\<close>_" [75,73] 73) and
ZERO ("\<^bold>0" 81) and
ONE ("\<^bold>1" 81) and
CHAR ("_" [1000] 80) and
ALT ("_ + _" [77,77] 78) and
SEQ ("_ \<cdot> _" [77,77] 78) and
STAR ("_\<^sup>\<star>" [79] 78) and
val.Void ("Empty" 78) and
val.Char ("Char _" [1000] 78) and
val.Left ("Left _" [79] 78) and
val.Right ("Right _" [1000] 78) and
val.Seq ("Seq _ _" [79,79] 78) and
val.Stars ("Stars _" [79] 78) and
L ("L'(_')" [10] 78) and
LV ("LV _ _" [80,73] 78) and
der_syn ("_\\_" [79, 1000] 76) and
ders_syn ("_\\_" [79, 1000] 76) and
flat ("|_|" [75] 74) and
flats ("|_|" [72] 74) and
Sequ ("_ @ _" [78,77] 63) and
injval ("inj _ _ _" [79,77,79] 76) and
mkeps ("mkeps _" [79] 76) and
length ("len _" [73] 73) and
intlen ("len _" [73] 73) and
set ("_" [73] 73) and
Prf ("_ : _" [75,75] 75) and
Posix ("'(_, _') \<rightarrow> _" [63,75,75] 75) and
lexer ("lexer _ _" [78,78] 77) and
F_RIGHT ("F\<^bsub>Right\<^esub> _") and
F_LEFT ("F\<^bsub>Left\<^esub> _") and
F_ALT ("F\<^bsub>Alt\<^esub> _ _") and
F_SEQ1 ("F\<^bsub>Seq1\<^esub> _ _") and
F_SEQ2 ("F\<^bsub>Seq2\<^esub> _ _") and
F_SEQ ("F\<^bsub>Seq\<^esub> _ _") and
simp_SEQ ("simp\<^bsub>Seq\<^esub> _ _" [1000, 1000] 1) and
simp_ALT ("simp\<^bsub>Alt\<^esub> _ _" [1000, 1000] 1) and
slexer ("lexer\<^sup>+" 1000) and
at ("_\<^latex>\<open>\\mbox{$\\downharpoonleft$}\<close>\<^bsub>_\<^esub>") and
lex_list ("_ \<prec>\<^bsub>lex\<^esub> _") and
PosOrd ("_ \<prec>\<^bsub>_\<^esub> _" [77,77,77] 77) and
PosOrd_ex ("_ \<prec> _" [77,77] 77) and
PosOrd_ex_eq ("_ \<^latex>\<open>\\mbox{$\\preccurlyeq$}\<close> _" [77,77] 77) and
pflat_len ("\<parallel>_\<parallel>\<^bsub>_\<^esub>") and
nprec ("_ \<^latex>\<open>\\mbox{$\\not\\prec$}\<close> _" [77,77] 77) and
bder_syn ("_\<^latex>\<open>\\mbox{$\\bbslash$}\<close>_" [79, 1000] 76) and
bders_syn ("_\<^latex>\<open>\\mbox{$\\bbslash$}\<close>_" [79, 1000] 76) and
intern ("_\<^latex>\<open>\\mbox{$^\\uparrow$}\<close>" [900] 80) and
erase ("_\<^latex>\<open>\\mbox{$^\\downarrow$}\<close>" [1000] 74) and
bnullable ("nullable\<^latex>\<open>\\mbox{$_b$}\<close> _" [1000] 80) and
bmkeps ("mkeps\<^latex>\<open>\\mbox{$_b$}\<close> _" [1000] 80) and
blexer ("lexer\<^latex>\<open>\\mbox{$_b$}\<close> _ _" [77, 77] 80) and
code ("code _" [79] 74) and
DUMMY ("\<^latex>\<open>\\underline{\\hspace{2mm}}\<close>")
definition
"match r s \<equiv> nullable (ders s r)"
lemma LV_STAR_ONE_empty:
shows "LV (STAR ONE) [] = {Stars []}"
by(auto simp add: LV_def elim: Prf.cases intro: Prf.intros)
(*
comments not implemented
p9. The condition "not exists s3 s4..." appears often enough (in particular in
the proof of Lemma 3) to warrant a definition.
*)
(*>*)
section {* Introduction *}
text {*
Brzozowski \cite{Brzozowski1964} introduced the notion of the {\em
derivative} @{term "der c r"} of a regular expression @{text r} w.r.t.\
a character~@{text c}, and showed that it gave a simple solution to the
problem of matching a string @{term s} with a regular expression @{term
r}: if the derivative of @{term r} w.r.t.\ (in succession) all the
characters of the string matches the empty string, then @{term r}
matches @{term s} (and {\em vice versa}). The derivative has the
property (which may almost be regarded as its specification) that, for
every string @{term s} and regular expression @{term r} and character
@{term c}, one has @{term "cs \<in> L(r)"} if and only if \mbox{@{term "s \<in> L(der c r)"}}.
The beauty of Brzozowski's derivatives is that
they are neatly expressible in any functional language, and easily
definable and reasoned about in theorem provers---the definitions just
consist of inductive datatypes and simple recursive functions. A
mechanised correctness proof of Brzozowski's matcher in for example HOL4
has been mentioned by Owens and Slind~\cite{Owens2008}. Another one in
Isabelle/HOL is part of the work by Krauss and Nipkow \cite{Krauss2011}.
And another one in Coq is given by Coquand and Siles \cite{Coquand2012}.
If a regular expression matches a string, then in general there is more
than one way of how the string is matched. There are two commonly used
disambiguation strategies to generate a unique answer: one is called
GREEDY matching \cite{Frisch2004} and the other is POSIX
matching~\cite{POSIX,Kuklewicz,OkuiSuzuki2010,Sulzmann2014,Vansummeren2006}.
For example consider the string @{term xy} and the regular expression
\mbox{@{term "STAR (ALT (ALT x y) xy)"}}. Either the string can be
matched in two `iterations' by the single letter-regular expressions
@{term x} and @{term y}, or directly in one iteration by @{term xy}. The
first case corresponds to GREEDY matching, which first matches with the
left-most symbol and only matches the next symbol in case of a mismatch
(this is greedy in the sense of preferring instant gratification to
delayed repletion). The second case is POSIX matching, which prefers the
longest match.
In the context of lexing, where an input string needs to be split up
into a sequence of tokens, POSIX is the more natural disambiguation
strategy for what programmers consider basic syntactic building blocks
in their programs. These building blocks are often specified by some
regular expressions, say @{text "r\<^bsub>key\<^esub>"} and @{text
"r\<^bsub>id\<^esub>"} for recognising keywords and identifiers,
respectively. There are a few underlying (informal) rules behind
tokenising a string in a POSIX \cite{POSIX} fashion:
\begin{itemize}
\item[$\bullet$] \emph{The Longest Match Rule} (or \emph{``{M}aximal {M}unch {R}ule''}):
The longest initial substring matched by any regular expression is taken as
next token.\smallskip
\item[$\bullet$] \emph{Priority Rule:}
For a particular longest initial substring, the first (leftmost) regular expression
that can match determines the token.\smallskip
\item[$\bullet$] \emph{Star Rule:} A subexpression repeated by ${}^\star$ shall
not match an empty string unless this is the only match for the repetition.\smallskip
\item[$\bullet$] \emph{Empty String Rule:} An empty string shall be considered to
be longer than no match at all.
\end{itemize}
\noindent Consider for example a regular expression @{text
"r\<^bsub>key\<^esub>"} for recognising keywords such as @{text "if"},
@{text "then"} and so on; and @{text "r\<^bsub>id\<^esub>"}
recognising identifiers (say, a single character followed by
characters or numbers). Then we can form the regular expression
@{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"}
and use POSIX matching to tokenise strings, say @{text "iffoo"} and
@{text "if"}. For @{text "iffoo"} we obtain by the Longest Match Rule
a single identifier token, not a keyword followed by an
identifier. For @{text "if"} we obtain by the Priority Rule a keyword
token, not an identifier token---even if @{text "r\<^bsub>id\<^esub>"}
matches also. By the Star Rule we know @{text "(r\<^bsub>key\<^esub> +
r\<^bsub>id\<^esub>)\<^sup>\<star>"} matches @{text "iffoo"},
respectively @{text "if"}, in exactly one `iteration' of the star. The
Empty String Rule is for cases where, for example, the regular expression
@{text "(a\<^sup>\<star>)\<^sup>\<star>"} matches against the
string @{text "bc"}. Then the longest initial matched substring is the
empty string, which is matched by both the whole regular expression
and the parenthesised subexpression.
One limitation of Brzozowski's matcher is that it only generates a
YES/NO answer for whether a string is being matched by a regular
expression. Sulzmann and Lu~\cite{Sulzmann2014} extended this matcher
to allow generation not just of a YES/NO answer but of an actual
matching, called a [lexical] {\em value}. Assuming a regular
expression matches a string, values encode the information of
\emph{how} the string is matched by the regular expression---that is,
which part of the string is matched by which part of the regular
expression. For this consider again the string @{text "xy"} and
the regular expression \mbox{@{text "(x + (y + xy))\<^sup>\<star>"}}
(this time fully parenthesised). We can view this regular expression
as tree and if the string @{text xy} is matched by two Star
`iterations', then the @{text x} is matched by the left-most
alternative in this tree and the @{text y} by the right-left alternative. This
suggests to record this matching as
\begin{center}
@{term "Stars [Left(Char x), Right(Left(Char y))]"}
\end{center}
\noindent where @{const Stars}, @{text Left}, @{text Right} and @{text
Char} are constructors for values. @{text Stars} records how many
iterations were used; @{text Left}, respectively @{text Right}, which
alternative is used. This `tree view' leads naturally to the idea that
regular expressions act as types and values as inhabiting those types
(see, for example, \cite{HosoyaVouillonPierce2005}). The value for
matching @{text "xy"} in a single `iteration', i.e.~the POSIX value,
would look as follows
\begin{center}
@{term "Stars [Seq (Char x) (Char y)]"}
\end{center}
\noindent where @{const Stars} has only a single-element list for the
single iteration and @{const Seq} indicates that @{term xy} is matched
by a sequence regular expression.
%, which we will in what follows
%write more formally as @{term "SEQ x y"}.
Sulzmann and Lu give a simple algorithm to calculate a value that
appears to be the value associated with POSIX matching. The challenge
then is to specify that value, in an algorithm-independent fashion,
and to show that Sulzmann and Lu's derivative-based algorithm does
indeed calculate a value that is correct according to the
specification. The answer given by Sulzmann and Lu
\cite{Sulzmann2014} is to define a relation (called an ``order
relation'') on the set of values of @{term r}, and to show that (once
a string to be matched is chosen) there is a maximum element and that
it is computed by their derivative-based algorithm. This proof idea is
inspired by work of Frisch and Cardelli \cite{Frisch2004} on a GREEDY
regular expression matching algorithm. However, we were not able to
establish transitivity and totality for the ``order relation'' by
Sulzmann and Lu. There are some inherent problems with their approach
(of which some of the proofs are not published in
\cite{Sulzmann2014}); perhaps more importantly, we give in this paper
a simple inductive (and algorithm-independent) definition of what we
call being a {\em POSIX value} for a regular expression @{term r} and
a string @{term s}; we show that the algorithm by Sulzmann and Lu
computes such a value and that such a value is unique. Our proofs are
both done by hand and checked in Isabelle/HOL. The experience of
doing our proofs has been that this mechanical checking was absolutely
essential: this subject area has hidden snares. This was also noted by
Kuklewicz \cite{Kuklewicz} who found that nearly all POSIX matching
implementations are ``buggy'' \cite[Page 203]{Sulzmann2014} and by
Grathwohl et al \cite[Page 36]{CrashCourse2014} who wrote:
\begin{quote}
\it{}``The POSIX strategy is more complicated than the greedy because of
the dependence on information about the length of matched strings in the
various subexpressions.''
\end{quote}
\noindent {\bf Contributions:} We have implemented in Isabelle/HOL the
derivative-based regular expression matching algorithm of
Sulzmann and Lu \cite{Sulzmann2014}. We have proved the correctness of this
algorithm according to our specification of what a POSIX value is (inspired
by work of Vansummeren \cite{Vansummeren2006}). Sulzmann
and Lu sketch in \cite{Sulzmann2014} an informal correctness proof: but to
us it contains unfillable gaps.\footnote{An extended version of
\cite{Sulzmann2014} is available at the website of its first author; this
extended version already includes remarks in the appendix that their
informal proof contains gaps, and possible fixes are not fully worked out.}
Our specification of a POSIX value consists of a simple inductive definition
that given a string and a regular expression uniquely determines this value.
We also show that our definition is equivalent to an ordering
of values based on positions by Okui and Suzuki \cite{OkuiSuzuki2010}.
%Derivatives as calculated by Brzozowski's method are usually more complex
%regular expressions than the initial one; various optimisations are
%possible. We prove the correctness when simplifications of @{term "ALT ZERO r"},
%@{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r ONE"} to
%@{term r} are applied.
We extend our results to ??? Bitcoded version??
*}
section {* Preliminaries *}
text {* \noindent Strings in Isabelle/HOL are lists of characters with
the empty string being represented by the empty list, written @{term
"[]"}, and list-cons being written as @{term "DUMMY # DUMMY"}. Often
we use the usual bracket notation for lists also for strings; for
example a string consisting of just a single character @{term c} is
written @{term "[c]"}. We use the usual definitions for
\emph{prefixes} and \emph{strict prefixes} of strings. By using the
type @{type char} for characters we have a supply of finitely many
characters roughly corresponding to the ASCII character set. Regular
expressions are defined as usual as the elements of the following
inductive datatype:
\begin{center}
@{text "r :="}
@{const "ZERO"} $\mid$
@{const "ONE"} $\mid$
@{term "CHAR c"} $\mid$
@{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
@{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
@{term "STAR r"}
\end{center}
\noindent where @{const ZERO} stands for the regular expression that does
not match any string, @{const ONE} for the regular expression that matches
only the empty string and @{term c} for matching a character literal. The
language of a regular expression is also defined as usual by the
recursive function @{term L} with the six clauses:
\begin{center}
\begin{tabular}{l@ {\hspace{4mm}}rcl}
\textit{(1)} & @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
\textit{(2)} & @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
\textit{(3)} & @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
\textit{(4)} & @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
@{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
\textit{(5)} & @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ &
@{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
\textit{(6)} & @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
\end{tabular}
\end{center}
\noindent In clause \textit{(4)} we use the operation @{term "DUMMY ;;
DUMMY"} for the concatenation of two languages (it is also list-append for
strings). We use the star-notation for regular expressions and for
languages (in the last clause above). The star for languages is defined
inductively by two clauses: @{text "(i)"} the empty string being in
the star of a language and @{text "(ii)"} if @{term "s\<^sub>1"} is in a
language and @{term "s\<^sub>2"} in the star of this language, then also @{term
"s\<^sub>1 @ s\<^sub>2"} is in the star of this language. It will also be convenient
to use the following notion of a \emph{semantic derivative} (or \emph{left
quotient}) of a language defined as
%
\begin{center}
@{thm Der_def}\;.
\end{center}
\noindent
For semantic derivatives we have the following equations (for example
mechanically proved in \cite{Krauss2011}):
%
\begin{equation}\label{SemDer}
\begin{array}{lcl}
@{thm (lhs) Der_null} & \dn & @{thm (rhs) Der_null}\\
@{thm (lhs) Der_empty} & \dn & @{thm (rhs) Der_empty}\\
@{thm (lhs) Der_char} & \dn & @{thm (rhs) Der_char}\\
@{thm (lhs) Der_union} & \dn & @{thm (rhs) Der_union}\\
@{thm (lhs) Der_Sequ} & \dn & @{thm (rhs) Der_Sequ}\\
@{thm (lhs) Der_star} & \dn & @{thm (rhs) Der_star}
\end{array}
\end{equation}
\noindent \emph{\Brz's derivatives} of regular expressions
\cite{Brzozowski1964} can be easily defined by two recursive functions:
the first is from regular expressions to booleans (implementing a test
when a regular expression can match the empty string), and the second
takes a regular expression and a character to a (derivative) regular
expression:
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
@{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
@{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
@{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\medskip\\
% \end{tabular}
% \end{center}
% \begin{center}
% \begin{tabular}{lcl}
@{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
@{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
@{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
@{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}
\end{tabular}
\end{center}
\noindent
We may extend this definition to give derivatives w.r.t.~strings:
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\
@{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\
\end{tabular}
\end{center}
\noindent Given the equations in \eqref{SemDer}, it is a relatively easy
exercise in mechanical reasoning to establish that
\begin{proposition}\label{derprop}\mbox{}\\
\begin{tabular}{ll}
\textit{(1)} & @{thm (lhs) nullable_correctness} if and only if
@{thm (rhs) nullable_correctness}, and \\
\textit{(2)} & @{thm[mode=IfThen] der_correctness}.
\end{tabular}
\end{proposition}
\noindent With this in place it is also very routine to prove that the
regular expression matcher defined as
%
\begin{center}
@{thm match_def}
\end{center}
\noindent gives a positive answer if and only if @{term "s \<in> L r"}.
Consequently, this regular expression matching algorithm satisfies the
usual specification for regular expression matching. While the matcher
above calculates a provably correct YES/NO answer for whether a regular
expression matches a string or not, the novel idea of Sulzmann and Lu
\cite{Sulzmann2014} is to append another phase to this algorithm in order
to calculate a [lexical] value. We will explain the details next.
*}
section {* POSIX Regular Expression Matching\label{posixsec} *}
text {*
There have been many previous works that use values for encoding
\emph{how} a regular expression matches a string.
The clever idea by Sulzmann and Lu \cite{Sulzmann2014} is to
define a function on values that mirrors (but inverts) the
construction of the derivative on regular expressions. \emph{Values}
are defined as the inductive datatype
\begin{center}
@{text "v :="}
@{const "Void"} $\mid$
@{term "val.Char c"} $\mid$
@{term "Left v"} $\mid$
@{term "Right v"} $\mid$
@{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$
@{term "Stars vs"}
\end{center}
\noindent where we use @{term vs} to stand for a list of
values. (This is similar to the approach taken by Frisch and
Cardelli for GREEDY matching \cite{Frisch2004}, and Sulzmann and Lu
for POSIX matching \cite{Sulzmann2014}). The string underlying a
value can be calculated by the @{const flat} function, written
@{term "flat DUMMY"} and defined as:
\begin{center}
\begin{tabular}[t]{lcl}
@{thm (lhs) flat.simps(1)} & $\dn$ & @{thm (rhs) flat.simps(1)}\\
@{thm (lhs) flat.simps(2)} & $\dn$ & @{thm (rhs) flat.simps(2)}\\
@{thm (lhs) flat.simps(3)} & $\dn$ & @{thm (rhs) flat.simps(3)}\\
@{thm (lhs) flat.simps(4)} & $\dn$ & @{thm (rhs) flat.simps(4)}
\end{tabular}\hspace{14mm}
\begin{tabular}[t]{lcl}
@{thm (lhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
@{thm (lhs) flat.simps(6)} & $\dn$ & @{thm (rhs) flat.simps(6)}\\
@{thm (lhs) flat.simps(7)} & $\dn$ & @{thm (rhs) flat.simps(7)}\\
\end{tabular}
\end{center}
\noindent We will sometimes refer to the underlying string of a
value as \emph{flattened value}. We will also overload our notation and
use @{term "flats vs"} for flattening a list of values and concatenating
the resulting strings.
Sulzmann and Lu define
inductively an \emph{inhabitation relation} that associates values to
regular expressions. We define this relation as
follows:\footnote{Note that the rule for @{term Stars} differs from
our earlier paper \cite{AusafDyckhoffUrban2016}. There we used the
original definition by Sulzmann and Lu which does not require that
the values @{term "v \<in> set vs"} flatten to a non-empty
string. The reason for introducing the more restricted version of
lexical values is convenience later on when reasoning about an
ordering relation for values.}
\begin{center}
\begin{tabular}{c@ {\hspace{12mm}}c}\label{prfintros}
\\[-8mm]
@{thm[mode=Axiom] Prf.intros(4)} &
@{thm[mode=Axiom] Prf.intros(5)[of "c"]}\\[4mm]
@{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} &
@{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\\[4mm]
@{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]} &
@{thm[mode=Rule] Prf.intros(6)[of "vs"]}
\end{tabular}
\end{center}
\noindent where in the clause for @{const "Stars"} we use the
notation @{term "v \<in> set vs"} for indicating that @{text v} is a
member in the list @{text vs}. We require in this rule that every
value in @{term vs} flattens to a non-empty string. The idea is that
@{term "Stars"}-values satisfy the informal Star Rule (see Introduction)
where the $^\star$ does not match the empty string unless this is
the only match for the repetition. Note also that no values are
associated with the regular expression @{term ZERO}, and that the
only value associated with the regular expression @{term ONE} is
@{term Void}. It is routine to establish how values ``inhabiting''
a regular expression correspond to the language of a regular
expression, namely
\begin{proposition}\label{inhabs}
@{thm L_flat_Prf}
\end{proposition}
\noindent
Given a regular expression @{text r} and a string @{text s}, we define the
set of all \emph{Lexical Values} inhabited by @{text r} with the underlying string
being @{text s}:\footnote{Okui and Suzuki refer to our lexical values
as \emph{canonical values} in \cite{OkuiSuzuki2010}. The notion of \emph{non-problematic
values} by Cardelli and Frisch \cite{Frisch2004} is related, but not identical
to our lexical values.}
\begin{center}
@{thm LV_def}
\end{center}
\noindent The main property of @{term "LV r s"} is that it is alway finite.
\begin{proposition}
@{thm LV_finite}
\end{proposition}
\noindent This finiteness property does not hold in general if we
remove the side-condition about @{term "flat v \<noteq> []"} in the
@{term Stars}-rule above. For example using Sulzmann and Lu's
less restrictive definition, @{term "LV (STAR ONE) []"} would contain
infinitely many values, but according to our more restricted
definition only a single value, namely @{thm LV_STAR_ONE_empty}.
If a regular expression @{text r} matches a string @{text s}, then
generally the set @{term "LV r s"} is not just a singleton set. In
case of POSIX matching the problem is to calculate the unique lexical value
that satisfies the (informal) POSIX rules from the Introduction.
Graphically the POSIX value calculation algorithm by Sulzmann and Lu
can be illustrated by the picture in Figure~\ref{Sulz} where the
path from the left to the right involving @{term
derivatives}/@{const nullable} is the first phase of the algorithm
(calculating successive \Brz's derivatives) and @{const
mkeps}/@{text inj}, the path from right to left, the second
phase. This picture shows the steps required when a regular
expression, say @{text "r\<^sub>1"}, matches the string @{term
"[a,b,c]"}. We first build the three derivatives (according to
@{term a}, @{term b} and @{term c}). We then use @{const nullable}
to find out whether the resulting derivative regular expression
@{term "r\<^sub>4"} can match the empty string. If yes, we call the
function @{const mkeps} that produces a value @{term "v\<^sub>4"}
for how @{term "r\<^sub>4"} can match the empty string (taking into
account the POSIX constraints in case there are several ways). This
function is defined by the clauses:
\begin{figure}[t]
\begin{center}
\begin{tikzpicture}[scale=2,node distance=1.3cm,
every node/.style={minimum size=6mm}]
\node (r1) {@{term "r\<^sub>1"}};
\node (r2) [right=of r1]{@{term "r\<^sub>2"}};
\draw[->,line width=1mm](r1)--(r2) node[above,midway] {@{term "der a DUMMY"}};
\node (r3) [right=of r2]{@{term "r\<^sub>3"}};
\draw[->,line width=1mm](r2)--(r3) node[above,midway] {@{term "der b DUMMY"}};
\node (r4) [right=of r3]{@{term "r\<^sub>4"}};
\draw[->,line width=1mm](r3)--(r4) node[above,midway] {@{term "der c DUMMY"}};
\draw (r4) node[anchor=west] {\;\raisebox{3mm}{@{term nullable}}};
\node (v4) [below=of r4]{@{term "v\<^sub>4"}};
\draw[->,line width=1mm](r4) -- (v4);
\node (v3) [left=of v4] {@{term "v\<^sub>3"}};
\draw[->,line width=1mm](v4)--(v3) node[below,midway] {@{text "inj r\<^sub>3 c"}};
\node (v2) [left=of v3]{@{term "v\<^sub>2"}};
\draw[->,line width=1mm](v3)--(v2) node[below,midway] {@{text "inj r\<^sub>2 b"}};
\node (v1) [left=of v2] {@{term "v\<^sub>1"}};
\draw[->,line width=1mm](v2)--(v1) node[below,midway] {@{text "inj r\<^sub>1 a"}};
\draw (r4) node[anchor=north west] {\;\raisebox{-8mm}{@{term "mkeps"}}};
\end{tikzpicture}
\end{center}
\mbox{}\\[-13mm]
\caption{The two phases of the algorithm by Sulzmann \& Lu \cite{Sulzmann2014},
matching the string @{term "[a,b,c]"}. The first phase (the arrows from
left to right) is \Brz's matcher building successive derivatives. If the
last regular expression is @{term nullable}, then the functions of the
second phase are called (the top-down and right-to-left arrows): first
@{term mkeps} calculates a value @{term "v\<^sub>4"} witnessing
how the empty string has been recognised by @{term "r\<^sub>4"}. After
that the function @{term inj} ``injects back'' the characters of the string into
the values.
\label{Sulz}}
\end{figure}
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
@{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
\end{tabular}
\end{center}
\noindent Note that this function needs only to be partially defined,
namely only for regular expressions that are nullable. In case @{const
nullable} fails, the string @{term "[a,b,c]"} cannot be matched by @{term
"r\<^sub>1"} and the null value @{term "None"} is returned. Note also how this function
makes some subtle choices leading to a POSIX value: for example if an
alternative regular expression, say @{term "ALT r\<^sub>1 r\<^sub>2"}, can
match the empty string and furthermore @{term "r\<^sub>1"} can match the
empty string, then we return a @{text Left}-value. The @{text
Right}-value will only be returned if @{term "r\<^sub>1"} cannot match the empty
string.
The most interesting idea from Sulzmann and Lu \cite{Sulzmann2014} is
the construction of a value for how @{term "r\<^sub>1"} can match the
string @{term "[a,b,c]"} from the value how the last derivative, @{term
"r\<^sub>4"} in Fig.~\ref{Sulz}, can match the empty string. Sulzmann and
Lu achieve this by stepwise ``injecting back'' the characters into the
values thus inverting the operation of building derivatives, but on the level
of values. The corresponding function, called @{term inj}, takes three
arguments, a regular expression, a character and a value. For example in
the first (or right-most) @{term inj}-step in Fig.~\ref{Sulz} the regular
expression @{term "r\<^sub>3"}, the character @{term c} from the last
derivative step and @{term "v\<^sub>4"}, which is the value corresponding
to the derivative regular expression @{term "r\<^sub>4"}. The result is
the new value @{term "v\<^sub>3"}. The final result of the algorithm is
the value @{term "v\<^sub>1"}. The @{term inj} function is defined by recursion on regular
expressions and by analysing the shape of values (corresponding to
the derivative regular expressions).
%
\begin{center}
\begin{tabular}{l@ {\hspace{5mm}}lcl}
\textit{(1)} & @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
\textit{(2)} & @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ &
@{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
\textit{(3)} & @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ &
@{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
\textit{(4)} & @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
& @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
\textit{(5)} & @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$
& @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
\textit{(6)} & @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$
& @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
\textit{(7)} & @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$
& @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
\end{tabular}
\end{center}
\noindent To better understand what is going on in this definition it
might be instructive to look first at the three sequence cases (clauses
\textit{(4)} -- \textit{(6)}). In each case we need to construct an ``injected value'' for
@{term "SEQ r\<^sub>1 r\<^sub>2"}. This must be a value of the form @{term
"Seq DUMMY DUMMY"}\,. Recall the clause of the @{text derivative}-function
for sequence regular expressions:
\begin{center}
@{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} $\dn$ @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}
\end{center}
\noindent Consider first the @{text "else"}-branch where the derivative is @{term
"SEQ (der c r\<^sub>1) r\<^sub>2"}. The corresponding value must therefore
be of the form @{term "Seq v\<^sub>1 v\<^sub>2"}, which matches the left-hand
side in clause~\textit{(4)} of @{term inj}. In the @{text "if"}-branch the derivative is an
alternative, namely @{term "ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c
r\<^sub>2)"}. This means we either have to consider a @{text Left}- or
@{text Right}-value. In case of the @{text Left}-value we know further it
must be a value for a sequence regular expression. Therefore the pattern
we match in the clause \textit{(5)} is @{term "Left (Seq v\<^sub>1 v\<^sub>2)"},
while in \textit{(6)} it is just @{term "Right v\<^sub>2"}. One more interesting
point is in the right-hand side of clause \textit{(6)}: since in this case the
regular expression @{text "r\<^sub>1"} does not ``contribute'' to
matching the string, that means it only matches the empty string, we need to
call @{const mkeps} in order to construct a value for how @{term "r\<^sub>1"}
can match this empty string. A similar argument applies for why we can
expect in the left-hand side of clause \textit{(7)} that the value is of the form
@{term "Seq v (Stars vs)"}---the derivative of a star is @{term "SEQ (der c r)
(STAR r)"}. Finally, the reason for why we can ignore the second argument
in clause \textit{(1)} of @{term inj} is that it will only ever be called in cases
where @{term "c=d"}, but the usual linearity restrictions in patterns do
not allow us to build this constraint explicitly into our function
definition.\footnote{Sulzmann and Lu state this clause as @{thm (lhs)
injval.simps(1)[of "c" "c"]} $\dn$ @{thm (rhs) injval.simps(1)[of "c"]},
but our deviation is harmless.}
The idea of the @{term inj}-function to ``inject'' a character, say
@{term c}, into a value can be made precise by the first part of the
following lemma, which shows that the underlying string of an injected
value has a prepended character @{term c}; the second part shows that
the underlying string of an @{const mkeps}-value is always the empty
string (given the regular expression is nullable since otherwise
@{text mkeps} might not be defined).
\begin{lemma}\mbox{}\smallskip\\\label{Prf_injval_flat}
\begin{tabular}{ll}
(1) & @{thm[mode=IfThen] Prf_injval_flat}\\
(2) & @{thm[mode=IfThen] mkeps_flat}
\end{tabular}
\end{lemma}
\begin{proof}
Both properties are by routine inductions: the first one can, for example,
be proved by induction over the definition of @{term derivatives}; the second by
an induction on @{term r}. There are no interesting cases.\qed
\end{proof}
Having defined the @{const mkeps} and @{text inj} function we can extend
\Brz's matcher so that a value is constructed (assuming the
regular expression matches the string). The clauses of the Sulzmann and Lu lexer are
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) lexer.simps(1)} & $\dn$ & @{thm (rhs) lexer.simps(1)}\\
@{thm (lhs) lexer.simps(2)} & $\dn$ & @{text "case"} @{term "lexer (der c r) s"} @{text of}\\
& & \phantom{$|$} @{term "None"} @{text "\<Rightarrow>"} @{term None}\\
& & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{term "Some (injval r c v)"}
\end{tabular}
\end{center}
\noindent If the regular expression does not match the string, @{const None} is
returned. If the regular expression \emph{does}
match the string, then @{const Some} value is returned. One important
virtue of this algorithm is that it can be implemented with ease in any
functional programming language and also in Isabelle/HOL. In the remaining
part of this section we prove that this algorithm is correct.
The well-known idea of POSIX matching is informally defined by some
rules such as the Longest Match and Priority Rules (see
Introduction); as correctly argued in \cite{Sulzmann2014}, this
needs formal specification. Sulzmann and Lu define an ``ordering
relation'' between values and argue that there is a maximum value,
as given by the derivative-based algorithm. In contrast, we shall
introduce a simple inductive definition that specifies directly what
a \emph{POSIX value} is, incorporating the POSIX-specific choices
into the side-conditions of our rules. Our definition is inspired by
the matching relation given by Vansummeren~\cite{Vansummeren2006}.
The relation we define is ternary and
written as \mbox{@{term "s \<in> r \<rightarrow> v"}}, relating
strings, regular expressions and values; the inductive rules are given in
Figure~\ref{POSIXrules}.
We can prove that given a string @{term s} and regular expression @{term
r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow> v"}.
%
\begin{figure}[t]
\begin{center}
\begin{tabular}{c}
@{thm[mode=Axiom] Posix.intros(1)}@{text "P"}@{term "ONE"} \qquad
@{thm[mode=Axiom] Posix.intros(2)}@{text "P"}@{term "c"}\medskip\\
@{thm[mode=Rule] Posix.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}@{text "P+L"}\qquad
@{thm[mode=Rule] Posix.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}@{text "P+R"}\medskip\\
$\mprset{flushleft}
\inferrule
{@{thm (prem 1) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \qquad
@{thm (prem 2) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]} \\\\
@{thm (prem 3) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}
{@{thm (concl) Posix.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}$@{text "PS"}\\
@{thm[mode=Axiom] Posix.intros(7)}@{text "P[]"}\medskip\\
$\mprset{flushleft}
\inferrule
{@{thm (prem 1) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
@{thm (prem 2) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \qquad
@{thm (prem 3) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]} \\\\
@{thm (prem 4) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}
{@{thm (concl) Posix.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}}$@{text "P\<star>"}
\end{tabular}
\end{center}
\caption{Our inductive definition of POSIX values.}\label{POSIXrules}
\end{figure}
\begin{theorem}\mbox{}\smallskip\\\label{posixdeterm}
\begin{tabular}{ll}
(1) & If @{thm (prem 1) Posix1(1)} then @{thm (concl)
Posix1(1)} and @{thm (concl) Posix1(2)}.\\
(2) & @{thm[mode=IfThen] Posix_determ(1)[of _ _ "v" "v'"]}
\end{tabular}
\end{theorem}
\begin{proof} Both by induction on the definition of @{term "s \<in> r \<rightarrow> v"}.
The second parts follows by a case analysis of @{term "s \<in> r \<rightarrow> v'"} and
the first part.\qed
\end{proof}
\noindent
We claim that our @{term "s \<in> r \<rightarrow> v"} relation captures the idea behind the four
informal POSIX rules shown in the Introduction: Consider for example the
rules @{text "P+L"} and @{text "P+R"} where the POSIX value for a string
and an alternative regular expression, that is @{term "(s, ALT r\<^sub>1 r\<^sub>2)"},
is specified---it is always a @{text "Left"}-value, \emph{except} when the
string to be matched is not in the language of @{term "r\<^sub>1"}; only then it
is a @{text Right}-value (see the side-condition in @{text "P+R"}).
Interesting is also the rule for sequence regular expressions (@{text
"PS"}). The first two premises state that @{term "v\<^sub>1"} and @{term "v\<^sub>2"}
are the POSIX values for @{term "(s\<^sub>1, r\<^sub>1)"} and @{term "(s\<^sub>2, r\<^sub>2)"}
respectively. Consider now the third premise and note that the POSIX value
of this rule should match the string \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}}. According to the
Longest Match Rule, we want that the @{term "s\<^sub>1"} is the longest initial
split of \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}} such that @{term "s\<^sub>2"} is still recognised
by @{term "r\<^sub>2"}. Let us assume, contrary to the third premise, that there
\emph{exist} an @{term "s\<^sub>3"} and @{term "s\<^sub>4"} such that @{term "s\<^sub>2"}
can be split up into a non-empty string @{term "s\<^sub>3"} and a possibly empty
string @{term "s\<^sub>4"}. Moreover the longer string @{term "s\<^sub>1 @ s\<^sub>3"} can be
matched by @{text "r\<^sub>1"} and the shorter @{term "s\<^sub>4"} can still be
matched by @{term "r\<^sub>2"}. In this case @{term "s\<^sub>1"} would \emph{not} be the
longest initial split of \mbox{@{term "s\<^sub>1 @ s\<^sub>2"}} and therefore @{term "Seq v\<^sub>1
v\<^sub>2"} cannot be a POSIX value for @{term "(s\<^sub>1 @ s\<^sub>2, SEQ r\<^sub>1 r\<^sub>2)"}.
The main point is that our side-condition ensures the Longest
Match Rule is satisfied.
A similar condition is imposed on the POSIX value in the @{text
"P\<star>"}-rule. Also there we want that @{term "s\<^sub>1"} is the longest initial
split of @{term "s\<^sub>1 @ s\<^sub>2"} and furthermore the corresponding value
@{term v} cannot be flattened to the empty string. In effect, we require
that in each ``iteration'' of the star, some non-empty substring needs to
be ``chipped'' away; only in case of the empty string we accept @{term
"Stars []"} as the POSIX value. Indeed we can show that our POSIX values
are lexical values which exclude those @{text Stars} that contain subvalues
that flatten to the empty string.
\begin{lemma}\label{LVposix}
@{thm [mode=IfThen] Posix_LV}
\end{lemma}
\begin{proof}
By routine induction on @{thm (prem 1) Posix_LV}.\qed
\end{proof}
\noindent
Next is the lemma that shows the function @{term "mkeps"} calculates
the POSIX value for the empty string and a nullable regular expression.
\begin{lemma}\label{lemmkeps}
@{thm[mode=IfThen] Posix_mkeps}
\end{lemma}
\begin{proof}
By routine induction on @{term r}.\qed
\end{proof}
\noindent
The central lemma for our POSIX relation is that the @{text inj}-function
preserves POSIX values.
\begin{lemma}\label{Posix2}
@{thm[mode=IfThen] Posix_injval}
\end{lemma}
\begin{proof}
By induction on @{text r}. We explain two cases.
\begin{itemize}
\item[$\bullet$] Case @{term "r = ALT r\<^sub>1 r\<^sub>2"}. There are
two subcases, namely @{text "(a)"} \mbox{@{term "v = Left v'"}} and @{term
"s \<in> der c r\<^sub>1 \<rightarrow> v'"}; and @{text "(b)"} @{term "v = Right v'"}, @{term
"s \<notin> L (der c r\<^sub>1)"} and @{term "s \<in> der c r\<^sub>2 \<rightarrow> v'"}. In @{text "(a)"} we
know @{term "s \<in> der c r\<^sub>1 \<rightarrow> v'"}, from which we can infer @{term "(c # s)
\<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v'"} by induction hypothesis and hence @{term "(c #
s) \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> injval (ALT r\<^sub>1 r\<^sub>2) c (Left v')"} as needed. Similarly
in subcase @{text "(b)"} where, however, in addition we have to use
Proposition~\ref{derprop}(2) in order to infer @{term "c # s \<notin> L r\<^sub>1"} from @{term
"s \<notin> L (der c r\<^sub>1)"}.\smallskip
\item[$\bullet$] Case @{term "r = SEQ r\<^sub>1 r\<^sub>2"}. There are three subcases:
\begin{quote}
\begin{description}
\item[@{text "(a)"}] @{term "v = Left (Seq v\<^sub>1 v\<^sub>2)"} and @{term "nullable r\<^sub>1"}
\item[@{text "(b)"}] @{term "v = Right v\<^sub>1"} and @{term "nullable r\<^sub>1"}
\item[@{text "(c)"}] @{term "v = Seq v\<^sub>1 v\<^sub>2"} and @{term "\<not> nullable r\<^sub>1"}
\end{description}
\end{quote}
\noindent For @{text "(a)"} we know @{term "s\<^sub>1 \<in> der c r\<^sub>1 \<rightarrow> v\<^sub>1"} and
@{term "s\<^sub>2 \<in> r\<^sub>2 \<rightarrow> v\<^sub>2"} as well as
%
\[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> s\<^sub>1 @ s\<^sub>3 \<in> L (der c r\<^sub>1) \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
\noindent From the latter we can infer by Proposition~\ref{derprop}(2):
%
\[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
\noindent We can use the induction hypothesis for @{text "r\<^sub>1"} to obtain
@{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. Putting this all together allows us to infer
@{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case @{text "(c)"}
is similar.
For @{text "(b)"} we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and
@{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former
we have @{term "(c # s) \<in> r\<^sub>2 \<rightarrow> (injval r\<^sub>2 c v\<^sub>1)"} by induction hypothesis
for @{term "r\<^sub>2"}. From the latter we can infer
%
\[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = c # s \<and> s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
\noindent By Lemma~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"}
holds. Putting this all together, we can conclude with @{term "(c #
s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required.
Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the
sequence case, except that we need to also ensure that @{term "flat (injval r\<^sub>1
c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1)
\<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"} (which in turn follows from @{term "s\<^sub>1 \<in> der c
r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed
\end{itemize}
\end{proof}
\noindent
With Lemma~\ref{Posix2} in place, it is completely routine to establish
that the Sulzmann and Lu lexer satisfies our specification (returning
the null value @{term "None"} iff the string is not in the language of the regular expression,
and returning a unique POSIX value iff the string \emph{is} in the language):
\begin{theorem}\mbox{}\smallskip\\\label{lexercorrect}
\begin{tabular}{ll}
(1) & @{thm (lhs) lexer_correct_None} if and only if @{thm (rhs) lexer_correct_None}\\
(2) & @{thm (lhs) lexer_correct_Some} if and only if @{thm (rhs) lexer_correct_Some}\\
\end{tabular}
\end{theorem}
\begin{proof}
By induction on @{term s} using Lemma~\ref{lemmkeps} and \ref{Posix2}.\qed
\end{proof}
\noindent In \textit{(2)} we further know by Theorem~\ref{posixdeterm} that the
value returned by the lexer must be unique. A simple corollary
of our two theorems is:
\begin{corollary}\mbox{}\smallskip\\\label{lexercorrectcor}
\begin{tabular}{ll}
(1) & @{thm (lhs) lexer_correctness(2)} if and only if @{thm (rhs) lexer_correctness(2)}\\
(2) & @{thm (lhs) lexer_correctness(1)} if and only if @{thm (rhs) lexer_correctness(1)}\\
\end{tabular}
\end{corollary}
\noindent This concludes our correctness proof. Note that we have
not changed the algorithm of Sulzmann and Lu,\footnote{All
deviations we introduced are harmless.} but introduced our own
specification for what a correct result---a POSIX value---should be.
In the next section we show that our specification coincides with
another one given by Okui and Suzuki using a different technique.
*}
section {* Ordering of Values according to Okui and Suzuki*}
text {*
While in the previous section we have defined POSIX values directly
in terms of a ternary relation (see inference rules in Figure~\ref{POSIXrules}),
Sulzmann and Lu took a different approach in \cite{Sulzmann2014}:
they introduced an ordering for values and identified POSIX values
as the maximal elements. An extended version of \cite{Sulzmann2014}
is available at the website of its first author; this includes more
details of their proofs, but which are evidently not in final form
yet. Unfortunately, we were not able to verify claims that their
ordering has properties such as being transitive or having maximal
elements.
Okui and Suzuki \cite{OkuiSuzuki2010,OkuiSuzukiTech} described
another ordering of values, which they use to establish the
correctness of their automata-based algorithm for POSIX matching.
Their ordering resembles some aspects of the one given by Sulzmann
and Lu, but overall is quite different. To begin with, Okui and
Suzuki identify POSIX values as minimal, rather than maximal,
elements in their ordering. A more substantial difference is that
the ordering by Okui and Suzuki uses \emph{positions} in order to
identify and compare subvalues. Positions are lists of natural
numbers. This allows them to quite naturally formalise the Longest
Match and Priority rules of the informal POSIX standard. Consider
for example the value @{term v}
\begin{center}
@{term "v == Stars [Seq (Char x) (Char y), Char z]"}
\end{center}
\noindent
At position @{text "[0,1]"} of this value is the
subvalue @{text "Char y"} and at position @{text "[1]"} the
subvalue @{term "Char z"}. At the `root' position, or empty list
@{term "[]"}, is the whole value @{term v}. Positions such as @{text
"[0,1,0]"} or @{text "[2]"} are outside of @{text
v}. If it exists, the subvalue of @{term v} at a position @{text
p}, written @{term "at v p"}, can be recursively defined by
\begin{center}
\begin{tabular}{r@ {\hspace{0mm}}lcl}
@{term v} & @{text "\<downharpoonleft>\<^bsub>[]\<^esub>"} & @{text "\<equiv>"}& @{thm (rhs) at.simps(1)}\\
@{term "Left v"} & @{text "\<downharpoonleft>\<^bsub>0::ps\<^esub>"} & @{text "\<equiv>"}& @{thm (rhs) at.simps(2)}\\
@{term "Right v"} & @{text "\<downharpoonleft>\<^bsub>1::ps\<^esub>"} & @{text "\<equiv>"} &
@{thm (rhs) at.simps(3)[simplified Suc_0_fold]}\\
@{term "Seq v\<^sub>1 v\<^sub>2"} & @{text "\<downharpoonleft>\<^bsub>0::ps\<^esub>"} & @{text "\<equiv>"} &
@{thm (rhs) at.simps(4)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \\
@{term "Seq v\<^sub>1 v\<^sub>2"} & @{text "\<downharpoonleft>\<^bsub>1::ps\<^esub>"}
& @{text "\<equiv>"} &
@{thm (rhs) at.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2", simplified Suc_0_fold]} \\
@{term "Stars vs"} & @{text "\<downharpoonleft>\<^bsub>n::ps\<^esub>"} & @{text "\<equiv>"}& @{thm (rhs) at.simps(6)}\\
\end{tabular}
\end{center}
\noindent In the last clause we use Isabelle's notation @{term "vs ! n"} for the
@{text n}th element in a list. The set of positions inside a value @{text v},
written @{term "Pos v"}, is given by
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) Pos.simps(1)} & @{text "\<equiv>"} & @{thm (rhs) Pos.simps(1)}\\
@{thm (lhs) Pos.simps(2)} & @{text "\<equiv>"} & @{thm (rhs) Pos.simps(2)}\\
@{thm (lhs) Pos.simps(3)} & @{text "\<equiv>"} & @{thm (rhs) Pos.simps(3)}\\
@{thm (lhs) Pos.simps(4)} & @{text "\<equiv>"} & @{thm (rhs) Pos.simps(4)}\\
@{thm (lhs) Pos.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
& @{text "\<equiv>"}
& @{thm (rhs) Pos.simps(5)[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
@{thm (lhs) Pos_stars} & @{text "\<equiv>"} & @{thm (rhs) Pos_stars}\\
\end{tabular}
\end{center}
\noindent
whereby @{text len} in the last clause stands for the length of a list. Clearly
for every position inside a value there exists a subvalue at that position.
To help understanding the ordering of Okui and Suzuki, consider again
the earlier value
@{text v} and compare it with the following @{text w}:
\begin{center}
\begin{tabular}{l}
@{term "v == Stars [Seq (Char x) (Char y), Char z]"}\\
@{term "w == Stars [Char x, Char y, Char z]"}
\end{tabular}
\end{center}
\noindent Both values match the string @{text "xyz"}, that means if
we flatten these values at their respective root position, we obtain
@{text "xyz"}. However, at position @{text "[0]"}, @{text v} matches
@{text xy} whereas @{text w} matches only the shorter @{text x}. So
according to the Longest Match Rule, we should prefer @{text v},
rather than @{text w} as POSIX value for string @{text xyz} (and
corresponding regular expression). In order to
formalise this idea, Okui and Suzuki introduce a measure for
subvalues at position @{text p}, called the \emph{norm} of @{text v}
at position @{text p}. We can define this measure in Isabelle as an
integer as follows
\begin{center}
@{thm pflat_len_def}
\end{center}
\noindent where we take the length of the flattened value at
position @{text p}, provided the position is inside @{text v}; if
not, then the norm is @{text "-1"}. The default for outside
positions is crucial for the POSIX requirement of preferring a
@{text Left}-value over a @{text Right}-value (if they can match the
same string---see the Priority Rule from the Introduction). For this
consider
\begin{center}
@{term "v == Left (Char x)"} \qquad and \qquad @{term "w == Right (Char x)"}
\end{center}
\noindent Both values match @{text x}. At position @{text "[0]"}
the norm of @{term v} is @{text 1} (the subvalue matches @{text x}),
but the norm of @{text w} is @{text "-1"} (the position is outside
@{text w} according to how we defined the `inside' positions of
@{text Left}- and @{text Right}-values). Of course at position
@{text "[1]"}, the norms @{term "pflat_len v [1]"} and @{term
"pflat_len w [1]"} are reversed, but the point is that subvalues
will be analysed according to lexicographically ordered
positions. According to this ordering, the position @{text "[0]"}
takes precedence over @{text "[1]"} and thus also @{text v} will be
preferred over @{text w}. The lexicographic ordering of positions, written
@{term "DUMMY \<sqsubset>lex DUMMY"}, can be conveniently formalised
by three inference rules
\begin{center}
\begin{tabular}{ccc}
@{thm [mode=Axiom] lex_list.intros(1)}\hspace{1cm} &
@{thm [mode=Rule] lex_list.intros(3)[where ?p1.0="p\<^sub>1" and ?p2.0="p\<^sub>2" and
?ps1.0="ps\<^sub>1" and ?ps2.0="ps\<^sub>2"]}\hspace{1cm} &
@{thm [mode=Rule] lex_list.intros(2)[where ?ps1.0="ps\<^sub>1" and ?ps2.0="ps\<^sub>2"]}
\end{tabular}
\end{center}
With the norm and lexicographic order in place,
we can state the key definition of Okui and Suzuki
\cite{OkuiSuzuki2010}: a value @{term "v\<^sub>1"} is \emph{smaller at position @{text p}} than
@{term "v\<^sub>2"}, written @{term "v\<^sub>1 \<sqsubset>val p v\<^sub>2"},
if and only if $(i)$ the norm at position @{text p} is
greater in @{term "v\<^sub>1"} (that is the string @{term "flat (at v\<^sub>1 p)"} is longer
than @{term "flat (at v\<^sub>2 p)"}) and $(ii)$ all subvalues at
positions that are inside @{term "v\<^sub>1"} or @{term "v\<^sub>2"} and that are
lexicographically smaller than @{text p}, we have the same norm, namely
\begin{center}
\begin{tabular}{c}
@{thm (lhs) PosOrd_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
@{text "\<equiv>"}
$\begin{cases}
(i) & @{term "pflat_len v\<^sub>1 p > pflat_len v\<^sub>2 p"} \quad\text{and}\smallskip \\
(ii) & @{term "(\<forall>q \<in> Pos v\<^sub>1 \<union> Pos v\<^sub>2. q \<sqsubset>lex p --> pflat_len v\<^sub>1 q = pflat_len v\<^sub>2 q)"}
\end{cases}$
\end{tabular}
\end{center}
\noindent The position @{text p} in this definition acts as the
\emph{first distinct position} of @{text "v\<^sub>1"} and @{text
"v\<^sub>2"}, where both values match strings of different length
\cite{OkuiSuzuki2010}. Since at @{text p} the values @{text
"v\<^sub>1"} and @{text "v\<^sub>2"} match different strings, the
ordering is irreflexive. Derived from the definition above
are the following two orderings:
\begin{center}
\begin{tabular}{l}
@{thm PosOrd_ex_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
@{thm PosOrd_ex_eq_def[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
\end{tabular}
\end{center}
While we encountered a number of obstacles for establishing properties like
transitivity for the ordering of Sulzmann and Lu (and which we failed
to overcome), it is relatively straightforward to establish this
property for the orderings
@{term "DUMMY :\<sqsubset>val DUMMY"} and @{term "DUMMY :\<sqsubseteq>val DUMMY"}
by Okui and Suzuki.
\begin{lemma}[Transitivity]\label{transitivity}
@{thm [mode=IfThen] PosOrd_trans[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and ?v3.0="v\<^sub>3"]}
\end{lemma}
\begin{proof} From the assumption we obtain two positions @{text p}
and @{text q}, where the values @{text "v\<^sub>1"} and @{text
"v\<^sub>2"} (respectively @{text "v\<^sub>2"} and @{text
"v\<^sub>3"}) are `distinct'. Since @{text
"\<prec>\<^bsub>lex\<^esub>"} is trichotomous, we need to consider
three cases, namely @{term "p = q"}, @{term "p \<sqsubset>lex q"} and
@{term "q \<sqsubset>lex p"}. Let us look at the first case. Clearly
@{term "pflat_len v\<^sub>2 p < pflat_len v\<^sub>1 p"} and @{term
"pflat_len v\<^sub>3 p < pflat_len v\<^sub>2 p"} imply @{term
"pflat_len v\<^sub>3 p < pflat_len v\<^sub>1 p"}. It remains to show
that for a @{term "p' \<in> Pos v\<^sub>1 \<union> Pos v\<^sub>3"}
with @{term "p' \<sqsubset>lex p"} that @{term "pflat_len v\<^sub>1
p' = pflat_len v\<^sub>3 p'"} holds. Suppose @{term "p' \<in> Pos
v\<^sub>1"}, then we can infer from the first assumption that @{term
"pflat_len v\<^sub>1 p' = pflat_len v\<^sub>2 p'"}. But this means
that @{term "p'"} must be in @{term "Pos v\<^sub>2"} too (the norm
cannot be @{text "-1"} given @{term "p' \<in> Pos v\<^sub>1"}).
Hence we can use the second assumption and
infer @{term "pflat_len v\<^sub>2 p' = pflat_len v\<^sub>3 p'"},
which concludes this case with @{term "v\<^sub>1 :\<sqsubset>val
v\<^sub>3"}. The reasoning in the other cases is similar.\qed
\end{proof}
\noindent
The proof for $\preccurlyeq$ is similar and omitted.
It is also straightforward to show that @{text "\<prec>"} and
$\preccurlyeq$ are partial orders. Okui and Suzuki furthermore show that they
are linear orderings for lexical values \cite{OkuiSuzuki2010} of a given
regular expression and given string, but we have not formalised this in Isabelle. It is
not essential for our results. What we are going to show below is
that for a given @{text r} and @{text s}, the orderings have a unique
minimal element on the set @{term "LV r s"}, which is the POSIX value
we defined in the previous section. We start with two properties that
show how the length of a flattened value relates to the @{text "\<prec>"}-ordering.
\begin{proposition}\mbox{}\smallskip\\\label{ordlen}
\begin{tabular}{@ {}ll}
(1) &
@{thm [mode=IfThen] PosOrd_shorterE[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
(2) &
@{thm [mode=IfThen] PosOrd_shorterI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
\end{tabular}
\end{proposition}
\noindent Both properties follow from the definition of the ordering. Note that
\textit{(2)} entails that a value, say @{term "v\<^sub>2"}, whose underlying
string is a strict prefix of another flattened value, say @{term "v\<^sub>1"}, then
@{term "v\<^sub>1"} must be smaller than @{term "v\<^sub>2"}. For our proofs it
will be useful to have the following properties---in each case the underlying strings
of the compared values are the same:
\begin{proposition}\mbox{}\smallskip\\\label{ordintros}
\begin{tabular}{ll}
\textit{(1)} &
@{thm [mode=IfThen] PosOrd_Left_Right[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
\textit{(2)} & If
@{thm (prem 1) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;then\;
@{thm (lhs) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;iff\;
@{thm (rhs) PosOrd_Left_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
\textit{(3)} & If
@{thm (prem 1) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;then\;
@{thm (lhs) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]} \;iff\;
@{thm (rhs) PosOrd_Right_eq[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}\\
\textit{(4)} & If
@{thm (prem 1) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]} \;then\;
@{thm (lhs) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]} \;iff\;
@{thm (rhs) PosOrd_Seq_eq[where ?v2.0="v\<^sub>2" and ?w2.0="w\<^sub>2"]}\\
\textit{(5)} & If
@{thm (prem 2) PosOrd_SeqI1[simplified, where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]} \;and\;
@{thm (prem 1) PosOrd_SeqI1[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]} \;then\;
@{thm (concl) PosOrd_SeqI1[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
?w1.0="w\<^sub>1" and ?w2.0="w\<^sub>2"]}\\
\textit{(6)} & If
@{thm (prem 1) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;then\;
@{thm (lhs) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;iff\;
@{thm (rhs) PosOrd_Stars_append_eq[where ?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]}\\
\textit{(7)} & If
@{thm (prem 2) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;and\;
@{thm (prem 1) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]} \;then\;
@{thm (concl) PosOrd_StarsI[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2" and
?vs1.0="vs\<^sub>1" and ?vs2.0="vs\<^sub>2"]}\\
\end{tabular}
\end{proposition}
\noindent One might prefer that statements \textit{(4)} and \textit{(5)}
(respectively \textit{(6)} and \textit{(7)})
are combined into a single \textit{iff}-statement (like the ones for @{text
Left} and @{text Right}). Unfortunately this cannot be done easily: such
a single statement would require an additional assumption about the
two values @{term "Seq v\<^sub>1 v\<^sub>2"} and @{term "Seq w\<^sub>1 w\<^sub>2"}
being inhabited by the same regular expression. The
complexity of the proofs involved seems to not justify such a
`cleaner' single statement. The statements given are just the properties that
allow us to establish our theorems without any difficulty. The proofs
for Proposition~\ref{ordintros} are routine.
Next we establish how Okui and Suzuki's orderings relate to our
definition of POSIX values. Given a @{text POSIX} value @{text "v\<^sub>1"}
for @{text r} and @{text s}, then any other lexical value @{text
"v\<^sub>2"} in @{term "LV r s"} is greater or equal than @{text
"v\<^sub>1"}, namely:
\begin{theorem}\label{orderone}
@{thm [mode=IfThen] Posix_PosOrd[where ?v1.0="v\<^sub>1" and ?v2.0="v\<^sub>2"]}
\end{theorem}
\begin{proof} By induction on our POSIX rules. By
Theorem~\ref{posixdeterm} and the definition of @{const LV}, it is clear
that @{text "v\<^sub>1"} and @{text "v\<^sub>2"} have the same
underlying string @{term s}. The three base cases are
straightforward: for example for @{term "v\<^sub>1 = Void"}, we have
that @{term "v\<^sub>2 \<in> LV ONE []"} must also be of the form
\mbox{@{term "v\<^sub>2 = Void"}}. Therefore we have @{term
"v\<^sub>1 :\<sqsubseteq>val v\<^sub>2"}. The inductive cases for
@{text r} being of the form @{term "ALT r\<^sub>1 r\<^sub>2"} and
@{term "SEQ r\<^sub>1 r\<^sub>2"} are as follows:
\begin{itemize}
\item[$\bullet$] Case @{text "P+L"} with @{term "s \<in> (ALT r\<^sub>1 r\<^sub>2)
\<rightarrow> (Left w\<^sub>1)"}: In this case the value
@{term "v\<^sub>2"} is either of the
form @{term "Left w\<^sub>2"} or @{term "Right w\<^sub>2"}. In the
latter case we can immediately conclude with \mbox{@{term "v\<^sub>1
:\<sqsubseteq>val v\<^sub>2"}} since a @{text Left}-value with the
same underlying string @{text s} is always smaller than a
@{text Right}-value by Proposition~\ref{ordintros}\textit{(1)}.
In the former case we have @{term "w\<^sub>2
\<in> LV r\<^sub>1 s"} and can use the induction hypothesis to infer
@{term "w\<^sub>1 :\<sqsubseteq>val w\<^sub>2"}. Because @{term
"w\<^sub>1"} and @{term "w\<^sub>2"} have the same underlying string
@{text s}, we can conclude with @{term "Left w\<^sub>1
:\<sqsubseteq>val Left w\<^sub>2"} using
Proposition~\ref{ordintros}\textit{(2)}.\smallskip
\item[$\bullet$] Case @{text "P+R"} with @{term "s \<in> (ALT r\<^sub>1 r\<^sub>2)
\<rightarrow> (Right w\<^sub>1)"}: This case similar to the previous
case, except that we additionally know @{term "s \<notin> L
r\<^sub>1"}. This is needed when @{term "v\<^sub>2"} is of the form
\mbox{@{term "Left w\<^sub>2"}}. Since \mbox{@{term "flat v\<^sub>2 = flat
w\<^sub>2"} @{text "= s"}} and @{term "\<Turnstile> w\<^sub>2 :
r\<^sub>1"}, we can derive a contradiction for \mbox{@{term "s \<notin> L
r\<^sub>1"}} using
Proposition~\ref{inhabs}. So also in this case \mbox{@{term "v\<^sub>1
:\<sqsubseteq>val v\<^sub>2"}}.\smallskip
\item[$\bullet$] Case @{text "PS"} with @{term "(s\<^sub>1 @
s\<^sub>2) \<in> (SEQ r\<^sub>1 r\<^sub>2) \<rightarrow> (Seq
w\<^sub>1 w\<^sub>2)"}: We can assume @{term "v\<^sub>2 = Seq
(u\<^sub>1) (u\<^sub>2)"} with @{term "\<Turnstile> u\<^sub>1 :
r\<^sub>1"} and \mbox{@{term "\<Turnstile> u\<^sub>2 :
r\<^sub>2"}}. We have @{term "s\<^sub>1 @ s\<^sub>2 = (flat
u\<^sub>1) @ (flat u\<^sub>2)"}. By the side-condition of the
@{text PS}-rule we know that either @{term "s\<^sub>1 = flat
u\<^sub>1"} or that @{term "flat u\<^sub>1"} is a strict prefix of
@{term "s\<^sub>1"}. In the latter case we can infer @{term
"w\<^sub>1 :\<sqsubset>val u\<^sub>1"} by
Proposition~\ref{ordlen}\textit{(2)} and from this @{term "v\<^sub>1
:\<sqsubseteq>val v\<^sub>2"} by Proposition~\ref{ordintros}\textit{(5)}
(as noted above @{term "v\<^sub>1"} and @{term "v\<^sub>2"} must have the
same underlying string).
In the former case we know
@{term "u\<^sub>1 \<in> LV r\<^sub>1 s\<^sub>1"} and @{term
"u\<^sub>2 \<in> LV r\<^sub>2 s\<^sub>2"}. With this we can use the
induction hypotheses to infer @{term "w\<^sub>1 :\<sqsubseteq>val
u\<^sub>1"} and @{term "w\<^sub>2 :\<sqsubseteq>val u\<^sub>2"}. By
Proposition~\ref{ordintros}\textit{(4,5)} we can again infer
@{term "v\<^sub>1 :\<sqsubseteq>val
v\<^sub>2"}.
\end{itemize}
\noindent The case for @{text "P\<star>"} is similar to the @{text PS}-case and omitted.\qed
\end{proof}
\noindent This theorem shows that our @{text POSIX} value for a
regular expression @{text r} and string @{term s} is in fact a
minimal element of the values in @{text "LV r s"}. By
Proposition~\ref{ordlen}\textit{(2)} we also know that any value in
@{text "LV r s'"}, with @{term "s'"} being a strict prefix, cannot be
smaller than @{text "v\<^sub>1"}. The next theorem shows the
opposite---namely any minimal element in @{term "LV r s"} must be a
@{text POSIX} value. This can be established by induction on @{text
r}, but the proof can be drastically simplified by using the fact
from the previous section about the existence of a @{text POSIX} value
whenever a string @{term "s \<in> L r"}.
\begin{theorem}
@{thm [mode=IfThen] PosOrd_Posix[where ?v1.0="v\<^sub>1"]}
\end{theorem}
\begin{proof}
If @{thm (prem 1) PosOrd_Posix[where ?v1.0="v\<^sub>1"]} then
@{term "s \<in> L r"} by Proposition~\ref{inhabs}. Hence by Theorem~\ref{lexercorrect}(2)
there exists a
@{text POSIX} value @{term "v\<^sub>P"} with @{term "s \<in> r \<rightarrow> v\<^sub>P"}
and by Lemma~\ref{LVposix} we also have \mbox{@{term "v\<^sub>P \<in> LV r s"}}.
By Theorem~\ref{orderone} we therefore have
@{term "v\<^sub>P :\<sqsubseteq>val v\<^sub>1"}. If @{term "v\<^sub>P = v\<^sub>1"} then
we are done. Otherwise we have @{term "v\<^sub>P :\<sqsubset>val v\<^sub>1"}, which
however contradicts the second assumption about @{term "v\<^sub>1"} being the smallest
element in @{term "LV r s"}. So we are done in this case too.\qed
\end{proof}
\noindent
From this we can also show
that if @{term "LV r s"} is non-empty (or equivalently @{term "s \<in> L r"}) then
it has a unique minimal element:
\begin{corollary}
@{thm [mode=IfThen] Least_existence1}
\end{corollary}
\noindent To sum up, we have shown that the (unique) minimal elements
of the ordering by Okui and Suzuki are exactly the @{text POSIX}
values we defined inductively in Section~\ref{posixsec}. This provides
an independent confirmation that our ternary relation formalise the
informal POSIX rules.
*}
section {* Bitcoded Lexing *}
text {*
Incremental calculation of the value. To simplify the proof we first define the function
@{const flex} which calculates the ``iterated'' injection function. With this we can
rewrite the lexer as
\begin{center}
@{thm lexer_flex}
\end{center}
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) code.simps(1)} & $\dn$ & @{thm (rhs) code.simps(1)}\\
@{thm (lhs) code.simps(2)} & $\dn$ & @{thm (rhs) code.simps(2)}\\
@{thm (lhs) code.simps(3)} & $\dn$ & @{thm (rhs) code.simps(3)}\\
@{thm (lhs) code.simps(4)} & $\dn$ & @{thm (rhs) code.simps(4)}\\
@{thm (lhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) code.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
@{thm (lhs) code.simps(6)} & $\dn$ & @{thm (rhs) code.simps(6)}\\
@{thm (lhs) code.simps(7)} & $\dn$ & @{thm (rhs) code.simps(7)}
\end{tabular}
\end{center}
\begin{center}
\begin{tabular}{lcl}
@{term areg} & $::=$ & @{term "AZERO"}\\
& $\mid$ & @{term "AONE bs"}\\
& $\mid$ & @{term "ACHAR bs c"}\\
& $\mid$ & @{term "AALT bs r\<^sub>1 r\<^sub>2"}\\
& $\mid$ & @{term "ASEQ bs r\<^sub>1 r\<^sub>2"}\\
& $\mid$ & @{term "ASTAR bs r"}
\end{tabular}
\end{center}
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) intern.simps(1)} & $\dn$ & @{thm (rhs) intern.simps(1)}\\
@{thm (lhs) intern.simps(2)} & $\dn$ & @{thm (rhs) intern.simps(2)}\\
@{thm (lhs) intern.simps(3)} & $\dn$ & @{thm (rhs) intern.simps(3)}\\
@{thm (lhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) intern.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) intern.simps(6)} & $\dn$ & @{thm (rhs) intern.simps(6)}\\
\end{tabular}
\end{center}
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) erase.simps(1)} & $\dn$ & @{thm (rhs) erase.simps(1)}\\
@{thm (lhs) erase.simps(2)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(2)[of bs]}\\
@{thm (lhs) erase.simps(3)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(3)[of bs]}\\
@{thm (lhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) erase.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) erase.simps(6)[of bs]} & $\dn$ & @{thm (rhs) erase.simps(6)[of bs]}\\
\end{tabular}
\end{center}
Some simple facts about erase
\begin{lemma}\mbox{}\\
@{thm erase_bder}\\
@{thm erase_intern}
\end{lemma}
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) bnullable.simps(1)} & $\dn$ & @{thm (rhs) bnullable.simps(1)}\\
@{thm (lhs) bnullable.simps(2)} & $\dn$ & @{thm (rhs) bnullable.simps(2)}\\
@{thm (lhs) bnullable.simps(3)} & $\dn$ & @{thm (rhs) bnullable.simps(3)}\\
@{thm (lhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bnullable.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) bnullable.simps(6)} & $\dn$ & @{thm (rhs) bnullable.simps(6)}\medskip\\
% \end{tabular}
% \end{center}
% \begin{center}
% \begin{tabular}{lcl}
@{thm (lhs) bder.simps(1)} & $\dn$ & @{thm (rhs) bder.simps(1)}\\
@{thm (lhs) bder.simps(2)} & $\dn$ & @{thm (rhs) bder.simps(2)}\\
@{thm (lhs) bder.simps(3)} & $\dn$ & @{thm (rhs) bder.simps(3)}\\
@{thm (lhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(4)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bder.simps(5)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) bder.simps(6)} & $\dn$ & @{thm (rhs) bder.simps(6)}
\end{tabular}
\end{center}
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) bmkeps.simps(1)} & $\dn$ & @{thm (rhs) bmkeps.simps(1)}\\
@{thm (lhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(2)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) bmkeps.simps(3)[of bs "r\<^sub>1" "r\<^sub>2"]}\\
@{thm (lhs) bmkeps.simps(4)} & $\dn$ & @{thm (rhs) bmkeps.simps(4)}\medskip\\
\end{tabular}
\end{center}
@{thm [mode=IfThen] bder_retrieve}
By induction on @{text r}
\begin{theorem}[Main Lemma]\mbox{}\\
@{thm [mode=IfThen] MAIN_decode}
\end{theorem}
\noindent
Definition of the bitcoded lexer
@{thm blexer_def}
\begin{theorem}
@{thm blexer_correctness}
\end{theorem}
*}
section {* Optimisations *}
text {*
Derivatives as calculated by \Brz's method are usually more complex
regular expressions than the initial one; the result is that the
derivative-based matching and lexing algorithms are often abysmally slow.
However, various optimisations are possible, such as the simplifications
of @{term "ALT ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and
@{term "SEQ r ONE"} to @{term r}. These simplifications can speed up the
algorithms considerably, as noted in \cite{Sulzmann2014}. One of the
advantages of having a simple specification and correctness proof is that
the latter can be refined to prove the correctness of such simplification
steps. While the simplification of regular expressions according to
rules like
\begin{equation}\label{Simpl}
\begin{array}{lcllcllcllcl}
@{term "ALT ZERO r"} & @{text "\<Rightarrow>"} & @{term r} \hspace{8mm}%\\
@{term "ALT r ZERO"} & @{text "\<Rightarrow>"} & @{term r} \hspace{8mm}%\\
@{term "SEQ ONE r"} & @{text "\<Rightarrow>"} & @{term r} \hspace{8mm}%\\
@{term "SEQ r ONE"} & @{text "\<Rightarrow>"} & @{term r}
\end{array}
\end{equation}
\noindent is well understood, there is an obstacle with the POSIX value
calculation algorithm by Sulzmann and Lu: if we build a derivative regular
expression and then simplify it, we will calculate a POSIX value for this
simplified derivative regular expression, \emph{not} for the original (unsimplified)
derivative regular expression. Sulzmann and Lu \cite{Sulzmann2014} overcome this obstacle by
not just calculating a simplified regular expression, but also calculating
a \emph{rectification function} that ``repairs'' the incorrect value.
The rectification functions can be (slightly clumsily) implemented in
Isabelle/HOL as follows using some auxiliary functions:
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) F_RIGHT.simps(1)} & $\dn$ & @{text "Right (f v)"}\\
@{thm (lhs) F_LEFT.simps(1)} & $\dn$ & @{text "Left (f v)"}\\
@{thm (lhs) F_ALT.simps(1)} & $\dn$ & @{text "Right (f\<^sub>2 v)"}\\
@{thm (lhs) F_ALT.simps(2)} & $\dn$ & @{text "Left (f\<^sub>1 v)"}\\
@{thm (lhs) F_SEQ1.simps(1)} & $\dn$ & @{text "Seq (f\<^sub>1 ()) (f\<^sub>2 v)"}\\
@{thm (lhs) F_SEQ2.simps(1)} & $\dn$ & @{text "Seq (f\<^sub>1 v) (f\<^sub>2 ())"}\\
@{thm (lhs) F_SEQ.simps(1)} & $\dn$ & @{text "Seq (f\<^sub>1 v\<^sub>1) (f\<^sub>2 v\<^sub>2)"}\medskip\\
%\end{tabular}
%
%\begin{tabular}{lcl}
@{term "simp_ALT (ZERO, DUMMY) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_RIGHT f\<^sub>2)"}\\
@{term "simp_ALT (r\<^sub>1, f\<^sub>1) (ZERO, DUMMY)"} & $\dn$ & @{term "(r\<^sub>1, F_LEFT f\<^sub>1)"}\\
@{term "simp_ALT (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(ALT r\<^sub>1 r\<^sub>2, F_ALT f\<^sub>1 f\<^sub>2)"}\\
@{term "simp_SEQ (ONE, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>2, F_SEQ1 f\<^sub>1 f\<^sub>2)"}\\
@{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (ONE, f\<^sub>2)"} & $\dn$ & @{term "(r\<^sub>1, F_SEQ2 f\<^sub>1 f\<^sub>2)"}\\
@{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(SEQ r\<^sub>1 r\<^sub>2, F_SEQ f\<^sub>1 f\<^sub>2)"}\\
\end{tabular}
\end{center}
\noindent
The functions @{text "simp\<^bsub>Alt\<^esub>"} and @{text "simp\<^bsub>Seq\<^esub>"} encode the simplification rules
in \eqref{Simpl} and compose the rectification functions (simplifications can occur
deep inside the regular expression). The main simplification function is then
\begin{center}
\begin{tabular}{lcl}
@{term "simp (ALT r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_ALT (simp r\<^sub>1) (simp r\<^sub>2)"}\\
@{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
@{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
\end{tabular}
\end{center}
\noindent where @{term "id"} stands for the identity function. The
function @{const simp} returns a simplified regular expression and a corresponding
rectification function. Note that we do not simplify under stars: this
seems to slow down the algorithm, rather than speed it up. The optimised
lexer is then given by the clauses:
\begin{center}
\begin{tabular}{lcl}
@{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\
@{thm (lhs) slexer.simps(2)} & $\dn$ &
@{text "let (r\<^sub>s, f\<^sub>r) = simp (r "}$\backslash$@{text " c) in"}\\
& & @{text "case"} @{term "slexer r\<^sub>s s"} @{text of}\\
& & \phantom{$|$} @{term "None"} @{text "\<Rightarrow>"} @{term None}\\
& & $|$ @{term "Some v"} @{text "\<Rightarrow>"} @{text "Some (inj r c (f\<^sub>r v))"}
\end{tabular}
\end{center}
\noindent
In the second clause we first calculate the derivative @{term "der c r"}
and then simplify the result. This gives us a simplified derivative
@{text "r\<^sub>s"} and a rectification function @{text "f\<^sub>r"}. The lexer
is then recursively called with the simplified derivative, but before
we inject the character @{term c} into the value @{term v}, we need to rectify
@{term v} (that is construct @{term "f\<^sub>r v"}). Before we can establish the correctness
of @{term "slexer"}, we need to show that simplification preserves the language
and simplification preserves our POSIX relation once the value is rectified
(recall @{const "simp"} generates a (regular expression, rectification function) pair):
\begin{lemma}\mbox{}\smallskip\\\label{slexeraux}
\begin{tabular}{ll}
(1) & @{thm L_fst_simp[symmetric]}\\
(2) & @{thm[mode=IfThen] Posix_simp}
\end{tabular}
\end{lemma}
\begin{proof} Both are by induction on @{text r}. There is no
interesting case for the first statement. For the second statement,
of interest are the @{term "r = ALT r\<^sub>1 r\<^sub>2"} and @{term "r = SEQ r\<^sub>1
r\<^sub>2"} cases. In each case we have to analyse four subcases whether
@{term "fst (simp r\<^sub>1)"} and @{term "fst (simp r\<^sub>2)"} equals @{const
ZERO} (respectively @{const ONE}). For example for @{term "r = ALT
r\<^sub>1 r\<^sub>2"}, consider the subcase @{term "fst (simp r\<^sub>1) = ZERO"} and
@{term "fst (simp r\<^sub>2) \<noteq> ZERO"}. By assumption we know @{term "s \<in>
fst (simp (ALT r\<^sub>1 r\<^sub>2)) \<rightarrow> v"}. From this we can infer @{term "s \<in> fst (simp r\<^sub>2) \<rightarrow> v"}
and by IH also (*) @{term "s \<in> r\<^sub>2 \<rightarrow> (snd (simp r\<^sub>2) v)"}. Given @{term "fst (simp r\<^sub>1) = ZERO"}
we know @{term "L (fst (simp r\<^sub>1)) = {}"}. By the first statement
@{term "L r\<^sub>1"} is the empty set, meaning (**) @{term "s \<notin> L r\<^sub>1"}.
Taking (*) and (**) together gives by the \mbox{@{text "P+R"}}-rule
@{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> Right (snd (simp r\<^sub>2) v)"}. In turn this
gives @{term "s \<in> ALT r\<^sub>1 r\<^sub>2 \<rightarrow> snd (simp (ALT r\<^sub>1 r\<^sub>2)) v"} as we need to show.
The other cases are similar.\qed
\end{proof}
\noindent We can now prove relatively straightforwardly that the
optimised lexer produces the expected result:
\begin{theorem}
@{thm slexer_correctness}
\end{theorem}
\begin{proof} By induction on @{term s} generalising over @{term
r}. The case @{term "[]"} is trivial. For the cons-case suppose the
string is of the form @{term "c # s"}. By induction hypothesis we
know @{term "slexer r s = lexer r s"} holds for all @{term r} (in
particular for @{term "r"} being the derivative @{term "der c
r"}). Let @{term "r\<^sub>s"} be the simplified derivative regular expression, that is @{term
"fst (simp (der c r))"}, and @{term "f\<^sub>r"} be the rectification
function, that is @{term "snd (simp (der c r))"}. We distinguish the cases
whether (*) @{term "s \<in> L (der c r)"} or not. In the first case we
have by Theorem~\ref{lexercorrect}(2) a value @{term "v"} so that @{term
"lexer (der c r) s = Some v"} and @{term "s \<in> der c r \<rightarrow> v"} hold.
By Lemma~\ref{slexeraux}(1) we can also infer from~(*) that @{term "s
\<in> L r\<^sub>s"} holds. Hence we know by Theorem~\ref{lexercorrect}(2) that
there exists a @{term "v'"} with @{term "lexer r\<^sub>s s = Some v'"} and
@{term "s \<in> r\<^sub>s \<rightarrow> v'"}. From the latter we know by
Lemma~\ref{slexeraux}(2) that @{term "s \<in> der c r \<rightarrow> (f\<^sub>r v')"} holds.
By the uniqueness of the POSIX relation (Theorem~\ref{posixdeterm}) we
can infer that @{term v} is equal to @{term "f\<^sub>r v'"}---that is the
rectification function applied to @{term "v'"}
produces the original @{term "v"}. Now the case follows by the
definitions of @{const lexer} and @{const slexer}.
In the second case where @{term "s \<notin> L (der c r)"} we have that
@{term "lexer (der c r) s = None"} by Theorem~\ref{lexercorrect}(1). We
also know by Lemma~\ref{slexeraux}(1) that @{term "s \<notin> L r\<^sub>s"}. Hence
@{term "lexer r\<^sub>s s = None"} by Theorem~\ref{lexercorrect}(1) and
by IH then also @{term "slexer r\<^sub>s s = None"}. With this we can
conclude in this case too.\qed
\end{proof}
*}
(*<*)
end
(*>*)