lexing: thys/Paper/Paper.thy@6adda4a667b1


(*<*)
theory Paper
imports "../ReStar" "~~/src/HOL/Library/LaTeXsugar"
begin

declare [[show_question_marks = false]]

notation (latex output)
   If  ("(\<^raw:\textrm{>if\<^raw:}> (_)/ \<^raw:\textrm{>then\<^raw:}> (_)/ \<^raw:\textrm{>else\<^raw:}> (_))" 10) and
  Cons ("_\<^raw:\mbox{$\,$}>::\<^raw:\mbox{$\,$}>_" [78,77] 73) and
  ZERO ("\<^raw:\textrm{0}>" 78) and 
  ONE ("\<^raw:\textrm{1}>" 78) and 
  CHAR ("_" [1000] 10) and
  ALT ("_ + _" [1000,1000] 78) and
  SEQ ("_ \<cdot> _" [1000,1000] 78) and
  STAR ("_\<^sup>\<star>" [1000] 78) and
  val.Char ("Char _" [1000] 78) and
  val.Left ("Left _" [1000] 78) and
  val.Right ("Right _" [1000] 78) and
  L ("L _" [1000] 0) and
  flat ("|_|" [70] 73) and
  Sequ ("_ @ _" [78,77] 63) and
  injval ("inj _ _ _" [1000,77,1000] 77) and 
  projval ("proj _ _ _" [1000,77,1000] 77) and 
  length ("len _" [78] 73) 
  (* and ValOrd ("_ \<succeq>\<^bsub>_\<^esub> _" [78,77,77] 73) *)
(*>*)

section {* Introduction *}

text {*
  

Sulzmann and Lu \cite{Sulzmann2014}

there are two commonly used
disambiguation strategies to create a unique matching tree:
one is called \emph{greedy} matching \cite{Frisch2004} and the
other is \emph{POSIX} matching~\cite{Kuklewicz,Sulzmann2014}.
For the latter there are two rough rules:  

\begin{itemize}
\item The Longest Match Rule (or ``maximal munch rule''):\smallskip\\ The
      longest initial substring matched by any regular
      expression is taken as next token.

\item Rule Priority:\smallskip\\ For a particular longest initial
      substring, the first regular expression that can match
      determines the token.
\end{itemize}

\noindent In the context of lexing, POSIX is the more
interesting disambiguation strategy as it produces longest
matches, which is necessary for tokenising programs. For
example the string \textit{iffoo} should not match the keyword
\textit{if} and the rest, but as one string \textit{iffoo},
which might be a variable name in a program. As another
example consider the string $xy$ and the regular expression
\mbox{$(x + y + xy)^*$}. Either the input string can be
matched in two `iterations' by the single letter-regular
expressions $x$ and $y$, or directly in one iteration by $xy$.
The first case corresponds to greedy matching, which first
matches with the left-most symbol and only matches the next
symbol in case of a mismatch. The second case is POSIX
matching, which prefers the longest match. In case more than
one (longest) matches exist, only then it prefers the
left-most match. While POSIX matching seems natural, it turns
out to be much more subtle than greedy matching in terms of
implementations and in terms of proving properties about it.
If POSIX matching is implemented using automata, then one has
to follow transitions (according to the input string) until
one finds an accepting state, record this state and look for
further transition which might lead to another accepting state
that represents a longer input initial substring to be
matched. Only if none can be found, the last accepting state
is returned.


Sulzmann and Lu's paper \cite{Sulzmann2014} targets POSIX
regular expression matching. They write that it is known to be
to be a non-trivial problem and nearly all POSIX matching
implementations are ``buggy'' \cite[Page 203]{Sulzmann2014}.
For this they cite a study by Kuklewicz \cite{Kuklewicz}. My
current work is about formalising the proofs in the paper by
Sulzmann and Lu. Specifically, they propose in this paper a
POSIX matching algorithm and give some details of a
correctness proof for this algorithm inside the paper and some
more details in an appendix. This correctness proof is
unformalised, meaning it is just a ``pencil-and-paper'' proof,
not done in a theorem prover. Though, the paper and presumably
the proof have been peer-reviewed. Unfortunately their proof
does not give enough details such that it can be
straightforwardly implemented in a theorem prover, say
Isabelle. In fact, the purported proof they outline does not
work in central places. We discovered this when filling in
many gaps and attempting to formalise the proof in Isabelle. 



{\bf Contributions:}

*}

section {* Preliminaries *}

text {* \noindent Strings in Isabelle/HOL are lists of characters with
  the empty string being represented by the empty list, written @{term
  "[]"}, and list-cons being written as @{term "DUMMY # DUMMY"}. By
  using the type char we have a supply of finitely many characters
  roughly corresponding to the ASCII character set.
  Regular exprtessions

  \begin{center}
  @{text "r :="}
  @{const "ZERO"} $\mid$
  @{const "ONE"} $\mid$
  @{term "CHAR c"} $\mid$
  @{term "ALT r\<^sub>1 r\<^sub>2"} $\mid$
  @{term "SEQ r\<^sub>1 r\<^sub>2"} $\mid$
  @{term "STAR r"} 
  \end{center}

*}

section {* POSIX Regular Expression Matching *}

section {* The Argument by Sulzmmann and Lu *}

section {* Conclusion *}

text {*

  Nipkow lexer from 2000

*}


text {*




  \noindent
  Values

  \begin{center}
  @{text "v :="}
  @{const "Void"} $\mid$
  @{term "val.Char c"} $\mid$
  @{term "Left v"} $\mid$
  @{term "Right v"} $\mid$
  @{term "Seq v\<^sub>1 v\<^sub>2"} $\mid$ 
  @{term "Stars vs"} 
  \end{center}  

  \noindent
  The language of a regular expression

  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) L.simps(1)} & $\dn$ & @{thm (rhs) L.simps(1)}\\
  @{thm (lhs) L.simps(2)} & $\dn$ & @{thm (rhs) L.simps(2)}\\
  @{thm (lhs) L.simps(3)} & $\dn$ & @{thm (rhs) L.simps(3)}\\
  @{thm (lhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) L.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) L.simps(6)} & $\dn$ & @{thm (rhs) L.simps(6)}\\
  \end{tabular}
  \end{center}

  \noindent
  The nullable function

  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) nullable.simps(1)} & $\dn$ & @{thm (rhs) nullable.simps(1)}\\
  @{thm (lhs) nullable.simps(2)} & $\dn$ & @{thm (rhs) nullable.simps(2)}\\
  @{thm (lhs) nullable.simps(3)} & $\dn$ & @{thm (rhs) nullable.simps(3)}\\
  @{thm (lhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(4)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) nullable.simps(5)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) nullable.simps(6)} & $\dn$ & @{thm (rhs) nullable.simps(6)}\\
  \end{tabular}
  \end{center}

  \noindent
  The derivative function for characters and strings

  \begin{center}
  \begin{tabular}{lcp{7.5cm}}
  @{thm (lhs) der.simps(1)} & $\dn$ & @{thm (rhs) der.simps(1)}\\
  @{thm (lhs) der.simps(2)} & $\dn$ & @{thm (rhs) der.simps(2)}\\
  @{thm (lhs) der.simps(3)} & $\dn$ & @{thm (rhs) der.simps(3)}\\
  @{thm (lhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(4)[of c "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) der.simps(5)[of c "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) der.simps(6)} & $\dn$ & @{thm (rhs) der.simps(6)}\medskip\\

  @{thm (lhs) ders.simps(1)} & $\dn$ & @{thm (rhs) ders.simps(1)}\\
  @{thm (lhs) ders.simps(2)} & $\dn$ & @{thm (rhs) ders.simps(2)}\\
  \end{tabular}
  \end{center}

  \noindent
  The @{const flat} function for values

  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) flat.simps(1)} & $\dn$ & @{thm (rhs) flat.simps(1)}\\
  @{thm (lhs) flat.simps(2)} & $\dn$ & @{thm (rhs) flat.simps(2)}\\
  @{thm (lhs) flat.simps(3)} & $\dn$ & @{thm (rhs) flat.simps(3)}\\
  @{thm (lhs) flat.simps(4)} & $\dn$ & @{thm (rhs) flat.simps(4)}\\
  @{thm (lhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]} & $\dn$ & @{thm (rhs) flat.simps(5)[of "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) flat.simps(6)} & $\dn$ & @{thm (rhs) flat.simps(6)}\\
  @{thm (lhs) flat.simps(7)} & $\dn$ & @{thm (rhs) flat.simps(7)}\\
  \end{tabular}
  \end{center}

  \noindent
  The @{const mkeps} function

  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) mkeps.simps(1)} & $\dn$ & @{thm (rhs) mkeps.simps(1)}\\
  @{thm (lhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(2)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]} & $\dn$ & @{thm (rhs) mkeps.simps(3)[of "r\<^sub>1" "r\<^sub>2"]}\\
  @{thm (lhs) mkeps.simps(4)} & $\dn$ & @{thm (rhs) mkeps.simps(4)}\\
  \end{tabular}
  \end{center}

  \noindent
  The @{text inj} function

  \begin{center}
  \begin{tabular}{lcl}
  @{thm (lhs) injval.simps(1)} & $\dn$ & @{thm (rhs) injval.simps(1)}\\
  @{thm (lhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]} & $\dn$ & 
      @{thm (rhs) injval.simps(2)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1"]}\\
  @{thm (lhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ & 
      @{thm (rhs) injval.simps(3)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(4)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(5)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>1" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]} & $\dn$ 
      & @{thm (rhs) injval.simps(6)[of "r\<^sub>1" "r\<^sub>2" "c" "v\<^sub>2"]}\\
  @{thm (lhs) injval.simps(7)[of "r" "c" "v" "vs"]} & $\dn$ 
      & @{thm (rhs) injval.simps(7)[of "r" "c" "v" "vs"]}\\
  \end{tabular}
  \end{center}

  \noindent
  The inhabitation relation:

  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Rule] Prf.intros(1)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>2" "r\<^sub>2"]}\medskip\\ 
  @{thm[mode=Rule] Prf.intros(2)[of "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]} \qquad 
  @{thm[mode=Rule] Prf.intros(3)[of "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]}\medskip\\
  @{thm[mode=Axiom] Prf.intros(4)} \qquad 
  @{thm[mode=Axiom] Prf.intros(5)[of "c"]}\medskip\\
  @{thm[mode=Axiom] Prf.intros(6)[of "r"]} \qquad 
  @{thm[mode=Rule] Prf.intros(7)[of "v" "r" "vs"]}\medskip\\
  \end{tabular}
  \end{center}

  \noindent
  We have also introduced a slightly restricted version of this relation
  where the last rule is restricted so that @{term "flat v \<noteq> []"}.
  This relation for \emph{non-problematic} is written @{term "\<Turnstile> v : r"}.
  \bigskip


  \noindent
  Our Posix relation @{term "s \<in> r \<rightarrow> v"}

  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Axiom] PMatch.intros(1)} \qquad
  @{thm[mode=Axiom] PMatch.intros(2)}\medskip\\
  @{thm[mode=Rule] PMatch.intros(3)[of "s" "r\<^sub>1" "v" "r\<^sub>2"]}\qquad
  @{thm[mode=Rule] PMatch.intros(4)[of "s" "r\<^sub>2" "v" "r\<^sub>1"]}\medskip\\
  \multicolumn{1}{p{5cm}}{@{thm[mode=Rule] PMatch.intros(5)[of "s\<^sub>1" "r\<^sub>1" "v\<^sub>1" "s\<^sub>2" "r\<^sub>2" "v\<^sub>2"]}}\medskip\\
  @{thm[mode=Rule] PMatch.intros(6)[of "s\<^sub>1" "r" "v" "s\<^sub>2" "vs"]}\medskip\\
  @{thm[mode=Axiom] PMatch.intros(7)}\medskip\\
  \end{tabular}
  \end{center}

  \noindent
  Our version of Sulzmann's ordering relation

  \begin{center}
  \begin{tabular}{c}
  @{thm[mode=Rule] ValOrd.intros(2)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>1'" "v\<^sub>2" "r\<^sub>2" "v\<^sub>2'"]} \qquad
  @{thm[mode=Rule] ValOrd.intros(1)[of "v\<^sub>2" "r\<^sub>2" "v\<^sub>2'" "v\<^sub>1" "r\<^sub>1"]}\medskip\\
  @{thm[mode=Rule] ValOrd.intros(3)[of "v\<^sub>1" "v\<^sub>2" "r\<^sub>1" "r\<^sub>2"]} \qquad
  @{thm[mode=Rule] ValOrd.intros(4)[of "v\<^sub>2" "v\<^sub>1" "r\<^sub>1" "r\<^sub>2"]}\medskip\\ 
  @{thm[mode=Rule] ValOrd.intros(5)[of "v\<^sub>2" "r\<^sub>2" "v\<^sub>2'" "r\<^sub>1"]} \qquad
  @{thm[mode=Rule] ValOrd.intros(6)[of "v\<^sub>1" "r\<^sub>1" "v\<^sub>1'"  "r\<^sub>2"]} \medskip\\
  @{thm[mode=Axiom] ValOrd.intros(7)}\qquad
  @{thm[mode=Axiom] ValOrd.intros(8)[of "c"]}\medskip\\
  @{thm[mode=Rule] ValOrd.intros(9)[of "v" "vs" "r"]}\qquad
  @{thm[mode=Rule] ValOrd.intros(10)[of "v" "vs" "r"]}\medskip\\
  @{thm[mode=Rule] ValOrd.intros(11)[of "v\<^sub>1" "r" "v\<^sub>2" "vs\<^sub>1" "vs\<^sub>2"]}\medskip\\
  @{thm[mode=Rule] ValOrd.intros(12)[of "vs\<^sub>1" "r" "vs\<^sub>2" "v"]}\qquad
  @{thm[mode=Axiom] ValOrd.intros(13)[of "r"]}\medskip\\
  \end{tabular}
  \end{center}

  \noindent
  A prefix of a string s

  \begin{center}
  \begin{tabular}{c}
  @{thm prefix_def[of "s\<^sub>1" "s\<^sub>2"]}
  \end{tabular}
  \end{center}

  \noindent
  Values and non-problematic values

  \begin{center}
  \begin{tabular}{c}
  @{thm Values_def}\medskip\\
  \end{tabular}
  \end{center}


  \noindent
  The point is that for a given @{text s} and @{text r} there are only finitely many
  non-problematic values.
*} 

text {* 
  \noindent
  Some lemmas we have proved:\bigskip
  
  @{thm L_flat_Prf}

  @{thm L_flat_NPrf}

  @{thm[mode=IfThen] mkeps_nullable}

  @{thm[mode=IfThen] mkeps_flat}

  @{thm[mode=IfThen] Prf_injval}

  @{thm[mode=IfThen] Prf_injval_flat}
  
  @{thm[mode=IfThen] PMatch_mkeps}
  
  @{thm[mode=IfThen] PMatch1(2)}

  @{thm[mode=IfThen] PMatch1N}

  @{thm[mode=IfThen] PMatch_determ(1)[of "s" "r" "v\<^sub>1" "v\<^sub>2"]}

  \medskip
  \noindent
  This is the main theorem that lets us prove that the algorithm is correct according to
  @{term "s \<in> r \<rightarrow> v"}:

  @{thm[mode=IfThen] PMatch2}

  \mbox{}\bigskip
  
  \noindent {\bf Proof} The proof is by induction on the definition of
  @{const der}. Other inductions would go through as well. The
  interesting case is for @{term "SEQ r\<^sub>1 r\<^sub>2"}. First we analyse the
  case where @{term "nullable r\<^sub>1"}. We have by induction hypothesis

  \[
  \begin{array}{l}
  (IH1)\quad @{text "\<forall>s v."} \text{\;if\;} @{term "s \<in> der c r\<^sub>1 \<rightarrow> v"} 
  \text{\;then\;} @{term "(c # s) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v"}\\
  (IH2)\quad @{text "\<forall>s v."} \text{\;if\;} @{term "s \<in> der c r\<^sub>2 \<rightarrow> v"} 
  \text{\;then\;} @{term "(c # s) \<in> r\<^sub>2 \<rightarrow> injval r\<^sub>2 c v"}
  \end{array}
  \]
  
  \noindent
  and have 

  \[
  @{term "s \<in> ALT (SEQ (der c r\<^sub>1) r\<^sub>2) (der c r\<^sub>2) \<rightarrow> v"}
  \]
  
  \noindent
  There are two cases what @{term v} can be: (1) @{term "Left v'"} and (2) @{term "Right v'"}.

  \begin{itemize}
  \item[(1)] We know @{term "s \<in> SEQ (der c r\<^sub>1) r\<^sub>2 \<rightarrow> v'"} holds, from which we
  can infer that there are @{text "s\<^sub>1"}, @{term "s\<^sub>2"}, @{text "v\<^sub>1"}, @{term "v\<^sub>2"}
  with

  \[
  @{term "s\<^sub>1 \<in> der c r\<^sub>1 \<rightarrow> v\<^sub>1"} \qquad\text{and}\qquad @{term "s\<^sub>2 \<in> r\<^sub>2 \<rightarrow> v\<^sub>2"}
  \]

  and also

  \[
  @{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> s\<^sub>1 @ s\<^sub>3 \<in> L (der c r\<^sub>1) \<and> s\<^sub>4 \<in> L r\<^sub>2)"}
  \]

  \noindent
  and have to prove
  
  \[
  @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}
  \]

  \noindent
  The two requirements @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"} and 
  @{term "s\<^sub>2 \<in> r\<^sub>2 \<rightarrow> v\<^sub>2"} can be proved by the induction hypothese (IH1) and the
  fact above.

  \noindent
  This leaves to prove
  
  \[
  @{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}
  \]
  
  \noindent
  which holds because @{term "(c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 "} implies @{term "s\<^sub>1 @ s\<^sub>3 \<in> L (der c r\<^sub>1) "}

  \item[(2)] This case is similar.
  \end{itemize}

  \noindent 
  The final case is that @{term " \<not> nullable r\<^sub>1"} holds. This case again similar
  to the cases above.
*}


text {*
  %\noindent
  %{\bf Acknowledgements:}
  %We are grateful for the comments we received from anonymous
  %referees.

  \bibliographystyle{plain}
  \bibliography{root}

  \section{Roy's Rules}

   \newcommand{\abs}[1]{\mid\!\! #1\!\! \mid}
   %%\newcommand{\mts}{\textit{``''}
   \newcommand{\tl}{\ \triangleleft\ }
   $$\inferrule[]{Void \tl \epsilon}{}
            \quad\quad
     \inferrule[]{Char\ c \tl Lit\ c}{}
   $$
   $$\inferrule
       {v_1 \tl r_1}
       {Left\ v_1 \tl r_1 + r_2}
   \quad\quad
     \inferrule[]
       { v_2 \tl r_2 \\ \abs{v_2}\ \not\in\ L(r_1)}
       {Right\ v_2 \tl r_1 + r_2}
   $$
   $$
   \inferrule
       {v_1 \tl r_1\\
        v_2 \tl r_2\\
        s \in\  L(r_1\backslash\! \abs{v_1}) \ \land\
        \abs{v_2}\!\backslash s\ \epsilon\ L(r_2)
        \ \Rightarrow\ s = []
       }
       {(v_1, v_2) \tl r_1 \cdot r_2}
   $$
   $$\inferrule
         { v \tl r \\ vs \tl r^* \\ \abs{v}\ \not=\ []} 
         { (v :: vs) \tl r^* }
   \quad\quad
       \inferrule{}
         { []  \tl r^* }       
   $$

*}


(*<*)
end
(*>*)
author	Christian Urban <christian dot urban at kcl dot ac dot uk>
	Sun, 28 Feb 2016 14:01:12 +0000
changeset 107	6adda4a667b1
parent 105	80218dddbb15
child 108	73f7dc60c285
permissions	-rw-r--r--