thys/Paper/Paper.thy
changeset 137 4178b7e71809
parent 136 fa0d8aa5d7de
child 138 a87b8a09ffe8
equal deleted inserted replaced
136:fa0d8aa5d7de 137:4178b7e71809
   552   an induction on @{term r}. There are no interesting cases.\qed
   552   an induction on @{term r}. There are no interesting cases.\qed
   553   \end{proof}
   553   \end{proof}
   554 
   554 
   555   Having defined the @{const mkeps} and @{text inj} function we can extend
   555   Having defined the @{const mkeps} and @{text inj} function we can extend
   556   \Brz's matcher so that a [lexical] value is constructed (assuming the
   556   \Brz's matcher so that a [lexical] value is constructed (assuming the
   557   regular expression matches the string). The clauses of the lexer are
   557   regular expression matches the string). The clauses of the Sulzmann and Lu lexer are
   558 
   558 
   559   \begin{center}
   559   \begin{center}
   560   \begin{tabular}{lcl}
   560   \begin{tabular}{lcl}
   561   @{thm (lhs) matcher.simps(1)} & $\dn$ & @{thm (rhs) matcher.simps(1)}\\
   561   @{thm (lhs) matcher.simps(1)} & $\dn$ & @{thm (rhs) matcher.simps(1)}\\
   562   @{thm (lhs) matcher.simps(2)} & $\dn$ & @{text "case"} @{term "matcher (der c r) s"} @{text of}\\
   562   @{thm (lhs) matcher.simps(2)} & $\dn$ & @{text "case"} @{term "matcher (der c r) s"} @{text of}\\
   636 
   636 
   637   A similar condition is imposed on the POSIX value in the @{text
   637   A similar condition is imposed on the POSIX value in the @{text
   638   "P\<star>"}-rule. Also there we want that @{term "s\<^sub>1"} is the longest initial
   638   "P\<star>"}-rule. Also there we want that @{term "s\<^sub>1"} is the longest initial
   639   split of @{term "s\<^sub>1 @ s\<^sub>2"} and furthermore the corresponding value
   639   split of @{term "s\<^sub>1 @ s\<^sub>2"} and furthermore the corresponding value
   640   @{term v} cannot be flattened to the empty string. In effect, we require
   640   @{term v} cannot be flattened to the empty string. In effect, we require
   641   that in each ``iteration'' of the star, some non-empty substring need to
   641   that in each ``iteration'' of the star, some non-empty substring needs to
   642   be ``chipped'' away; only in case of the empty string we accept @{term
   642   be ``chipped'' away; only in case of the empty string we accept @{term
   643   "Stars []"} as the POSIX value.
   643   "Stars []"} as the POSIX value.
   644 
   644 
   645    We can prove that given a string @{term s} and regular expression @{term
   645    We can prove that given a string @{term s} and regular expression @{term
   646    r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow>
   646    r}, the POSIX value @{term v} is uniquely determined by @{term "s \<in> r \<rightarrow>
   707   \noindent From the latter we can infer by Prop.~\ref{derprop}(2):
   707   \noindent From the latter we can infer by Prop.~\ref{derprop}(2):
   708 
   708 
   709   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
   709   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
   710 
   710 
   711   \noindent We can use the induction hypothesis for @{text "r\<^sub>1"} to obtain
   711   \noindent We can use the induction hypothesis for @{text "r\<^sub>1"} to obtain
   712   @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. This allows us to infer
   712   @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. This all put together allows us to infer
   713   @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case @{text "(c)"}
   713   @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case @{text "(c)"}
   714   is similarly.
   714   is similarly.
   715 
   715 
   716   For @{text "(b)"} we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and 
   716   For @{text "(b)"} we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and 
   717   @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former
   717   @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former
   720 
   720 
   721   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = c # s \<and> s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
   721   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = c # s \<and> s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
   722 
   722 
   723   \noindent By Lem.~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"}
   723   \noindent By Lem.~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"}
   724   holds. Putting this all together, we can conclude with @{term "(c #
   724   holds. Putting this all together, we can conclude with @{term "(c #
   725   s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}.
   725   s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required.
   726 
   726 
   727   Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the
   727   Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the
   728   sequence case, except that we need to ensure that @{term "flat (injval r\<^sub>1
   728   sequence case, except that we need to ensure that @{term "flat (injval r\<^sub>1
   729   c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1)
   729   c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1)
   730   \<in> r' \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}  (which in turn follows from @{term "s\<^sub>1 \<in> der c
   730   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}  (which in turn follows from @{term "s\<^sub>1 \<in> der c
   731   r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed
   731   r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed
   732   \end{proof}
   732   \end{proof}
   733 
   733 
   734   \noindent
   734   \noindent
   735   With Lem.~\ref{PMatch2} in place, it is completely routine to establish
   735   With Lem.~\ref{PMatch2} in place, it is completely routine to establish
   747   \begin{proof}
   747   \begin{proof}
   748   By induction on @{term s} using Lem.~\ref{lemmkeps} and \ref{PMatch2}.\qed  
   748   By induction on @{term s} using Lem.~\ref{lemmkeps} and \ref{PMatch2}.\qed  
   749   \end{proof}
   749   \end{proof}
   750 
   750 
   751   \noindent This concludes our correctness proof. Note that we have not
   751   \noindent This concludes our correctness proof. Note that we have not
   752   changed the algorithm by Sulzmann and Lu, but introduced our own
   752   changed the algorithm by Sulzmann and Lu,\footnote{Any deviation we introduced is harmless.} 
       
   753   but introduced our own
   753   specification for what a correct result---a POSIX value---should be.
   754   specification for what a correct result---a POSIX value---should be.
   754   A strong point in favour of Sulzmann and Lu's algorithm is that it
   755   A strong point in favour of Sulzmann and Lu's algorithm is that it
   755   can be extended in various ways.
   756   can be extended in various ways.
   756 
   757 
   757 *}
   758 *}
   758 
   759 
   759 section {* Extensions and Optimisations*}
   760 section {* Extensions and Optimisations*}
   760 
   761 
   761 text {*
   762 text {*
   762 
   763 
   763   If we are interested in tokenising string, then we need to not just
   764   If we are interested in tokenising strings, then we need to not just
   764   split up the string into tokens, but also ``classify'' the tokens (for
   765   split up the string into tokens, but also ``classify'' the tokens (for
   765   example whether it is a keyword or an identifier). This can be
   766   example whether it is a keyword or an identifier). This can be
   766   done with only minor modifications by introducing \emph{record regular
   767   done with only minor modifications to the algorithm by introducing \emph{record regular
   767   expressions} and \emph{record values} (for example \cite{Sulzmann2014b}):
   768   expressions} and \emph{record values} (for example \cite{Sulzmann2014b}):
   768 
   769 
   769   \begin{center}  
   770   \begin{center}  
   770   @{text "r :="}
   771   @{text "r :="}
   771   @{text "..."} $\mid$
   772   @{text "..."} $\mid$
   775   @{text "(l : v)"}
   776   @{text "(l : v)"}
   776   \end{center}
   777   \end{center}
   777   
   778   
   778   \noindent where @{text l} is a label, say a string, @{text r} a regular
   779   \noindent where @{text l} is a label, say a string, @{text r} a regular
   779   expression and @{text v} a value. All functions can be smoothly extended
   780   expression and @{text v} a value. All functions can be smoothly extended
   780   to these regular expressions and values. For example @{text "(l : r)"} is
   781   to these regular expressions and values. For example \mbox{@{text "(l : r)"}} is
   781   nullable iff @{term r} is, and so on. The purpose of the record regular
   782   nullable iff @{term r} is, and so on. The purpose of the record regular
   782   expression is to mark certain parts of a regular expression and then
   783   expression is to mark certain parts of a regular expression and then
   783   record in the calculated value which parts of the string were matched by
   784   record in the calculated value which parts of the string were matched by
   784   this part. The label can then serve for classifying tokens. Recall the
   785   this part. The label can then serve as classification of the tokens. For this recall the
   785   regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} for keywords and
   786   regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} for keywords and
   786   identifiers from the Introduction. With record regular expression we can
   787   identifiers from the Introduction. With record regular expression we can
   787   form @{text "((key : r\<^bsub>key\<^esub>) + (id : r\<^bsub>id\<^esub>))\<^sup>\<star>"} and then traverse the
   788   form \mbox{@{text "((key : r\<^bsub>key\<^esub>) + (id : r\<^bsub>id\<^esub>))\<^sup>\<star>"}} and then traverse the
   788   calculated value and only collect the underlying strings in record values.
   789   calculated value and only collect the underlying strings in record values.
   789   With this we obtain finite sequences of pairs of labels and strings, for 
   790   With this we obtain finite sequences of pairs of labels and strings, for 
   790   example
   791   example
   791 
   792 
   792   \[@{text "(l\<^sub>1 : s\<^sub>1), ..., (l\<^sub>n : s\<^sub>n)"}\]
   793   \[@{text "(l\<^sub>1 : s\<^sub>1), ..., (l\<^sub>n : s\<^sub>n)"}\]
   797   Derivatives as calculated by \Brz's method are usually more complex
   798   Derivatives as calculated by \Brz's method are usually more complex
   798   regular expressions than the initial one; the result is that the matching
   799   regular expressions than the initial one; the result is that the matching
   799   and lexing algorithms are often abysmally slow. However, various
   800   and lexing algorithms are often abysmally slow. However, various
   800   optimisations are possible, such as the simplifications of @{term "ALT
   801   optimisations are possible, such as the simplifications of @{term "ALT
   801   ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r
   802   ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r
   802   ONE"} to @{term r}. One of the advantages of having a simple specification
   803   ONE"} to @{term r}. These simplifications can speed up the algorithms considerably,
   803   and correctness proof is that the latter can be refined to allow for such
   804   as noted in \cite{Sulzmann2014}.
   804   optimisations and simple correctness proof.
   805   One of the advantages of having a simple specification
       
   806   and correctness proof is that the latter can be refined to prove the 
       
   807   correctness of such simplification steps.
   805 
   808 
   806   While the simplification of regular expressions according to 
   809   While the simplification of regular expressions according to 
   807   rules like
   810   rules like
   808 
   811 
   809   \begin{center}
   812   \begin{center}
   816   \end{center}
   819   \end{center}
   817 
   820 
   818   \noindent is well understood, there is an obstacle with the POSIX value
   821   \noindent is well understood, there is an obstacle with the POSIX value
   819   calculation algorithm by Sulzmann and Lu: if we build a derivative regular
   822   calculation algorithm by Sulzmann and Lu: if we build a derivative regular
   820   expression and then simplify it, we will calculate a POSIX value for this
   823   expression and then simplify it, we will calculate a POSIX value for this
   821   simplified regular expression, \emph{not} for the original (unsimplified)
   824   simplified derivative regular expression, \emph{not} for the original (unsimplified)
   822   derivative regular expression. Sulzmann and Lu overcome this obstacle by
   825   derivative regular expression. Sulzmann and Lu overcome this obstacle by
   823   not just calculating a simplified regular expression, but also calculating
   826   not just calculating a simplified regular expression, but also calculating
   824   a \emph{rectification function} that ``repairs'' the incorrect value.
   827   a \emph{rectification function} that ``repairs'' the incorrect value.
   825   
   828   
   826   The rectification functions can be (slightly clumsily) implemented  in
   829   The rectification functions can be (slightly clumsily) implemented  in
  1122   by Sulzmann and Lu \cite{Sulzmann2014}, but we believe it contains
  1125   by Sulzmann and Lu \cite{Sulzmann2014}, but we believe it contains
  1123   unfillable gaps. In the online version of \cite{Sulzmann2014}, the authors
  1126   unfillable gaps. In the online version of \cite{Sulzmann2014}, the authors
  1124   already acknowledge some small problems, but our experience suggests
  1127   already acknowledge some small problems, but our experience suggests
  1125   there are more serious problems. 
  1128   there are more serious problems. 
  1126   
  1129   
  1127   Closesly related to our work is an automata-based lexer formalised by
  1130   Closely related to our work is an automata-based lexer formalised by
  1128   Nipkow \cite{Nipkow98}. This lexer also splits up strings into longest
  1131   Nipkow \cite{Nipkow98}. This lexer also splits up strings into longest
  1129   initial substrings, but Nipkow's algorithm is not completely
  1132   initial substrings, but Nipkow's algorithm is not completely
  1130   computational. The algorithm by Sulzmann and Lu, in contrast, can be
  1133   computational. The algorithm by Sulzmann and Lu, in contrast, can be
  1131   implemented easily in functional languages. A bespoke lexer for the
  1134   implemented easily in functional languages. A bespoke lexer for the
  1132   Imp-Language is formalised in Coq as part of the Software Foundations book
  1135   Imp-Language is formalised in Coq as part of the Software Foundations book