thys/Paper/Paper.thy
changeset 147 71f4ecc08849
parent 146 da81ffac4b10
child 148 702ed601349b
equal deleted inserted replaced
146:da81ffac4b10 147:71f4ecc08849
   705   \noindent From the latter we can infer by Prop.~\ref{derprop}(2):
   705   \noindent From the latter we can infer by Prop.~\ref{derprop}(2):
   706 
   706 
   707   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
   707   \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\]
   708 
   708 
   709   \noindent We can use the induction hypothesis for @{text "r\<^sub>1"} to obtain
   709   \noindent We can use the induction hypothesis for @{text "r\<^sub>1"} to obtain
   710   @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. This all put together allows us to infer
   710   @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. Putting this all together allows us to infer
   711   @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case @{text "(c)"}
   711   @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case @{text "(c)"}
   712   is similarly.
   712   is similarly.
   713 
   713 
   714   For @{text "(b)"} we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and 
   714   For @{text "(b)"} we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and 
   715   @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former
   715   @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former
   721   \noindent By Lem.~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"}
   721   \noindent By Lem.~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"}
   722   holds. Putting this all together, we can conclude with @{term "(c #
   722   holds. Putting this all together, we can conclude with @{term "(c #
   723   s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required.
   723   s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required.
   724 
   724 
   725   Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the
   725   Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the
   726   sequence case, except that we need to ensure that @{term "flat (injval r\<^sub>1
   726   sequence case, except that we need to also ensure that @{term "flat (injval r\<^sub>1
   727   c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1)
   727   c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1)
   728   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}  (which in turn follows from @{term "s\<^sub>1 \<in> der c
   728   \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}  (which in turn follows from @{term "s\<^sub>1 \<in> der c
   729   r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed
   729   r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed
   730   \end{proof}
   730   \end{proof}
   731 
   731 
   745   \begin{proof}
   745   \begin{proof}
   746   By induction on @{term s} using Lem.~\ref{lemmkeps} and \ref{Posix2}.\qed  
   746   By induction on @{term s} using Lem.~\ref{lemmkeps} and \ref{Posix2}.\qed  
   747   \end{proof}
   747   \end{proof}
   748 
   748 
   749   \noindent This concludes our correctness proof. Note that we have not
   749   \noindent This concludes our correctness proof. Note that we have not
   750   changed the algorithm by Sulzmann and Lu,\footnote{Any deviation we introduced is harmless.} 
   750   changed the algorithm of Sulzmann and Lu,\footnote{Any deviation we
   751   but introduced our own
   751   introduced is harmless.} but introduced our own specification for what a
   752   specification for what a correct result---a POSIX value---should be.
   752   correct result---a POSIX value---should be. A strong point in favour of
   753   A strong point in favour of Sulzmann and Lu's algorithm is that it
   753   Sulzmann and Lu's algorithm is that it can be extended in various ways.
   754   can be extended in various ways.
       
   755 
   754 
   756 *}
   755 *}
   757 
   756 
   758 section {* Extensions and Optimisations*}
   757 section {* Extensions and Optimisations*}
   759 
   758 
   760 text {*
   759 text {*
   761 
   760 
   762   If we are interested in tokenising strings, then we need to not just
   761   If we are interested in tokenising a string, then we need to not just
   763   split up the string into tokens, but also ``classify'' the tokens (for
   762   split up the string into tokens, but also ``classify'' the tokens (for
   764   example whether it is a keyword or an identifier). This can be
   763   example whether it is a keyword or an identifier). This can be done with
   765   done with only minor modifications to the algorithm by introducing \emph{record regular
   764   only minor modifications to the algorithm by introducing \emph{record
   766   expressions} and \emph{record values} (for example \cite{Sulzmann2014b}):
   765   regular expressions} and \emph{record values} (for example
       
   766   \cite{Sulzmann2014b}):
   767 
   767 
   768   \begin{center}  
   768   \begin{center}  
   769   @{text "r :="}
   769   @{text "r :="}
   770   @{text "..."} $\mid$
   770   @{text "..."} $\mid$
   771   @{text "(l : r)"} \qquad\qquad
   771   @{text "(l : r)"} \qquad\qquad
   774   @{text "(l : v)"}
   774   @{text "(l : v)"}
   775   \end{center}
   775   \end{center}
   776   
   776   
   777   \noindent where @{text l} is a label, say a string, @{text r} a regular
   777   \noindent where @{text l} is a label, say a string, @{text r} a regular
   778   expression and @{text v} a value. All functions can be smoothly extended
   778   expression and @{text v} a value. All functions can be smoothly extended
   779   to these regular expressions and values. For example \mbox{@{text "(l : r)"}} is
   779   to these regular expressions and values. For example \mbox{@{text "(l :
   780   nullable iff @{term r} is, and so on. The purpose of the record regular
   780   r)"}} is nullable iff @{term r} is, and so on. The purpose of the record
   781   expression is to mark certain parts of a regular expression and then
   781   regular expression is to mark certain parts of a regular expression and
   782   record in the calculated value which parts of the string were matched by
   782   then record in the calculated value which parts of the string were matched
   783   this part. The label can then serve as classification of the tokens. For this recall the
   783   by this part. The label can then serve as classification for the tokens.
   784   regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} for keywords and
   784   For this recall the regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} for
   785   identifiers from the Introduction. With record regular expression we can
   785   keywords and identifiers from the Introduction. With record regular
   786   form \mbox{@{text "((key : r\<^bsub>key\<^esub>) + (id : r\<^bsub>id\<^esub>))\<^sup>\<star>"}} and then traverse the
   786   expression we can form \mbox{@{text "((key : r\<^bsub>key\<^esub>) + (id : r\<^bsub>id\<^esub>))\<^sup>\<star>"}}
   787   calculated value and only collect the underlying strings in record values.
   787   and then traverse the calculated value and only collect the underlying
   788   With this we obtain finite sequences of pairs of labels and strings, for 
   788   strings in record values. With this we obtain finite sequences of pairs of
   789   example
   789   labels and strings, for example
   790 
   790 
   791   \[@{text "(l\<^sub>1 : s\<^sub>1), ..., (l\<^sub>n : s\<^sub>n)"}\]
   791   \[@{text "(l\<^sub>1 : s\<^sub>1), ..., (l\<^sub>n : s\<^sub>n)"}\]
   792   
   792   
   793   \noindent from which tokens with classifications (keyword-token,
   793   \noindent from which tokens with classifications (keyword-token,
   794   identifier-token and so on) can be extracted.
   794   identifier-token and so on) can be extracted.
   795 
   795 
   796   Derivatives as calculated by \Brz's method are usually more complex
   796   Derivatives as calculated by \Brz's method are usually more complex
   797   regular expressions than the initial one; the result is that the matching
   797   regular expressions than the initial one; the result is that the
   798   and lexing algorithms are often abysmally slow. However, various
   798   deivative-based matching and lexing algorithms are often abysmally slow.
   799   optimisations are possible, such as the simplifications of @{term "ALT
   799   However, various optimisations are possible, such as the simplifications
   800   ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r
   800   of @{term "ALT ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and
   801   ONE"} to @{term r}. These simplifications can speed up the algorithms considerably,
   801   @{term "SEQ r ONE"} to @{term r}. These simplifications can speed up the
   802   as noted in \cite{Sulzmann2014}.
   802   algorithms considerably, as noted in \cite{Sulzmann2014}. One of the
   803   One of the advantages of having a simple specification
   803   advantages of having a simple specification and correctness proof is that
   804   and correctness proof is that the latter can be refined to prove the 
   804   the latter can be refined to prove the correctness of such simplification
   805   correctness of such simplification steps.
   805   steps.
   806 
   806 
   807   While the simplification of regular expressions according to 
   807   While the simplification of regular expressions according to 
   808   rules like
   808   rules like
   809 
   809 
   810   \begin{center}
   810   \begin{center}
   858   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
   858   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
   859   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
   859   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
   860   \end{tabular}
   860   \end{tabular}
   861   \end{center} 
   861   \end{center} 
   862 
   862 
   863   \noindent where @{term "id"} stands for the identity function. 
   863   \noindent where @{term "id"} stands for the identity function. This
   864   This function returns a simplified regular expression and a rectification 
   864   function returns a simplified regular expression and a corresponding
   865   function. Note that
   865   rectification function. Note that we do not simplify under stars: this
   866   we do not simplify under stars: this seems to slow down the algorithm,
   866   seems to slow down the algorithm, rather than speed up. The optimised
   867   rather than speed up. The optimised lexer is then given by the clauses:
   867   lexer is then given by the clauses:
   868   
   868   
   869   \begin{center}
   869   \begin{center}
   870   \begin{tabular}{lcl}
   870   \begin{tabular}{lcl}
   871   @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\
   871   @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\
   872   @{thm (lhs) slexer.simps(2)} & $\dn$ & 
   872   @{thm (lhs) slexer.simps(2)} & $\dn$ &