705 \noindent From the latter we can infer by Prop.~\ref{derprop}(2): |
705 \noindent From the latter we can infer by Prop.~\ref{derprop}(2): |
706 |
706 |
707 \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\] |
707 \[@{term "\<not> (\<exists>s\<^sub>3 s\<^sub>4. s\<^sub>3 \<noteq> [] \<and> s\<^sub>3 @ s\<^sub>4 = s\<^sub>2 \<and> (c # s\<^sub>1) @ s\<^sub>3 \<in> L r\<^sub>1 \<and> s\<^sub>4 \<in> L r\<^sub>2)"}\] |
708 |
708 |
709 \noindent We can use the induction hypothesis for @{text "r\<^sub>1"} to obtain |
709 \noindent We can use the induction hypothesis for @{text "r\<^sub>1"} to obtain |
710 @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. This all put together allows us to infer |
710 @{term "(c # s\<^sub>1) \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"}. Putting this all together allows us to infer |
711 @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case @{text "(c)"} |
711 @{term "((c # s\<^sub>1) @ s\<^sub>2) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (injval r\<^sub>1 c v\<^sub>1) v\<^sub>2"}. The case @{text "(c)"} |
712 is similarly. |
712 is similarly. |
713 |
713 |
714 For @{text "(b)"} we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and |
714 For @{text "(b)"} we know @{term "s \<in> der c r\<^sub>2 \<rightarrow> v\<^sub>1"} and |
715 @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former |
715 @{term "s\<^sub>1 @ s\<^sub>2 \<notin> L (SEQ (der c r\<^sub>1) r\<^sub>2)"}. From the former |
721 \noindent By Lem.~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"} |
721 \noindent By Lem.~\ref{lemmkeps} we know @{term "[] \<in> r\<^sub>1 \<rightarrow> (mkeps r\<^sub>1)"} |
722 holds. Putting this all together, we can conclude with @{term "(c # |
722 holds. Putting this all together, we can conclude with @{term "(c # |
723 s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required. |
723 s) \<in> SEQ r\<^sub>1 r\<^sub>2 \<rightarrow> Seq (mkeps r\<^sub>1) (injval r\<^sub>2 c v\<^sub>1)"}, as required. |
724 |
724 |
725 Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the |
725 Finally suppose @{term "r = STAR r\<^sub>1"}. This case is very similar to the |
726 sequence case, except that we need to ensure that @{term "flat (injval r\<^sub>1 |
726 sequence case, except that we need to also ensure that @{term "flat (injval r\<^sub>1 |
727 c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1) |
727 c v\<^sub>1) \<noteq> []"}. This follows from @{term "(c # s\<^sub>1) |
728 \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"} (which in turn follows from @{term "s\<^sub>1 \<in> der c |
728 \<in> r\<^sub>1 \<rightarrow> injval r\<^sub>1 c v\<^sub>1"} (which in turn follows from @{term "s\<^sub>1 \<in> der c |
729 r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed |
729 r\<^sub>1 \<rightarrow> v\<^sub>1"} and the induction hypothesis).\qed |
730 \end{proof} |
730 \end{proof} |
731 |
731 |
745 \begin{proof} |
745 \begin{proof} |
746 By induction on @{term s} using Lem.~\ref{lemmkeps} and \ref{Posix2}.\qed |
746 By induction on @{term s} using Lem.~\ref{lemmkeps} and \ref{Posix2}.\qed |
747 \end{proof} |
747 \end{proof} |
748 |
748 |
749 \noindent This concludes our correctness proof. Note that we have not |
749 \noindent This concludes our correctness proof. Note that we have not |
750 changed the algorithm by Sulzmann and Lu,\footnote{Any deviation we introduced is harmless.} |
750 changed the algorithm of Sulzmann and Lu,\footnote{Any deviation we |
751 but introduced our own |
751 introduced is harmless.} but introduced our own specification for what a |
752 specification for what a correct result---a POSIX value---should be. |
752 correct result---a POSIX value---should be. A strong point in favour of |
753 A strong point in favour of Sulzmann and Lu's algorithm is that it |
753 Sulzmann and Lu's algorithm is that it can be extended in various ways. |
754 can be extended in various ways. |
|
755 |
754 |
756 *} |
755 *} |
757 |
756 |
758 section {* Extensions and Optimisations*} |
757 section {* Extensions and Optimisations*} |
759 |
758 |
760 text {* |
759 text {* |
761 |
760 |
762 If we are interested in tokenising strings, then we need to not just |
761 If we are interested in tokenising a string, then we need to not just |
763 split up the string into tokens, but also ``classify'' the tokens (for |
762 split up the string into tokens, but also ``classify'' the tokens (for |
764 example whether it is a keyword or an identifier). This can be |
763 example whether it is a keyword or an identifier). This can be done with |
765 done with only minor modifications to the algorithm by introducing \emph{record regular |
764 only minor modifications to the algorithm by introducing \emph{record |
766 expressions} and \emph{record values} (for example \cite{Sulzmann2014b}): |
765 regular expressions} and \emph{record values} (for example |
|
766 \cite{Sulzmann2014b}): |
767 |
767 |
768 \begin{center} |
768 \begin{center} |
769 @{text "r :="} |
769 @{text "r :="} |
770 @{text "..."} $\mid$ |
770 @{text "..."} $\mid$ |
771 @{text "(l : r)"} \qquad\qquad |
771 @{text "(l : r)"} \qquad\qquad |
774 @{text "(l : v)"} |
774 @{text "(l : v)"} |
775 \end{center} |
775 \end{center} |
776 |
776 |
777 \noindent where @{text l} is a label, say a string, @{text r} a regular |
777 \noindent where @{text l} is a label, say a string, @{text r} a regular |
778 expression and @{text v} a value. All functions can be smoothly extended |
778 expression and @{text v} a value. All functions can be smoothly extended |
779 to these regular expressions and values. For example \mbox{@{text "(l : r)"}} is |
779 to these regular expressions and values. For example \mbox{@{text "(l : |
780 nullable iff @{term r} is, and so on. The purpose of the record regular |
780 r)"}} is nullable iff @{term r} is, and so on. The purpose of the record |
781 expression is to mark certain parts of a regular expression and then |
781 regular expression is to mark certain parts of a regular expression and |
782 record in the calculated value which parts of the string were matched by |
782 then record in the calculated value which parts of the string were matched |
783 this part. The label can then serve as classification of the tokens. For this recall the |
783 by this part. The label can then serve as classification for the tokens. |
784 regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} for keywords and |
784 For this recall the regular expression @{text "(r\<^bsub>key\<^esub> + r\<^bsub>id\<^esub>)\<^sup>\<star>"} for |
785 identifiers from the Introduction. With record regular expression we can |
785 keywords and identifiers from the Introduction. With record regular |
786 form \mbox{@{text "((key : r\<^bsub>key\<^esub>) + (id : r\<^bsub>id\<^esub>))\<^sup>\<star>"}} and then traverse the |
786 expression we can form \mbox{@{text "((key : r\<^bsub>key\<^esub>) + (id : r\<^bsub>id\<^esub>))\<^sup>\<star>"}} |
787 calculated value and only collect the underlying strings in record values. |
787 and then traverse the calculated value and only collect the underlying |
788 With this we obtain finite sequences of pairs of labels and strings, for |
788 strings in record values. With this we obtain finite sequences of pairs of |
789 example |
789 labels and strings, for example |
790 |
790 |
791 \[@{text "(l\<^sub>1 : s\<^sub>1), ..., (l\<^sub>n : s\<^sub>n)"}\] |
791 \[@{text "(l\<^sub>1 : s\<^sub>1), ..., (l\<^sub>n : s\<^sub>n)"}\] |
792 |
792 |
793 \noindent from which tokens with classifications (keyword-token, |
793 \noindent from which tokens with classifications (keyword-token, |
794 identifier-token and so on) can be extracted. |
794 identifier-token and so on) can be extracted. |
795 |
795 |
796 Derivatives as calculated by \Brz's method are usually more complex |
796 Derivatives as calculated by \Brz's method are usually more complex |
797 regular expressions than the initial one; the result is that the matching |
797 regular expressions than the initial one; the result is that the |
798 and lexing algorithms are often abysmally slow. However, various |
798 deivative-based matching and lexing algorithms are often abysmally slow. |
799 optimisations are possible, such as the simplifications of @{term "ALT |
799 However, various optimisations are possible, such as the simplifications |
800 ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and @{term "SEQ r |
800 of @{term "ALT ZERO r"}, @{term "ALT r ZERO"}, @{term "SEQ ONE r"} and |
801 ONE"} to @{term r}. These simplifications can speed up the algorithms considerably, |
801 @{term "SEQ r ONE"} to @{term r}. These simplifications can speed up the |
802 as noted in \cite{Sulzmann2014}. |
802 algorithms considerably, as noted in \cite{Sulzmann2014}. One of the |
803 One of the advantages of having a simple specification |
803 advantages of having a simple specification and correctness proof is that |
804 and correctness proof is that the latter can be refined to prove the |
804 the latter can be refined to prove the correctness of such simplification |
805 correctness of such simplification steps. |
805 steps. |
806 |
806 |
807 While the simplification of regular expressions according to |
807 While the simplification of regular expressions according to |
808 rules like |
808 rules like |
809 |
809 |
810 \begin{center} |
810 \begin{center} |
858 @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\ |
858 @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\ |
859 @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\ |
859 @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\ |
860 \end{tabular} |
860 \end{tabular} |
861 \end{center} |
861 \end{center} |
862 |
862 |
863 \noindent where @{term "id"} stands for the identity function. |
863 \noindent where @{term "id"} stands for the identity function. This |
864 This function returns a simplified regular expression and a rectification |
864 function returns a simplified regular expression and a corresponding |
865 function. Note that |
865 rectification function. Note that we do not simplify under stars: this |
866 we do not simplify under stars: this seems to slow down the algorithm, |
866 seems to slow down the algorithm, rather than speed up. The optimised |
867 rather than speed up. The optimised lexer is then given by the clauses: |
867 lexer is then given by the clauses: |
868 |
868 |
869 \begin{center} |
869 \begin{center} |
870 \begin{tabular}{lcl} |
870 \begin{tabular}{lcl} |
871 @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\ |
871 @{thm (lhs) slexer.simps(1)} & $\dn$ & @{thm (rhs) slexer.simps(1)}\\ |
872 @{thm (lhs) slexer.simps(2)} & $\dn$ & |
872 @{thm (lhs) slexer.simps(2)} & $\dn$ & |