thys/Paper/Paper.thy
changeset 138 a87b8a09ffe8
parent 137 4178b7e71809
child 139 f28cf86a1e7f
equal deleted inserted replaced
137:4178b7e71809 138:a87b8a09ffe8
    50   F_SEQ1 ("F\<^bsub>Seq1\<^esub> _ _") and
    50   F_SEQ1 ("F\<^bsub>Seq1\<^esub> _ _") and
    51   F_SEQ2 ("F\<^bsub>Seq2\<^esub> _ _") and
    51   F_SEQ2 ("F\<^bsub>Seq2\<^esub> _ _") and
    52   F_SEQ ("F\<^bsub>Seq\<^esub> _ _") and
    52   F_SEQ ("F\<^bsub>Seq\<^esub> _ _") and
    53   simp_SEQ ("simp\<^bsub>Seq\<^esub> _ _" [1000, 1000] 1) and
    53   simp_SEQ ("simp\<^bsub>Seq\<^esub> _ _" [1000, 1000] 1) and
    54   simp_ALT ("simp\<^bsub>Alt\<^esub> _ _" [1000, 1000] 1) and
    54   simp_ALT ("simp\<^bsub>Alt\<^esub> _ _" [1000, 1000] 1) and
    55   matcher3 ("lexer\<^sup>+ _ _")
    55   matcher3 ("lexer\<^sup>+ _ _" [78,78] 77)
    56 
    56 
    57 definition 
    57 definition 
    58   "match r s \<equiv> nullable (ders s r)"
    58   "match r s \<equiv> nullable (ders s r)"
    59 
    59 
    60 (*>*)
    60 (*>*)
   849   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(SEQ r\<^sub>1 r\<^sub>2, F_SEQ f\<^sub>1 f\<^sub>2)"}\\
   849   @{term "simp_SEQ (r\<^sub>1, f\<^sub>1) (r\<^sub>2, f\<^sub>2)"} & $\dn$ & @{term "(SEQ r\<^sub>1 r\<^sub>2, F_SEQ f\<^sub>1 f\<^sub>2)"}\\
   850   \end{tabular}
   850   \end{tabular}
   851   \end{center}
   851   \end{center}
   852 
   852 
   853   \noindent
   853   \noindent
   854   The main simplification function is then 
   854   The functions @{text "simp\<^bsub>ALT\<^esub>"} and @{text "simp\<^bsub>SEQ\<^esub>"} encode the simplification rules
       
   855   and compose the rectification functions. The main simplification function is then 
   855 
   856 
   856   \begin{center}
   857   \begin{center}
   857   \begin{tabular}{lcl}
   858   \begin{tabular}{lcl}
   858   @{term "simp (ALT r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_ALT (simp r\<^sub>1) (simp r\<^sub>2)"}\\
   859   @{term "simp (ALT r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_ALT (simp r\<^sub>1) (simp r\<^sub>2)"}\\
   859   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
   860   @{term "simp (SEQ r\<^sub>1 r\<^sub>2)"} & $\dn$ & @{term "simp_SEQ (simp r\<^sub>1) (simp r\<^sub>2)"}\\
   860   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
   861   @{term "simp r"} & $\dn$ & @{term "(r, id)"}\\
   861   \end{tabular}
   862   \end{tabular}
   862   \end{center} 
   863   \end{center} 
   863 
   864 
   864   \noindent where @{term "id"} stands for the identity function. Note that
   865   \noindent where @{term "id"} stands for the identity function. 
       
   866   This function returns a simplified regular expression and a rectification 
       
   867   function. Note that
   865   we do not simplify under stars: this seems to slow down the algorithm,
   868   we do not simplify under stars: this seems to slow down the algorithm,
   866   rather than speed up. The optimised lexer is then given by the clauses:
   869   rather than speed up. The optimised lexer is then given by the clauses:
   867   
   870   
   868   \begin{center}
   871   \begin{center}
   869   \begin{tabular}{lcl}
   872   \begin{tabular}{lcl}
   877   \end{center}
   880   \end{center}
   878 
   881 
   879   \noindent
   882   \noindent
   880   In the second clause we first calculate the derivative @{text "r \\ c"}
   883   In the second clause we first calculate the derivative @{text "r \\ c"}
   881   and then simplify the result. This gives us a simplified derivative
   884   and then simplify the result. This gives us a simplified derivative
   882   @{text "r\<^sub>s"} and a rectification function @{text "f\<^sub>r"}. The matcher
   885   @{text "r\<^sub>s"} and a rectification function @{text "f\<^sub>r"}. The lexer
   883   is recursively called with the simplified derivative, but before
   886   is then recursively called with the simplified derivative, but before
   884   we inject the character @{term c} into value, we need to rectify
   887   we inject the character @{term c} into value, we need to rectify
   885   it (@{term "f\<^sub>r v"}). We can prove that
   888   it (that is construct @{term "f\<^sub>r v"}). We can prove that
   886 
   889 
   887   \begin{lemma}
   890   \begin{lemma}
   888   @{term "matcher3 r s = matcher r s"}
   891   @{term "matcher3 r s = matcher r s"}
   889   \end{lemma}
   892   \end{lemma}
   890 
   893 
   891   \noindent
   894   \noindent
   892   holds but refer the reader to our mechanisation for details.
   895   holds but for lack of space refer the reader to our mechanisation for details.
   893 
   896 
   894 *}
   897 *}
   895 
   898 
   896 section {* The Correctness Argument by Sulzmmann and Lu *}
   899 section {* The Correctness Argument by Sulzmmann and Lu *}
   897 
   900 
   912 $\infer{v_{1} \posix_{r_{1}} v'_{1}} 
   915 $\infer{v_{1} \posix_{r_{1}} v'_{1}} 
   913        {Seq\,v_{1}\,v_{2} \posix_{r_{1}r_{2}} Seq\,v'_{1}\,v'_{2}}(C2)$
   916        {Seq\,v_{1}\,v_{2} \posix_{r_{1}r_{2}} Seq\,v'_{1}\,v'_{2}}(C2)$
   914 &
   917 &
   915 $\infer{v_{2} \posix_{r_{2}} v'_{2}} 
   918 $\infer{v_{2} \posix_{r_{2}} v'_{2}} 
   916        {Seq\,v_{1}\,v_{2} \posix_{r_{1}r_{2}} Seq\,v_{1}\,v'_{2}}(C1)$
   919        {Seq\,v_{1}\,v_{2} \posix_{r_{1}r_{2}} Seq\,v_{1}\,v'_{2}}(C1)$
   917 \medskip\\
   920 \smallskip\\
   918 		
   921 		
   919 		
   922 		
   920 $\infer{ len |v_{2}| > len |v_{1}|} 
   923 $\infer{ len |v_{2}| > len |v_{1}|} 
   921        {Right \; v_{2} \posix_{r_{1}+r_{2}} Left \; v_{1}}(A1)$ 
   924        {Right \; v_{2} \posix_{r_{1}+r_{2}} Left \; v_{1}}(A1)$ 
   922 &
   925 &
   923 $\infer{ len |v_{1}| \geq len |v_{2}|} 
   926 $\infer{ len |v_{1}| \geq len |v_{2}|} 
   924        {Left \; v_{1} \posix_{r_{1}+r_{2}} Right \; v_{2}} (A2)$ 
   927        {Left \; v_{1} \posix_{r_{1}+r_{2}} Right \; v_{2}} (A2)$ 
   925 \medskip\\	
   928 \smallskip\\	
   926 
   929 
   927 $\infer{ v_{2} \posix_{r_{2}} v'_{2}} 
   930 $\infer{ v_{2} \posix_{r_{2}} v'_{2}} 
   928        {Right \; v_{2} \posix_{r_{1}+r_{2}} Right \; v'_{2}}(A3)$ & 
   931        {Right \; v_{2} \posix_{r_{1}+r_{2}} Right \; v'_{2}}(A3)$ & 
   929 
   932 
   930 $\infer{ v_{1} \posix_{r_{1}} v'_{1}} 
   933 $\infer{ v_{1} \posix_{r_{1}} v'_{1}} 
   931        {Left \; v_{1} \posix_{r_{1}+r_{2}} Left \; v'_{1}}(A4)$ 
   934        {Left \; v_{1} \posix_{r_{1}+r_{2}} Left \; v'_{1}}(A4)$ 
   932 \medskip\\
   935 \smallskip\\
   933 
   936 
   934  $\infer{|v :: vs| = []} {Stars\,[] \posix_{r^{\star}} Stars\,(v :: vs)}(K1)$ & 
   937  $\infer{|v :: vs| = []} {Stars\,[] \posix_{r^{\star}} Stars\,(v :: vs)}(K1)$ & 
   935  $\infer{|v :: vs| \neq []} {Stars\,(v :: vs) \posix_{r^{\star}} Stars\,[]}(K2)$
   938  $\infer{|v :: vs| \neq []} {Stars\,(v :: vs) \posix_{r^{\star}} Stars\,[]}(K2)$
   936 \medskip\\
   939 \smallskip\\
   937 	
   940 	
   938 $\infer{ v_{1} \posix_{r} v_{2}} 
   941 $\infer{ v_{1} \posix_{r} v_{2}} 
   939        {Stars\,(v_{1} :: vs_{1}) \posix_{r^{\star}} Stars\,(v_{2} :: vs_{2})}(K3)$ & 
   942        {Stars\,(v_{1} :: vs_{1}) \posix_{r^{\star}} Stars\,(v_{2} :: vs_{2})}(K3)$ & 
   940 $\infer{ Stars\,vs_{1} \posix_{r^{\star}} Stars\,vs_{2}} 
   943 $\infer{ Stars\,vs_{1} \posix_{r^{\star}} Stars\,vs_{2}} 
   941        {Stars\,(v :: vs_{1}) \posix_{r^{\star}} Stars\,(v :: vs_{2})}(K4)$	
   944        {Stars\,(v :: vs_{1}) \posix_{r^{\star}} Stars\,(v :: vs_{2})}(K4)$	
  1114 
  1117 
  1115 text {*
  1118 text {*
  1116 
  1119 
  1117   We have implemented the POSIX value calculation algorithm introduced in
  1120   We have implemented the POSIX value calculation algorithm introduced in
  1118   \cite{Sulzmann2014}. Our implementation is nearly identical to the
  1121   \cite{Sulzmann2014}. Our implementation is nearly identical to the
  1119   original and all modifications are harmless (like our char-clause for
  1122   original and all modifications we introduced are harmless (like our char-clause for
  1120   @{text inj}). We have proved this algorithm to be correct, but correct
  1123   @{text inj}). We have proved this algorithm to be correct, but correct
  1121   according to our own specification of what POSIX values are. Our
  1124   according to our own specification of what POSIX values are. Our
  1122   specification (inspired from work in \cite{Vansummeren2006}) appears to be
  1125   specification (inspired from work in \cite{Vansummeren2006}) appears to be
  1123   much simpler than in \cite{Sulzmann2014} and our proofs are nearly always
  1126   much simpler than in \cite{Sulzmann2014} and our proofs are nearly always
  1124   straightforward. We have attempted to formalise the original proof
  1127   straightforward. We have attempted to formalise the original proof
  1125   by Sulzmann and Lu \cite{Sulzmann2014}, but we believe it contains
  1128   by Sulzmann and Lu \cite{Sulzmann2014}, but we believe it contains
  1126   unfillable gaps. In the online version of \cite{Sulzmann2014}, the authors
  1129   unfillable gaps. In the online version of \cite{Sulzmann2014}, the authors
  1127   already acknowledge some small problems, but our experience suggests
  1130   already acknowledge some small problems, but our experience suggests
  1128   there are more serious problems. 
  1131   that there are more serious problems. 
  1129   
  1132   
  1130   Closely related to our work is an automata-based lexer formalised by
  1133   Closely related to our work is an automata-based lexer formalised by
  1131   Nipkow \cite{Nipkow98}. This lexer also splits up strings into longest
  1134   Nipkow \cite{Nipkow98}. This lexer also splits up strings into longest
  1132   initial substrings, but Nipkow's algorithm is not completely
  1135   initial substrings, but Nipkow's algorithm is not completely
  1133   computational. The algorithm by Sulzmann and Lu, in contrast, can be
  1136   computational. The algorithm by Sulzmann and Lu, in contrast, can be
  1134   implemented easily in functional languages. A bespoke lexer for the
  1137   implemented with easy in any functional language. A bespoke lexer for the
  1135   Imp-Language is formalised in Coq as part of the Software Foundations book
  1138   Imp-language is formalised in Coq as part of the Software Foundations book
  1136   \cite{Pierce2015}. The disadvantage of such bespoke lexers is that they
  1139   \cite{Pierce2015}. The disadvantage of such bespoke lexers is that they
  1137   do not generalise easily to more advanced features.
  1140   do not generalise easily to more advanced features.
  1138   Our formalisation is available from
  1141   Our formalisation is available from
  1139   \url{http://www.inf.kcl.ac.uk/staff/urbanc/lex}.
  1142   \url{http://www.inf.kcl.ac.uk/staff/urbanc/lex}.
  1140 
  1143