polished proposal
authorurbanc
Tue, 06 Sep 2011 02:35:10 +0000
changeset 242 093e45c44d91
parent 241 68d48522ea9a
child 243 86a4182c73e7
polished proposal
Journal/Paper.thy
Journal/document/root.bib
csupp.pdf
csupp.tex
--- a/Journal/Paper.thy	Mon Sep 05 20:59:50 2011 +0000
+++ b/Journal/Paper.thy	Tue Sep 06 02:35:10 2011 +0000
@@ -29,6 +29,9 @@
   "_Collect p P"      <= "{p|xs. P}"
   "_CollectIn p A P"  <= "{p : A. P}"
 
+syntax (latex output)
+  "_UNION_le"   :: "'a \<Rightarrow> 'a => 'b set => 'b set"       ("(3\<Union>(00\<^bsub>_ \<le> _\<^esub>)/ _)" [0, 0, 10] 10)
+
 abbreviation "ZERO \<equiv> Zero"
 abbreviation "ONE \<equiv> One"
 abbreviation "ATOM \<equiv> Atom"
@@ -2463,9 +2466,10 @@
   
   \noindent
   {\bf Acknowledgements:}
-  We are grateful for the comments we received from Larry
-  Paulson, Tobias Nipkow made us aware of the properties in Lemma~\ref{subseqreg}
-  and Tjark Weber helped us with their proofs.
+  We are grateful for the comments we received from Larry Paulson.  Tobias
+  Nipkow made us aware of the properties in Lemma~\ref{subseqreg} and Tjark
+  Weber helped us with their proofs.
+
 
 *}
 
--- a/Journal/document/root.bib	Mon Sep 05 20:59:50 2011 +0000
+++ b/Journal/document/root.bib	Tue Sep 06 02:35:10 2011 +0000
@@ -157,7 +157,7 @@
  title = {{D}erivatives of {R}egular {E}xpressions},
  journal = {Journal of the ACM},
  volume = {11},
- issue = {4},
+ number = {4},
  year = {1964},
  pages = {481--494},
  publisher = {ACM}
@@ -252,134 +252,94 @@
   volume    = {6898}
 }
 
-@Article{Okhotin/04a,
-  author =	"A. Okhotin",
-  title =	"Boolean {G}rammars",
-  journal =	"Inf.~Comput.",
+
+
+@Article{Okhotin04,
+  author =	"A.~Okhotin",
+  title =	"{B}oolean {G}rammars",
+  journal =	"Information and Computation",
   pages =	"19--48",
   year = 	"2004",
   number =	"1",
-  volume =	"194",
-  keywords =	"context-free grammar, intersection, complement,
-		 language equation, parsing, conjunctive grammar,
-		 trellis automaton, cellular automaton",
-  abstract =	"A new generalization of context-free grammars is
-		 introduced: Boolean grammars allow the use of all
-		 set-theoretic operations as an integral part of the
-		 formalism of rules. Rigorous semantics for these
-		 grammars is defined by language equations in a way that
-		 allows to generalize some techniques from the theory of
-		 context-free grammars, including Chomsky normal form,
-		 Cocke-Kasami-Younger cubic-time recognition algorithm
-		 and some limited extension of the notion of a parse
-		 tree, which together allow to conjecture practical
-		 applicability of the new concept. (Copyright 2004
-		 ScienceDirect)",
-  editor =	"Albert R. Meyer",
-  publisher =	"Elsevier B.V.",
-  address =	"Orlando-Amsterdam-Tokyo-Singapore",
-  URL =  	"http://dx.doi.org/10.1016/j.ic.2004.03.006",
-  cdate =	"2004-12-21",
-  mdate =	"2005-08-18",
+  volume =	"194"
 }
 
-@Article{journals/iandc/KountouriotisNR09,
-  title =	"{W}ell-founded {S}emantics for Boolean {G}rammars",
-  author =	"V. Kountouriotis and C. Nomikos and P.
-		 Rondogiannis",
-  journal =	"Inf. Comput",
-  year = 	"2009",
-  number =	"9",
-  volume =	"207",
-  bibdate =	"2009-09-08",
-  bibsource =	"DBLP,
-		 http://dblp.uni-trier.de/db/journals/iandc/iandc207.html#KountouriotisNR09",
-  pages =	"945--967",
-  URL =  	"http://dx.doi.org/10.1016/j.ic.2009.05.002",
+@Article{KountouriotisNR09,
+  title =	"{W}ell-founded {S}emantics for {B}oolean {G}rammars",
+  author =	"V.~Kountouriotis and C.~Nomikos and P.~Rondogiannis",
+  journal =	"Information and Computation",
+  year = 	2009,
+  number =	9,
+  volume =	207,
+  pages     =   {945--967}
+}
+
+
+@article{Leroy09,
+  author = {X.~Leroy},
+  title = {{F}ormal {V}erification of a {R}ealistic {C}ompiler},
+  journal = {Communications of the ACM},
+  year = 2009,
+  volume = 52,
+  number = 7,
+  pages = {107--115}
 }
 
-@Article{journals/corr/abs-1010-5023,
-  title =	"Yacc is {D}ead",
-  author =	"M. Might and D. Darais",
-  journal =	"CoRR",
-  year = 	"2010",
-  volume =	"abs/1010.5023",
-  note = 	"informal publication",
-  bibdate =	"2010-11-03",
-  bibsource =	"DBLP,
-		 http://dblp.uni-trier.de/db/journals/corr/corr1010.html#abs-1010-5023",
-  URL =  	"http://arxiv.org/abs/1010.5023",
+
+@Unpublished{Might11,
+  title =	"{Y}acc is {D}ead",
+  author =	"M.~Might and D.~Darais",
+  note  = "To appear in {\it Proc.~of the 16th ACM International Conference on 
+           Functional Programming (ICFP)}",
+  year = 2011
 }
 
-@InProceedings{Ford04a,
-  author =	"B. Ford",
-  title =	"Parsing {E}xpression {G}rammars: {A} {R}ecognition-based
+@InProceedings{Ford04,
+  author =	"B.~Ford",
+  title =	"{P}arsing {E}xpression {G}rammars: {A} {R}ecognition-{B}ased
 		 {S}yntactic {F}oundation",
-  booktitle =	"POPL '04: Proceedings of the 31st ACM SIGPLAN-SIGACT
-		 symposium on Principles of programming languages",
-  year = 	"2004",
-  ISBN = 	"1-58113-729-X",
-  pages =	"111--122",
-  location =	"Venice, Italy",
-  doi =  	"10.1145/964001.964011",
-  URL =  	"http://pdos.csail.mit.edu/~baford/packrat/popl04/peg-popl04.pdf",
-  publisher =	"ACM",
-  address =	"New York, NY, USA",
-  keywords =	"peg parsing cclit",
-  doi-url =	"http://dx.doi.org/10.1145/964001.964011",
+  booktitle =	"Proc.~of the 31st ACM Symposium on Principles of Programming Languages (POPL)",
+  year = 	2004,
+  pages =	"111--122"
 }
 
-@InProceedings{Ford02b,
-  author =	"B. Ford",
-  title =	"{Packrat Parsing: a Practical Linear-Time Algorithm
-		 with Backtracking}",
-  booktitle =	"ICFP '02: Proceedings of the seventh ACM SIGPLAN
-		 international conference on Functional programming",
+@InProceedings{Ford02,
+  author =	"B.~Ford",
+  title =	"{P}ackrat {P}arsing: : {S}imple, {P}owerful, {L}azy, {L}inear {T}ime,
+                 ({F}unctional {P}earl)",
+  booktitle =	"Proc.~of the 7th ACM International Conference on Functional Programming (ICFP)",
   year = 	"2002",
-  school =	"Massachusetts Institute of Technology",
-  URL =  	"http://pdos.csail.mit.edu/~baford/packrat/thesis/
-		 http://pdos.csail.mit.edu/~baford/packrat/thesis/thesis.pdf",
-  keywords =	"packrat parsing cclit",
+  pages  = {36--47}
+
 }
 
-@InProceedings{conf/pepm/WarthDM08,
-  title =	"{Packrat Parsers Can Support Left Recursion}",
-  author =	"A. Warth and J. R. Douglass and T. D.
-		 Millstein",
-  bibdate =	"2008-04-04",
-  bibsource =	"DBLP,
-		 http://dblp.uni-trier.de/db/conf/pepm/pepm2008.html#WarthDM08",
-  booktitle =	"PEPM",
-  booktitle =	"Proceedings of the 2008 {ACM} {SIGPLAN} Symposium on
+@InProceedings{WarthDM08,
+  title =	"{{P}ackrat {P}arsers {C}an {S}upport {L}eft {R}ecursion}",
+  author =	"A.~Warth and J.~R.~Douglass and T.~D.~Millstein",
+  booktitle =	"Proc. of the {ACM} Symposium on
 		 Partial Evaluation and Semantics-based Program
-		 Manipulation, {PEPM} 2008, San Francisco, California,
-		 {USA}, January 7-8, 2008",
-  publisher =	"ACM",
+		 Manipulation (PEPM)",
   year = 	"2008",
-  editor =	"Robert Gl{\"u}ck and Oege de Moor",
-  ISBN = 	"978-1-59593-977-7",
-  pages =	"103--110",
-  URL =  	"http://doi.acm.org/10.1145/1328408.1328424",
+  pages =	"103--110"
 }
 
 @Article{Earley70,
   author =	"J. Earley",
   title =	"{An Efficient Context-Free Parsing Algorithm}",
-  journal =	"Communications of the ACM (CACM)",
+  journal =	"Communications of the ACM",
   volume =	"13",
   number =	"2",
-  month =	feb,
-  year = 	"1970",
-  keywords =	"parallel parsing, syntax analysis, parsing,
-		 context-free grammar, compilers, computational
-		 complexity, CACM,",
+  pages =       {94--102},
+  year = 	"1970"
 }
 
 @Article{AycHor02,
-  author =	"Aycock and Horspool",
-  title =	"{Practical Earley Parsing}",
-  journal =	"COMPJ: The Computer Journal",
+  author =	"J.~Aycock and R.~N.~Horspool",
+  title =	"{{P}ractical {E}arley {P}arsing}",
+  journal =	"The Computer Journal",
   volume =	"45",
+  number =      "6",
+  pages =       "620--630",
   year = 	"2002",
 }
 
Binary file csupp.pdf has changed
--- a/csupp.tex	Mon Sep 05 20:59:50 2011 +0000
+++ b/csupp.tex	Tue Sep 06 02:35:10 2011 +0000
@@ -34,86 +34,89 @@
 
 \begin{multicols}{2}
 \section*{Background}
+
 \noindent
-Parsing is the act of transforming plain text into some
-structure that can be analyzed by computers for further processing.
-One might think that parsing has been studied to death, and after
-\emph{yacc} and \emph{lex} no new results can be obtained in this area.
-However recent developments and novel approaches make it increasingly clear,
-that this is not true anymore.
+Parsers transform plain text into some abstract structure that can be analyzed by
+computers for further processing.  One might think that parsers have been
+studied to death, and after \emph{yacc} and \emph{lex} no new results can be
+obtained in this area.  However recent developments and novel approaches make
+it increasingly clear, that this is not true anymore~\cite{Might11}. And
+there is a practical need for new results: for example the future HTML 5 
+Standard abandons a well-defined grammar specification, in favour of a bespoke
+parser given as pseudo code.
 
-We propose to on parsers from a certification point of view. Increasingly,
-parsers are part of certified compilers, like \mbox{\emph{CompCert}}, which
-are guaranteed to be correct and bug-free. Such certified compilers are
-crucial in areas where software just cannot fail. However, so far the parsers
-of these compilers have been left out of the certification.  This is because
-parsing algorithms are often ad hoc and their semantics is not clearly
-specified. Unfortunately, this means parsers can harbour errors that
-potentially invalidate the whole certification and correctness of the
-compiler. In this project, we like to change that with the help of theorem
-provers.
+This work targets parsers from a certification point of view. Increasingly,
+parsers are part of certified compilers, like
+\mbox{\emph{CompCert}}~\cite{Leroy09}, which are guaranteed to be
+bug-free. Such certified compilers are crucial in areas where software just
+cannot fail. However, so far the parsers of these compilers have been left out
+of the certification.  This is because parsing algorithms are often ad hoc and
+their semantics is not clearly specified. This means, unfortunately, parsers
+can harbour errors that potentially invalidate the whole certification and
+correctness of the compiler. In this project, we aim to change this situation
+with the help of the theorem prover Isabelle/HOL.
 
-Only in the last few years, theorem provers have become good enough for
+Only in the last few years, theorem provers have become powerful enough for
 establishing the correctness of some standard lexing and parsing
 algorithms. For this, the algorithms still need to be formulated in way so
 that it is easy to reason about them. In our earlier work about lexing and
 regular languages, we showed that this precludes well-known algorithms based
-automata. However we showed also that regular languages can be formulated and
+automata~\cite{WuZhangUrban11}. However we showed also that regular languages can be formulated and
 reasoned about entirely in terms regular expressions, which can be easily
 represented in theorem provers. This work uses the device of derivatives of
 regular expressions. We like to extend this device to parsers and grammars.
 The aim is to come up with elegant and practical useful parsing algorithms
-whose correctness can be certified in a theorem prover.
+whose correctness can be certified in Isabelle/HOL.
 
 
 \section*{Proposed Work}
 
-A recent development in parsing is Parsing Expression Grammars (PEG), which
-are an extension of the weel-known Context Free Grammars
-(CFG)~\cite{Ford04a}. The extension introduces new regular operators, such as
-negation and conjunction, on the right-hand sides of grammar rules, as well as
-priority orderings. With these extensions, PEG parsing becomes much
-more powerful. For example disambiguation, formerly expressed by semantic
+A recent development in the field of parsing are Parsing Expression Grammars
+(PEGs)~\cite{Ford04}, which
+are an extension of the well-known Context Free Grammars
+(CFGs). This extension introduces new regular operators, such as
+negation and conjunction, on the right-hand side of grammar rules, as well as
+priority orderings for rules. With these extensions, PEG parsing becomes much
+more powerful and more useful in practise. For example disambiguation, formerly expressed by semantic
 filters, can now be expressed directly using grammar rules. 
 
-However, there is serious disadvantage of PEG for applications: is does not
-support grammrs involving left recursion~\cite{Ford02b}. Although a new PEG
-parsing algorithm has been proposed that can deal with left
-recursion~\cite{conf/pepm/WarthDM08}, there is no correctness proof, not even
-in ``paper-and-pencil'' form. One aim of this research is to solve this sorry
-state-of-affairs by either certifying this algorithm or inventing a new
-one. For this we will first formalize a fixed point semantics of PEG, based on
-which an efficient, certified parsing algorithm can be given given. For this
-we take as starting point the paper~\cite{Ford04a}, which does not treat
-left-recursion, but gives an operational semantics for PEG parsing. For the
-semantics, it seems plausible that we can adapt work on Boolean
-Grammars~\cite{Okhotin/04a}, which are similar to PEGs, and for which the
-paper ~\cite{journals/iandc/KountouriotisNR09} gives a semantics to negation
-operators, but not to Kleene's star operation.
+However, there is a serious limitation of PEGs, which affects potential
+applications: grammars involving left recursion are not
+allowed~\cite{Ford02}. Although one PEG parsing algorithm has already been
+proposed that can deal with left recursion~\cite{WarthDM08}, there
+is no correctness proof for it, not even one with ``pencil and paper''. One aim of this
+research is to solve this sorry state-of-affairs by either certifying this
+algorithm or inventing a new one. For this we will first devise a
+fixed-point semantics of PEGs, against which we can certify a parser. For this
+semantics we take as starting point the paper~\cite{Ford04}, which does not
+treat left-recursion, but gives an operational semantics for PEG
+parsing. There are also good indications that we can adapt work on Boolean
+Grammars~\cite{Okhotin04}, which are similar to PEGs, and for which the
+paper~\cite{KountouriotisNR09} gives a fixed-point semantics 
+to negation operators, but not to the Kleene star.
 
-For the parsing algorithm, we might also be able to draw inspiration from
-parsers based on Cocke-Younger-Kasami (CYK)
-algorithms~\cite{journals/iandc/KountouriotisNR09} and
-Early~\cite{Earley70,AycHor02} parsers. The defect CYK algorithms is that the
-original grammar specification needs to be transformed into a normal
-form. This transformation may lead to grammar explosion and inefficient
-parsing. We will investigate whether this transformation can be avoided.
-Early style parsers, which have recently been certified by Ridge [???], 
-need to be extended to PEG parsing in order to be helpful for us.
+For the parsing algorithm, we might be able to draw inspiration from parsers
+based on the classic Cocke-Younger-Kasami (CYK)
+algorithms~\cite{KountouriotisNR09} and
+Early~\cite{AycHor02, Earley70} parsers. The defect of CYK algorithms, however,
+is that the grammar specifications given by the user need to be transformed
+into a normal form. This transformation may potentially lead to rule explosion
+and hence inefficient parsing. We will investigate whether this transformation
+can be avoided.  The problem with Early-style parsers is that they need to be 
+extended to PEG parsing in order to be helpful for us.
+
 
 Finally, we want to investigate whether derivatives of regular expressions
-~\cite{Brzozowski64,Almeidaetal10,OwensReppyTuron09,journals/corr/abs-1010-5023}
-can be extended to parsing. Lexing based on derivatives gives rise to very
-elegant regular expression matchers that can be certified in a theorem prover 
-with ease.  We will study whether the idea of taking a derivative of a regular
-expression can be extended to rules in grammars. The problem that needs to be
-overcome again arises from possible left recursion in parsing. 
-
-    
-\newpage
+~\cite{Brzozowski64,Might11,OwensReppyTuron09}
+can be generalised to PEG parsing. In earlier work, we showed that lexing based on
+derivatives gives rise to very elegant regular expression matchers that can be
+certified in a theorem prover with ease.  We will study whether the idea of
+taking a derivative of a regular expression can be extended to rules in
+grammars. The problem that needs to be overcome is again how to deal with possible
+left-recursion in grammar rules.
 
 
-\small
+
 \bibliography{Journal/document/root}
 \bibliographystyle{abbrv}
 \end{multicols}