csupp.tex
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Fri, 27 Sep 2013 09:20:58 +0100
changeset 389 796de251332c
parent 258 1abf8586ee6b
permissions -rwxr-xr-x
added paper by Tobias
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     1
\documentclass{article}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     2
\usepackage{a4wide,ot1patch}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     3
\usepackage[latin1]{inputenc}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     4
\usepackage{multicol}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     5
\usepackage{charter}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     6
\usepackage{amsmath,amssymb,amsthm}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     7
\usepackage{fancyheadings}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     8
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
     9
\addtolength{\oddsidemargin}{-6mm}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    10
\addtolength{\evensidemargin}{-6mm}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    11
\addtolength{\textwidth}{11mm}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    12
\addtolength{\columnsep}{3mm}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    13
\addtolength{\textheight}{8mm}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    14
\addtolength{\topmargin}{-7.5mm}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    15
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    16
\pagestyle{fancyplain}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    17
\lhead[\fancyplain{}{A}]{\fancyplain{}{}}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    18
\rhead[\fancyplain{}{C}]{\fancyplain{}{}}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    19
\renewcommand{\headrulewidth}{0pt}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    20
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    21
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    22
\begin{document}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    23
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    24
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    25
\begin{center}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    26
\begin{tabular}{c}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    27
\\[-5mm]
243
86a4182c73e7 more tuning on the proposal
urbanc
parents: 242
diff changeset
    28
\LARGE\bf Novel Certified Parsers\\[-10mm]
229
2087fc59f2a1 One passage added.
zhang
parents: 228
diff changeset
    29
\mbox{}
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    30
\end{tabular}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    31
\end{center}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    32
\thispagestyle{empty}
228
87a8dc29d7ae latest changes
urbanc
parents: 227
diff changeset
    33
\mbox{}\\[-5mm]
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    34
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    35
\begin{multicols}{2}
228
87a8dc29d7ae latest changes
urbanc
parents: 227
diff changeset
    36
\section*{Background}
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    37
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    38
\noindent
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    39
Parsers transform plain text into some abstract structure that can be analyzed by
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    40
computers for further processing.  One might think that parsers have been
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    41
studied to death, and after \emph{yacc} and \emph{lex} no new results can be
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    42
obtained in this area.  However recent developments and novel approaches make
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    43
it increasingly clear, that this is not true anymore~\cite{Might11}. And
244
urbanc
parents: 243
diff changeset
    44
there is a real practical need for new results: for example the future HTML5 
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    45
Standard abandons a well-defined grammar specification, in favour of a bespoke
258
1abf8586ee6b added slides for a talk in St Andrews
urbanc
parents: 244
diff changeset
    46
parser given as pseudo code. Proving any property about this parser is nearly 
1abf8586ee6b added slides for a talk in St Andrews
urbanc
parents: 244
diff changeset
    47
impossible.
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    48
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    49
This work targets parsers from a certification point of view. Increasingly,
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    50
parsers are part of certified compilers, like
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    51
\mbox{\emph{CompCert}}~\cite{Leroy09}, which are guaranteed to be
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    52
bug-free. Such certified compilers are crucial in areas where software just
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    53
cannot fail. However, so far the parsers of these compilers have been left out
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    54
of the certification.  This is because parsing algorithms are often ad hoc and
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    55
their semantics is not clearly specified. This means, unfortunately, parsers
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    56
can harbour errors that potentially invalidate the whole certification and
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    57
correctness of the compiler. In this project, we aim to change this situation
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    58
with the help of the theorem prover Isabelle/HOL.
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
    59
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    60
Only in the last few years, theorem provers have become powerful enough for
236
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
    61
establishing the correctness of some standard lexing and parsing
243
86a4182c73e7 more tuning on the proposal
urbanc
parents: 242
diff changeset
    62
algorithms. For this, the algorithms still need to be formulated in a way so
236
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
    63
that it is easy to reason about them. In our earlier work about lexing and
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
    64
regular languages, we showed that this precludes well-known algorithms based
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    65
automata~\cite{WuZhangUrban11}. However we showed also that regular languages can be formulated and
236
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
    66
reasoned about entirely in terms regular expressions, which can be easily
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
    67
represented in theorem provers. This work uses the device of derivatives of
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
    68
regular expressions. We like to extend this device to parsers and grammars.
234
eeadb4e51d74 some changes
urbanc
parents: 232
diff changeset
    69
The aim is to come up with elegant and practical useful parsing algorithms
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    70
whose correctness can be certified in Isabelle/HOL.
236
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
    71
228
87a8dc29d7ae latest changes
urbanc
parents: 227
diff changeset
    72
229
2087fc59f2a1 One passage added.
zhang
parents: 228
diff changeset
    73
\section*{Proposed Work}
2087fc59f2a1 One passage added.
zhang
parents: 228
diff changeset
    74
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    75
A recent development in the field of parsing are Parsing Expression Grammars
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    76
(PEGs)~\cite{Ford04}, which
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    77
are an extension of the well-known Context Free Grammars
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    78
(CFGs). This extension introduces new regular operators, such as
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    79
negation and conjunction, on the right-hand side of grammar rules, as well as
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    80
priority orderings for rules. With these extensions, PEG parsing becomes much
258
1abf8586ee6b added slides for a talk in St Andrews
urbanc
parents: 244
diff changeset
    81
more powerful and more useful in practice. For example disambiguation, formerly expressed by semantic
236
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
    82
filters, can now be expressed directly using grammar rules. 
230
6bb8ad9093e6 More modification by Xingyuan.
zhang
parents: 229
diff changeset
    83
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    84
However, there is a serious limitation of PEGs, which affects potential
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    85
applications: grammars involving left recursion are not
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    86
allowed~\cite{Ford02}. Although one PEG parsing algorithm has already been
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    87
proposed that can deal with left recursion~\cite{WarthDM08}, there
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    88
is no correctness proof for it, not even one with ``pencil and paper''. One aim of this
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    89
research is to solve this sorry state-of-affairs by either certifying this
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    90
algorithm or inventing a new one. For this we will first devise a
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    91
fixed-point semantics of PEGs, against which we can certify a parser. For this
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    92
semantics we take as starting point the paper~\cite{Ford04}, which does not
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    93
treat left-recursion, but gives an operational semantics for PEG
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    94
parsing. There are also good indications that we can adapt work on Boolean
243
86a4182c73e7 more tuning on the proposal
urbanc
parents: 242
diff changeset
    95
Grammars~\cite{Okhotin04}, which are similar to PEGs and for which the
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
    96
paper~\cite{KountouriotisNR09} gives a fixed-point semantics 
258
1abf8586ee6b added slides for a talk in St Andrews
urbanc
parents: 244
diff changeset
    97
for negation operators, but not to the Kleene star.
234
eeadb4e51d74 some changes
urbanc
parents: 232
diff changeset
    98
258
1abf8586ee6b added slides for a talk in St Andrews
urbanc
parents: 244
diff changeset
    99
For our parsing algorithm, we might be able to build upon
243
86a4182c73e7 more tuning on the proposal
urbanc
parents: 242
diff changeset
   100
the classic Cocke-Younger-Kasami (CYK)
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   101
algorithms~\cite{KountouriotisNR09} and
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   102
Early~\cite{AycHor02, Earley70} parsers. The defect of CYK algorithms, however,
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   103
is that the grammar specifications given by the user need to be transformed
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   104
into a normal form. This transformation may potentially lead to rule explosion
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   105
and hence inefficient parsing. We will investigate whether this transformation
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   106
can be avoided.  The problem with Early-style parsers is that they need to be 
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   107
extended to PEG parsing in order to be helpful for us.
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   108
234
eeadb4e51d74 some changes
urbanc
parents: 232
diff changeset
   109
236
2d4f1334b5ca shortened
urbanc
parents: 235
diff changeset
   110
Finally, we want to investigate whether derivatives of regular expressions
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   111
~\cite{Brzozowski64,Might11,OwensReppyTuron09}
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   112
can be generalised to PEG parsing. In earlier work, we showed that lexing based on
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   113
derivatives gives rise to very elegant regular expression matchers that can be
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   114
certified in a theorem prover with ease.  We will study whether the idea of
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   115
taking a derivative of a regular expression can be extended to rules in
243
86a4182c73e7 more tuning on the proposal
urbanc
parents: 242
diff changeset
   116
grammars. The problem that needs to be addressed is again how to deal with 
86a4182c73e7 more tuning on the proposal
urbanc
parents: 242
diff changeset
   117
left-recursive grammar rules.
229
2087fc59f2a1 One passage added.
zhang
parents: 228
diff changeset
   118
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   119
242
093e45c44d91 polished proposal
urbanc
parents: 236
diff changeset
   120
232
114064363ef0 Proposal paragraphs by Xingyuan completed (with references added).
zhang
parents: 230
diff changeset
   121
\bibliography{Journal/document/root}
114064363ef0 Proposal paragraphs by Xingyuan completed (with references added).
zhang
parents: 230
diff changeset
   122
\bibliographystyle{abbrv}
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   123
\end{multicols}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   124
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   125
%  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   126
%  \noindent {\bf Objectives:} The overall goals of the project are as follows:
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   127
%  \begin{itemize}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   128
%  \item To solve the POPLmark challenge.
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   129
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   130
%  \item To complete and greatly improve the existing implementation of the
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   131
%    nominal datatype package.
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   132
%  \item To explore the strengths of this package by proving the
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   133
%    safety of SML.
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   134
%  \item To provide a basis for extracting programs from safety proofs.
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   135
229
2087fc59f2a1 One passage added.
zhang
parents: 228
diff changeset
   136
%  \item To make the nominal datatype package usable for teaching
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   137
%    students about the lambda-calculus and the theory of programming
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   138
%    languages. \smallskip
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   139
%  \end{itemize}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   140
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   141
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   142
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   143
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   144
\end{document}
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   145
229
2087fc59f2a1 One passage added.
zhang
parents: 228
diff changeset
   146
%%% Local Variables:
225
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   147
%%% mode: latex
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   148
%%% TeX-master: t
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   149
%%% TeX-command-default: "PdfLaTeX"
bc3ffe0dd1d8 added a start for a proposal
urbanc
parents:
diff changeset
   150
%%% TeX-view-style: (("." "kpdf %s.pdf"))
229
2087fc59f2a1 One passage added.
zhang
parents: 228
diff changeset
   151
%%% End: