thys3/document/root.tex
author Christian Urban <christian.urban@kcl.ac.uk>
Sun, 01 May 2022 23:16:44 +0100
changeset 499 6a100d32314c
parent 496 f493a20feeb3
child 569 5af61c89f51e
permissions -rw-r--r--
updated the paper

\documentclass[runningheads]{lipics-v2021}
\usepackage{times}
\usepackage{isabelle}
\usepackage{isabellesym}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathpartir}
\usepackage{tikz}
\usepackage{pgf}
\usetikzlibrary{positioning}
%\usepackage{pdfsetup}
\usepackage{stmaryrd}
%\usepackage{url}
%\usepackage{color}
%\usepackage[safe]{tipa}
%\usepackage[sc]{mathpazo}
%\usepackage{fontspec}
%\setmainfont[Ligatures=TeX]{Palatino Linotype}




\urlstyle{rm}
\isabellestyle{it}
\renewcommand{\isastyleminor}{\it}% 
\renewcommand{\isastyle}{\normalsize\it}%


\def\dn{\,\stackrel{\mbox{\scriptsize def}}{=}\,}
\renewcommand{\isasymequiv}{$\dn$}
\renewcommand{\isasymemptyset}{$\varnothing$}
\renewcommand{\isacharunderscore}{\mbox{$\_\!\_$}}
\renewcommand{\isasymiota}{\makebox[0mm]{${}^{\prime}$}}
\renewcommand{\isasymin}{\ensuremath{\,\in\,}}

\addtolength{\oddsidemargin}{-1.5mm}
\addtolength{\evensidemargin}{-1.5mm}
\addtolength{\textwidth}{3.4mm}
\addtolength{\textheight}{1.4mm}

\def\lexer{\mathit{lexer}}
\def\mkeps{\mathit{mkeps}}
\def\inj{\mathit{inj}}
\def\Empty{\mathit{Empty}}
\def\Left{\mathit{Left}}
\def\Right{\mathit{Right}}
\def\Stars{\mathit{Stars}}
\def\Char{\mathit{Char}}
\def\Seq{\mathit{Seq}}
\def\Der{\mathit{Der}}
\def\nullable{\mathit{nullable}}
\def\Z{\mathit{Z}}
\def\S{\mathit{S}}
\newcommand{\ZERO}{\mbox{\bf 0}}
\newcommand{\ONE}{\mbox{\bf 1}}
\def\rs{\mathit{rs}}

\def\Brz{Brzozowski}
\def\der{\backslash}
\newtheorem{falsehood}{Falsehood}
\newtheorem{conject}{Conjecture}

\bibliographystyle{plainurl}

\title{{POSIX} {L}exing with {B}itcoded {D}erivatives}
\titlerunning{POSIX Lexing with Bitcoded Derivatives}
\author{Chengsong Tan}{King's College London}{chengsong.tan@kcl.ac.uk}{}{}
\author{Christian Urban}{King's College London}{christian.urban@kcl.ac.uk}{}{}
\authorrunning{C.~Tan and C.~Urban}
\keywords{POSIX matching and lexing, derivatives of regular expressions, Isabelle/HOL}
\category{}
\ccsdesc[100]{Design and analysis of algorithms}
\ccsdesc[100]{Formal languages and automata theory}
\Copyright{\mbox{}}
\renewcommand{\DOIPrefix}{}
\nolinenumbers


\begin{document}
\maketitle

\begin{abstract}
  Sulzmann and Lu describe a lexing algorithm that calculates
  Brzozowski derivatives using bitcodes annotated to regular
  expressions.  Their algorithm generates POSIX values which encode
  the information of \emph{how} a regular expression matches a
  string---that is, which part of the string is matched by which part
  of the regular expression. This information is needed in the
  context of lexing in order to extract and to classify tokens.
  The purpose of the bitcodes is to generate POSIX values incrementally while
  derivatives are calculated. They also help with designing
  an ``aggressive'' simplification function that keeps the size of
  derivatives finitely bounded. Without simplification the size of some derivatives can grow
  arbitrarily big, resulting in an extremely slow lexing algorithm.  In this
  paper we describe a variant of Sulzmann and Lu's algorithm: Our
  variant is a recursive functional program, whereas Sulzmann
  and Lu's version involves a fixpoint construction. We \textit{(i)}
  prove in Isabelle/HOL that our algorithm is correct and generates
  unique POSIX values; we also \textit{(ii)} establish finite
  bounds for the size of the derivatives.

  %The size can be seen as a
  %proxy measure for the efficiency of the lexing algorithm: because of
  %the polynomial bound our algorithm does not suffer from
  %the exponential blowup in earlier works.
  
  % Brzozowski introduced the notion of derivatives for regular
  % expressions. They can be used for a very simple regular expression
  % matching algorithm.  Sulzmann and Lu cleverly extended this
  % algorithm in order to deal with POSIX matching, which is the
  % underlying disambiguation strategy for regular expressions needed
  % in lexers.  Their algorithm generates POSIX values which encode
  % the information of \emph{how} a regular expression matches a
  % string---that is, which part of the string is matched by which
  % part of the regular expression.  In this paper we give our
  % inductive definition of what a POSIX value is and show $(i)$ that
  % such a value is unique (for given regular expression and string
  % being matched) and $(ii)$ that Sulzmann and Lu's algorithm always
  % generates such a value (provided that the regular expression
  % matches the string). We show that $(iii)$ our inductive definition
  % of a POSIX value is equivalent to an alternative definition by
  % Okui and Suzuki which identifies POSIX values as least elements
  % according to an ordering of values.  We also prove the correctness
  % of Sulzmann's bitcoded version of the POSIX matching algorithm and
  % extend the results to additional constructors for regular
  % expressions.  \smallskip
\end{abstract}



\input{session}



\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End: