%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Masters/Doctoral Thesis
% LaTeX Template
% Version 2.5 (27/8/17)
%
% This template was downloaded from:
% http://www.LaTeXTemplates.com
%
% Version 2.x major modifications by:
% Vel (vel@latextemplates.com)
%
% This template is based on a template by:
% Steve Gunn (http://users.ecs.soton.ac.uk/srg/softwaretools/document/templates/)
% Sunil Patel (http://www.sunilpatel.co.uk/thesis-template/)
%
% Template license:
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%----------------------------------------------------------------------------------------
% PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
%----------------------------------------------------------------------------------------
\documentclass[
11pt, % The default document font size, options: 10pt, 11pt, 12pt
%oneside, % Two side (alternating margins) for binding by default, uncomment to switch to one side
english, % ngerman for German
singlespacing, % Single line spacing, alternatives: onehalfspacing or doublespacing
%draft, % Uncomment to enable draft mode (no pictures, no links, overfull hboxes indicated)
%nolistspacing, % If the document is onehalfspacing or doublespacing, uncomment this to set spacing in lists to single
%liststotoc, % Uncomment to add the list of figures/tables/etc to the table of contents
%toctotoc, % Uncomment to add the main table of contents to the table of contents
%parskip, % Uncomment to add space between paragraphs
%nohyperref, % Uncomment to not load the hyperref package
headsepline, % Uncomment to get a line under the header
%chapterinoneline, % Uncomment to place the chapter title next to the number on one line
consistentlayout, % Uncomment to change the layout of the declaration, abstract and acknowledgements pages to match the default layout
]{MastersDoctoralThesis} % The class file specifying the document structure
\usepackage[utf8]{inputenc} % Required for inputting international characters
\usepackage[T1]{fontenc} % Output font encoding for international characters
%\usepackage{fdsymbol} % Loads unicode-math
\usepackage{cancel}
\usepackage{fontawesome5}
\usepackage{bbding,pifont,dingbat}
\usepackage{listings}
\usepackage{xcolor}
\usepackage{beramono}
\usepackage{verbatim}
\usepackage{float}
\usepackage{mathpazo} % Use the Palatino font by default
\usepackage{hyperref}
\usepackage{lipsum}
\usepackage[backend=bibtex]{biblatex} % Use the bibtex backend with the authoryear citation style (which resembles APA)
\usepackage[usestackEOL]{stackengine}
\usepackage{scalerel}
\usepackage{graphicx}
%style=authoryear, natbib=true
\usepackage{stmaryrd}
\usepackage{caption}
\usepackage{afterpage}
\addbibresource{example.bib} % The filename of the bibliography
\usepackage[autostyle=true]{csquotes} % Required to generate language-dependent quotes in the bibliography
%My Newly added Libraries in addition to template
\usepackage{graphic}
\usepackage{data}
%\usepackage{algorithm}
\usepackage{amsmath}
\makeatletter
\newcommand{\xleftrightarrow}[2][]{\ext@arrow 3359\leftrightarrowfill@{#1}{#2}}
\newcommand{\xdashrightarrow}[2][]{\ext@arrow 0359\rightarrowfill@@{#1}{#2}}
\newcommand{\xdashleftarrow}[2][]{\ext@arrow 3095\leftarrowfill@@{#1}{#2}}
\newcommand{\xdashleftrightarrow}[2][]{\ext@arrow 3359\leftrightarrowfill@@{#1}{#2}}
\def\rightarrowfill@@{\arrowfill@@\relax\relbar\rightarrow}
\def\leftarrowfill@@{\arrowfill@@\leftarrow\relbar\relax}
\def\leftrightarrowfill@@{\arrowfill@@\leftarrow\relbar\rightarrow}
\def\arrowfill@@#1#2#3#4{%
$\m@th\thickmuskip0mu\medmuskip\thickmuskip\thinmuskip\thickmuskip
\relax#4#1
\xleaders\hbox{$#4#2$}\hfill
#3$%
}
\makeatother
\def\checkmark{\tikz\fill[scale=0.4](0,.35) -- (.25,0) -- (1,.7) -- (.25,.15) -- cycle;}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{cleveref}
%\usepackage{mathtools}
\usepackage[noend]{algpseudocode}
\usepackage{enumitem}
\usepackage{nccmath}
\usepackage{tikz-cd}
\usepackage{tikz}
\usetikzlibrary{automata, positioning, calc}
\usetikzlibrary{arrows}
\usetikzlibrary{shapes.misc}
\usetikzlibrary{fit,
shapes.geometric,
patterns,
backgrounds,
graphs}
\usetikzlibrary{babel}
\usepackage{mathpartir}
\usepackage{stackrel}
\DeclareCaptionType{mytype}[Illustration][]
\newenvironment{envForCaption}{\captionsetup{type=mytype} }{}
\pgfplotsset{
myplotstyle/.style={
legend style={draw=none, font=\small},
legend cell align=left,
legend pos=north east,
ylabel style={align=center, font=\bfseries\boldmath},
xlabel style={align=center, font=\bfseries\boldmath},
x tick label style={font=\bfseries\boldmath},
y tick label style={font=\bfseries\boldmath},
scaled ticks=true,
every axis plot/.append style={thick},
},
}
\definecolor{dkgreen}{rgb}{0,0.6,0}
\definecolor{gray}{rgb}{0.5,0.5,0.5}
\definecolor{mauve}{rgb}{0.58,0,0.82}
\lstdefinestyle{myScalastyle}{
frame=tb,
language=scala,
aboveskip=3mm,
belowskip=3mm,
showstringspaces=false,
columns=flexible,
basicstyle={\small\ttfamily},
numbers=none,
numberstyle=\tiny\color{gray},
keywordstyle=\color{blue},
commentstyle=\color{dkgreen},
stringstyle=\color{mauve},
frame=single,
breaklines=true,
breakatwhitespace=true,
tabsize=3,
}
%----------------------------------------------------------------------------------------
% MARGIN SETTINGS
%----------------------------------------------------------------------------------------
\geometry{
paper=a4paper, % Change to letterpaper for US letter
inner=2.5cm, % Inner margin
outer=3.8cm, % Outer margin
bindingoffset=.5cm, % Binding offset
top=1.5cm, % Top margin
bottom=1.5cm, % Bottom margin
%showframe, % Uncomment to show how the type block is set on the page
}
%----------------------------------------------------------------------------------------
% THESIS INFORMATION
%----------------------------------------------------------------------------------------
\thesistitle{POSIX Regular Expression Matching and Lexing} % Your thesis title, this is used in the title and abstract, print it elsewhere with \ttitle
\supervisor{Dr. Christian \textsc{Urban}} % Your supervisor's name, this is used in the title page, print it elsewhere with \supname
\examiner{} % Your examiner's name, this is not currently used anywhere in the template, print it elsewhere with \examname
\degree{Doctor of Philosophy} % Your degree name, this is used in the title page and abstract, print it elsewhere with \degreename
\author{Chengsong \textsc{Tan}} % Your name, this is used in the title page and abstract, print it elsewhere with \authorname
\addresses{} % Your address, this is not currently used anywhere in the template, print it elsewhere with \addressname
\subject{Computer Science} % Your subject area, this is not currently used anywhere in the template, print it elsewhere with \subjectname
\keywords{} % Keywords for your thesis, this is not currently used anywhere in the template, print it elsewhere with \keywordnames
\university{\href{https://www.kcl.ac.uk}{King's College London}} % Your university's name and URL, this is used in the title page and abstract, print it elsewhere with \univname
\department{\href{https://www.kcl.ac.uk/informatics}{Department or Informatics}} % Your department's name and URL, this is used in the title page and abstract, print it elsewhere with \deptname
\group{\href{https://www.kcl.ac.uk/research/ssy}{Software Systems Group}} % Your research group's name and URL, this is used in the title page, print it elsewhere with \groupname
\faculty{\href{http://faculty.university.com}{Chengsong Tan}} % Your faculty's name and URL, this is used in the title page and abstract, print it elsewhere with \facname
\AtBeginDocument{
\hypersetup{pdftitle=\ttitle} % Set the PDF's title to your title
\hypersetup{pdfauthor=\authorname} % Set the PDF's author to your name
\hypersetup{pdfkeywords=\keywordnames} % Set the PDF's keywords to your keywords
}
\begin{document}
\frontmatter % Use roman page numbering style (i, ii, iii, iv...) for the pre-content pages
\pagestyle{plain} % Default to the plain heading style until the thesis style is called for the body content
%----------------------------------------------------------------------------------------
% TITLE PAGE
%----------------------------------------------------------------------------------------
\begin{titlepage}
\begin{center}
\vspace*{.06\textheight}
{\scshape\LARGE \univname\par}\vspace{1.5cm} % University name
\textsc{\Large Doctoral Thesis}\\[0.5cm] % Thesis type
\HRule \\[0.4cm] % Horizontal line
{\huge \bfseries \ttitle\par}\vspace{0.4cm} % Thesis title
\HRule \\[1.5cm] % Horizontal line
\begin{minipage}[t]{0.4\textwidth}
\begin{flushleft} \large
\emph{Author:}\\
\href{https://kclpure.kcl.ac.uk/portal/en/persons/chengsong-tan(a63b381b-04bc-4cd7-beea-beb3e96cb153).html}{\authorname} % Author name - remove the \href bracket to remove the link
\end{flushleft}
\end{minipage}
\begin{minipage}[t]{0.4\textwidth}
\begin{flushright} \large
\emph{Supervisor:} \\
\href{https://www.kcl.ac.uk/people/christian-urban}{\supname} % Supervisor name - remove the \href bracket to remove the link
\end{flushright}
\end{minipage}\\[3cm]
\vfill
\large \textit{A thesis submitted in fulfillment of the requirements\\ for the degree of \degreename}\\[0.3cm] % University requirement text
\textit{in the}\\[0.4cm]
\groupname\\\deptname\\[2cm] % Research group name and department name
\vfill
{\large \today}\\[4cm] % Date
%\includegraphics{Logo} % University/department logo - uncomment to place it
\vfill
\end{center}
\end{titlepage}
%----------------------------------------------------------------------------------------
% DECLARATION PAGE
%----------------------------------------------------------------------------------------
\begin{declaration}
\addchaptertocentry{\authorshipname} % Add the declaration to the table of contents
\noindent I, \authorname, declare that this thesis titled, \enquote{\ttitle} and the work presented in it are my own. I confirm that:
\begin{itemize}
\item This work was done wholly while in candidature for a research degree at this University.
\item Where any part of this thesis has previously been submitted for a degree or any other qualification at this University or any other institution, this has been clearly stated.
\item Where I have consulted the published work of others, this is always clearly attributed.
\item Where I have quoted from the work of others, the source is always given. With the exception of such quotations, this thesis is entirely my own work.
\item I have acknowledged all main sources of help.
\item Where the thesis is based on work done by myself jointly with others, I have made clear exactly what was done by others and what I have contributed myself.\\
\end{itemize}
\noindent Signed:\\
\rule[0.5em]{25em}{0.5pt} % This prints a line for the signature
\noindent Date:\\
\rule[0.5em]{25em}{0.5pt} % This prints a line to write the date
\end{declaration}
\cleardoublepage
%----------------------------------------------------------------------------------------
% QUOTATION PAGE
%----------------------------------------------------------------------------------------
%\vspace*{0.2\textheight}
%
%\noindent\enquote{\itshape Thanks to my solid academic training, today I can write hundreds of words on virtually any topic without possessing a shred of information, which is how I got a good job in journalism.}\bigbreak
%
%\hfill Dave Barry
%----------------------------------------------------------------------------------------
% ABSTRACT PAGE
%----------------------------------------------------------------------------------------
%\begin{abstract}
%\end{abstract}
\begin{abstract}
\addchaptertocentry{\abstractname} % Add the abstract to the table of contents
%\addchap{Abstract}
\textbf{Problem: not like an abstract, more like a summary}
\textbf{Goal for new abstract: more high-level and abstract, tell the problem and solution in a concise way.}
New abstract:\\
%Modern computer systems rely on lexing for essential applications such as
%compilers, IDEs, file systems, and network intrusion detection systems.
%These applications require correctness with respect to
%the POSIX standard and high performance.
%%While existing implementations of lexers often achieve high performance,
%Existing implementations had drawbacks such as bugs and catastrophic backtracking,
%preventing them from solving the problem once
%and for all.
%To address these drawbacks,
%this thesis offers an algorithm with formally proven correctness and internal data structures' size bound.
%These mechanised proofs ensure that our algorithm is fast and correct in \textbf{all} cases.
%Our proofs use term-rewriting relations to establish invariants during derivatives and simplifications,
%which is extensible and friendly to theorem provers.
POSIX is the most widely used disambiguation strategy for regular expression matching. There are some difficulties associated with the POSIX strategy and according to tests conducted by Kulkewitz, many regular expression matchers implementing this strategy produce incorrect results. This thesis is concerned with an POSIX regular expression matching algorithm introduced by Sulzmann and Lu. This algorithm uses bitcoded regular expressions and is based on the idea of Brzozowski derivatives. The algorithm generates POSIX values which encode the information of how a regular expression matches a string - that is, which part of the string is matched by which part of the regular expression. This information is needed in the context of lexing in order to extract and to classify tokens.
While a formalised correctness proof for Sulzmann and Lu's algorithm already exists, this proof does not include any of the crucial simplification rules. These simplification rules are however necessary in order to have an acceptable runtime for this algorithm. Our version of the simplification rules includes a number of fixes and improvements: one problem we fix has to do with their use of the nub function that does not remove non-trivial duplicates. We improve the simplification rules by formulating them as simple recursive function and also by simplifying more instances of regular expressions. As a result we can establish a bound on the size of derivatives. Our proofs are formalised in Isabelle/HOL.
Old abstract:
This thesis is about regular expressions and derivatives. It combines functional algorithms and their formal verification in the Isabelle/HOL theorem prover.
Classic results say that regular expression matching should be
linear with respect to the input. The size of the regular expressions
are often treated as a constant factor.
However with some regular expressions and inputs, existing implementations
often suffer from non-linear or even exponential running time,
giving rise to ReDoS (regular expression denial-of-service ) attacks.
To avoid these attacks, regular expression matchers and lexers with
formalised correctness and running time related
properties are of interest because the guarantees apply to all inputs, not just a finite
number of empirical test cases.
Sulzmann and Lu describe a lexing algorithm that calculates Brzozowski derivatives using bitcodes annotated to regular expressions. Their algorithm generates POSIX values which encode the information of how a regular expression matches a string—that is, which part of the string is matched by which part of the regular expression. This information is needed in the context of lexing in order to extract and to classify tokens. The purpose of the bitcodes is to generate POSIX values incrementally while derivatives are calculated. They also help with designing an “aggressive” simplification function that keeps the size of derivatives finitely bounded.
Our simplification function is more aggressive than the one by Sulzmann and Lu.
We also fix a problem in Sulzmann and Lu's simplification to do with their use of
the $\textit{nub}$ function which does not remove non-trivial duplicates.
Without simplification the size of some derivatives can grow arbitrarily big resulting in an extremely slow lexing algorithm.
In this thesis we describe a variant of Sulzmann and Lu’s algorithm: Our variant is a recursive functional program, whereas Sulzmann and Lu’s version involves a fixpoint construction. We (i) prove in Isabelle/HOL that our algorithm is correct and generates unique POSIX values; we also (ii) establish a finite bound for the size of the derivatives for every input string; we also
(iii) give a program and a conjecture that the finite
bound can be improved to be cubic if stronger simplification rules are applied.
\end{abstract}
%----------------------------------------------------------------------------------------
% ACKNOWLEDGEMENTS
%----------------------------------------------------------------------------------------
\begin{acknowledgements}
\addchaptertocentry{\acknowledgementname} % Add the acknowledgements to the table of contents
I would like to express my deepest thanks to my supervisor Doctor Christian Urban,
who have been always extremely supportive thoughout my PhD, in all sorts of ways.
Supervisionwise, Christian always
thinks in terms of the best interests for the student, to which I am eternally grateful for.
I would also like to thank Doctor Ning Zhang, who have always been very gentle and caring to me,
quick to lend a
helping hand at difficult times.
I want to thank Doctor Kathrin Stark, my SIGPLAN mentor, for offering brilliant advice
at the late stage of my PhD. My transition from a PhD student to a postdoc researcher
could not have been so smooth without Kathrin's mentoring.
%I want to thank Jeanna Wheeler, my UMO mentor, for helping me regulate my mental health
%and productivity, by being always encouraging
%and compassionate in her sessions.
I want to thank Jeanna Wheeler for helping me with keeping sane during my time during the PhD and COVID times when an encouraging and compassionate person was very appreciated.
I want to thank my father Haiyan Tan and my mother Yunan Cheng,
for their unconditional love, and who I have not seen
face to face for three years.
I really miss you.
I want to thank my friends Yuying Chen, Kai Zeng, Rui Luo, Jingyi Liu, Qingtian Ye, and many others,
who have always been very patient and compassionate, giving clever advice when I turned to
them for help.
\end{acknowledgements}
%----------------------------------------------------------------------------------------
% LIST OF CONTENTS/FIGURES/TABLES PAGES
%----------------------------------------------------------------------------------------
\tableofcontents % Prints the main table of contents
%\listoffigures % Prints the list of figures
%\listoftables % Prints the list of tables
%----------------------------------------------------------------------------------------
% ABBREVIATIONS
%----------------------------------------------------------------------------------------
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\newtheorem{conjecture}{Conjecture}
\newtheorem{corollary}{Corollary}
\newtheorem{property}{Property}
\newtheorem{proposition}{Proposition}
%proof
%\newcommand\sflat[1][]{\textit{sflat} \, #1}
%\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
%\newcommand{\bderssimp}[2]{#1 \backslash_{bsimp} #2}
%\newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
%\newcommand{\sflataux}[1]{\lbracket #1 \rbracket}
%\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
%\newcommand{\ZERO}{\mbox{\bf 0}}
%\newcommand{\ONE}{\mbox{\bf 1}}
%\newcommand{\AALTS}[2]{\XOR {\scriptstyle #1}\, #2}
%
%\def\lexer{\mathit{lexer}}
%\def\mkeps{\mathit{mkeps}}
%
%\def\AZERO{\textit{AZERO}}
%\def\AONE{\textit{AONE}}
%\def\ACHAR{\textit{ACHAR}}
%
%
%\def\ALTS{\textit{ALTS}}
%\def\ASTAR{\textit{ASTAR}}
%\def\DFA{\textit{DFA}}
%\def\bmkeps{\textit{bmkeps}}
%\def\retrieve{\textit{retrieve}}
%\def\blexer{\textit{blexer}}
%\def\flex{\textit{flex}}
%\def\inj{\mathit{inj}}
%\def\Empty{\mathit{Empty}}
%\def\Left{\mathit{Left}}
%\def\Right{\mathit{Right}}
%\def\Stars{\mathit{Stars}}
%\def\Char{\mathit{Char}}
%\def\Seq{\mathit{Seq}}
%\def\Der{\mathit{Der}}
%\def\nullable{\mathit{nullable}}
%\def\Z{\mathit{Z}}
%\def\S{\mathit{S}}
%\def\rup{r^\uparrow}
%%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
%\def\distinctWith{\textit{distinctWith}}
%
%\def\rexp{\mathbf{rexp}}
%\def\simp{\mathit{simp}}
%\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
%\def\map{\mathit{map}}
%\def\distinct{\mathit{distinct}}
%\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
%\def\map{\textit{map}}
%\def\vsuf{\textit{vsuf}}
%\def\sflataux{\textit{sflat}\_\textit{aux}}
%\def\rrexp{\textit{rrexp}}
%\def\rsize{\textit{rsize}}
%\def\asize{\textit{asize}}
%\def\rerase{\textit{rerase}}
%\def\erase{\textit{erase}}
%\def\STAR{\textit{STAR}}
%\def\flts{\textit{flts}}
%
%----------------------------------------------------------------------------------------
% SYMBOLS
%----------------------------------------------------------------------------------------
%\begin{symbols}{lll} % Include a list of Symbols (a three column table)
%$a$ & distance & \si{\meter} \\
%$P$ & power & \si{\watt} (\si{\joule\per\second}) \\
%Symbol & Name & Unit \\
%\addlinespace % Gap to separate the Roman symbols from the Greek
%$\omega$ & angular frequency & \si{\radian} \\
%\end{symbols}
%----------------------------------------------------------------------------------------
% DEDICATION
%----------------------------------------------------------------------------------------
\dedicatory{For/Dedicated to/To my\ldots}
%----------------------------------------------------------------------------------------
% THESIS CONTENT - CHAPTERS
%----------------------------------------------------------------------------------------
\mainmatter % Begin numeric (1,2,3...) page numbering
\pagestyle{thesis} % Return the page headers back to the "thesis" style
% Include the chapters of the thesis as separate files from the Chapters folder
% Uncomment the lines as you write the chapters
\include{Chapters/Introduction}
\include{Chapters/Inj}
\include{Chapters/Bitcoded1}
\include{Chapters/Bitcoded2}
\include{Chapters/Finite}
\include{Chapters/Cubic}
\include{Chapters/RelatedWork}
\include{Chapters/Future}
%----------------------------------------------------------------------------------------
% THESIS CONTENT - APPENDICES
%----------------------------------------------------------------------------------------
%\appendix % Cue to tell LaTeX that the following "chapters" are Appendices
% Include the appendices of the thesis as separate files from the Appendices folder
% Uncomment the lines as you write the Appendices
%\include{Appendices/AppendixA}
%\include{Appendices/AppendixB}
%\include{Appendices/AppendixC}
%----------------------------------------------------------------------------------------
% BIBLIOGRAPHY
%----------------------------------------------------------------------------------------
\printbibliography[heading=bibintoc]
%----------------------------------------------------------------------------------------
\end{document}