lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@2ad20ba5b178


% Chapter 1

\chapter{Introduction} % Main chapter title

\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 

%----------------------------------------------------------------------------------------

% Define some commands to keep the formatting separated from the content 
\newcommand{\keyword}[1]{\textbf{#1}}
\newcommand{\tabhead}[1]{\textbf{#1}}
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\file}[1]{\texttt{\bfseries#1}}
\newcommand{\option}[1]{\texttt{\itshape#1}}

%boxes
\newcommand*{\mybox}[1]{\framebox{\strut #1}}

%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
\def\derssimp{\textit{ders}\_\textit{simp}}
\def\rders{\textit{rders}}
\newcommand{\bders}[2]{#1 \backslash #2}
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
\def\bsimps{\textit{bsimp}}
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
\newcommand{\ZERO}{\mbox{\bf 0}}
\newcommand{\ONE}{\mbox{\bf 1}}
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
\def\rdistincts{\textit{rdistinct}}
\def\rDistinct{\textit{rdistinct}}
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
\def\cbn{\textit{createdByNtimes}}
\def\hpa{\textit{highestPowerAux}}
\def\hpower{\textit{highestPower}}
\def\ntset{\textit{ntset}}
\def\optermsimp{\textit{optermsimp}}
\def\optermOsimp{\textit{optermOsimp}}
\def\optermosimp{\textit{optermosimp}}
\def\opterm{\textit{opterm}}
\def\nString{\textit{nonemptyString}}

\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}

\def\SEQ{\textit{SEQ}}
\def\SEQs{\textit{SEQs}}
\def\case{\textit{case}}
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
\def\rsimpalts{\textit{rsimp}_{ALTS}}
\def\good{\textit{good}}
\def\btrue{\textit{true}}
\def\bfalse{\textit{false}}
\def\bnullable{\textit{bnullable}}
\def\bnullables{\textit{bnullables}}
\def\Some{\textit{Some}}
\def\None{\textit{None}}
\def\code{\textit{code}}
\def\decode{\textit{decode}}
\def\internalise{\textit{internalise}}
\def\lexer{\mathit{lexer}}
\def\mkeps{\textit{mkeps}}
\newcommand{\rder}[2]{#2 \backslash_r #1}

\def\rerases{\textit{rerase}}

\def\nonnested{\textit{nonnested}}
\def\AZERO{\textit{AZERO}}
\def\sizeNregex{\textit{sizeNregex}}
\def\AONE{\textit{AONE}}
\def\ACHAR{\textit{ACHAR}}

\def\simpsulz{\textit{simp}_{Sulz}}

\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
\def\frewrite{\rightsquigarrow_f}
\def\hrewrite{\rightsquigarrow_h}
\def\grewrite{\rightsquigarrow_g}
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
\def\fuse{\textit{fuse}}
\def\bder{\textit{bder}}
\def\der{\textit{der}}
\def\POSIX{\textit{POSIX}}
\def\ALTS{\textit{ALTS}}
\def\ASTAR{\textit{ASTAR}}
\def\DFA{\textit{DFA}}
\def\NFA{\textit{NFA}}
\def\bmkeps{\textit{bmkeps}}
\def\bmkepss{\textit{bmkepss}}
\def\retrieve{\textit{retrieve}}
\def\blexer{\textit{blexer}}
\def\flex{\textit{flex}}
\def\inj{\textit{inj}}
\def\Empty{\textit{Empty}}
\def\Left{\textit{Left}}
\def\Right{\textit{Right}}
\def\Stars{\textit{Stars}}
\def\Char{\textit{Char}}
\def\Seq{\textit{Seq}}
\def\Alt{\textit{Alt}}
\def\Der{\textit{Der}}
\def\Ders{\textit{Ders}}
\def\nullable{\mathit{nullable}}
\def\Z{\mathit{Z}}
\def\S{\mathit{S}}
\def\rup{r^\uparrow}
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
\def\distinctWith{\textit{distinctWith}}
\def\lf{\textit{lf}}
\def\PD{\textit{PD}}
\def\suffix{\textit{Suffix}}
\def\distinctBy{\textit{distinctBy}}
\def\starupdate{\textit{starUpdate}}
\def\starupdates{\textit{starUpdates}}
\def\nupdate{\textit{nupdate}}
\def\nupdates{\textit{nupdates}}


\def\size{\mathit{size}}
\def\rexp{\mathbf{rexp}}
\def\simp{\mathit{simp}}
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
\def\map{\mathit{map}}
\def\distinct{\mathit{distinct}}
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
\def\blexerStrong{\textit{blexerStrong}}
\def\bsimpStrong{\textit{bsimpStrong}}
\def\bdersStrongs{\textit{bdersStrong}}
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}

\def\map{\textit{map}}
\def\rrexp{\textit{rrexp}}
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
\newcommand\asize[1]{\llbracket #1 \rrbracket}
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}

\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}


\def\rflts{\textit{rflts}}
\def\rrewrite{\textit{rrewrite}}
\def\bsimpalts{\textit{bsimp}_{ALTS}}
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
\def\rsimlalts{\textit{rsimp}_{ALTs}}
\def\rsimpseq{\textit{rsimp}_{SEQ}}

\def\erase{\textit{erase}}
\def\STAR{\textit{STAR}}
\def\flts{\textit{flts}}


\def\zeroable{\textit{zeroable}}
\def\nub{\textit{nub}}
\def\filter{\textit{filter}}
%\def\not{\textit{not}}



\def\RZERO{\mathbf{0}_r }
\def\RONE{\mathbf{1}_r}
\newcommand\RCHAR[1]{\mathbf{#1}_r}
\newcommand\RSEQ[2]{#1 \cdot #2}
\newcommand\RALTS[1]{\sum #1}
\newcommand\RSTAR[1]{#1^*}
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}




\lstdefinestyle{myScalastyle}{
  frame=tb,
  language=scala,
  aboveskip=3mm,
  belowskip=3mm,
  showstringspaces=false,
  columns=flexible,
  basicstyle={\small\ttfamily},
  numbers=none,
  numberstyle=\tiny\color{gray},
  keywordstyle=\color{blue},
  commentstyle=\color{dkgreen},
  stringstyle=\color{mauve},
  frame=single,
  breaklines=true,
  breakatwhitespace=true,
  tabsize=3,
}


%----------------------------------------------------------------------------------------
%This part is about regular expressions, Brzozowski derivatives,
%and a bit-coded lexing algorithm with proven correctness and time bounds.

%TODO: look up snort rules to use here--give readers idea of what regexes look like

\marginpar{rephrasing using "imprecise words"}
Regular expressions, since their inception in the 1940s, 
have been subject to extensive study and implementation. 
Their primary application lies in text processing--finding
matches and identifying patterns in a string.
%It is often used to match strings that comprises of numerous fields, 
%where certain fields may recur or be omitted. 
For example, a simple regular expression that tries 
to recognise email addresses is
\marginpar{rephrased from "the regex for recognising" to "a simple regex that tries to match email"}
\begin{center}
\verb|[a-z0-9._]^+@[a-z0-9.-]^+\.\{a-z\}\{2,6\}|
%$[a-z0-9._]^+@[a-z0-9.-]^+\.[a-z]{2,6}$.
\end{center}
\marginpar{Simplified example, but the distinction between . and escaped . is correct
and therefore left unchanged. Also verbatim package does not straightforwardly support superscripts so + kept as they are.}
%Using this, regular expression matchers and lexers are able to extract 
%the domain names by the use of \verb|[a-zA-Z0-9.-]+|. 
\marginpar{Rewrote explanation for the expression.}
The bracketed sub-expressions are used to extract specific parts of an email address.
The local part is recognised by the expression enclosed in 
the first pair of brackets: $[a-z0-9._]$, and after the ``@'' sign
is the part that recognises the domain, where $[a-z]{2, 6}$ specifically
matches the top-level domain.
%Consequently, they are an indispensible components in text processing tools 
%of software systems such as compilers, IDEs, and firewalls.

The study of regular expressions is ongoing due to an
issue known as catastrophic backtracking. 
This phenomenon refers to scenarios in which the regular expression 
matching or lexing engine exhibits a disproportionately long 
runtime despite the simplicity of the input and expression.

One cause of catastrophic backtracking lies within the 
ambiguities of lexing.\marginpar{rephrased "the origin of catastrophic ...} 
In the process of matching a multi-character string with 
a regular expression that encompasses several sub-expressions, 
different positions can be designated to mark 
the boundaries of corresponding substrings of the sub-expressions. 
For instance, in matching the string $aaa$ with the 
regular expression $(a+aa)^*$, the divide between 
the initial match and the subsequent iteration could either be 
set between the first and second characters ($a | aa$) or between the second and third characters ($aa | a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.

Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).

Various disambiguation strategies are 
employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:


%Regular expressions 
%have been extensively studied and
%implemented since their invention in 1940s.
%It is primarily used in lexing, where an unstructured input string is broken
%down into a tree of tokens.
%That tree's construction is guided by the shape of the regular expression.
%This is particularly useful in expressing the shape of a string
%with many fields, where certain fields might repeat or be omitted.
%Regular expression matchers and Lexers allow us to 
%identify and delimit different subsections of a string and potentially 
%extract information from inputs, making them
%an indispensible component in modern software systems' text processing tasks
%such as compilers, IDEs, and firewalls.
%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
%which means the regular expression matching or lexing engine takes an unproportional time to run 
%despite the input and the expression being relatively simple.
%
%Catastrophic backtracking stems from the ambiguities of lexing: 
%when matching a multiple-character string with a regular 
%exression that includes serveral sub-expressions, there might be different positions to set 
%the border between sub-expressions' corresponding sub-strings.
%For example, matching the string $aaa$ against the regular expression
%$(a+aa)^*$, the border between the initial match and the second iteration
%could be between the first and second character ($a | aa$) 
%or between the second and third character ($aa | a$).
%As the size of the input string and the structural complexity 
%of the regular expression increase,
%the number of different combinations of delimiters can grow exponentially, and
%algorithms that explore these possibilities unwisely will also see an exponential complexity.
%
%Catastrophic backtracking allow a class of computationally inexpensive attacks called
%Regular expression Denial of Service attacks (ReDoS), in which the hacker 
%simply sends out a small attack string to a server, 
%triggering high-complexity behaviours in its regular expression engine. 
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more 
%details of this in the practical overview section in chapter \ref{Introduction}).
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
%There have been prose definitions like the following
%by Kuklewicz \cite{KuklewiczHaskell}: 
%described the POSIX rule as (section 1, last paragraph):
\marginpar{\em Deleted some peripheral specifications.}
\begin{quote}
	\begin{itemize}
		\item
regular expressions (REs) take the leftmost starting match, and the longest match starting there
earlier subpatterns have leftmost-longest priority over later subpatterns\\
\item
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
\item
	$\ldots$
%REs have right associative concatenation which can be changed with parenthesis\\
%\item
%parenthesized subexpressions return the match from their last usage\\
%\item
%text of component subexpressions must be contained in the text of the 
%higher-level subexpressions\\
%\item
%if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
%\item
%if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
\end{itemize}
\end{quote}
However, the author noted that various lexers that claim to be POSIX 
are rarely correct according to this standard.
There are numerous occasions where programmers realised the subtlety and
difficulty to implement correctly, one such quote from Go's regexp library author 
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
\begin{quote}\it
``
The POSIX rule is computationally prohibitive
and not even well-defined.
``
\end{quote}
Being able to formally define and capture the idea of 
POSIX rules and prove 
the correctness of regular expression matching/lexing 
algorithms against the POSIX semantics definitions
is valuable.


Formal proofs are
machine checked programs
 %such as the ones written in Isabelle/HOL, is a powerful means 
for computer scientists to be certain 
about the correctness of their algorithms.
This is done by 
recursively checking that every fact in a proof script 
is either an axiom or a fact that is derived from
known axioms or verified facts.
%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
%methods as their implementation and complexity analysis tend to be error-prone.
Formal proofs provides an unprecendented level of asssurance
that an algorithm will perform as expected under all inputs.
The software systems that help people interactively build and check
such proofs are called theorem-provers or proof assitants.
Many  theorem-provers have been developed, such as Mizar,
Isabelle/HOL, HOL-Light, HOL4,
Coq, Agda, Idris, Lean and so on.
Isabelle/HOL is a theorem prover with a simple type theory
and powerful automated proof generators like sledgehammer.
We chose to use Isabelle/HOL for its powerful automation
and ease and simplicity in expressing regular expressions and 
regular languages.
%Some of those employ
%dependent types like Mizar, Coq, Agda, Lean and Idris.
%Some support a constructivism approach, such as Coq.


Formal proofs on regular expression matching and lexing
complements the efforts in
industry which tend to focus on overall speed
with techniques like parallelization (FPGA paper), tackling 
the problem of catastrophic backtracking 
in an ad-hoc manner (cloudflare and stackexchange article).

There have been many interesting steps in the theorem-proving community 
about formalising regular expressions and lexing.
One flavour is to build from the regular expression an automaton, and determine
acceptance in terms of the resulting 
state after executing the input string on that automaton.
Automata formalisations are in general harder and more cumbersome to deal
with for theorem provers than working directly on regular expressions.
One such example is by Nipkow \cite{Nipkow1998}.
%They 
%made everything recursive (for example the next state function),
As a core idea, they
used a list of booleans to name each state so that 
after composing sub-automata together, renaming the states to maintain
the distinctness of each state is recursive and simple.
The result was the obvious lemmas incorporating  
``a painful amount of detail'' in their formalisation.
Sometimes the automata are represented as graphs. 
But graphs are not inductive datatypes.
Having to set the induction principle on the number of nodes
in a graph makes formal reasoning non-intuitive and convoluted,
resulting in large formalisations \cite{Lammich2012}. 
When combining two graphs, one also needs to make sure that the nodes in
both graphs are distinct, which almost always involve
renaming of the nodes.
A theorem-prover which provides dependent types such as Coq 
can alleviate the issue of representing graph nodes
\cite{Doczkal2013}. There the representation of nodes is made
easier by the use of $\textit{FinType}$.
Another representation for automata are matrices. 
But the induction for them is not as straightforward either.
There are some more clever representations, for example one by Paulson 
using hereditarily finite sets \cite{Paulson2015}. 
There the problem with combining graphs can be solved better. 
%but we believe that such clever tricks are not very obvious for 
%the John-average-Isabelle-user.

The approach that operates directly on regular expressions circumvents the problem of
conversion between a regular expression and an automaton, thereby avoiding representation
problems altogether, despite that regular expressions may be seen as a variant of a
non-deterministic finite automaton (ref Laurikari tagged NFA paper).
To matching a string, a sequence of algebraic transformations called 
(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression.
Each derivative takes a character and a regular expression, 
and returns a new regular expression whose language is closely related to 
the original regular expression's language:
strings prefixed with that input character will have their head removed
and strings not prefixed
with that character will be eliminated. 
After taking derivatives with respect to all the characters the string is
exhausted. Then an algorithm checks whether the empty string is in that final 
regular expression's language.
If so, a match exists and the string is in the language of the input regular expression.

Again this process can be seen as the simulation of an NFA running on a string,
but the recursive nature of the datatypes and functions involved makes
derivatives a perfect object of study for theorem provers.
That is why there has been numerous formalisations of regular expressions
and Brzozowski derivatives in the functional programming and 
theorem proving community (a large list of refs to derivatives formalisation publications).
Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
formalised the notion of bit-coded regular expressions
and proved their relations with simple regular expressions in
the dependently-typed proof assistant Agda.
They also proved the soundness and completeness of a matching algorithm
based on the bit-coded regular expressions. Their algorithm is a decision procedure
that gives a Yes/No answer, which does not produce
lexical values.
%X also formalised derivatives and regular expressions, producing "parse trees".
%(Some person who's a big name in formal methods)


The variant of the problem we are looking at centers around
an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
The reason we chose to look at $\blexer$ and its simplifications 
is because it allows a lexical tree to be generated
by some elegant and subtle procedure based on Brzozowski derivatives.
The procedures are made of recursive functions and inductive datatypes just like derivatives, 
allowing intuitive and concise formal reasoning with theorem provers.
Most importantly, $\blexer$ opens up a path to an optimized version
of $\blexersimp$ possibilities to improve
performance with simplifications that aggressively change the structure of regular expressions.
While most derivative-based methods
rely on structures to be maintained to allow induction to
go through.
For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
with derivatives, but as soon as they started introducing
optimizations such as memoization, they reverted to constructing
DFAs first.
Edelmann \ref{Edelmann2020} explored similar optimizations in his
work on verified LL(1) parsing, with additional enhancements with data structures like
zippers.

%Sulzmann and Lu have worked out an algorithm
%that is especially suited for verification
%which utilized the fact
%that whenever ambiguity occurs one may duplicate a sub-expression and use
%different copies to describe different matching choices.
The idea behind the correctness of $\blexer$ is simple: during a derivative,
multiple matches might be possible, where an alternative with multiple children
each corresponding to a 
different match is created. In the end of
a lexing process one always picks up the leftmost alternative, which is guarnateed 
to be a POSIX value.
This is done by consistently keeping sub-regular expressions in an alternative
with longer submatches
to the left of other copies (
Remember that POSIX values are roughly the values with the longest inital
submatch).
The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
is that since we only take the leftmost copy, then all re-occurring copies can be
eliminated without losing the POSIX property, and this can be done with
children of alternatives at different levels by merging them together.
Proving $\blexersimp$ requires a different
proof strategy compared to that by Ausaf \cite{FahadThesis}.
We invent a rewriting relation as an
inductive predicate which captures 
a strong enough invariance that ensures correctness,
which commutes with the derivative operation. 
This predicate allows a simple
induction on the input string to go through.

%This idea has been repeatedly used in different variants of lexing
%algorithms in their paper, one of which involves bit-codes. The bit-coded
%derivative-based algorithm even allows relatively aggressive 
%%simplification rules which cause
%structural changes that render the induction used in the correctness
%proofs unusable.
%More details will be given in \ref{Bitcoded2} including the
%new correctness proof which involves a new inductive predicate which allows 
%rule induction to go through.







%first character is removed 
%state of the automaton after matching that character 
%where nodes are represented as 
%a sub-expression (for example tagged NFA
%Working on regular expressions 
%Because of these problems with automata, we prefer regular expressions
%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
%the regular expression and a different data structure.
%
%
%The key idea 

(ends)

%Regular expressions are widely used in computer science: 
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
%command-line tools like $\mathit{grep}$ that facilitate easy 
%text-processing \cite{grep}; network intrusion
%detection systems that inspect suspicious traffic; or compiler
%front ends.
%Given their usefulness and ubiquity, one would assume that
%modern regular expression matching implementations
%are mature and fully studied.
%Indeed, in many popular programming languages' regular expression engines, 
%one can supply a regular expression and a string, and in most cases get
%get the matching information in a very short time.
%Those engines can sometimes be blindingly fast--some 
%network intrusion detection systems
%use regular expression engines that are able to process 
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
%However, those engines can sometimes also exhibit a surprising security vulnerability
%under a certain class of inputs.
%However, , this is not the case for $\mathbf{all}$ inputs.
%TODO: get source for SNORT/BRO's regex matching engine/speed


%----------------------------------------------------------------------------------------
\section{Contribution}
{\color{red} \rule{\linewidth}{0.5mm}}
\textbf{issue with this part: way too short, not enough details of what I have done.}
{\color{red} \rule{\linewidth}{0.5mm}}

In this thesis,
we propose a solution to catastrophic
backtracking and error-prone matchers: a formally verified
regular expression lexing algorithm
that is both fast
and correct. 
{\color{red} \rule{\linewidth}{0.5mm}}
\HandRight Added content:
%Package \verb`pifont`: \ding{43}
%Package \verb`utfsym`: \usym{261E} 
%Package \verb`dingbat`: \leftpointright 
%Package \verb`dingbat`: \rightpointright 

In particular, the main problem we solved on top of previous work was
coming up with a formally verified algorithm called $\blexersimp$ based
on Brzozowski derivatives. It calculates a POSIX
lexical value from a string and a regular expression. This algorithm was originally
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
function does not really simplify intermediate results where it needs to and improved the
algorithm accordingly.
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
\begin{theorem}
$|\blexersimp \; r \; s | \leq N_r$
\end{theorem}
The simplifications applied in each step of $\blexersimp$ 

\begin{center}
	$\blexersimp
	$
\end{center}
keeps the derivatives small, but presents a 
challenge


establishing a correctness theorem of the below form:
%TODO: change this to "theorem to prove"
\begin{theorem}
	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$, 
	then $\blexersimp\; r \; s = \Some \; v$. Otherwise 
	$\blexersimp \; r \; s = \None$.
\end{theorem}




The result is %a regular expression lexing algorithm that comes with 
\begin{itemize}
\item
an improved version of  Sulzmann and Lu's bit-coded algorithm using 
derivatives with simplifications, 
accompanied by
a proven correctness theorem according to POSIX specification 
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, 
\item 
a complexity-related property for that algorithm saying that the 
internal data structure will
remain below a finite bound,
\item
and an extension to
the bounded repetition constructs with the correctness and finiteness property
maintained.
\end{itemize}
\noindent
{\color{red} \rule{\linewidth}{0.5mm}}
With a formal finiteness bound in place,
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
The Isabelle/HOL code for our formalisation can be 
found at
\begin{center}
\url{https://github.com/hellotommmy/posix}
\end{center}
Further improvements to the algorithm with an even stronger version of 
simplification can be made. We conjecture that the resulting size of derivatives
can be bounded by a cubic bound w.r.t. the size of the regular expression.
We will give relevant code in Scala, 
but do not give a formal proof for that in Isabelle/HOL.
This is still future work.


\section{Structure of the thesis}
\marginpar{\em This is a marginal note.} 
Before talking about the formal proof of $\blexersimp$'s 
correctness, which is the main contribution of this thesis,
we need to introduce two formal proofs which belong
to Ausafe et al. 
In chapter \ref{Inj} we will introduce the concepts
and notations we 
use for describing regular expressions and derivatives,
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including 
its correctness proof).
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
together with the correctness proof by Ausaf and Urban.
Then we illustrate in chapter \ref{Bitcoded2}
how Sulzmann and Lu's
simplifications fail to simplify correctly. We therefore introduce our own version of the
algorithm with correct simplifications and 
their correctness proof.  
In chapter \ref{Finite} we give the second guarantee
of our bitcoded algorithm, that is a finite bound on the size of any 
regular expression's derivatives. 
We also show how one can extend the
algorithm to include bounded repetitions. 
In chapter \ref{Cubic} we discuss stronger simplification rules which 
improve the finite bound to a cubic bound. %and the NOT regular expression.
Chapter \ref{RelatedWork} introduces relevant work for this thesis.
Chapter \ref{Future} concludes and mentions avenues of future research.
 




%----------------------------------------------------------------------------------------


%----------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------
author	Chengsong
	Fri, 07 Jul 2023 20:03:05 +0100
changeset 654	2ad20ba5b178
parent 653	bc5571c38d1f
child 664	ba44144875b1
permissions	-rwxr-xr-x