lexing: ChengsongTanPhdThesis/Chapters/Introduction.tex@3831621d7b14


% Chapter 1

\chapter{Introduction} % Main chapter title

\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} 

%----------------------------------------------------------------------------------------

% Define some commands to keep the formatting separated from the content 
\newcommand{\keyword}[1]{\textbf{#1}}
\newcommand{\tabhead}[1]{\textbf{#1}}
\newcommand{\code}[1]{\texttt{#1}}
\newcommand{\file}[1]{\texttt{\bfseries#1}}
\newcommand{\option}[1]{\texttt{\itshape#1}}

%boxes
\newcommand*{\mybox}[1]{\framebox{\strut #1}}

%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
\def\derssimp{\textit{ders}\_\textit{simp}}
\def\rders{\textit{rders}}
\newcommand{\bders}[2]{#1 \backslash #2}
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
\def\bsimps{\textit{bsimp}}
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
\newcommand{\ZERO}{\mbox{\bf 0}}
\newcommand{\ONE}{\mbox{\bf 1}}
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
\def\rdistincts{\textit{rdistinct}}
\def\rDistinct{\textit{rdistinct}}
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
\def\cbn{\textit{createdByNtimes}}
\def\hpa{\textit{highestPowerAux}}
\def\hpower{\textit{highestPower}}
\def\ntset{\textit{ntset}}
\def\optermsimp{\textit{optermsimp}}
\def\optermOsimp{\textit{optermOsimp}}
\def\optermosimp{\textit{optermosimp}}
\def\opterm{\textit{opterm}}
\def\nString{\textit{nonemptyString}}

\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}

\def\SEQ{\textit{SEQ}}
\def\SEQs{\textit{SEQs}}
\def\case{\textit{case}}
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
\def\rsimpalts{\textit{rsimp}_{ALTS}}
\def\good{\textit{good}}
\def\btrue{\textit{true}}
\def\bfalse{\textit{false}}
\def\bnullable{\textit{bnullable}}
\def\bnullables{\textit{bnullables}}
\def\Some{\textit{Some}}
\def\None{\textit{None}}
\def\code{\textit{code}}
\def\decode{\textit{decode}}
\def\internalise{\textit{internalise}}
\def\lexer{\mathit{lexer}}
\def\mkeps{\textit{mkeps}}
\newcommand{\rder}[2]{#2 \backslash_r #1}

\def\rerases{\textit{rerase}}

\def\nonnested{\textit{nonnested}}
\def\AZERO{\textit{AZERO}}
\def\sizeNregex{\textit{sizeNregex}}
\def\AONE{\textit{AONE}}
\def\ACHAR{\textit{ACHAR}}

\def\simpsulz{\textit{simp}_{Sulz}}

\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
\def\frewrite{\rightsquigarrow_f}
\def\hrewrite{\rightsquigarrow_h}
\def\grewrite{\rightsquigarrow_g}
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
\def\fuse{\textit{fuse}}
\def\bder{\textit{bder}}
\def\der{\textit{der}}
\def\POSIX{\textit{POSIX}}
\def\ALTS{\textit{ALTS}}
\def\ASTAR{\textit{ASTAR}}
\def\DFA{\textit{DFA}}
\def\NFA{\textit{NFA}}
\def\bmkeps{\textit{bmkeps}}
\def\bmkepss{\textit{bmkepss}}
\def\retrieve{\textit{retrieve}}
\def\blexer{\textit{blexer}}
\def\flex{\textit{flex}}
\def\inj{\textit{inj}}
\def\Empty{\textit{Empty}}
\def\Left{\textit{Left}}
\def\Right{\textit{Right}}
\def\Stars{\textit{Stars}}
\def\Char{\textit{Char}}
\def\Seq{\textit{Seq}}
\def\Alt{\textit{Alt}}
\def\Der{\textit{Der}}
\def\Ders{\textit{Ders}}
\def\nullable{\mathit{nullable}}
\def\Z{\mathit{Z}}
\def\S{\mathit{S}}
\def\rup{r^\uparrow}
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
\def\distinctWith{\textit{distinctWith}}
\def\lf{\textit{lf}}
\def\PD{\textit{PD}}
\def\suffix{\textit{Suffix}}
\def\distinctBy{\textit{distinctBy}}
\def\starupdate{\textit{starUpdate}}
\def\starupdates{\textit{starUpdates}}
\def\nupdate{\textit{nupdate}}
\def\nupdates{\textit{nupdates}}


\def\size{\mathit{size}}
\def\rexp{\mathbf{rexp}}
\def\simp{\mathit{simp}}
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
\def\map{\mathit{map}}
\def\distinct{\mathit{distinct}}
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
\def\blexerStrong{\textit{blexerStrong}}
\def\bsimpStrong{\textit{bsimpStrong}}
\def\bdersStrongs{\textit{bdersStrong}}
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}

\def\map{\textit{map}}
\def\rrexp{\textit{rrexp}}
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
\newcommand\asize[1]{\llbracket #1 \rrbracket}
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}

\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}


\def\rflts{\textit{rflts}}
\def\rrewrite{\textit{rrewrite}}
\def\bsimpalts{\textit{bsimp}_{ALTS}}
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
\def\rsimlalts{\textit{rsimp}_{ALTs}}
\def\rsimpseq{\textit{rsimp}_{SEQ}}

\def\erase{\textit{erase}}
\def\STAR{\textit{STAR}}
\def\flts{\textit{flts}}


\def\zeroable{\textit{zeroable}}
\def\nub{\textit{nub}}
\def\filter{\textit{filter}}
%\def\not{\textit{not}}



\def\RZERO{\mathbf{0}_r }
\def\RONE{\mathbf{1}_r}
\newcommand\RCHAR[1]{\mathbf{#1}_r}
\newcommand\RSEQ[2]{#1 \cdot #2}
\newcommand\RALTS[1]{\sum #1}
\newcommand\RSTAR[1]{#1^*}
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}




\lstdefinestyle{myScalastyle}{
  frame=tb,
  language=scala,
  aboveskip=3mm,
  belowskip=3mm,
  showstringspaces=false,
  columns=flexible,
  basicstyle={\small\ttfamily},
  numbers=none,
  numberstyle=\tiny\color{gray},
  keywordstyle=\color{blue},
  commentstyle=\color{dkgreen},
  stringstyle=\color{mauve},
  frame=single,
  breaklines=true,
  breakatwhitespace=true,
  tabsize=3,
}


%----------------------------------------------------------------------------------------
%This part is about regular expressions, Brzozowski derivatives,
%and a bit-coded lexing algorithm with proven correctness and time bounds.
% \marginpar{Totally rewritten introduction}
% Formal proofs, 


%TODO: look up snort rules to use here--give readers idea of what regexes look like

\marginpar{rephrasing following Christian comment}
Regular expressions, since their inception in the 1950s
\cite{RegularExpressions}, 
have been subject to extensive study and implementation. 
Their primary application lies in text processing--finding
matches and identifying patterns in a string.
%It is often used to match strings that comprises of numerous fields, 
%where certain fields may recur or be omitted. 
For example, a simple regular expression that tries 
to recognise email addresses is
\marginpar{rephrased from "the regex for recognising" to "a simple regex that tries to match email"}
\begin{center}
\verb|[a-z0-9._]+@[a-z0-9.-]+\.[a-z]{2,6}|
%$[a-z0-9._]^+@[a-z0-9.-]^+\.[a-z]{2,6}$.
\end{center}
\marginpar{Simplified example, but the distinction between . and escaped . is correct
and therefore left unchanged. Also verbatim package does not straightforwardly support superscripts so + kept as they are.}
%Using this, regular expression matchers and lexers are able to extract 
%the domain names by the use of \verb|[a-zA-Z0-9.-]+|. 
\marginpar{Rewrote explanation for the expression.}
This expression assumes all letters in the email have been converted into lower-case.
The local-part is recognised by the first 
bracketed expression $[a-z0-9.\_]^+$ 
where the
operator ``+'' (should be viewed as a superscript) 
means that it must be some non-empty string
made of alpha-numeric characters.
After the ``@'' sign
is the sub-expression that recognises the domain of that email, 
where $[a-z]^{\{2, 6\}}$ specifically
matches the top-level domain, such as ``org'', ``com'', ``uk'' and etc.
The counters in the superscript 
$2, 6$ specifies that all top-level domains
should have between two to six characters.
This regular expression does not represent all possible email addresses 
(e.g. those involving ``-'' cannot be recognised), but patterns
of similar shape and using roughly the same set of syntax are used
everywhere in our daily life,
% Indeed regular expressions are used everywhere 
% in a computer user's daily life--for example searching for a string
% in a text document using ctrl-F.
% When such regex searching 
% a useful and simple way to handle email-processing when the task does
% not require absolute precision. 
%The same can be said about
%applications like .
%Consequently, they are an indispensible components in text processing tools 
%of software systems such as compilers, IDEs, and firewalls.
% The picture gets more blurry when we start to ask questions like 
% why 
%TODO: Insert corresponding bibliography
%There are many more scenarios in which regular exp% %use cases for regular expressions in computer science,
for example in compilers \cite{LEX}, networking \cite{Snort1999}, 
software engineering (IDEs) \cite{Atom}
and operating systems \cite{GREP}, where the correctness
of matching can be crucially important.

Implementations of regular expression matching out in the wild, however, 
are surprisingly unreliable.
%are not always reliable as %people think
%they
%should be. %Given the importance of the correct functioning
% Indeed they have been heavily relied on and
% extensively implemented and studied in modern computer systems--whenever you do
% a 
%TODO: double check this is true
An example is the regular expresion $(a^*)^*b$ and the string
$aa\ldots a$. The expectation is that any regex engine should
be able to solve this (return a no match) in no time.
However, if this 
%regular expression and string pair 
is tried out in an
regex engine like that of Java 8, the runtime would quickly grow beyond 100 seconds
with just dozens of characters 
%in the string. 
Such behaviour can result in Denial-of-Service (ReDoS) attacks
with minimal resources--just the pair of ``evil'' 
regular expression and string.
The outage of the web service provider Cloudflare \cite{Cloudflare} 
in 2019 \cite{CloudflareOutage} is a recent example
where such issues caused serious negative impact.


The reason why these regex matching engines get so slow
is because they use backtracking or a depth-first-search (DFS) on the 
search space of possible matches. They employ backtracking algorithms to 
support back-references, a mechanism allowing
expressing languages which repeating an arbitrary long string 
such as 
$\{ww| w\in \Sigma*\}$. Such a constructs makes matching
NP-complete, for which no known non-backtracking algorithms exist.
More modern programming languages' regex engines such as
Rust and GO's do promise linear-time behaviours 
with respect to input string,
but they do not support back-references, and often 
impose ad-hoc restrictions on the input patterns.
More importantly, these promises are purely empirical,
making them prone to future ReDoS attacks and other types of errors.

The other unreliability of industry regex engines
is that they do not produce the desired output. 
Kuklewicz have found out during his testing
of practical regex engines that almost all of them are incorrect with respect
to the POSIX standard, a disambiguation strategy adopted most
widely in computer science. Roughly speaking POSIX lexing means to always
go for the longest initial submatch possible, 
for instance a single iteration
$aa$ is preferred over two iterations $a,a$ 
when matching $(a|aa)^*$ with the string $aa$.
The POSIX strategy as summarised by Kuklewicz goes
as follows:
\marginpar{\em Deleted some peripheral specifications.}
\begin{quote}
	\begin{itemize}
		\item
regular expressions (REs) take the leftmost starting match, and the longest match starting there
earlier subpatterns have leftmost-longest priority over later subpatterns\\
\item
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
\item
	$\ldots$
%REs have right associative concatenation which can be changed with parenthesis\\
%\item
%parenthesized subexpressions return the match from their last usage\\
%\item
%text of component subexpressions must be contained in the text of the 
%higher-level subexpressions\\
%\item
%if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
%\item
%if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
\end{itemize}
\end{quote}
The author noted that various lexers that come with POSIX operating systems
are rarely correct according to this standard.
A test case that exposes the bugs 
is the regular expression $(aba|ab|a)^*$ and string $ababa$.
A correct match would split the string into $ab, aba$, involving
two repetitions of the Kleene star $(aba|ab|a)^*$.
Most regex engines would instead return two (partial) matches
$aba$ and  $a$ \footnote{Try it out here: \url{https://regex101.com/r/c5hga5/1}, last accessed 22-Aug-2023}.
There are numerous occasions where programmers realised the subtlety and
difficulty to implement POSIX correctly, one such quote from Go's regexp library author:
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}, last accessed 22-Aug-2023}
\begin{quote}\it
``
The POSIX rule is computationally prohibitive
and not even well-defined.
''
\end{quote}
% Being able to formally define and capture the idea of 
% POSIX rules and prove 
% the correctness of regular expression matching/lexing 
% algorithms against the POSIX semantics definitions
% is valuable.
These all point towards a formal treatment of 
POSIX lexing to clear up inaccuracies and errors
in understanding and implementation of regex. The work presented
in this thesis uses formal proofs to ensure the correctness 
and runtime properties
of POSIX regular expression lexing.

Formal proofs or mechanised proofs are
computer checked programs
 %such as the ones written in Isabelle/HOL, is a powerful means 
that certify the correctness of facts
with respect to a set of axioms and definitions.
% This is done by 
% recursively checking that every fact in a proof script 
% is either an axiom or a fact that is derived from
% known axioms or verified facts.
%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
%methods as their implementation and complexity analysis tend to be error-prone.
They provide an unprecendented level of asssurance
that an algorithm will perform as expected under all inputs.
We believe such proofs can help eliminate catastrophic backtracking
once and for all.
The software systems that help people interactively build and check
formal proofs are called proof assitants.
% Many  theorem-provers have been developed, such as Mizar,
% Isabelle/HOL, HOL-Light, HOL4,
% Coq, Agda, Idris, Lean and so on.
Isabelle/HOL \cite{Isabelle} is a widely-used proof assistant with a simple type theory
and powerful automated proof generators like sledgehammer.
We chose to use Isabelle/HOL for its powerful automation
and ease and simplicity in expressing regular expressions and 
regular languages.
%Some of those employ
%dependent types like Mizar, Coq, Agda, Lean and Idris.
%Some support a constructivism approach, such as Coq.


% Formal proofs on regular expression matching and lexing
% complements the efforts in
% industry which tend to focus on overall speed
% with techniques like parallelization (FPGA paper), tackling 
% the problem of catastrophic backtracking 
% in an ad-hoc manner (cloudflare and stackexchange article).
The algorithm we work on is based on Brzozowski derivatives.
Brzozowski invented the notion of ``derivatives'' on 
regular expressions \cite{Brzozowski1964}, and the idea is
that we can reason about what regular expressions can match
by taking derivatives of them. 
A derivative operation takes a regular expression $r$ and a character
$c$,
and returns a new regular expression $r\backslash c$.
The derivative is taken with respect to $c$:
it transforms $r$ in such a way that the resulting derivative
$r\backslash c$ contains all strings in the language of $r$ that
starts with $c$, but now with the head character $c$ thrown away.
For example, for $r$ equal to $(aba | ab | a)^*$
as discussed earlier, its derivative with respect to character $a$ is
\[
  r\backslash a = (ba | b| \ONE)(aba | ab | a)^*.
\]
% By applying derivatives repeatedly one can c
Taking derivatives repeatedly with respect to the sequence
of characters in the string $s$, one obtain a regular expression
containing the information of how $r$ matched $s$.

Brzozowski derivatives are purely algebraic operations
that can be implemented as a recursive function
that does a pattern match on the structure of the regular expression.
For example, the derivatives of character regular expressions,
Kleene star and bounded repetitions can be described by the following
code equations:
% For example some clauses
\begin{center}
	\begin{tabular}{lcl}
    $d \backslash c$     & $\dn$ & 
		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
% $(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
% $(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\
% 	&   & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
% 	&   & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
	$(r^*)\backslash c$           & $\dn$ & $(r\backslash c) \cdot r^*$\\
		$r^{\{n\}} \backslash c$     & $\dn$ & $r \backslash c \cdot
		r^{\{n-1\}} (\text{when} \; n > 0)$\\
	\end{tabular}
\end{center}
(end of ready work, rest construction site)
In particular, we are working on an improvement of the work
by Ausaf et al. \cite{Ausaf} \cite{AusafDyckhoffUrban2016} 
and Sulzmann and Lu \cite{Sulzmann2014}.


The variant of the problem we are looking at centers around
an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
The reason we chose to look at $\blexer$ and its simplifications 
is because it allows a lexical tree to be generated
by some elegant and subtle procedure based on Brzozowski derivatives.
The procedures are made of recursive functions and inductive datatypes just like derivatives, 
allowing intuitive and concise formal reasoning with theorem provers.
Most importantly, $\blexer$ opens up a path to an optimized version
of $\blexersimp$ possibilities to improve
performance with simplifications that aggressively change the structure of regular expressions.
While most derivative-based methods
rely on structures to be maintained to allow induction to
go through.
For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
with derivatives, but as soon as they started introducing
optimizations such as memoization, they reverted to constructing
DFAs first.
Edelmann \ref{Edelmann2020} explored similar optimizations in his
work on verified LL(1) parsing, with additional enhancements with data structures like
zippers.

%Sulzmann and Lu have worked out an algorithm
%that is especially suited for verification
%which utilized the fact
%that whenever ambiguity occurs one may duplicate a sub-expression and use
%different copies to describe different matching choices.
The idea behind the correctness of $\blexer$ is simple: during a derivative,
multiple matches might be possible, where an alternative with multiple children
each corresponding to a 
different match is created. In the end of
a lexing process one always picks up the leftmost alternative, which is guarnateed 
to be a POSIX value.
This is done by consistently keeping sub-regular expressions in an alternative
with longer submatches
to the left of other copies (
Remember that POSIX values are roughly the values with the longest inital
submatch).
The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
is that since we only take the leftmost copy, then all re-occurring copies can be
eliminated without losing the POSIX property, and this can be done with
children of alternatives at different levels by merging them together.
Proving $\blexersimp$ requires a different
proof strategy compared to that by Ausaf \cite{FahadThesis}.
We invent a rewriting relation as an
inductive predicate which captures 
a strong enough invariance that ensures correctness,
which commutes with the derivative operation. 
This predicate allows a simple
induction on the input string to go through.

%This idea has been repeatedly used in different variants of lexing
%algorithms in their paper, one of which involves bit-codes. The bit-coded
%derivative-based algorithm even allows relatively aggressive 
%%simplification rules which cause
%structural changes that render the induction used in the correctness
%proofs unusable.
%More details will be given in \ref{Bitcoded2} including the
%new correctness proof which involves a new inductive predicate which allows 
%rule induction to go through.


\marginpar{20230821 Addition: high-level idea in context}
To summarise, we expect modern regex matching and lexing engines
to be reliabe, fast and correct, and support rich syntax constructs
like bounded repetitions and back references.
Backtracking regex engines have exhibited exponential 
worst-case behaviours (catastrophic backtracking)
for employing a depth-first-search on the search tree of possible matches,
and complexity analysis about them also takes worst-case exponential 
time (ref Minamide and Birmingham work).
%Expand notes: JFLEX generates DFAs, cannot support backrefs
%Expand notes: Java 8, python supports pcre, but is exponential
%Expand notes: more modern prog langs like GO, Rust claims linear, and forbids large counters and backrefs
%The work described in this thesis is part of an ongoing
%project which aims to build a formally verified lexer satisfying all these
To ensure the correctness and predictable behaviour of lexical analysis, 
we propose to
build a formally verified lexer that satisfy correctness and non-backtracking
requirements in a bottom-up manner using Brzozowski derivatives.
We build on the line of work by Ausaf et al.
A derivative-based matching algorithm avoids backtracking, by explicitly
representing possible matches on the same level of a search tree 
as regular expression terms
in a breadth-first manner.
Efficiency of such algorithms rely on limiting the size
of the derivatives during the computation, for example by
eliminating redundant 
terms that lead to identical matches. 


The end goal would be a lexer that comes with additional formal guarantees
on computational complexity and actual speed competent with other
unverified regex engines.
We will report in the next section the outcome of this project
so far and the contribution with respect to other works in
lexical analysis and theorem proving.


Derivatives on regular expressions are popular in the theorem proving community
because their algebraic features are amenable to formal reasoning.
As a testimony for this, there exists a sizable number of formalisations of Brzozowski derivatives and
regular expressions in different theorem proving languages.
%Expand notes: verified lexers: proof-of-concept: Ruberio, Egolf, fast but no formal proof in a proof assistant: Magnus, 



% The study of regular expressions is ongoing due to an
% issue known as catastrophic backtracking. 
% This phenomenon refers to scenarios in which the regular expression 


% One cause of catastrophic backtracking lies within the 
% ambiguities of lexing.\marginpar{rephrased "the origin of catastrophic ...} 
% In the process of matching a multi-character string with 
% a regular expression that encompasses several sub-expressions, 
% different positions can be designated to mark 
% the boundaries of corresponding substrings of the sub-expressions. 
% For instance, in matching the string $aaa$ with the 
% regular expression $(a+aa)^*$, the divide between 
% the initial match and the subsequent iteration could either be 
% set between the first and second characters ($a | aa$) or 
% between the second and third characters ($aa | a$). 
% As both the length of the input string and the structural complexity 
% of the regular expression increase, the number of potential delimiter 
% combinations can grow exponentially, leading to a corresponding increase 
% in complexity for algorithms that do not optimally navigate these possibilities.

% Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).

% Various disambiguation strategies are 
% employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:


%Regular expressions 
%have been extensively studied and
%implemented since their invention in 1940s.
%It is primarily used in lexing, where an unstructured input string is broken
%down into a tree of tokens.
%That tree's construction is guided by the shape of the regular expression.
%This is particularly useful in expressing the shape of a string
%with many fields, where certain fields might repeat or be omitted.
%Regular expression matchers and Lexers allow us to 
%identify and delimit different subsections of a string and potentially 
%extract information from inputs, making them
%an indispensible component in modern software systems' text processing tasks
%such as compilers, IDEs, and firewalls.
%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
%which means the regular expression matching or lexing engine takes an unproportional time to run 
%despite the input and the expression being relatively simple.
%
%Catastrophic backtracking stems from the ambiguities of lexing: 
%when matching a multiple-character string with a regular 
%exression that includes serveral sub-expressions, there might be different positions to set 
%the border between sub-expressions' corresponding sub-strings.
%For example, matching the string $aaa$ against the regular expression
%$(a+aa)^*$, the border between the initial match and the second iteration
%could be between the first and second character ($a | aa$) 
%or between the second and third character ($aa | a$).
%As the size of the input string and the structural complexity 
%of the regular expression increase,
%the number of different combinations of delimiters can grow exponentially, and
%algorithms that explore these possibilities unwisely will also see an exponential complexity.
%
%Catastrophic backtracking allow a class of computationally inexpensive attacks called
%Regular expression Denial of Service attacks (ReDoS), in which the hacker 
%simply sends out a small attack string to a server, 
%triggering high-complexity behaviours in its regular expression engine. 
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more 
%details of this in the practical overview section in chapter \ref{Introduction}).
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
%There have been prose definitions like the following
%by Kuklewicz \cite{KuklewiczHaskell}: 
%described the POSIX rule as (section 1, last paragraph):





%first character is removed 
%state of the automaton after matching that character 
%where nodes are represented as 
%a sub-expression (for example tagged NFA
%Working on regular expressions 
%Because of these problems with automata, we prefer regular expressions
%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
%the regular expression and a different data structure.
%
%
%The key idea 



%Regular expressions are widely used in computer science: 
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
%command-line tools like $\mathit{grep}$ that facilitate easy 
%text-processing \cite{grep}; network intrusion
%detection systems that inspect suspicious traffic; or compiler
%front ends.
%Given their usefulness and ubiquity, one would assume that
%modern regular expression matching implementations
%are mature and fully studied.
%Indeed, in many popular programming languages' regular expression engines, 
%one can supply a regular expression and a string, and in most cases get
%get the matching information in a very short time.
%Those engines can sometimes be blindingly fast--some 
%network intrusion detection systems
%use regular expression engines that are able to process 
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
%However, those engines can sometimes also exhibit a surprising security vulnerability
%under a certain class of inputs.
%However, , this is not the case for $\mathbf{all}$ inputs.
%TODO: get source for SNORT/BRO's regex matching engine/speed


%----------------------------------------------------------------------------------------
\section{Contribution}
%{\color{red} \rule{\linewidth}{0.5mm}}
%\textbf{issue with this part: way too short, not enough details of what I have done.}
 %{\color{red} \rule{\linewidth}{0.5mm}}
\marginpar{\em Gerog: No sufficient context on related work making contribution hard to digest.}


\marginpar{introducing formalisations on regex matching}
There have been many formalisations in the theorem-proving community 
about regular expressions and lexing.
One flavour is to build from the regular expression an automaton, and determine
acceptance in terms of the resulting 
state after executing the input string on that automaton.
Automata formalisations are in general harder and more cumbersome to deal
with for theorem provers than working directly on regular expressions.
One such example is by Nipkow \cite{Nipkow1998}.
%They 
%made everything recursive (for example the next state function),
As a core idea, they
used a list of booleans to name each state so that 
after composing sub-automata together, renaming the states to maintain
the distinctness of each state is recursive and simple.
The result was the obvious lemmas incorporating  
``a painful amount of detail'' in their formalisation.
Sometimes the automata are represented as graphs. 
But graphs are not inductive datatypes.
Having to set the induction principle on the number of nodes
in a graph makes formal reasoning non-intuitive and convoluted,
resulting in large formalisations \cite{Lammich2012}. 
When combining two graphs, one also needs to make sure that the nodes in
both graphs are distinct, which almost always involve
renaming of the nodes.
A theorem-prover which provides dependent types such as Coq 
can alleviate the issue of representing graph nodes
\cite{Doczkal2013}. There the representation of nodes is made
easier by the use of $\textit{FinType}$.
Another representation for automata are matrices. 
But the induction for them is not as straightforward either.
There are some more clever representations, for example one by Paulson 
using hereditarily finite sets \cite{Paulson2015}. 
There the problem with combining graphs can be solved better. 
%but we believe that such clever tricks are not very obvious for 
%the John-average-Isabelle-user.

The approach that operates directly on regular expressions circumvents the problem of
conversion between a regular expression and an automaton, thereby avoiding representation
problems altogether, despite that regular expressions may be seen as a variant of a
non-deterministic finite automaton (ref Laurikari tagged NFA paper).
To matching a string, a sequence of algebraic transformations called 
(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression.
Each derivative takes a character and a regular expression, 
and returns a new regular expression whose language is closely related to 
the original regular expression's language:
strings prefixed with that input character will have their head removed
and strings not prefixed
with that character will be eliminated. 
After taking derivatives with respect to all the characters the string is
exhausted. Then an algorithm checks whether the empty string is in that final 
regular expression's language.
If so, a match exists and the string is in the language of the input regular expression.

Again this process can be seen as the simulation of an NFA running on a string,
but the recursive nature of the datatypes and functions involved makes
derivatives a perfect object of study for theorem provers.
That is why there has been numerous formalisations of regular expressions
and Brzozowski derivatives in the functional programming and 
theorem proving community (a large list of refs to derivatives formalisation publications).
Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
formalised the notion of bit-coded regular expressions
and proved their relations with simple regular expressions in
the dependently-typed proof assistant Agda.
They also proved the soundness and completeness of a matching algorithm
based on the bit-coded regular expressions. Their algorithm is a decision procedure
that gives a Yes/No answer, which does not produce
lexical values.
%X also formalised derivatives and regular expressions, producing "parse trees".
%(Some person who's a big name in formal methods)



%In this thesis,
%we propose a solution to catastrophic
%backtracking and error-prone matchers: a formally verified
%regular expression lexing algorithm
%that is both fast
%and correct. 
%%{\color{red} \rule{\linewidth}{0.5mm}}
%\HandRight Added content:
%Package \verb`pifont`: \ding{43}
%Package \verb`utfsym`: \usym{261E} 
%Package \verb`dingbat`: \leftpointright 
%Package \verb`dingbat`: \rightpointright 
We have made mainly two contributions in this thesis: %the
%lexer we developed based on Brzozowski derivatives and 
%Sulzmanna and Lu's developments called 
proving the lexer $\blexersimp$ is both i) correctness and ii)fast.
It is correct w.r.t a formalisation of POSIX lexing by Ausaf et al.\cite{AusafDyckhoffUrban2016}.
It is fast compared with un-optimised implementations like Sulzmann and Lu's orignal
development by our metric of internal data structures not growing unbounded.
\subsection{Theorem Proving with Derivatives}

Ribeiro and Du Bois \cite{RibeiroAgda2017} have 
formalised the notion of bit-coded regular expressions
and proved their relations with simple regular expressions in
the dependently-typed proof assistant Agda.
They also proved the soundness and completeness of a matching algorithm
based on the bit-coded regular expressions.
Ausaf et al. \cite{AusafDyckhoffUrban2016}
are the first to
give a quite simple formalised POSIX
specification in Isabelle/HOL, and also prove 
that their specification coincides with an earlier (unformalised) 
POSIX specification given by Okui and Suzuki \cite{Okui10}.
They then formally proved the correctness of
a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014}
with regards to that specification.
They also found that the informal POSIX
specification by Sulzmann and Lu needs to be substantially 
revised in order for the correctness proof to go through.
Their original specification and proof were unfixable
according to Ausaf et al.



\subsection{Complexity Results}
Our formalisation of complexity is unique among similar works in the sense that
%is about the size of internal data structures.
to our knowledge %we don't know of a 
there are not other certified 
lexing/parsing algorithms with similar data structure size bound theorems.
Common practices involve making empirical analysis of the complexity of the algorithm
in question (\cite{Verbatim}, \cite{Verbatimpp}), or relying 
on prior (unformalised) complexity analysis of well-known algorithms (\cite{ValiantParsing}),
making them prone to complexity bugs.

%TODO: add citation
%, for example in the Verbatim work \cite{Verbatim}

%While formalised proofs have not included 
%Issues known as "performance bugs" can
Whilst formalised complexity theorems 
have not yet appeared in other certified lexers/parsers,
%while this work is done,
they do find themselves in the broader theorem-proving literature:
\emph{time credits} have been formalised for separation logic in Coq 
\cite{atkey2010amortised}
%not used in 
to characterise the runtime complexity of union-find.
%where integer values are recorded %from the beginning of an execution
%as part of the program state
%and decremented in each step. 
The idea is that the total number of instructions executed 
is equal to the time credits consumed
during the execution of an algorithm.
%each time a basic step is executed. 
%The way to use a time credit Time credit is an integer
%is assigned to a program prior to execution, and each step in the program consumes
%one credit.
Arma{\"e}l et al. \cite{bigOImperative} have extended the framework to allow expressing time
credits using big-O notations,
so one can prove both the functional correctness and asymptotic complexity
of higher-order imperative algorithms.
A number of formal proofs also exist for some other
algorithms in Coq \cite{subtypingFormalComplexity}.

The big-O notation have also been formalised in Isabelle
\cite{bigOIsabelle}
%for example the complexity of
%the Quicksort algorithm 
%is $\Theta n\ln n$ 
\marginpar{more work on formalising complexity}.
%Our next step is to leverage these frameworks
%It is a precursor to our 
Our work focuses on the space complexity of the algorithm under our notion of the size of 
a regular expression.
Despite not being a direct asymptotic time complexity proof,
our result is an important stepping stone towards one.



Brzozowski showed that there are finitely many similar deriviatives, 
where similarity is defined in terms of ACI equations. 
This allows him to use derivatives as a basis for DFAs where each state is 
labelled with a derivative. 
However, Brzozowski did not show anything about 
the size of the derivatives.
Antimirov showed that there can only be finitely 
many partial derivatives for a regular expression and any set of 
strings. He showed that the number is actually the 
``alphabetical width'' plus 1. 
From this result one can relatively easily establish 
that the size of the partial derivatives is 
no bigger than $(\textit{size} \; r)^3$ for every string.
Unfortunately this result does not seem to carry over to our 
setting because partial derivatives have the simplification 
\begin{equation}\label{eq:headSplitRule}
	(r_1 + r_2) \cdot r_3 \rightarrow  (r_1 \cdot r_3) + (r_2 \cdot r_3)
\end{equation}
built in. We cannot have this because otherwise we would 
lose the POSIX property. For instance, the lexing result of
regular expression
\[
	(a+ab)\cdot(bc+c)
\]
with respect to string $abc$ using our lexer with the simplification rule \ref{eq:headSplitRule}
would be 
\[
	\Left (\Seq \; (\Char \; a), \Seq (\Char \; b) \; (\Char \; c)  )
\]
instead of the correct POSIX value
\[
	\Seq \; (\Right \; (\Seq \; (\Char \; a) \; (\Char \; b)) ) \; (\Char \;)
\]
Our result about the finite bound also does not say anything about the number of derivatives. 
In fact there are infinitely many derivatives in general 
because in the annotated regular expression for STAR we record the 
number of iterations. What our result shows that the size of 
the derivatives is bounded, not the number.
\marginpar{new changes up to this point.}


In particular, the main problem we solved on top of previous work was
coming up with a formally verified algorithm called $\blexersimp$ based
on Brzozowski derivatives. It calculates a POSIX
lexical value from a string and a regular expression. This algorithm was originally
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
function does not really simplify intermediate results where it needs to and improved the
algorithm accordingly.
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
\begin{theorem}
$|\blexersimp \; r \; s | \leq N_r$
\end{theorem}
The simplifications applied in each step of $\blexersimp$ 

\begin{center}
	$\blexersimp
	$
\end{center}
keeps the derivatives small, but presents a 
challenge


establishing a correctness theorem of the below form:
%TODO: change this to "theorem to prove"
\begin{theorem}
	If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$, 
	then $\blexersimp\; r \; s = \Some \; v$. Otherwise 
	$\blexersimp \; r \; s = \None$.
\end{theorem}




The result is %a regular expression lexing algorithm that comes with 
\begin{itemize}
\item
an improved version of  Sulzmann and Lu's bit-coded algorithm using 
derivatives with simplifications, 
accompanied by
a proven correctness theorem according to POSIX specification 
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, 
\item 
a complexity-related property for that algorithm saying that the 
internal data structure will
remain below a finite bound,
\item
and an extension to
the bounded repetition constructs with the correctness and finiteness property
maintained.
\end{itemize}
\noindent
{\color{red} \rule{\linewidth}{0.5mm}}
With a formal finiteness bound in place,
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
The Isabelle/HOL code for our formalisation can be 
found at
\begin{center}
\url{https://github.com/hellotommmy/posix}
\end{center}
Further improvements to the algorithm with an even stronger version of 
simplification can be made. We conjecture that the resulting size of derivatives
can be bounded by a cubic bound w.r.t. the size of the regular expression.
We will give relevant code in Scala, 
but do not give a formal proof for that in Isabelle/HOL.
This is still future work.


\section{Structure of the thesis}
\marginpar{\em This is a marginal note.} 
Before talking about the formal proof of $\blexersimp$'s 
correctness, which is the main contribution of this thesis,
we need to introduce two formal proofs which belong
to Ausafe et al. 
In chapter \ref{Inj} we will introduce the concepts
and notations we 
use for describing regular expressions and derivatives,
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including 
its correctness proof).
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
together with the correctness proof by Ausaf and Urban.
Then we illustrate in chapter \ref{Bitcoded2}
how Sulzmann and Lu's
simplifications fail to simplify correctly. We therefore introduce our own version of the
algorithm with correct simplifications and 
their correctness proof.  
In chapter \ref{Finite} we give the second guarantee
of our bitcoded algorithm, that is a finite bound on the size of any 
regular expression's derivatives. 
We also show how one can extend the
algorithm to include bounded repetitions. 
In chapter \ref{Cubic} we discuss stronger simplification rules which 
improve the finite bound to a cubic bound. %and the NOT regular expression.
Chapter \ref{RelatedWork} introduces relevant work for this thesis.
Chapter \ref{Future} concludes and mentions avenues of future research.
 




%----------------------------------------------------------------------------------------


%----------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------
author	Chengsong
	Wed, 23 Aug 2023 03:02:31 +0100
changeset 668	3831621d7b14
parent 666	6da4516ea87d
permissions	-rwxr-xr-x