532
|
1 |
% Chapter 1
|
|
2 |
|
|
3 |
\chapter{Introduction} % Main chapter title
|
|
4 |
|
|
5 |
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
|
|
6 |
|
|
7 |
%----------------------------------------------------------------------------------------
|
|
8 |
|
|
9 |
% Define some commands to keep the formatting separated from the content
|
|
10 |
\newcommand{\keyword}[1]{\textbf{#1}}
|
|
11 |
\newcommand{\tabhead}[1]{\textbf{#1}}
|
|
12 |
\newcommand{\code}[1]{\texttt{#1}}
|
|
13 |
\newcommand{\file}[1]{\texttt{\bfseries#1}}
|
|
14 |
\newcommand{\option}[1]{\texttt{\itshape#1}}
|
|
15 |
|
|
16 |
%boxes
|
|
17 |
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
|
|
18 |
|
|
19 |
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
|
|
20 |
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
|
|
21 |
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
|
543
|
22 |
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
|
596
|
23 |
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
|
564
|
24 |
\def\derssimp{\textit{ders}\_\textit{simp}}
|
557
|
25 |
\def\rders{\textit{rders}}
|
532
|
26 |
\newcommand{\bders}[2]{#1 \backslash #2}
|
|
27 |
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
|
591
|
28 |
\def\bsimps{\textit{bsimp}}
|
554
|
29 |
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
|
532
|
30 |
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
|
|
31 |
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
|
|
32 |
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
|
|
33 |
\newcommand{\ZERO}{\mbox{\bf 0}}
|
|
34 |
\newcommand{\ONE}{\mbox{\bf 1}}
|
|
35 |
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
|
555
|
36 |
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
|
594
|
37 |
\def\rdistincts{\textit{rdistinct}}
|
556
|
38 |
\def\rDistinct{\textit{rdistinct}}
|
532
|
39 |
\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
|
|
40 |
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
|
|
41 |
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
|
620
|
42 |
\def\cbn{\textit{createdByNtimes}}
|
|
43 |
\def\hpa{\textit{highestPowerAux}}
|
|
44 |
\def\hpower{\textit{highestPower}}
|
|
45 |
\def\ntset{\textit{ntset}}
|
|
46 |
\def\optermsimp{\textit{optermsimp}}
|
|
47 |
\def\optermOsimp{\textit{optermOsimp}}
|
|
48 |
\def\optermosimp{\textit{optermosimp}}
|
|
49 |
\def\opterm{\textit{opterm}}
|
|
50 |
\def\nString{\textit{nonemptyString}}
|
532
|
51 |
|
|
52 |
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
|
|
53 |
|
600
|
54 |
\def\SEQ{\textit{SEQ}}
|
|
55 |
\def\SEQs{\textit{SEQs}}
|
564
|
56 |
\def\case{\textit{case}}
|
554
|
57 |
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
|
|
58 |
\def\rsimpalts{\textit{rsimp}_{ALTS}}
|
|
59 |
\def\good{\textit{good}}
|
|
60 |
\def\btrue{\textit{true}}
|
|
61 |
\def\bfalse{\textit{false}}
|
542
|
62 |
\def\bnullable{\textit{bnullable}}
|
543
|
63 |
\def\bnullables{\textit{bnullables}}
|
538
|
64 |
\def\Some{\textit{Some}}
|
|
65 |
\def\None{\textit{None}}
|
537
|
66 |
\def\code{\textit{code}}
|
532
|
67 |
\def\decode{\textit{decode}}
|
|
68 |
\def\internalise{\textit{internalise}}
|
|
69 |
\def\lexer{\mathit{lexer}}
|
|
70 |
\def\mkeps{\textit{mkeps}}
|
557
|
71 |
\newcommand{\rder}[2]{#2 \backslash_r #1}
|
532
|
72 |
|
585
|
73 |
\def\rerases{\textit{rerase}}
|
|
74 |
|
554
|
75 |
\def\nonnested{\textit{nonnested}}
|
532
|
76 |
\def\AZERO{\textit{AZERO}}
|
558
|
77 |
\def\sizeNregex{\textit{sizeNregex}}
|
532
|
78 |
\def\AONE{\textit{AONE}}
|
|
79 |
\def\ACHAR{\textit{ACHAR}}
|
|
80 |
|
585
|
81 |
\def\simpsulz{\textit{simp}_{Sulz}}
|
|
82 |
|
557
|
83 |
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
|
555
|
84 |
\def\frewrite{\rightsquigarrow_f}
|
|
85 |
\def\hrewrite{\rightsquigarrow_h}
|
|
86 |
\def\grewrite{\rightsquigarrow_g}
|
|
87 |
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
|
|
88 |
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
|
|
89 |
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
|
538
|
90 |
\def\fuse{\textit{fuse}}
|
|
91 |
\def\bder{\textit{bder}}
|
542
|
92 |
\def\der{\textit{der}}
|
532
|
93 |
\def\POSIX{\textit{POSIX}}
|
|
94 |
\def\ALTS{\textit{ALTS}}
|
|
95 |
\def\ASTAR{\textit{ASTAR}}
|
|
96 |
\def\DFA{\textit{DFA}}
|
538
|
97 |
\def\NFA{\textit{NFA}}
|
532
|
98 |
\def\bmkeps{\textit{bmkeps}}
|
543
|
99 |
\def\bmkepss{\textit{bmkepss}}
|
532
|
100 |
\def\retrieve{\textit{retrieve}}
|
|
101 |
\def\blexer{\textit{blexer}}
|
|
102 |
\def\flex{\textit{flex}}
|
573
|
103 |
\def\inj{\textit{inj}}
|
564
|
104 |
\def\Empty{\textit{Empty}}
|
567
|
105 |
\def\Left{\textit{Left}}
|
|
106 |
\def\Right{\textit{Right}}
|
573
|
107 |
\def\Stars{\textit{Stars}}
|
|
108 |
\def\Char{\textit{Char}}
|
|
109 |
\def\Seq{\textit{Seq}}
|
532
|
110 |
\def\Der{\textit{Der}}
|
|
111 |
\def\Ders{\textit{Ders}}
|
|
112 |
\def\nullable{\mathit{nullable}}
|
|
113 |
\def\Z{\mathit{Z}}
|
|
114 |
\def\S{\mathit{S}}
|
|
115 |
\def\rup{r^\uparrow}
|
|
116 |
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
|
|
117 |
\def\distinctWith{\textit{distinctWith}}
|
|
118 |
\def\lf{\textit{lf}}
|
|
119 |
\def\PD{\textit{PD}}
|
|
120 |
\def\suffix{\textit{Suffix}}
|
543
|
121 |
\def\distinctBy{\textit{distinctBy}}
|
558
|
122 |
\def\starupdate{\textit{starUpdate}}
|
|
123 |
\def\starupdates{\textit{starUpdates}}
|
620
|
124 |
\def\nupdate{\textit{nupdate}}
|
|
125 |
\def\nupdates{\textit{nupdates}}
|
558
|
126 |
|
532
|
127 |
|
|
128 |
\def\size{\mathit{size}}
|
|
129 |
\def\rexp{\mathbf{rexp}}
|
|
130 |
\def\simp{\mathit{simp}}
|
|
131 |
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
|
|
132 |
\def\map{\mathit{map}}
|
|
133 |
\def\distinct{\mathit{distinct}}
|
|
134 |
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
|
590
|
135 |
\def\blexerStrong{\textit{blexerStrong}}
|
|
136 |
\def\bsimpStrong{\textit{bsimpStrong}}
|
591
|
137 |
\def\bdersStrongs{\textit{bdersStrong}}
|
590
|
138 |
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
|
|
139 |
|
532
|
140 |
\def\map{\textit{map}}
|
|
141 |
\def\rrexp{\textit{rrexp}}
|
554
|
142 |
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
|
532
|
143 |
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
|
|
144 |
\newcommand\asize[1]{\llbracket #1 \rrbracket}
|
543
|
145 |
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
|
|
146 |
|
538
|
147 |
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
|
532
|
148 |
|
543
|
149 |
|
|
150 |
\def\rflts{\textit{rflts}}
|
|
151 |
\def\rrewrite{\textit{rrewrite}}
|
|
152 |
\def\bsimpalts{\textit{bsimp}_{ALTS}}
|
596
|
153 |
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
|
|
154 |
\def\rsimlalts{\textit{rsimp}_{ALTs}}
|
|
155 |
\def\rsimpseq{\textit{rsimp}_{SEQ}}
|
543
|
156 |
|
532
|
157 |
\def\erase{\textit{erase}}
|
|
158 |
\def\STAR{\textit{STAR}}
|
|
159 |
\def\flts{\textit{flts}}
|
|
160 |
|
|
161 |
|
579
|
162 |
\def\zeroable{\textit{zeroable}}
|
|
163 |
\def\nub{\textit{nub}}
|
|
164 |
\def\filter{\textit{filter}}
|
601
|
165 |
%\def\not{\textit{not}}
|
579
|
166 |
|
|
167 |
|
|
168 |
|
532
|
169 |
\def\RZERO{\mathbf{0}_r }
|
|
170 |
\def\RONE{\mathbf{1}_r}
|
|
171 |
\newcommand\RCHAR[1]{\mathbf{#1}_r}
|
|
172 |
\newcommand\RSEQ[2]{#1 \cdot #2}
|
558
|
173 |
\newcommand\RALTS[1]{\sum #1}
|
532
|
174 |
\newcommand\RSTAR[1]{#1^*}
|
558
|
175 |
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
|
532
|
176 |
|
538
|
177 |
|
|
178 |
|
590
|
179 |
|
|
180 |
\lstdefinestyle{myScalastyle}{
|
|
181 |
frame=tb,
|
|
182 |
language=scala,
|
|
183 |
aboveskip=3mm,
|
|
184 |
belowskip=3mm,
|
|
185 |
showstringspaces=false,
|
|
186 |
columns=flexible,
|
|
187 |
basicstyle={\small\ttfamily},
|
|
188 |
numbers=none,
|
|
189 |
numberstyle=\tiny\color{gray},
|
|
190 |
keywordstyle=\color{blue},
|
|
191 |
commentstyle=\color{dkgreen},
|
|
192 |
stringstyle=\color{mauve},
|
|
193 |
frame=single,
|
|
194 |
breaklines=true,
|
|
195 |
breakatwhitespace=true,
|
|
196 |
tabsize=3,
|
538
|
197 |
}
|
|
198 |
|
590
|
199 |
|
532
|
200 |
%----------------------------------------------------------------------------------------
|
|
201 |
%This part is about regular expressions, Brzozowski derivatives,
|
|
202 |
%and a bit-coded lexing algorithm with proven correctness and time bounds.
|
|
203 |
|
|
204 |
%TODO: look up snort rules to use here--give readers idea of what regexes look like
|
|
205 |
|
601
|
206 |
|
648
|
207 |
Regular expressions, since their inception in the 1940s,
|
|
208 |
have been subject to extensive study and implementation.
|
|
209 |
Their primary application lies in lexing,
|
|
210 |
a process whereby an unstructured input string is disassembled into a tree of tokens
|
|
211 |
with their guidance.
|
|
212 |
This is often used to match strings that comprises of numerous fields,
|
|
213 |
where certain fields may recur or be omitted.
|
|
214 |
For example, the regular expression for recognising email addresses is
|
|
215 |
\begin{center}
|
|
216 |
\verb|^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$|.
|
|
217 |
\end{center}
|
|
218 |
Using this, regular expression matchers and lexers are able to extract
|
649
|
219 |
the domain names by the use of \verb|[a-zA-Z0-9.-]+|.
|
|
220 |
Consequently, they are an indispensible components in text processing tools
|
648
|
221 |
of software systems such as compilers, IDEs, and firewalls.
|
601
|
222 |
|
648
|
223 |
The study of regular expressions is ongoing due to an
|
|
224 |
issue known as catastrophic backtracking.
|
|
225 |
This phenomenon refers to scenarios in which the regular expression
|
|
226 |
matching or lexing engine exhibits a disproportionately long
|
|
227 |
runtime despite the simplicity of the input and expression.
|
|
228 |
|
|
229 |
The origin of catastrophic backtracking is rooted in the
|
|
230 |
ambiguities of lexing.
|
|
231 |
In the process of matching a multi-character string with
|
|
232 |
a regular expression that encompasses several sub-expressions,
|
|
233 |
different positions can be designated to mark
|
|
234 |
the boundaries of corresponding substrings of the sub-expressions.
|
|
235 |
For instance, in matching the string $aaa$ with the
|
|
236 |
regular expression $(a+aa)^*$, the divide between
|
|
237 |
the initial match and the subsequent iteration could either be
|
|
238 |
set between the first and second characters ($a | aa$) or between the second and third characters ($aa | a$). As both the length of the input string and the structural complexity of the regular expression increase, the number of potential delimiter combinations can grow exponentially, leading to a corresponding increase in complexity for algorithms that do not optimally navigate these possibilities.
|
|
239 |
|
|
240 |
Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}).
|
|
241 |
|
|
242 |
Various disambiguation strategies are employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph:
|
601
|
243 |
|
|
244 |
|
648
|
245 |
%Regular expressions
|
|
246 |
%have been extensively studied and
|
|
247 |
%implemented since their invention in 1940s.
|
|
248 |
%It is primarily used in lexing, where an unstructured input string is broken
|
|
249 |
%down into a tree of tokens.
|
|
250 |
%That tree's construction is guided by the shape of the regular expression.
|
|
251 |
%This is particularly useful in expressing the shape of a string
|
|
252 |
%with many fields, where certain fields might repeat or be omitted.
|
|
253 |
%Regular expression matchers and Lexers allow us to
|
|
254 |
%identify and delimit different subsections of a string and potentially
|
|
255 |
%extract information from inputs, making them
|
|
256 |
%an indispensible component in modern software systems' text processing tasks
|
|
257 |
%such as compilers, IDEs, and firewalls.
|
|
258 |
%Research on them is far from over due to the well-known issue called catastrophic-backtracking,
|
|
259 |
%which means the regular expression matching or lexing engine takes an unproportional time to run
|
|
260 |
%despite the input and the expression being relatively simple.
|
|
261 |
%
|
|
262 |
%Catastrophic backtracking stems from the ambiguities of lexing:
|
|
263 |
%when matching a multiple-character string with a regular
|
|
264 |
%exression that includes serveral sub-expressions, there might be different positions to set
|
|
265 |
%the border between sub-expressions' corresponding sub-strings.
|
|
266 |
%For example, matching the string $aaa$ against the regular expression
|
|
267 |
%$(a+aa)^*$, the border between the initial match and the second iteration
|
|
268 |
%could be between the first and second character ($a | aa$)
|
|
269 |
%or between the second and third character ($aa | a$).
|
|
270 |
%As the size of the input string and the structural complexity
|
|
271 |
%of the regular expression increase,
|
|
272 |
%the number of different combinations of delimiters can grow exponentially, and
|
|
273 |
%algorithms that explore these possibilities unwisely will also see an exponential complexity.
|
|
274 |
%
|
|
275 |
%Catastrophic backtracking allow a class of computationally inexpensive attacks called
|
|
276 |
%Regular expression Denial of Service attacks (ReDoS), in which the hacker
|
|
277 |
%simply sends out a small attack string to a server,
|
|
278 |
%triggering high-complexity behaviours in its regular expression engine.
|
|
279 |
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more
|
|
280 |
%details of this in the practical overview section in chapter \ref{Introduction}).
|
|
281 |
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX.
|
|
282 |
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible.
|
|
283 |
%There have been prose definitions like the following
|
|
284 |
%by Kuklewicz \cite{KuklewiczHaskell}:
|
|
285 |
%described the POSIX rule as (section 1, last paragraph):
|
646
|
286 |
\begin{quote}
|
|
287 |
\begin{itemize}
|
|
288 |
\item
|
|
289 |
regular expressions (REs) take the leftmost starting match, and the longest match starting there
|
|
290 |
earlier subpatterns have leftmost-longest priority over later subpatterns\\
|
|
291 |
\item
|
|
292 |
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
|
|
293 |
\item
|
|
294 |
REs have right associative concatenation which can be changed with parenthesis\\
|
|
295 |
\item
|
|
296 |
parenthesized subexpressions return the match from their last usage\\
|
|
297 |
\item
|
|
298 |
text of component subexpressions must be contained in the text of the
|
|
299 |
higher-level subexpressions\\
|
|
300 |
\item
|
|
301 |
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
|
|
302 |
\item
|
|
303 |
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\
|
|
304 |
\end{itemize}
|
|
305 |
\end{quote}
|
649
|
306 |
However, the author noted that various lexers that claim to be POSIX
|
|
307 |
are rarely correct according to this standard.
|
|
308 |
There are numerous occasions where programmers realised the subtlety and
|
|
309 |
difficulty to implement correctly, one such quote from Go's regexp library author
|
|
310 |
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}}
|
|
311 |
\begin{quote}\it
|
|
312 |
``
|
|
313 |
The POSIX rule is computationally prohibitive
|
|
314 |
and not even well-defined.
|
|
315 |
``
|
|
316 |
\end{quote}
|
|
317 |
Being able to formally define and capture the idea of
|
|
318 |
POSIX rules and prove
|
|
319 |
the correctness of regular expression matching/lexing
|
|
320 |
algorithms against the POSIX semantics definitions
|
|
321 |
is valuable.
|
|
322 |
|
646
|
323 |
|
649
|
324 |
Formal proofs are
|
|
325 |
machine checked programs
|
|
326 |
%such as the ones written in Isabelle/HOL, is a powerful means
|
|
327 |
for computer scientists to be certain
|
|
328 |
about the correctness of their algorithms.
|
|
329 |
This is done by
|
|
330 |
recursively checking that every fact in a proof script
|
|
331 |
is either an axiom or a fact that is derived from
|
|
332 |
known axioms or verified facts.
|
|
333 |
%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such
|
|
334 |
%methods as their implementation and complexity analysis tend to be error-prone.
|
|
335 |
Formal proofs provides an unprecendented level of asssurance
|
|
336 |
that an algorithm will perform as expected under all inputs.
|
|
337 |
The software systems that help people interactively build and check
|
|
338 |
such proofs are called theorem-provers or proof assitants.
|
|
339 |
Many theorem-provers have been developed, such as Mizar,
|
|
340 |
Isabelle/HOL, HOL-Light, HOL4,
|
|
341 |
Coq, Agda, Idris, Lean and so on.
|
|
342 |
Isabelle/HOL is a theorem prover with a simple type theory
|
|
343 |
and powerful automated proof generators like sledgehammer.
|
|
344 |
We chose to use Isabelle/HOL for its powerful automation
|
|
345 |
and ease and simplicity in expressing regular expressions and
|
|
346 |
regular languages.
|
|
347 |
%Some of those employ
|
|
348 |
%dependent types like Mizar, Coq, Agda, Lean and Idris.
|
|
349 |
%Some support a constructivism approach, such as Coq.
|
646
|
350 |
|
|
351 |
|
649
|
352 |
Formal proofs on regular expression matching and lexing
|
|
353 |
complements the efforts in
|
|
354 |
industry which tend to focus on overall speed
|
|
355 |
with techniques like parallelization (FPGA paper), tackling
|
|
356 |
the problem of catastrophic backtracking
|
|
357 |
in an ad-hoc manner (cloudflare and stackexchange article).
|
|
358 |
|
|
359 |
There have been many interesting steps in the theorem-proving community
|
|
360 |
about formalising regular expressions and lexing.
|
|
361 |
One flavour is to build from the regular expression an automaton, and determine
|
|
362 |
acceptance in terms of the resulting
|
|
363 |
state after executing the input string on that automaton.
|
|
364 |
Automata formalisations are in general harder and more cumbersome to deal
|
|
365 |
with for theorem provers than working directly on regular expressions.
|
|
366 |
One such example is by Nipkow \cite{Nipkow1998}.
|
|
367 |
%They
|
|
368 |
%made everything recursive (for example the next state function),
|
|
369 |
As a core idea, they
|
|
370 |
used a list of booleans to name each state so that
|
|
371 |
after composing sub-automata together, renaming the states to maintain
|
|
372 |
the distinctness of each state is recursive and simple.
|
|
373 |
The result was the obvious lemmas incorporating
|
|
374 |
``a painful amount of detail'' in their formalisation.
|
|
375 |
Sometimes the automata are represented as graphs.
|
|
376 |
But graphs are not inductive datatypes.
|
|
377 |
Having to set the induction principle on the number of nodes
|
|
378 |
in a graph makes formal reasoning non-intuitive and convoluted,
|
|
379 |
resulting in large formalisations \cite{Lammich2012}.
|
|
380 |
When combining two graphs, one also needs to make sure that the nodes in
|
|
381 |
both graphs are distinct, which almost always involve
|
|
382 |
renaming of the nodes.
|
|
383 |
A theorem-prover which provides dependent types such as Coq
|
|
384 |
can alleviate the issue of representing graph nodes
|
|
385 |
\cite{Doczkal2013}. There the representation of nodes is made
|
|
386 |
easier by the use of $\textit{FinType}$.
|
|
387 |
Another representation for automata are matrices.
|
|
388 |
But the induction for them is not as straightforward either.
|
|
389 |
There are some more clever representations, for example one by Paulson
|
|
390 |
using hereditarily finite sets \cite{Paulson2015}.
|
|
391 |
There the problem with combining graphs can be solved better.
|
|
392 |
%but we believe that such clever tricks are not very obvious for
|
|
393 |
%the John-average-Isabelle-user.
|
|
394 |
|
|
395 |
The approach that operates directly on regular expressions circumvents the problem of
|
|
396 |
conversion between a regular expression and an automaton, thereby avoiding representation
|
|
397 |
problems altogether, despite that regular expressions may be seen as a variant of a
|
|
398 |
non-deterministic finite automaton (ref Laurikari tagged NFA paper).
|
|
399 |
To matching a string, a sequence of algebraic transformations called
|
|
400 |
(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression.
|
|
401 |
Each derivative takes a character and a regular expression,
|
|
402 |
and returns a new regular expression whose language is closely related to
|
|
403 |
the original regular expression's language:
|
|
404 |
strings prefixed with that input character will have their head removed
|
|
405 |
and strings not prefixed
|
|
406 |
with that character will be eliminated.
|
|
407 |
After taking derivatives with respect to all the characters the string is
|
|
408 |
exhausted. Then an algorithm checks whether the empty string is in that final
|
|
409 |
regular expression's language.
|
|
410 |
If so, a match exists and the string is in the language of the input regular expression.
|
|
411 |
|
|
412 |
Again this process can be seen as the simulation of an NFA running on a string,
|
|
413 |
but the recursive nature of the datatypes and functions involved makes
|
|
414 |
derivatives a perfect object of study for theorem provers.
|
|
415 |
That is why there has been numerous formalisations of regular expressions
|
|
416 |
and Brzozowski derivatives in the functional programming and
|
|
417 |
theorem proving community (a large list of refs to derivatives formalisation publications).
|
|
418 |
Ribeiro and Du Bois \cite{RibeiroAgda2017} have
|
|
419 |
formalised the notion of bit-coded regular expressions
|
|
420 |
and proved their relations with simple regular expressions in
|
|
421 |
the dependently-typed proof assistant Agda.
|
|
422 |
They also proved the soundness and completeness of a matching algorithm
|
|
423 |
based on the bit-coded regular expressions. Their algorithm is a decision procedure
|
|
424 |
that gives a Yes/No answer, which does not produce
|
|
425 |
lexical values.
|
|
426 |
%X also formalised derivatives and regular expressions, producing "parse trees".
|
|
427 |
%(Some person who's a big name in formal methods)
|
|
428 |
|
|
429 |
|
|
430 |
The variant of the problem we are looking at centers around
|
|
431 |
an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}.
|
|
432 |
The reason we chose to look at $\blexer$ and its simplifications
|
|
433 |
is because it allows a lexical tree to be generated
|
|
434 |
by some elegant and subtle procedure based on Brzozowski derivatives.
|
|
435 |
The procedures are made of recursive functions and inductive datatypes just like derivatives,
|
|
436 |
allowing intuitive and concise formal reasoning with theorem provers.
|
|
437 |
Most importantly, $\blexer$ opens up a path to an optimized version
|
|
438 |
of $\blexersimp$ possibilities to improve
|
|
439 |
performance with simplifications that aggressively change the structure of regular expressions.
|
|
440 |
While most derivative-based methods
|
|
441 |
rely on structures to be maintained to allow induction to
|
|
442 |
go through.
|
|
443 |
For example, Egolf et al. \ref{Verbatim} have developed a verified lexer
|
|
444 |
with derivatives, but as soon as they started introducing
|
|
445 |
optimizations such as memoization, they reverted to constructing
|
|
446 |
DFAs first.
|
|
447 |
Edelmann \ref{Edelmann2020} explored similar optimizations in his
|
|
448 |
work on verified LL(1) parsing, with additional enhancements with data structures like
|
|
449 |
zippers.
|
|
450 |
|
|
451 |
%Sulzmann and Lu have worked out an algorithm
|
|
452 |
%that is especially suited for verification
|
|
453 |
%which utilized the fact
|
|
454 |
%that whenever ambiguity occurs one may duplicate a sub-expression and use
|
|
455 |
%different copies to describe different matching choices.
|
|
456 |
The idea behind the correctness of $\blexer$ is simple: during a derivative,
|
|
457 |
multiple matches might be possible, where an alternative with multiple children
|
|
458 |
each corresponding to a
|
|
459 |
different match is created. In the end of
|
|
460 |
a lexing process one always picks up the leftmost alternative, which is guarnateed
|
|
461 |
to be a POSIX value.
|
|
462 |
This is done by consistently keeping sub-regular expressions in an alternative
|
|
463 |
with longer submatches
|
|
464 |
to the left of other copies (
|
|
465 |
Remember that POSIX values are roughly the values with the longest inital
|
|
466 |
submatch).
|
|
467 |
The idea behind the optimized version of $\blexer$, which we call $\blexersimp$,
|
|
468 |
is that since we only take the leftmost copy, then all re-occurring copies can be
|
|
469 |
eliminated without losing the POSIX property, and this can be done with
|
|
470 |
children of alternatives at different levels by merging them together.
|
|
471 |
Proving $\blexersimp$ requires a different
|
|
472 |
proof strategy compared to that by Ausaf \cite{FahadThesis}.
|
|
473 |
We invent a rewriting relation as an
|
|
474 |
inductive predicate which captures
|
|
475 |
a strong enough invariance that ensures correctness,
|
|
476 |
which commutes with the derivative operation.
|
|
477 |
This predicate allows a simple
|
|
478 |
induction on the input string to go through.
|
|
479 |
|
|
480 |
%This idea has been repeatedly used in different variants of lexing
|
|
481 |
%algorithms in their paper, one of which involves bit-codes. The bit-coded
|
|
482 |
%derivative-based algorithm even allows relatively aggressive
|
|
483 |
%%simplification rules which cause
|
|
484 |
%structural changes that render the induction used in the correctness
|
|
485 |
%proofs unusable.
|
|
486 |
%More details will be given in \ref{Bitcoded2} including the
|
|
487 |
%new correctness proof which involves a new inductive predicate which allows
|
|
488 |
%rule induction to go through.
|
|
489 |
|
|
490 |
|
|
491 |
|
|
492 |
|
|
493 |
|
|
494 |
|
|
495 |
|
|
496 |
%first character is removed
|
|
497 |
%state of the automaton after matching that character
|
|
498 |
%where nodes are represented as
|
|
499 |
%a sub-expression (for example tagged NFA
|
|
500 |
%Working on regular expressions
|
|
501 |
%Because of these problems with automata, we prefer regular expressions
|
|
502 |
%and derivatives rather than an automata (or graph-based) approach which explicitly converts between
|
|
503 |
%the regular expression and a different data structure.
|
|
504 |
%
|
|
505 |
%
|
|
506 |
%The key idea
|
646
|
507 |
|
648
|
508 |
(ends)
|
646
|
509 |
|
|
510 |
%Regular expressions are widely used in computer science:
|
|
511 |
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
|
|
512 |
%command-line tools like $\mathit{grep}$ that facilitate easy
|
|
513 |
%text-processing \cite{grep}; network intrusion
|
|
514 |
%detection systems that inspect suspicious traffic; or compiler
|
|
515 |
%front ends.
|
|
516 |
%Given their usefulness and ubiquity, one would assume that
|
|
517 |
%modern regular expression matching implementations
|
|
518 |
%are mature and fully studied.
|
|
519 |
%Indeed, in many popular programming languages' regular expression engines,
|
|
520 |
%one can supply a regular expression and a string, and in most cases get
|
|
521 |
%get the matching information in a very short time.
|
|
522 |
%Those engines can sometimes be blindingly fast--some
|
|
523 |
%network intrusion detection systems
|
|
524 |
%use regular expression engines that are able to process
|
|
525 |
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
|
|
526 |
%However, those engines can sometimes also exhibit a surprising security vulnerability
|
|
527 |
%under a certain class of inputs.
|
602
|
528 |
%However, , this is not the case for $\mathbf{all}$ inputs.
|
601
|
529 |
%TODO: get source for SNORT/BRO's regex matching engine/speed
|
|
530 |
|
603
|
531 |
|
532
|
532 |
%----------------------------------------------------------------------------------------
|
|
533 |
\section{Contribution}
|
646
|
534 |
{\color{red} \rule{\linewidth}{0.5mm}}
|
|
535 |
\textbf{issue with this part: way too short, not enough details of what I have done.}
|
|
536 |
{\color{red} \rule{\linewidth}{0.5mm}}
|
532
|
537 |
|
609
|
538 |
In this thesis,
|
|
539 |
we propose a solution to catastrophic
|
|
540 |
backtracking and error-prone matchers: a formally verified
|
|
541 |
regular expression lexing algorithm
|
|
542 |
that is both fast
|
638
|
543 |
and correct.
|
646
|
544 |
{\color{red} \rule{\linewidth}{0.5mm}}
|
|
545 |
\HandRight Added content:
|
|
546 |
%Package \verb`pifont`: \ding{43}
|
|
547 |
%Package \verb`utfsym`: \usym{261E}
|
|
548 |
%Package \verb`dingbat`: \leftpointright
|
|
549 |
%Package \verb`dingbat`: \rightpointright
|
|
550 |
|
|
551 |
In particular, the main problem we solved on top of previous work was
|
|
552 |
coming up with a formally verified algorithm called $\blexersimp$ based
|
|
553 |
on Brzozowski derivatives. It calculates a POSIX
|
|
554 |
lexical value from a string and a regular expression. This algorithm was originally
|
|
555 |
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$
|
|
556 |
function does not really simplify intermediate results where it needs to and improved the
|
|
557 |
algorithm accordingly.
|
|
558 |
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$
|
|
559 |
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$:
|
|
560 |
\begin{theorem}
|
|
561 |
$|\blexersimp \; r \; s | \leq N_r$
|
|
562 |
\end{theorem}
|
|
563 |
The simplifications applied in each step of $\blexersimp$
|
|
564 |
|
|
565 |
\begin{center}
|
|
566 |
$\blexersimp
|
|
567 |
$
|
|
568 |
\end{center}
|
|
569 |
keeps the derivatives small, but presents a
|
|
570 |
challenge
|
|
571 |
|
|
572 |
|
|
573 |
establishing a correctness theorem of the below form:
|
|
574 |
%TODO: change this to "theorem to prove"
|
|
575 |
\begin{theorem}
|
|
576 |
If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$,
|
|
577 |
then $\blexersimp\; r \; s = \Some \; v$. Otherwise
|
|
578 |
$\blexersimp \; r \; s = \None$.
|
|
579 |
\end{theorem}
|
|
580 |
|
|
581 |
|
|
582 |
|
|
583 |
|
638
|
584 |
The result is %a regular expression lexing algorithm that comes with
|
538
|
585 |
\begin{itemize}
|
|
586 |
\item
|
609
|
587 |
an improved version of Sulzmann and Lu's bit-coded algorithm using
|
|
588 |
derivatives with simplifications,
|
|
589 |
accompanied by
|
|
590 |
a proven correctness theorem according to POSIX specification
|
|
591 |
given by Ausaf et al. \cite{AusafDyckhoffUrban2016},
|
|
592 |
\item
|
|
593 |
a complexity-related property for that algorithm saying that the
|
|
594 |
internal data structure will
|
638
|
595 |
remain below a finite bound,
|
538
|
596 |
\item
|
638
|
597 |
and an extension to
|
|
598 |
the bounded repetition constructs with the correctness and finiteness property
|
609
|
599 |
maintained.
|
622
|
600 |
\end{itemize}
|
|
601 |
\noindent
|
646
|
602 |
{\color{red} \rule{\linewidth}{0.5mm}}
|
609
|
603 |
With a formal finiteness bound in place,
|
|
604 |
we can greatly reduce the attack surface of servers in terms of ReDoS attacks.
|
631
|
605 |
The Isabelle/HOL code for our formalisation can be
|
|
606 |
found at
|
|
607 |
\begin{center}
|
|
608 |
\url{https://github.com/hellotommmy/posix}
|
|
609 |
\end{center}
|
609
|
610 |
Further improvements to the algorithm with an even stronger version of
|
631
|
611 |
simplification can be made. We conjecture that the resulting size of derivatives
|
|
612 |
can be bounded by a cubic bound w.r.t. the size of the regular expression.
|
638
|
613 |
We will give relevant code in Scala,
|
|
614 |
but do not give a formal proof for that in Isabelle/HOL.
|
|
615 |
This is still future work.
|
532
|
616 |
|
|
617 |
|
|
618 |
\section{Structure of the thesis}
|
622
|
619 |
In chapter \ref{Inj} we will introduce the concepts
|
532
|
620 |
and notations we
|
622
|
621 |
use for describing regular expressions and derivatives,
|
638
|
622 |
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including
|
622
|
623 |
its correctness proof).
|
|
624 |
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1}
|
|
625 |
together with the correctness proof by Ausaf and Urban.
|
|
626 |
Then we illustrate in chapter \ref{Bitcoded2}
|
|
627 |
how Sulzmann and Lu's
|
638
|
628 |
simplifications fail to simplify correctly. We therefore introduce our own version of the
|
|
629 |
algorithm with correct simplifications and
|
|
630 |
their correctness proof.
|
622
|
631 |
In chapter \ref{Finite} we give the second guarantee
|
532
|
632 |
of our bitcoded algorithm, that is a finite bound on the size of any
|
631
|
633 |
regular expression's derivatives.
|
|
634 |
We also show how one can extend the
|
|
635 |
algorithm to include bounded repetitions.
|
622
|
636 |
In chapter \ref{Cubic} we discuss stronger simplification rules which
|
638
|
637 |
improve the finite bound to a cubic bound. %and the NOT regular expression.
|
637
|
638 |
Chapter \ref{RelatedWork} introduces relevant work for this thesis.
|
638
|
639 |
Chapter \ref{Future} concludes and mentions avenues of future research.
|
532
|
640 |
|
|
641 |
|
|
642 |
|
|
643 |
|
|
644 |
|
|
645 |
%----------------------------------------------------------------------------------------
|
|
646 |
|
|
647 |
|
|
648 |
%----------------------------------------------------------------------------------------
|
|
649 |
|
|
650 |
%----------------------------------------------------------------------------------------
|
|
651 |
|
|
652 |
%----------------------------------------------------------------------------------------
|
|
653 |
|
|
654 |
|