author | Chengsong |
Wed, 23 Aug 2023 03:02:31 +0100 | |
changeset 668 | 3831621d7b14 |
parent 666 | 6da4516ea87d |
permissions | -rwxr-xr-x |
532 | 1 |
% Chapter 1 |
2 |
||
3 |
\chapter{Introduction} % Main chapter title |
|
4 |
||
5 |
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1} |
|
6 |
||
7 |
%---------------------------------------------------------------------------------------- |
|
8 |
||
9 |
% Define some commands to keep the formatting separated from the content |
|
10 |
\newcommand{\keyword}[1]{\textbf{#1}} |
|
11 |
\newcommand{\tabhead}[1]{\textbf{#1}} |
|
12 |
\newcommand{\code}[1]{\texttt{#1}} |
|
13 |
\newcommand{\file}[1]{\texttt{\bfseries#1}} |
|
14 |
\newcommand{\option}[1]{\texttt{\itshape#1}} |
|
15 |
||
16 |
%boxes |
|
17 |
\newcommand*{\mybox}[1]{\framebox{\strut #1}} |
|
18 |
||
19 |
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1} |
|
20 |
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis } |
|
21 |
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3} |
|
543 | 22 |
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2} |
596 | 23 |
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2} |
564 | 24 |
\def\derssimp{\textit{ders}\_\textit{simp}} |
557 | 25 |
\def\rders{\textit{rders}} |
532 | 26 |
\newcommand{\bders}[2]{#1 \backslash #2} |
27 |
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)} |
|
591 | 28 |
\def\bsimps{\textit{bsimp}} |
554 | 29 |
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1} |
532 | 30 |
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'} |
31 |
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% |
|
32 |
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}% |
|
33 |
\newcommand{\ZERO}{\mbox{\bf 0}} |
|
34 |
\newcommand{\ONE}{\mbox{\bf 1}} |
|
35 |
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2} |
|
555 | 36 |
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2} |
594 | 37 |
\def\rdistincts{\textit{rdistinct}} |
556 | 38 |
\def\rDistinct{\textit{rdistinct}} |
532 | 39 |
\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*} |
40 |
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'} |
|
41 |
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)} |
|
620 | 42 |
\def\cbn{\textit{createdByNtimes}} |
43 |
\def\hpa{\textit{highestPowerAux}} |
|
44 |
\def\hpower{\textit{highestPower}} |
|
45 |
\def\ntset{\textit{ntset}} |
|
46 |
\def\optermsimp{\textit{optermsimp}} |
|
47 |
\def\optermOsimp{\textit{optermOsimp}} |
|
48 |
\def\optermosimp{\textit{optermosimp}} |
|
49 |
\def\opterm{\textit{opterm}} |
|
50 |
\def\nString{\textit{nonemptyString}} |
|
532 | 51 |
|
52 |
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}} |
|
53 |
||
600 | 54 |
\def\SEQ{\textit{SEQ}} |
55 |
\def\SEQs{\textit{SEQs}} |
|
564 | 56 |
\def\case{\textit{case}} |
554 | 57 |
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}} |
58 |
\def\rsimpalts{\textit{rsimp}_{ALTS}} |
|
59 |
\def\good{\textit{good}} |
|
60 |
\def\btrue{\textit{true}} |
|
61 |
\def\bfalse{\textit{false}} |
|
542 | 62 |
\def\bnullable{\textit{bnullable}} |
543 | 63 |
\def\bnullables{\textit{bnullables}} |
538 | 64 |
\def\Some{\textit{Some}} |
65 |
\def\None{\textit{None}} |
|
537 | 66 |
\def\code{\textit{code}} |
532 | 67 |
\def\decode{\textit{decode}} |
68 |
\def\internalise{\textit{internalise}} |
|
69 |
\def\lexer{\mathit{lexer}} |
|
70 |
\def\mkeps{\textit{mkeps}} |
|
557 | 71 |
\newcommand{\rder}[2]{#2 \backslash_r #1} |
532 | 72 |
|
585 | 73 |
\def\rerases{\textit{rerase}} |
74 |
||
554 | 75 |
\def\nonnested{\textit{nonnested}} |
532 | 76 |
\def\AZERO{\textit{AZERO}} |
558 | 77 |
\def\sizeNregex{\textit{sizeNregex}} |
532 | 78 |
\def\AONE{\textit{AONE}} |
79 |
\def\ACHAR{\textit{ACHAR}} |
|
80 |
||
585 | 81 |
\def\simpsulz{\textit{simp}_{Sulz}} |
82 |
||
557 | 83 |
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}} |
555 | 84 |
\def\frewrite{\rightsquigarrow_f} |
85 |
\def\hrewrite{\rightsquigarrow_h} |
|
86 |
\def\grewrite{\rightsquigarrow_g} |
|
87 |
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}} |
|
88 |
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}} |
|
89 |
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}} |
|
538 | 90 |
\def\fuse{\textit{fuse}} |
91 |
\def\bder{\textit{bder}} |
|
542 | 92 |
\def\der{\textit{der}} |
532 | 93 |
\def\POSIX{\textit{POSIX}} |
94 |
\def\ALTS{\textit{ALTS}} |
|
95 |
\def\ASTAR{\textit{ASTAR}} |
|
96 |
\def\DFA{\textit{DFA}} |
|
538 | 97 |
\def\NFA{\textit{NFA}} |
532 | 98 |
\def\bmkeps{\textit{bmkeps}} |
543 | 99 |
\def\bmkepss{\textit{bmkepss}} |
532 | 100 |
\def\retrieve{\textit{retrieve}} |
101 |
\def\blexer{\textit{blexer}} |
|
102 |
\def\flex{\textit{flex}} |
|
573 | 103 |
\def\inj{\textit{inj}} |
564 | 104 |
\def\Empty{\textit{Empty}} |
567 | 105 |
\def\Left{\textit{Left}} |
106 |
\def\Right{\textit{Right}} |
|
573 | 107 |
\def\Stars{\textit{Stars}} |
108 |
\def\Char{\textit{Char}} |
|
109 |
\def\Seq{\textit{Seq}} |
|
653
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
110 |
\def\Alt{\textit{Alt}} |
532 | 111 |
\def\Der{\textit{Der}} |
112 |
\def\Ders{\textit{Ders}} |
|
113 |
\def\nullable{\mathit{nullable}} |
|
114 |
\def\Z{\mathit{Z}} |
|
115 |
\def\S{\mathit{S}} |
|
116 |
\def\rup{r^\uparrow} |
|
117 |
%\def\bderssimp{\mathit{bders}\_\mathit{simp}} |
|
118 |
\def\distinctWith{\textit{distinctWith}} |
|
119 |
\def\lf{\textit{lf}} |
|
120 |
\def\PD{\textit{PD}} |
|
121 |
\def\suffix{\textit{Suffix}} |
|
543 | 122 |
\def\distinctBy{\textit{distinctBy}} |
558 | 123 |
\def\starupdate{\textit{starUpdate}} |
124 |
\def\starupdates{\textit{starUpdates}} |
|
620 | 125 |
\def\nupdate{\textit{nupdate}} |
126 |
\def\nupdates{\textit{nupdates}} |
|
558 | 127 |
|
532 | 128 |
|
129 |
\def\size{\mathit{size}} |
|
130 |
\def\rexp{\mathbf{rexp}} |
|
131 |
\def\simp{\mathit{simp}} |
|
132 |
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}} |
|
133 |
\def\map{\mathit{map}} |
|
134 |
\def\distinct{\mathit{distinct}} |
|
135 |
\def\blexersimp{\mathit{blexer}\_\mathit{simp}} |
|
590 | 136 |
\def\blexerStrong{\textit{blexerStrong}} |
137 |
\def\bsimpStrong{\textit{bsimpStrong}} |
|
591 | 138 |
\def\bdersStrongs{\textit{bdersStrong}} |
590 | 139 |
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2} |
140 |
||
532 | 141 |
\def\map{\textit{map}} |
142 |
\def\rrexp{\textit{rrexp}} |
|
554 | 143 |
\newcommand\rnullable[1]{\textit{rnullable} \; #1 } |
532 | 144 |
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r} |
145 |
\newcommand\asize[1]{\llbracket #1 \rrbracket} |
|
543 | 146 |
\newcommand\rerase[1]{ (#1)_{\downarrow_r}} |
147 |
||
538 | 148 |
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\} |
532 | 149 |
|
543 | 150 |
|
151 |
\def\rflts{\textit{rflts}} |
|
152 |
\def\rrewrite{\textit{rrewrite}} |
|
153 |
\def\bsimpalts{\textit{bsimp}_{ALTS}} |
|
596 | 154 |
\def\bsimpaseq{\textit{bsimp}_{ASEQ}} |
155 |
\def\rsimlalts{\textit{rsimp}_{ALTs}} |
|
156 |
\def\rsimpseq{\textit{rsimp}_{SEQ}} |
|
543 | 157 |
|
532 | 158 |
\def\erase{\textit{erase}} |
159 |
\def\STAR{\textit{STAR}} |
|
160 |
\def\flts{\textit{flts}} |
|
161 |
||
162 |
||
579 | 163 |
\def\zeroable{\textit{zeroable}} |
164 |
\def\nub{\textit{nub}} |
|
165 |
\def\filter{\textit{filter}} |
|
601 | 166 |
%\def\not{\textit{not}} |
579 | 167 |
|
168 |
||
169 |
||
532 | 170 |
\def\RZERO{\mathbf{0}_r } |
171 |
\def\RONE{\mathbf{1}_r} |
|
172 |
\newcommand\RCHAR[1]{\mathbf{#1}_r} |
|
173 |
\newcommand\RSEQ[2]{#1 \cdot #2} |
|
558 | 174 |
\newcommand\RALTS[1]{\sum #1} |
532 | 175 |
\newcommand\RSTAR[1]{#1^*} |
558 | 176 |
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2} |
532 | 177 |
|
538 | 178 |
|
179 |
||
590 | 180 |
|
181 |
\lstdefinestyle{myScalastyle}{ |
|
182 |
frame=tb, |
|
183 |
language=scala, |
|
184 |
aboveskip=3mm, |
|
185 |
belowskip=3mm, |
|
186 |
showstringspaces=false, |
|
187 |
columns=flexible, |
|
188 |
basicstyle={\small\ttfamily}, |
|
189 |
numbers=none, |
|
190 |
numberstyle=\tiny\color{gray}, |
|
191 |
keywordstyle=\color{blue}, |
|
192 |
commentstyle=\color{dkgreen}, |
|
193 |
stringstyle=\color{mauve}, |
|
194 |
frame=single, |
|
195 |
breaklines=true, |
|
196 |
breakatwhitespace=true, |
|
197 |
tabsize=3, |
|
538 | 198 |
} |
199 |
||
590 | 200 |
|
532 | 201 |
%---------------------------------------------------------------------------------------- |
202 |
%This part is about regular expressions, Brzozowski derivatives, |
|
203 |
%and a bit-coded lexing algorithm with proven correctness and time bounds. |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
204 |
% \marginpar{Totally rewritten introduction} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
205 |
% Formal proofs, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
206 |
|
532 | 207 |
|
208 |
%TODO: look up snort rules to use here--give readers idea of what regexes look like |
|
209 |
||
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
210 |
\marginpar{rephrasing following Christian comment} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
211 |
Regular expressions, since their inception in the 1950s |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
212 |
\cite{RegularExpressions}, |
648 | 213 |
have been subject to extensive study and implementation. |
653
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
214 |
Their primary application lies in text processing--finding |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
215 |
matches and identifying patterns in a string. |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
216 |
%It is often used to match strings that comprises of numerous fields, |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
217 |
%where certain fields may recur or be omitted. |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
218 |
For example, a simple regular expression that tries |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
219 |
to recognise email addresses is |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
220 |
\marginpar{rephrased from "the regex for recognising" to "a simple regex that tries to match email"} |
648 | 221 |
\begin{center} |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
222 |
\verb|[a-z0-9._]+@[a-z0-9.-]+\.[a-z]{2,6}| |
653
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
223 |
%$[a-z0-9._]^+@[a-z0-9.-]^+\.[a-z]{2,6}$. |
648 | 224 |
\end{center} |
653
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
225 |
\marginpar{Simplified example, but the distinction between . and escaped . is correct |
654 | 226 |
and therefore left unchanged. Also verbatim package does not straightforwardly support superscripts so + kept as they are.} |
653
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
227 |
%Using this, regular expression matchers and lexers are able to extract |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
228 |
%the domain names by the use of \verb|[a-zA-Z0-9.-]+|. |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
229 |
\marginpar{Rewrote explanation for the expression.} |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
230 |
This expression assumes all letters in the email have been converted into lower-case. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
231 |
The local-part is recognised by the first |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
232 |
bracketed expression $[a-z0-9.\_]^+$ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
233 |
where the |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
234 |
operator ``+'' (should be viewed as a superscript) |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
235 |
means that it must be some non-empty string |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
236 |
made of alpha-numeric characters. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
237 |
After the ``@'' sign |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
238 |
is the sub-expression that recognises the domain of that email, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
239 |
where $[a-z]^{\{2, 6\}}$ specifically |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
240 |
matches the top-level domain, such as ``org'', ``com'', ``uk'' and etc. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
241 |
The counters in the superscript |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
242 |
$2, 6$ specifies that all top-level domains |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
243 |
should have between two to six characters. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
244 |
This regular expression does not represent all possible email addresses |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
245 |
(e.g. those involving ``-'' cannot be recognised), but patterns |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
246 |
of similar shape and using roughly the same set of syntax are used |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
247 |
everywhere in our daily life, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
248 |
% Indeed regular expressions are used everywhere |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
249 |
% in a computer user's daily life--for example searching for a string |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
250 |
% in a text document using ctrl-F. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
251 |
% When such regex searching |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
252 |
% a useful and simple way to handle email-processing when the task does |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
253 |
% not require absolute precision. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
254 |
%The same can be said about |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
255 |
%applications like . |
653
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
256 |
%Consequently, they are an indispensible components in text processing tools |
bc5571c38d1f
more updates in section 4.2 and incorporating Christian comments
Chengsong
parents:
652
diff
changeset
|
257 |
%of software systems such as compilers, IDEs, and firewalls. |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
258 |
% The picture gets more blurry when we start to ask questions like |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
259 |
% why |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
260 |
%TODO: Insert corresponding bibliography |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
261 |
%There are many more scenarios in which regular exp% %use cases for regular expressions in computer science, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
262 |
for example in compilers \cite{LEX}, networking \cite{Snort1999}, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
263 |
software engineering (IDEs) \cite{Atom} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
264 |
and operating systems \cite{GREP}, where the correctness |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
265 |
of matching can be crucially important. |
601 | 266 |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
267 |
Implementations of regular expression matching out in the wild, however, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
268 |
are surprisingly unreliable. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
269 |
%are not always reliable as %people think |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
270 |
%they |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
271 |
%should be. %Given the importance of the correct functioning |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
272 |
% Indeed they have been heavily relied on and |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
273 |
% extensively implemented and studied in modern computer systems--whenever you do |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
274 |
% a |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
275 |
%TODO: double check this is true |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
276 |
An example is the regular expresion $(a^*)^*b$ and the string |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
277 |
$aa\ldots a$. The expectation is that any regex engine should |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
278 |
be able to solve this (return a no match) in no time. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
279 |
However, if this |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
280 |
%regular expression and string pair |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
281 |
is tried out in an |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
282 |
regex engine like that of Java 8, the runtime would quickly grow beyond 100 seconds |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
283 |
with just dozens of characters |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
284 |
%in the string. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
285 |
Such behaviour can result in Denial-of-Service (ReDoS) attacks |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
286 |
with minimal resources--just the pair of ``evil'' |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
287 |
regular expression and string. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
288 |
The outage of the web service provider Cloudflare \cite{Cloudflare} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
289 |
in 2019 \cite{CloudflareOutage} is a recent example |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
290 |
where such issues caused serious negative impact. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
291 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
292 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
293 |
The reason why these regex matching engines get so slow |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
294 |
is because they use backtracking or a depth-first-search (DFS) on the |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
295 |
search space of possible matches. They employ backtracking algorithms to |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
296 |
support back-references, a mechanism allowing |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
297 |
expressing languages which repeating an arbitrary long string |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
298 |
such as |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
299 |
$\{ww| w\in \Sigma*\}$. Such a constructs makes matching |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
300 |
NP-complete, for which no known non-backtracking algorithms exist. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
301 |
More modern programming languages' regex engines such as |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
302 |
Rust and GO's do promise linear-time behaviours |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
303 |
with respect to input string, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
304 |
but they do not support back-references, and often |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
305 |
impose ad-hoc restrictions on the input patterns. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
306 |
More importantly, these promises are purely empirical, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
307 |
making them prone to future ReDoS attacks and other types of errors. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
308 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
309 |
The other unreliability of industry regex engines |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
310 |
is that they do not produce the desired output. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
311 |
Kuklewicz have found out during his testing |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
312 |
of practical regex engines that almost all of them are incorrect with respect |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
313 |
to the POSIX standard, a disambiguation strategy adopted most |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
314 |
widely in computer science. Roughly speaking POSIX lexing means to always |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
315 |
go for the longest initial submatch possible, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
316 |
for instance a single iteration |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
317 |
$aa$ is preferred over two iterations $a,a$ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
318 |
when matching $(a|aa)^*$ with the string $aa$. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
319 |
The POSIX strategy as summarised by Kuklewicz goes |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
320 |
as follows: |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
321 |
\marginpar{\em Deleted some peripheral specifications.} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
322 |
\begin{quote} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
323 |
\begin{itemize} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
324 |
\item |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
325 |
regular expressions (REs) take the leftmost starting match, and the longest match starting there |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
326 |
earlier subpatterns have leftmost-longest priority over later subpatterns\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
327 |
\item |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
328 |
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
329 |
\item |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
330 |
$\ldots$ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
331 |
%REs have right associative concatenation which can be changed with parenthesis\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
332 |
%\item |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
333 |
%parenthesized subexpressions return the match from their last usage\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
334 |
%\item |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
335 |
%text of component subexpressions must be contained in the text of the |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
336 |
%higher-level subexpressions\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
337 |
%\item |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
338 |
%if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
339 |
%\item |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
340 |
%if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
341 |
\end{itemize} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
342 |
\end{quote} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
343 |
The author noted that various lexers that come with POSIX operating systems |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
344 |
are rarely correct according to this standard. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
345 |
A test case that exposes the bugs |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
346 |
is the regular expression $(aba|ab|a)^*$ and string $ababa$. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
347 |
A correct match would split the string into $ab, aba$, involving |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
348 |
two repetitions of the Kleene star $(aba|ab|a)^*$. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
349 |
Most regex engines would instead return two (partial) matches |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
350 |
$aba$ and $a$ \footnote{Try it out here: \url{https://regex101.com/r/c5hga5/1}, last accessed 22-Aug-2023}. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
351 |
There are numerous occasions where programmers realised the subtlety and |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
352 |
difficulty to implement POSIX correctly, one such quote from Go's regexp library author: |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
353 |
\footnote{\url{https://pkg.go.dev/regexp\#pkg-overview}, last accessed 22-Aug-2023} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
354 |
\begin{quote}\it |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
355 |
`` |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
356 |
The POSIX rule is computationally prohibitive |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
357 |
and not even well-defined. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
358 |
'' |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
359 |
\end{quote} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
360 |
% Being able to formally define and capture the idea of |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
361 |
% POSIX rules and prove |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
362 |
% the correctness of regular expression matching/lexing |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
363 |
% algorithms against the POSIX semantics definitions |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
364 |
% is valuable. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
365 |
These all point towards a formal treatment of |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
366 |
POSIX lexing to clear up inaccuracies and errors |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
367 |
in understanding and implementation of regex. The work presented |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
368 |
in this thesis uses formal proofs to ensure the correctness |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
369 |
and runtime properties |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
370 |
of POSIX regular expression lexing. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
371 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
372 |
Formal proofs or mechanised proofs are |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
373 |
computer checked programs |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
374 |
%such as the ones written in Isabelle/HOL, is a powerful means |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
375 |
that certify the correctness of facts |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
376 |
with respect to a set of axioms and definitions. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
377 |
% This is done by |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
378 |
% recursively checking that every fact in a proof script |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
379 |
% is either an axiom or a fact that is derived from |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
380 |
% known axioms or verified facts. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
381 |
%The problem of regular expressions lexing and catastrophic backtracking are a suitable problem for such |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
382 |
%methods as their implementation and complexity analysis tend to be error-prone. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
383 |
They provide an unprecendented level of asssurance |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
384 |
that an algorithm will perform as expected under all inputs. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
385 |
We believe such proofs can help eliminate catastrophic backtracking |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
386 |
once and for all. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
387 |
The software systems that help people interactively build and check |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
388 |
formal proofs are called proof assitants. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
389 |
% Many theorem-provers have been developed, such as Mizar, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
390 |
% Isabelle/HOL, HOL-Light, HOL4, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
391 |
% Coq, Agda, Idris, Lean and so on. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
392 |
Isabelle/HOL \cite{Isabelle} is a widely-used proof assistant with a simple type theory |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
393 |
and powerful automated proof generators like sledgehammer. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
394 |
We chose to use Isabelle/HOL for its powerful automation |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
395 |
and ease and simplicity in expressing regular expressions and |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
396 |
regular languages. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
397 |
%Some of those employ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
398 |
%dependent types like Mizar, Coq, Agda, Lean and Idris. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
399 |
%Some support a constructivism approach, such as Coq. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
400 |
|
648 | 401 |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
402 |
% Formal proofs on regular expression matching and lexing |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
403 |
% complements the efforts in |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
404 |
% industry which tend to focus on overall speed |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
405 |
% with techniques like parallelization (FPGA paper), tackling |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
406 |
% the problem of catastrophic backtracking |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
407 |
% in an ad-hoc manner (cloudflare and stackexchange article). |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
408 |
The algorithm we work on is based on Brzozowski derivatives. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
409 |
Brzozowski invented the notion of ``derivatives'' on |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
410 |
regular expressions \cite{Brzozowski1964}, and the idea is |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
411 |
that we can reason about what regular expressions can match |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
412 |
by taking derivatives of them. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
413 |
A derivative operation takes a regular expression $r$ and a character |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
414 |
$c$, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
415 |
and returns a new regular expression $r\backslash c$. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
416 |
The derivative is taken with respect to $c$: |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
417 |
it transforms $r$ in such a way that the resulting derivative |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
418 |
$r\backslash c$ contains all strings in the language of $r$ that |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
419 |
starts with $c$, but now with the head character $c$ thrown away. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
420 |
For example, for $r$ equal to $(aba | ab | a)^*$ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
421 |
as discussed earlier, its derivative with respect to character $a$ is |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
422 |
\[ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
423 |
r\backslash a = (ba | b| \ONE)(aba | ab | a)^*. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
424 |
\] |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
425 |
% By applying derivatives repeatedly one can c |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
426 |
Taking derivatives repeatedly with respect to the sequence |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
427 |
of characters in the string $s$, one obtain a regular expression |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
428 |
containing the information of how $r$ matched $s$. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
429 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
430 |
Brzozowski derivatives are purely algebraic operations |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
431 |
that can be implemented as a recursive function |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
432 |
that does a pattern match on the structure of the regular expression. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
433 |
For example, the derivatives of character regular expressions, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
434 |
Kleene star and bounded repetitions can be described by the following |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
435 |
code equations: |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
436 |
% For example some clauses |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
437 |
\begin{center} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
438 |
\begin{tabular}{lcl} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
439 |
$d \backslash c$ & $\dn$ & |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
440 |
$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
441 |
% $(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
442 |
% $(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, [] \in L(r_1)$\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
443 |
% & & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
444 |
% & & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
445 |
$(r^*)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^*$\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
446 |
$r^{\{n\}} \backslash c$ & $\dn$ & $r \backslash c \cdot |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
447 |
r^{\{n-1\}} (\text{when} \; n > 0)$\\ |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
448 |
\end{tabular} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
449 |
\end{center} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
450 |
(end of ready work, rest construction site) |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
451 |
In particular, we are working on an improvement of the work |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
452 |
by Ausaf et al. \cite{Ausaf} \cite{AusafDyckhoffUrban2016} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
453 |
and Sulzmann and Lu \cite{Sulzmann2014}. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
454 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
455 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
456 |
The variant of the problem we are looking at centers around |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
457 |
an algorithm (which we call $\blexer$) developed by Sulzmann and Lu \ref{Sulzmann2014}. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
458 |
The reason we chose to look at $\blexer$ and its simplifications |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
459 |
is because it allows a lexical tree to be generated |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
460 |
by some elegant and subtle procedure based on Brzozowski derivatives. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
461 |
The procedures are made of recursive functions and inductive datatypes just like derivatives, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
462 |
allowing intuitive and concise formal reasoning with theorem provers. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
463 |
Most importantly, $\blexer$ opens up a path to an optimized version |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
464 |
of $\blexersimp$ possibilities to improve |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
465 |
performance with simplifications that aggressively change the structure of regular expressions. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
466 |
While most derivative-based methods |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
467 |
rely on structures to be maintained to allow induction to |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
468 |
go through. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
469 |
For example, Egolf et al. \ref{Verbatim} have developed a verified lexer |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
470 |
with derivatives, but as soon as they started introducing |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
471 |
optimizations such as memoization, they reverted to constructing |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
472 |
DFAs first. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
473 |
Edelmann \ref{Edelmann2020} explored similar optimizations in his |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
474 |
work on verified LL(1) parsing, with additional enhancements with data structures like |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
475 |
zippers. |
648 | 476 |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
477 |
%Sulzmann and Lu have worked out an algorithm |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
478 |
%that is especially suited for verification |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
479 |
%which utilized the fact |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
480 |
%that whenever ambiguity occurs one may duplicate a sub-expression and use |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
481 |
%different copies to describe different matching choices. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
482 |
The idea behind the correctness of $\blexer$ is simple: during a derivative, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
483 |
multiple matches might be possible, where an alternative with multiple children |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
484 |
each corresponding to a |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
485 |
different match is created. In the end of |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
486 |
a lexing process one always picks up the leftmost alternative, which is guarnateed |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
487 |
to be a POSIX value. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
488 |
This is done by consistently keeping sub-regular expressions in an alternative |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
489 |
with longer submatches |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
490 |
to the left of other copies ( |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
491 |
Remember that POSIX values are roughly the values with the longest inital |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
492 |
submatch). |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
493 |
The idea behind the optimized version of $\blexer$, which we call $\blexersimp$, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
494 |
is that since we only take the leftmost copy, then all re-occurring copies can be |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
495 |
eliminated without losing the POSIX property, and this can be done with |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
496 |
children of alternatives at different levels by merging them together. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
497 |
Proving $\blexersimp$ requires a different |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
498 |
proof strategy compared to that by Ausaf \cite{FahadThesis}. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
499 |
We invent a rewriting relation as an |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
500 |
inductive predicate which captures |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
501 |
a strong enough invariance that ensures correctness, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
502 |
which commutes with the derivative operation. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
503 |
This predicate allows a simple |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
504 |
induction on the input string to go through. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
505 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
506 |
%This idea has been repeatedly used in different variants of lexing |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
507 |
%algorithms in their paper, one of which involves bit-codes. The bit-coded |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
508 |
%derivative-based algorithm even allows relatively aggressive |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
509 |
%%simplification rules which cause |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
510 |
%structural changes that render the induction used in the correctness |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
511 |
%proofs unusable. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
512 |
%More details will be given in \ref{Bitcoded2} including the |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
513 |
%new correctness proof which involves a new inductive predicate which allows |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
514 |
%rule induction to go through. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
515 |
|
648 | 516 |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
517 |
\marginpar{20230821 Addition: high-level idea in context} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
518 |
To summarise, we expect modern regex matching and lexing engines |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
519 |
to be reliabe, fast and correct, and support rich syntax constructs |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
520 |
like bounded repetitions and back references. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
521 |
Backtracking regex engines have exhibited exponential |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
522 |
worst-case behaviours (catastrophic backtracking) |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
523 |
for employing a depth-first-search on the search tree of possible matches, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
524 |
and complexity analysis about them also takes worst-case exponential |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
525 |
time (ref Minamide and Birmingham work). |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
526 |
%Expand notes: JFLEX generates DFAs, cannot support backrefs |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
527 |
%Expand notes: Java 8, python supports pcre, but is exponential |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
528 |
%Expand notes: more modern prog langs like GO, Rust claims linear, and forbids large counters and backrefs |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
529 |
%The work described in this thesis is part of an ongoing |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
530 |
%project which aims to build a formally verified lexer satisfying all these |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
531 |
To ensure the correctness and predictable behaviour of lexical analysis, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
532 |
we propose to |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
533 |
build a formally verified lexer that satisfy correctness and non-backtracking |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
534 |
requirements in a bottom-up manner using Brzozowski derivatives. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
535 |
We build on the line of work by Ausaf et al. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
536 |
A derivative-based matching algorithm avoids backtracking, by explicitly |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
537 |
representing possible matches on the same level of a search tree |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
538 |
as regular expression terms |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
539 |
in a breadth-first manner. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
540 |
Efficiency of such algorithms rely on limiting the size |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
541 |
of the derivatives during the computation, for example by |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
542 |
eliminating redundant |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
543 |
terms that lead to identical matches. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
544 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
545 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
546 |
The end goal would be a lexer that comes with additional formal guarantees |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
547 |
on computational complexity and actual speed competent with other |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
548 |
unverified regex engines. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
549 |
We will report in the next section the outcome of this project |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
550 |
so far and the contribution with respect to other works in |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
551 |
lexical analysis and theorem proving. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
552 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
553 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
554 |
Derivatives on regular expressions are popular in the theorem proving community |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
555 |
because their algebraic features are amenable to formal reasoning. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
556 |
As a testimony for this, there exists a sizable number of formalisations of Brzozowski derivatives and |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
557 |
regular expressions in different theorem proving languages. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
558 |
%Expand notes: verified lexers: proof-of-concept: Ruberio, Egolf, fast but no formal proof in a proof assistant: Magnus, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
559 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
560 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
561 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
562 |
% The study of regular expressions is ongoing due to an |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
563 |
% issue known as catastrophic backtracking. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
564 |
% This phenomenon refers to scenarios in which the regular expression |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
565 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
566 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
567 |
% One cause of catastrophic backtracking lies within the |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
568 |
% ambiguities of lexing.\marginpar{rephrased "the origin of catastrophic ...} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
569 |
% In the process of matching a multi-character string with |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
570 |
% a regular expression that encompasses several sub-expressions, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
571 |
% different positions can be designated to mark |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
572 |
% the boundaries of corresponding substrings of the sub-expressions. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
573 |
% For instance, in matching the string $aaa$ with the |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
574 |
% regular expression $(a+aa)^*$, the divide between |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
575 |
% the initial match and the subsequent iteration could either be |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
576 |
% set between the first and second characters ($a | aa$) or |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
577 |
% between the second and third characters ($aa | a$). |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
578 |
% As both the length of the input string and the structural complexity |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
579 |
% of the regular expression increase, the number of potential delimiter |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
580 |
% combinations can grow exponentially, leading to a corresponding increase |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
581 |
% in complexity for algorithms that do not optimally navigate these possibilities. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
582 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
583 |
% Catastrophic backtracking facilitates a category of computationally inexpensive attacks known as Regular Expression Denial of Service (ReDoS) attacks. Here, an attacker merely dispatches a small attack string to a server, provoking high-complexity behaviours in the server's regular expression engine. Such attacks, whether intentional or accidental, have led to the failure of real-world systems (additional details can be found in the practical overview section in chapter \ref{Introduction}). |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
584 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
585 |
% Various disambiguation strategies are |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
586 |
% employed to select sub-matches, notably including Greedy and POSIX strategies. POSIX, the strategy most widely adopted in practice, invariably opts for the longest possible sub-match. Kuklewicz \cite{KuklewiczHaskell}, for instance, offers a descriptive definition of the POSIX rule in section 1, last paragraph: |
601 | 587 |
|
588 |
||
648 | 589 |
%Regular expressions |
590 |
%have been extensively studied and |
|
591 |
%implemented since their invention in 1940s. |
|
592 |
%It is primarily used in lexing, where an unstructured input string is broken |
|
593 |
%down into a tree of tokens. |
|
594 |
%That tree's construction is guided by the shape of the regular expression. |
|
595 |
%This is particularly useful in expressing the shape of a string |
|
596 |
%with many fields, where certain fields might repeat or be omitted. |
|
597 |
%Regular expression matchers and Lexers allow us to |
|
598 |
%identify and delimit different subsections of a string and potentially |
|
599 |
%extract information from inputs, making them |
|
600 |
%an indispensible component in modern software systems' text processing tasks |
|
601 |
%such as compilers, IDEs, and firewalls. |
|
602 |
%Research on them is far from over due to the well-known issue called catastrophic-backtracking, |
|
603 |
%which means the regular expression matching or lexing engine takes an unproportional time to run |
|
604 |
%despite the input and the expression being relatively simple. |
|
605 |
% |
|
606 |
%Catastrophic backtracking stems from the ambiguities of lexing: |
|
607 |
%when matching a multiple-character string with a regular |
|
608 |
%exression that includes serveral sub-expressions, there might be different positions to set |
|
609 |
%the border between sub-expressions' corresponding sub-strings. |
|
610 |
%For example, matching the string $aaa$ against the regular expression |
|
611 |
%$(a+aa)^*$, the border between the initial match and the second iteration |
|
612 |
%could be between the first and second character ($a | aa$) |
|
613 |
%or between the second and third character ($aa | a$). |
|
614 |
%As the size of the input string and the structural complexity |
|
615 |
%of the regular expression increase, |
|
616 |
%the number of different combinations of delimiters can grow exponentially, and |
|
617 |
%algorithms that explore these possibilities unwisely will also see an exponential complexity. |
|
618 |
% |
|
619 |
%Catastrophic backtracking allow a class of computationally inexpensive attacks called |
|
620 |
%Regular expression Denial of Service attacks (ReDoS), in which the hacker |
|
621 |
%simply sends out a small attack string to a server, |
|
622 |
%triggering high-complexity behaviours in its regular expression engine. |
|
623 |
%These attacks, be it deliberate or not, have caused real-world systems to go down (see more |
|
624 |
%details of this in the practical overview section in chapter \ref{Introduction}). |
|
625 |
%There are different disambiguation strategies to select sub-matches, most notably Greedy and POSIX. |
|
626 |
%The widely adopted strategy in practice is POSIX, which always go for the longest sub-match possible. |
|
627 |
%There have been prose definitions like the following |
|
628 |
%by Kuklewicz \cite{KuklewiczHaskell}: |
|
629 |
%described the POSIX rule as (section 1, last paragraph): |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
630 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
631 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
632 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
633 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
634 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
635 |
%first character is removed |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
636 |
%state of the automaton after matching that character |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
637 |
%where nodes are represented as |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
638 |
%a sub-expression (for example tagged NFA |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
639 |
%Working on regular expressions |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
640 |
%Because of these problems with automata, we prefer regular expressions |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
641 |
%and derivatives rather than an automata (or graph-based) approach which explicitly converts between |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
642 |
%the regular expression and a different data structure. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
643 |
% |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
644 |
% |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
645 |
%The key idea |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
646 |
|
649 | 647 |
|
646 | 648 |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
649 |
%Regular expressions are widely used in computer science: |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
650 |
%be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion; |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
651 |
%command-line tools like $\mathit{grep}$ that facilitate easy |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
652 |
%text-processing \cite{grep}; network intrusion |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
653 |
%detection systems that inspect suspicious traffic; or compiler |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
654 |
%front ends. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
655 |
%Given their usefulness and ubiquity, one would assume that |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
656 |
%modern regular expression matching implementations |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
657 |
%are mature and fully studied. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
658 |
%Indeed, in many popular programming languages' regular expression engines, |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
659 |
%one can supply a regular expression and a string, and in most cases get |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
660 |
%get the matching information in a very short time. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
661 |
%Those engines can sometimes be blindingly fast--some |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
662 |
%network intrusion detection systems |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
663 |
%use regular expression engines that are able to process |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
664 |
%hundreds of megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
665 |
%However, those engines can sometimes also exhibit a surprising security vulnerability |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
666 |
%under a certain class of inputs. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
667 |
%However, , this is not the case for $\mathbf{all}$ inputs. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
668 |
%TODO: get source for SNORT/BRO's regex matching engine/speed |
646 | 669 |
|
670 |
||
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
671 |
%---------------------------------------------------------------------------------------- |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
672 |
\section{Contribution} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
673 |
%{\color{red} \rule{\linewidth}{0.5mm}} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
674 |
%\textbf{issue with this part: way too short, not enough details of what I have done.} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
675 |
%{\color{red} \rule{\linewidth}{0.5mm}} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
676 |
\marginpar{\em Gerog: No sufficient context on related work making contribution hard to digest.} |
649 | 677 |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
678 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
679 |
\marginpar{introducing formalisations on regex matching} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
680 |
There have been many formalisations in the theorem-proving community |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
681 |
about regular expressions and lexing. |
649 | 682 |
One flavour is to build from the regular expression an automaton, and determine |
683 |
acceptance in terms of the resulting |
|
684 |
state after executing the input string on that automaton. |
|
685 |
Automata formalisations are in general harder and more cumbersome to deal |
|
686 |
with for theorem provers than working directly on regular expressions. |
|
687 |
One such example is by Nipkow \cite{Nipkow1998}. |
|
688 |
%They |
|
689 |
%made everything recursive (for example the next state function), |
|
690 |
As a core idea, they |
|
691 |
used a list of booleans to name each state so that |
|
692 |
after composing sub-automata together, renaming the states to maintain |
|
693 |
the distinctness of each state is recursive and simple. |
|
694 |
The result was the obvious lemmas incorporating |
|
695 |
``a painful amount of detail'' in their formalisation. |
|
696 |
Sometimes the automata are represented as graphs. |
|
697 |
But graphs are not inductive datatypes. |
|
698 |
Having to set the induction principle on the number of nodes |
|
699 |
in a graph makes formal reasoning non-intuitive and convoluted, |
|
700 |
resulting in large formalisations \cite{Lammich2012}. |
|
701 |
When combining two graphs, one also needs to make sure that the nodes in |
|
702 |
both graphs are distinct, which almost always involve |
|
703 |
renaming of the nodes. |
|
704 |
A theorem-prover which provides dependent types such as Coq |
|
705 |
can alleviate the issue of representing graph nodes |
|
706 |
\cite{Doczkal2013}. There the representation of nodes is made |
|
707 |
easier by the use of $\textit{FinType}$. |
|
708 |
Another representation for automata are matrices. |
|
709 |
But the induction for them is not as straightforward either. |
|
710 |
There are some more clever representations, for example one by Paulson |
|
711 |
using hereditarily finite sets \cite{Paulson2015}. |
|
712 |
There the problem with combining graphs can be solved better. |
|
713 |
%but we believe that such clever tricks are not very obvious for |
|
714 |
%the John-average-Isabelle-user. |
|
715 |
||
716 |
The approach that operates directly on regular expressions circumvents the problem of |
|
717 |
conversion between a regular expression and an automaton, thereby avoiding representation |
|
718 |
problems altogether, despite that regular expressions may be seen as a variant of a |
|
719 |
non-deterministic finite automaton (ref Laurikari tagged NFA paper). |
|
720 |
To matching a string, a sequence of algebraic transformations called |
|
721 |
(Brzozowski) $\textit{derivatives}$ (ref Brzozowski) is carried out on that regular expression. |
|
722 |
Each derivative takes a character and a regular expression, |
|
723 |
and returns a new regular expression whose language is closely related to |
|
724 |
the original regular expression's language: |
|
725 |
strings prefixed with that input character will have their head removed |
|
726 |
and strings not prefixed |
|
727 |
with that character will be eliminated. |
|
728 |
After taking derivatives with respect to all the characters the string is |
|
729 |
exhausted. Then an algorithm checks whether the empty string is in that final |
|
730 |
regular expression's language. |
|
731 |
If so, a match exists and the string is in the language of the input regular expression. |
|
732 |
||
733 |
Again this process can be seen as the simulation of an NFA running on a string, |
|
734 |
but the recursive nature of the datatypes and functions involved makes |
|
735 |
derivatives a perfect object of study for theorem provers. |
|
736 |
That is why there has been numerous formalisations of regular expressions |
|
737 |
and Brzozowski derivatives in the functional programming and |
|
738 |
theorem proving community (a large list of refs to derivatives formalisation publications). |
|
739 |
Ribeiro and Du Bois \cite{RibeiroAgda2017} have |
|
740 |
formalised the notion of bit-coded regular expressions |
|
741 |
and proved their relations with simple regular expressions in |
|
742 |
the dependently-typed proof assistant Agda. |
|
743 |
They also proved the soundness and completeness of a matching algorithm |
|
744 |
based on the bit-coded regular expressions. Their algorithm is a decision procedure |
|
745 |
that gives a Yes/No answer, which does not produce |
|
746 |
lexical values. |
|
747 |
%X also formalised derivatives and regular expressions, producing "parse trees". |
|
748 |
%(Some person who's a big name in formal methods) |
|
749 |
||
750 |
||
532 | 751 |
|
664 | 752 |
%In this thesis, |
753 |
%we propose a solution to catastrophic |
|
754 |
%backtracking and error-prone matchers: a formally verified |
|
755 |
%regular expression lexing algorithm |
|
756 |
%that is both fast |
|
757 |
%and correct. |
|
758 |
%%{\color{red} \rule{\linewidth}{0.5mm}} |
|
759 |
%\HandRight Added content: |
|
646 | 760 |
%Package \verb`pifont`: \ding{43} |
761 |
%Package \verb`utfsym`: \usym{261E} |
|
762 |
%Package \verb`dingbat`: \leftpointright |
|
763 |
%Package \verb`dingbat`: \rightpointright |
|
664 | 764 |
We have made mainly two contributions in this thesis: %the |
765 |
%lexer we developed based on Brzozowski derivatives and |
|
766 |
%Sulzmanna and Lu's developments called |
|
665
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
767 |
proving the lexer $\blexersimp$ is both i) correctness and ii)fast. |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
768 |
It is correct w.r.t a formalisation of POSIX lexing by Ausaf et al.\cite{AusafDyckhoffUrban2016}. |
664 | 769 |
It is fast compared with un-optimised implementations like Sulzmann and Lu's orignal |
770 |
development by our metric of internal data structures not growing unbounded. |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
771 |
\subsection{Theorem Proving with Derivatives} |
664 | 772 |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
773 |
Ribeiro and Du Bois \cite{RibeiroAgda2017} have |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
774 |
formalised the notion of bit-coded regular expressions |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
775 |
and proved their relations with simple regular expressions in |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
776 |
the dependently-typed proof assistant Agda. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
777 |
They also proved the soundness and completeness of a matching algorithm |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
778 |
based on the bit-coded regular expressions. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
779 |
Ausaf et al. \cite{AusafDyckhoffUrban2016} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
780 |
are the first to |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
781 |
give a quite simple formalised POSIX |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
782 |
specification in Isabelle/HOL, and also prove |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
783 |
that their specification coincides with an earlier (unformalised) |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
784 |
POSIX specification given by Okui and Suzuki \cite{Okui10}. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
785 |
They then formally proved the correctness of |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
786 |
a lexing algorithm by Sulzmann and Lu \cite{Sulzmann2014} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
787 |
with regards to that specification. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
788 |
They also found that the informal POSIX |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
789 |
specification by Sulzmann and Lu needs to be substantially |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
790 |
revised in order for the correctness proof to go through. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
791 |
Their original specification and proof were unfixable |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
792 |
according to Ausaf et al. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
793 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
794 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
795 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
796 |
\subsection{Complexity Results} |
664 | 797 |
Our formalisation of complexity is unique among similar works in the sense that |
798 |
%is about the size of internal data structures. |
|
799 |
to our knowledge %we don't know of a |
|
665
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
800 |
there are not other certified |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
801 |
lexing/parsing algorithms with similar data structure size bound theorems. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
802 |
Common practices involve making empirical analysis of the complexity of the algorithm |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
803 |
in question (\cite{Verbatim}, \cite{Verbatimpp}), or relying |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
804 |
on prior (unformalised) complexity analysis of well-known algorithms (\cite{ValiantParsing}), |
664 | 805 |
making them prone to complexity bugs. |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
806 |
|
664 | 807 |
%TODO: add citation |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
808 |
%, for example in the Verbatim work \cite{Verbatim} |
664 | 809 |
|
810 |
%While formalised proofs have not included |
|
811 |
%Issues known as "performance bugs" can |
|
812 |
Whilst formalised complexity theorems |
|
813 |
have not yet appeared in other certified lexers/parsers, |
|
814 |
%while this work is done, |
|
815 |
they do find themselves in the broader theorem-proving literature: |
|
816 |
\emph{time credits} have been formalised for separation logic in Coq |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
817 |
\cite{atkey2010amortised} |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
818 |
%not used in |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
819 |
to characterise the runtime complexity of union-find. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
820 |
%where integer values are recorded %from the beginning of an execution |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
821 |
%as part of the program state |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
822 |
%and decremented in each step. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
823 |
The idea is that the total number of instructions executed |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
824 |
is equal to the time credits consumed |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
825 |
during the execution of an algorithm. |
664 | 826 |
%each time a basic step is executed. |
827 |
%The way to use a time credit Time credit is an integer |
|
828 |
%is assigned to a program prior to execution, and each step in the program consumes |
|
829 |
%one credit. |
|
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
830 |
Arma{\"e}l et al. \cite{bigOImperative} have extended the framework to allow expressing time |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
831 |
credits using big-O notations, |
664 | 832 |
so one can prove both the functional correctness and asymptotic complexity |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
833 |
of higher-order imperative algorithms. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
834 |
A number of formal proofs also exist for some other |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
835 |
algorithms in Coq \cite{subtypingFormalComplexity}. |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
836 |
|
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
837 |
The big-O notation have also been formalised in Isabelle |
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
838 |
\cite{bigOIsabelle} |
664 | 839 |
%for example the complexity of |
840 |
%the Quicksort algorithm |
|
841 |
%is $\Theta n\ln n$ |
|
665
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
842 |
\marginpar{more work on formalising complexity}. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
843 |
%Our next step is to leverage these frameworks |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
844 |
%It is a precursor to our |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
845 |
Our work focuses on the space complexity of the algorithm under our notion of the size of |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
846 |
a regular expression. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
847 |
Despite not being a direct asymptotic time complexity proof, |
668
3831621d7b14
added technical Overview section, almost done introduction
Chengsong
parents:
666
diff
changeset
|
848 |
our result is an important stepping stone towards one. |
666 | 849 |
|
664 | 850 |
|
665
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
851 |
|
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
852 |
Brzozowski showed that there are finitely many similar deriviatives, |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
853 |
where similarity is defined in terms of ACI equations. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
854 |
This allows him to use derivatives as a basis for DFAs where each state is |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
855 |
labelled with a derivative. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
856 |
However, Brzozowski did not show anything about |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
857 |
the size of the derivatives. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
858 |
Antimirov showed that there can only be finitely |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
859 |
many partial derivatives for a regular expression and any set of |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
860 |
strings. He showed that the number is actually the |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
861 |
``alphabetical width'' plus 1. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
862 |
From this result one can relatively easily establish |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
863 |
that the size of the partial derivatives is |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
864 |
no bigger than $(\textit{size} \; r)^3$ for every string. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
865 |
Unfortunately this result does not seem to carry over to our |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
866 |
setting because partial derivatives have the simplification |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
867 |
\begin{equation}\label{eq:headSplitRule} |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
868 |
(r_1 + r_2) \cdot r_3 \rightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3) |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
869 |
\end{equation} |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
870 |
built in. We cannot have this because otherwise we would |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
871 |
lose the POSIX property. For instance, the lexing result of |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
872 |
regular expression |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
873 |
\[ |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
874 |
(a+ab)\cdot(bc+c) |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
875 |
\] |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
876 |
with respect to string $abc$ using our lexer with the simplification rule \ref{eq:headSplitRule} |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
877 |
would be |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
878 |
\[ |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
879 |
\Left (\Seq \; (\Char \; a), \Seq (\Char \; b) \; (\Char \; c) ) |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
880 |
\] |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
881 |
instead of the correct POSIX value |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
882 |
\[ |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
883 |
\Seq \; (\Right \; (\Seq \; (\Char \; a) \; (\Char \; b)) ) \; (\Char \;) |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
884 |
\] |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
885 |
Our result about the finite bound also does not say anything about the number of derivatives. |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
886 |
In fact there are infinitely many derivatives in general |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
887 |
because in the annotated regular expression for STAR we record the |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
888 |
number of iterations. What our result shows that the size of |
3bedbdce3a3b
a bit more intro, incorporating Christian chat messages
Chengsong
parents:
664
diff
changeset
|
889 |
the derivatives is bounded, not the number. |
664 | 890 |
\marginpar{new changes up to this point.} |
891 |
||
646 | 892 |
|
893 |
In particular, the main problem we solved on top of previous work was |
|
894 |
coming up with a formally verified algorithm called $\blexersimp$ based |
|
895 |
on Brzozowski derivatives. It calculates a POSIX |
|
896 |
lexical value from a string and a regular expression. This algorithm was originally |
|
897 |
by Sulzmann and Lu \cite{Sulzmann2014}, but we made the key observation that its $\textit{nub}$ |
|
898 |
function does not really simplify intermediate results where it needs to and improved the |
|
899 |
algorithm accordingly. |
|
900 |
We have proven our $\blexersimp$'s internal data structure does not grow beyond a constant $N_r$ |
|
901 |
depending on the input regular expression $r$, thanks to the aggressive simplifications of $\blexersimp$: |
|
902 |
\begin{theorem} |
|
903 |
$|\blexersimp \; r \; s | \leq N_r$ |
|
904 |
\end{theorem} |
|
905 |
The simplifications applied in each step of $\blexersimp$ |
|
906 |
||
907 |
\begin{center} |
|
908 |
$\blexersimp |
|
909 |
$ |
|
910 |
\end{center} |
|
911 |
keeps the derivatives small, but presents a |
|
912 |
challenge |
|
913 |
||
914 |
||
915 |
establishing a correctness theorem of the below form: |
|
916 |
%TODO: change this to "theorem to prove" |
|
917 |
\begin{theorem} |
|
918 |
If the POSIX lexical value of matching regular expression $r$ with string $s$ is $v$, |
|
919 |
then $\blexersimp\; r \; s = \Some \; v$. Otherwise |
|
920 |
$\blexersimp \; r \; s = \None$. |
|
921 |
\end{theorem} |
|
922 |
||
923 |
||
924 |
||
925 |
||
638 | 926 |
The result is %a regular expression lexing algorithm that comes with |
538 | 927 |
\begin{itemize} |
928 |
\item |
|
609 | 929 |
an improved version of Sulzmann and Lu's bit-coded algorithm using |
930 |
derivatives with simplifications, |
|
931 |
accompanied by |
|
932 |
a proven correctness theorem according to POSIX specification |
|
933 |
given by Ausaf et al. \cite{AusafDyckhoffUrban2016}, |
|
934 |
\item |
|
935 |
a complexity-related property for that algorithm saying that the |
|
936 |
internal data structure will |
|
638 | 937 |
remain below a finite bound, |
538 | 938 |
\item |
638 | 939 |
and an extension to |
940 |
the bounded repetition constructs with the correctness and finiteness property |
|
609 | 941 |
maintained. |
622 | 942 |
\end{itemize} |
943 |
\noindent |
|
646 | 944 |
{\color{red} \rule{\linewidth}{0.5mm}} |
609 | 945 |
With a formal finiteness bound in place, |
946 |
we can greatly reduce the attack surface of servers in terms of ReDoS attacks. |
|
631 | 947 |
The Isabelle/HOL code for our formalisation can be |
948 |
found at |
|
949 |
\begin{center} |
|
950 |
\url{https://github.com/hellotommmy/posix} |
|
951 |
\end{center} |
|
609 | 952 |
Further improvements to the algorithm with an even stronger version of |
631 | 953 |
simplification can be made. We conjecture that the resulting size of derivatives |
954 |
can be bounded by a cubic bound w.r.t. the size of the regular expression. |
|
638 | 955 |
We will give relevant code in Scala, |
956 |
but do not give a formal proof for that in Isabelle/HOL. |
|
957 |
This is still future work. |
|
532 | 958 |
|
959 |
||
960 |
\section{Structure of the thesis} |
|
652 | 961 |
\marginpar{\em This is a marginal note.} |
962 |
Before talking about the formal proof of $\blexersimp$'s |
|
963 |
correctness, which is the main contribution of this thesis, |
|
964 |
we need to introduce two formal proofs which belong |
|
965 |
to Ausafe et al. |
|
622 | 966 |
In chapter \ref{Inj} we will introduce the concepts |
532 | 967 |
and notations we |
622 | 968 |
use for describing regular expressions and derivatives, |
638 | 969 |
and the first version of Sulzmann and Lu's lexing algorithm without bitcodes (including |
622 | 970 |
its correctness proof). |
971 |
We will give their second lexing algorithm with bitcodes in \ref{Bitcoded1} |
|
972 |
together with the correctness proof by Ausaf and Urban. |
|
973 |
Then we illustrate in chapter \ref{Bitcoded2} |
|
974 |
how Sulzmann and Lu's |
|
638 | 975 |
simplifications fail to simplify correctly. We therefore introduce our own version of the |
976 |
algorithm with correct simplifications and |
|
977 |
their correctness proof. |
|
622 | 978 |
In chapter \ref{Finite} we give the second guarantee |
532 | 979 |
of our bitcoded algorithm, that is a finite bound on the size of any |
631 | 980 |
regular expression's derivatives. |
981 |
We also show how one can extend the |
|
982 |
algorithm to include bounded repetitions. |
|
622 | 983 |
In chapter \ref{Cubic} we discuss stronger simplification rules which |
638 | 984 |
improve the finite bound to a cubic bound. %and the NOT regular expression. |
637 | 985 |
Chapter \ref{RelatedWork} introduces relevant work for this thesis. |
638 | 986 |
Chapter \ref{Future} concludes and mentions avenues of future research. |
532 | 987 |
|
988 |
||
989 |
||
990 |
||
991 |
||
992 |
%---------------------------------------------------------------------------------------- |
|
993 |
||
994 |
||
995 |
%---------------------------------------------------------------------------------------- |
|
996 |
||
997 |
%---------------------------------------------------------------------------------------- |
|
998 |
||
999 |
%---------------------------------------------------------------------------------------- |
|
1000 |
||
1001 |