532
|
1 |
% Chapter 1
|
|
2 |
|
|
3 |
\chapter{Introduction} % Main chapter title
|
|
4 |
|
|
5 |
\label{Introduction} % For referencing the chapter elsewhere, use \ref{Chapter1}
|
|
6 |
|
|
7 |
%----------------------------------------------------------------------------------------
|
|
8 |
|
|
9 |
% Define some commands to keep the formatting separated from the content
|
|
10 |
\newcommand{\keyword}[1]{\textbf{#1}}
|
|
11 |
\newcommand{\tabhead}[1]{\textbf{#1}}
|
|
12 |
\newcommand{\code}[1]{\texttt{#1}}
|
|
13 |
\newcommand{\file}[1]{\texttt{\bfseries#1}}
|
|
14 |
\newcommand{\option}[1]{\texttt{\itshape#1}}
|
|
15 |
|
|
16 |
%boxes
|
|
17 |
\newcommand*{\mybox}[1]{\framebox{\strut #1}}
|
|
18 |
|
|
19 |
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
|
|
20 |
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
|
|
21 |
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
|
543
|
22 |
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimps} #2}
|
596
|
23 |
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimps} #2}
|
564
|
24 |
\def\derssimp{\textit{ders}\_\textit{simp}}
|
557
|
25 |
\def\rders{\textit{rders}}
|
532
|
26 |
\newcommand{\bders}[2]{#1 \backslash #2}
|
|
27 |
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
|
591
|
28 |
\def\bsimps{\textit{bsimp}}
|
554
|
29 |
\newcommand{\rsimp}[1]{\textit{rsimp}\; #1}
|
532
|
30 |
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
|
|
31 |
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
|
|
32 |
\newcommand{\denote}{\stackrel{\mbox{\scriptsize denote}}{=}}%
|
|
33 |
\newcommand{\ZERO}{\mbox{\bf 0}}
|
|
34 |
\newcommand{\ONE}{\mbox{\bf 1}}
|
|
35 |
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
|
555
|
36 |
\newcommand{\rdistinct}[2]{\textit{rdistinct} \;\; #1 \;\; #2}
|
594
|
37 |
\def\rdistincts{\textit{rdistinct}}
|
556
|
38 |
\def\rDistinct{\textit{rdistinct}}
|
532
|
39 |
\newcommand\hflat[1]{\llparenthesis #1 \rrparenthesis_*}
|
|
40 |
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
|
|
41 |
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
|
|
42 |
|
|
43 |
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
|
|
44 |
|
600
|
45 |
\def\SEQ{\textit{SEQ}}
|
|
46 |
\def\SEQs{\textit{SEQs}}
|
564
|
47 |
\def\case{\textit{case}}
|
554
|
48 |
\def\sequal{\stackrel{\mbox{\scriptsize rsimp}}{=}}
|
|
49 |
\def\rsimpalts{\textit{rsimp}_{ALTS}}
|
|
50 |
\def\good{\textit{good}}
|
|
51 |
\def\btrue{\textit{true}}
|
|
52 |
\def\bfalse{\textit{false}}
|
542
|
53 |
\def\bnullable{\textit{bnullable}}
|
543
|
54 |
\def\bnullables{\textit{bnullables}}
|
538
|
55 |
\def\Some{\textit{Some}}
|
|
56 |
\def\None{\textit{None}}
|
537
|
57 |
\def\code{\textit{code}}
|
532
|
58 |
\def\decode{\textit{decode}}
|
|
59 |
\def\internalise{\textit{internalise}}
|
|
60 |
\def\lexer{\mathit{lexer}}
|
|
61 |
\def\mkeps{\textit{mkeps}}
|
557
|
62 |
\newcommand{\rder}[2]{#2 \backslash_r #1}
|
532
|
63 |
|
585
|
64 |
\def\rerases{\textit{rerase}}
|
|
65 |
|
554
|
66 |
\def\nonnested{\textit{nonnested}}
|
532
|
67 |
\def\AZERO{\textit{AZERO}}
|
558
|
68 |
\def\sizeNregex{\textit{sizeNregex}}
|
532
|
69 |
\def\AONE{\textit{AONE}}
|
|
70 |
\def\ACHAR{\textit{ACHAR}}
|
|
71 |
|
585
|
72 |
\def\simpsulz{\textit{simp}_{Sulz}}
|
|
73 |
|
557
|
74 |
\def\scfrewrites{\stackrel{*}{\rightsquigarrow_{scf}}}
|
555
|
75 |
\def\frewrite{\rightsquigarrow_f}
|
|
76 |
\def\hrewrite{\rightsquigarrow_h}
|
|
77 |
\def\grewrite{\rightsquigarrow_g}
|
|
78 |
\def\frewrites{\stackrel{*}{\rightsquigarrow_f}}
|
|
79 |
\def\hrewrites{\stackrel{*}{\rightsquigarrow_h}}
|
|
80 |
\def\grewrites{\stackrel{*}{\rightsquigarrow_g}}
|
538
|
81 |
\def\fuse{\textit{fuse}}
|
|
82 |
\def\bder{\textit{bder}}
|
542
|
83 |
\def\der{\textit{der}}
|
532
|
84 |
\def\POSIX{\textit{POSIX}}
|
|
85 |
\def\ALTS{\textit{ALTS}}
|
|
86 |
\def\ASTAR{\textit{ASTAR}}
|
|
87 |
\def\DFA{\textit{DFA}}
|
538
|
88 |
\def\NFA{\textit{NFA}}
|
532
|
89 |
\def\bmkeps{\textit{bmkeps}}
|
543
|
90 |
\def\bmkepss{\textit{bmkepss}}
|
532
|
91 |
\def\retrieve{\textit{retrieve}}
|
|
92 |
\def\blexer{\textit{blexer}}
|
|
93 |
\def\flex{\textit{flex}}
|
573
|
94 |
\def\inj{\textit{inj}}
|
564
|
95 |
\def\Empty{\textit{Empty}}
|
567
|
96 |
\def\Left{\textit{Left}}
|
|
97 |
\def\Right{\textit{Right}}
|
573
|
98 |
\def\Stars{\textit{Stars}}
|
|
99 |
\def\Char{\textit{Char}}
|
|
100 |
\def\Seq{\textit{Seq}}
|
532
|
101 |
\def\Der{\textit{Der}}
|
|
102 |
\def\Ders{\textit{Ders}}
|
|
103 |
\def\nullable{\mathit{nullable}}
|
|
104 |
\def\Z{\mathit{Z}}
|
|
105 |
\def\S{\mathit{S}}
|
|
106 |
\def\rup{r^\uparrow}
|
|
107 |
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
|
|
108 |
\def\distinctWith{\textit{distinctWith}}
|
|
109 |
\def\lf{\textit{lf}}
|
|
110 |
\def\PD{\textit{PD}}
|
|
111 |
\def\suffix{\textit{Suffix}}
|
543
|
112 |
\def\distinctBy{\textit{distinctBy}}
|
558
|
113 |
\def\starupdate{\textit{starUpdate}}
|
|
114 |
\def\starupdates{\textit{starUpdates}}
|
|
115 |
|
532
|
116 |
|
|
117 |
\def\size{\mathit{size}}
|
|
118 |
\def\rexp{\mathbf{rexp}}
|
|
119 |
\def\simp{\mathit{simp}}
|
|
120 |
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
|
|
121 |
\def\map{\mathit{map}}
|
|
122 |
\def\distinct{\mathit{distinct}}
|
|
123 |
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
|
590
|
124 |
\def\blexerStrong{\textit{blexerStrong}}
|
|
125 |
\def\bsimpStrong{\textit{bsimpStrong}}
|
591
|
126 |
\def\bdersStrongs{\textit{bdersStrong}}
|
590
|
127 |
\newcommand{\bdersStrong}[2]{#1 \backslash_{bsimpStrongs} #2}
|
|
128 |
|
532
|
129 |
\def\map{\textit{map}}
|
|
130 |
\def\rrexp{\textit{rrexp}}
|
554
|
131 |
\newcommand\rnullable[1]{\textit{rnullable} \; #1 }
|
532
|
132 |
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
|
|
133 |
\newcommand\asize[1]{\llbracket #1 \rrbracket}
|
543
|
134 |
\newcommand\rerase[1]{ (#1)_{\downarrow_r}}
|
|
135 |
|
538
|
136 |
\newcommand\ChristianComment[1]{\textcolor{blue}{#1}\\}
|
532
|
137 |
|
543
|
138 |
|
|
139 |
\def\rflts{\textit{rflts}}
|
|
140 |
\def\rrewrite{\textit{rrewrite}}
|
|
141 |
\def\bsimpalts{\textit{bsimp}_{ALTS}}
|
596
|
142 |
\def\bsimpaseq{\textit{bsimp}_{ASEQ}}
|
|
143 |
\def\rsimlalts{\textit{rsimp}_{ALTs}}
|
|
144 |
\def\rsimpseq{\textit{rsimp}_{SEQ}}
|
543
|
145 |
|
532
|
146 |
\def\erase{\textit{erase}}
|
|
147 |
\def\STAR{\textit{STAR}}
|
|
148 |
\def\flts{\textit{flts}}
|
|
149 |
|
|
150 |
|
579
|
151 |
\def\zeroable{\textit{zeroable}}
|
|
152 |
\def\nub{\textit{nub}}
|
|
153 |
\def\filter{\textit{filter}}
|
601
|
154 |
%\def\not{\textit{not}}
|
579
|
155 |
|
|
156 |
|
|
157 |
|
532
|
158 |
\def\RZERO{\mathbf{0}_r }
|
|
159 |
\def\RONE{\mathbf{1}_r}
|
|
160 |
\newcommand\RCHAR[1]{\mathbf{#1}_r}
|
|
161 |
\newcommand\RSEQ[2]{#1 \cdot #2}
|
558
|
162 |
\newcommand\RALTS[1]{\sum #1}
|
532
|
163 |
\newcommand\RSTAR[1]{#1^*}
|
558
|
164 |
\newcommand\vsuf[2]{\textit{Suffix} \;#1\;#2}
|
532
|
165 |
|
538
|
166 |
|
|
167 |
|
590
|
168 |
|
|
169 |
\lstdefinestyle{myScalastyle}{
|
|
170 |
frame=tb,
|
|
171 |
language=scala,
|
|
172 |
aboveskip=3mm,
|
|
173 |
belowskip=3mm,
|
|
174 |
showstringspaces=false,
|
|
175 |
columns=flexible,
|
|
176 |
basicstyle={\small\ttfamily},
|
|
177 |
numbers=none,
|
|
178 |
numberstyle=\tiny\color{gray},
|
|
179 |
keywordstyle=\color{blue},
|
|
180 |
commentstyle=\color{dkgreen},
|
|
181 |
stringstyle=\color{mauve},
|
|
182 |
frame=single,
|
|
183 |
breaklines=true,
|
|
184 |
breakatwhitespace=true,
|
|
185 |
tabsize=3,
|
538
|
186 |
}
|
|
187 |
|
590
|
188 |
|
532
|
189 |
%----------------------------------------------------------------------------------------
|
|
190 |
%This part is about regular expressions, Brzozowski derivatives,
|
|
191 |
%and a bit-coded lexing algorithm with proven correctness and time bounds.
|
|
192 |
|
|
193 |
%TODO: look up snort rules to use here--give readers idea of what regexes look like
|
|
194 |
|
601
|
195 |
|
|
196 |
|
|
197 |
|
|
198 |
|
|
199 |
|
|
200 |
Regular expressions are widely used in computer science:
|
|
201 |
be it in text-editors \parencite{atomEditor} with syntax highlighting and auto-completion;
|
|
202 |
command-line tools like $\mathit{grep}$ that facilitate easy
|
|
203 |
text-processing; network intrusion
|
|
204 |
detection systems that reject suspicious traffic; or compiler
|
|
205 |
front ends--the majority of the solutions to these tasks
|
|
206 |
involve lexing with regular
|
|
207 |
expressions.
|
|
208 |
Given its usefulness and ubiquity, one would imagine that
|
|
209 |
modern regular expression matching implementations
|
|
210 |
are mature and fully studied.
|
602
|
211 |
Indeed, in a popular programming language's regex engine,
|
|
212 |
supplying it with regular expressions and strings,
|
|
213 |
in most cases one can
|
|
214 |
get the matching information in a very short time.
|
|
215 |
Those matchers can be blindingly fast--some
|
|
216 |
network intrusion detection systems
|
601
|
217 |
use regex engines that are able to process
|
|
218 |
megabytes or even gigabytes of data per second \parencite{Turo_ov__2020}.
|
602
|
219 |
However, those matchers can exhibit a surprising security vulnerability
|
|
220 |
under a certain class of inputs.
|
|
221 |
%However, , this is not the case for $\mathbf{all}$ inputs.
|
601
|
222 |
%TODO: get source for SNORT/BRO's regex matching engine/speed
|
|
223 |
|
603
|
224 |
|
|
225 |
Take $(a^*)^*\,b$ and ask whether
|
|
226 |
strings of the form $aa..a$ match this regular
|
|
227 |
expression. Obviously this is not the case---the expected $b$ in the last
|
|
228 |
position is missing. One would expect that modern regular expression
|
|
229 |
matching engines can find this out very quickly. Alas, if one tries
|
|
230 |
this example in JavaScript, Python or Java 8, even with strings of a small
|
|
231 |
length, say around 30 $a$'s,
|
|
232 |
the decision takes crazy time to finish (graph \ref{fig:aStarStarb}).
|
|
233 |
This is clearly exponential behaviour, and
|
|
234 |
is triggered by some relatively simple regex patterns.
|
|
235 |
Java 9 and newer
|
|
236 |
versions improves this behaviour, but is still slow compared
|
|
237 |
with the approach we are going to use.
|
|
238 |
|
|
239 |
|
|
240 |
|
|
241 |
|
|
242 |
This superlinear blowup in regular expression engines
|
|
243 |
had repeatedly caused grief in real life that they
|
|
244 |
get a name for them--``catastrophic backtracking''.
|
|
245 |
For example, on 20 July 2016 one evil
|
|
246 |
regular expression brought the webpage
|
|
247 |
\href{http://stackexchange.com}{Stack Exchange} to its
|
|
248 |
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}(Last accessed in 2019)}
|
|
249 |
In this instance, a regular expression intended to just trim white
|
|
250 |
spaces from the beginning and the end of a line actually consumed
|
|
251 |
massive amounts of CPU resources---causing web servers to grind to a
|
|
252 |
halt. In this example, the time needed to process
|
|
253 |
the string was $O(n^2)$ with respect to the string length. This
|
|
254 |
quadratic overhead was enough for the homepage of Stack Exchange to
|
|
255 |
respond so slowly that the load balancer assumed a $\mathit{DoS}$
|
|
256 |
attack and therefore stopped the servers from responding to any
|
|
257 |
requests. This made the whole site become unavailable.
|
|
258 |
|
601
|
259 |
\begin{figure}[p]
|
532
|
260 |
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
|
|
261 |
\begin{tikzpicture}
|
|
262 |
\begin{axis}[
|
|
263 |
xlabel={$n$},
|
|
264 |
x label style={at={(1.05,-0.05)}},
|
|
265 |
ylabel={time in secs},
|
|
266 |
enlargelimits=false,
|
|
267 |
xtick={0,5,...,30},
|
|
268 |
xmax=33,
|
|
269 |
ymax=35,
|
|
270 |
ytick={0,5,...,30},
|
|
271 |
scaled ticks=false,
|
|
272 |
axis lines=left,
|
|
273 |
width=5cm,
|
|
274 |
height=4cm,
|
|
275 |
legend entries={JavaScript},
|
|
276 |
legend pos=north west,
|
|
277 |
legend cell align=left]
|
|
278 |
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
|
|
279 |
\end{axis}
|
|
280 |
\end{tikzpicture}
|
|
281 |
&
|
|
282 |
\begin{tikzpicture}
|
|
283 |
\begin{axis}[
|
|
284 |
xlabel={$n$},
|
|
285 |
x label style={at={(1.05,-0.05)}},
|
|
286 |
%ylabel={time in secs},
|
|
287 |
enlargelimits=false,
|
|
288 |
xtick={0,5,...,30},
|
|
289 |
xmax=33,
|
|
290 |
ymax=35,
|
|
291 |
ytick={0,5,...,30},
|
|
292 |
scaled ticks=false,
|
|
293 |
axis lines=left,
|
|
294 |
width=5cm,
|
|
295 |
height=4cm,
|
|
296 |
legend entries={Python},
|
|
297 |
legend pos=north west,
|
|
298 |
legend cell align=left]
|
|
299 |
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
|
|
300 |
\end{axis}
|
|
301 |
\end{tikzpicture}
|
|
302 |
&
|
|
303 |
\begin{tikzpicture}
|
|
304 |
\begin{axis}[
|
|
305 |
xlabel={$n$},
|
|
306 |
x label style={at={(1.05,-0.05)}},
|
|
307 |
%ylabel={time in secs},
|
|
308 |
enlargelimits=false,
|
|
309 |
xtick={0,5,...,30},
|
|
310 |
xmax=33,
|
|
311 |
ymax=35,
|
|
312 |
ytick={0,5,...,30},
|
|
313 |
scaled ticks=false,
|
|
314 |
axis lines=left,
|
|
315 |
width=5cm,
|
|
316 |
height=4cm,
|
|
317 |
legend entries={Java 8},
|
|
318 |
legend pos=north west,
|
|
319 |
legend cell align=left]
|
|
320 |
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
|
|
321 |
\end{axis}
|
|
322 |
\end{tikzpicture}\\
|
601
|
323 |
\begin{tikzpicture}
|
|
324 |
\begin{axis}[
|
|
325 |
xlabel={$n$},
|
|
326 |
x label style={at={(1.05,-0.05)}},
|
|
327 |
ylabel={time in secs},
|
|
328 |
enlargelimits=false,
|
|
329 |
xtick={0,5,...,30},
|
|
330 |
xmax=33,
|
|
331 |
ymax=35,
|
|
332 |
ytick={0,5,...,30},
|
|
333 |
scaled ticks=false,
|
|
334 |
axis lines=left,
|
|
335 |
width=5cm,
|
|
336 |
height=4cm,
|
|
337 |
legend entries={Dart},
|
|
338 |
legend pos=north west,
|
|
339 |
legend cell align=left]
|
|
340 |
\addplot[green,mark=*, mark options={fill=white}] table {re-dart.data};
|
|
341 |
\end{axis}
|
|
342 |
\end{tikzpicture}
|
|
343 |
&
|
|
344 |
\begin{tikzpicture}
|
|
345 |
\begin{axis}[
|
|
346 |
xlabel={$n$},
|
|
347 |
x label style={at={(1.05,-0.05)}},
|
|
348 |
%ylabel={time in secs},
|
|
349 |
enlargelimits=false,
|
|
350 |
xtick={0,5,...,30},
|
|
351 |
xmax=33,
|
|
352 |
ymax=35,
|
|
353 |
ytick={0,5,...,30},
|
|
354 |
scaled ticks=false,
|
|
355 |
axis lines=left,
|
|
356 |
width=5cm,
|
|
357 |
height=4cm,
|
|
358 |
legend entries={Swift},
|
|
359 |
legend pos=north west,
|
|
360 |
legend cell align=left]
|
|
361 |
\addplot[purple,mark=*, mark options={fill=white}] table {re-swift.data};
|
|
362 |
\end{axis}
|
|
363 |
\end{tikzpicture}
|
|
364 |
& \\
|
|
365 |
\multicolumn{3}{c}{Graphs}
|
532
|
366 |
\end{tabular}
|
601
|
367 |
\caption{Graphs showing runtime for matching $(a^*)^*\,b$ with strings
|
|
368 |
of the form $\protect\underbrace{aa..a}_{n}$ in various existing regular expression libraries.
|
|
369 |
The reason for their superlinear behaviour is that they do a depth-first-search.
|
|
370 |
If the string does not match, the engine starts to explore all possibilities.
|
|
371 |
}\label{fig:aStarStarb}
|
|
372 |
\end{figure}\afterpage{\clearpage}
|
538
|
373 |
|
532
|
374 |
A more recent example is a global outage of all Cloudflare servers on 2 July
|
|
375 |
2019. A poorly written regular expression exhibited exponential
|
|
376 |
behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
|
|
377 |
had several causes, at the heart was a regular expression that
|
|
378 |
was used to monitor network
|
538
|
379 |
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}(Last accessed in 2022)}
|
532
|
380 |
These problems with regular expressions
|
|
381 |
are not isolated events that happen
|
|
382 |
very occasionally, but actually widespread.
|
|
383 |
They occur so often that they get a
|
|
384 |
name--Regular-Expression-Denial-Of-Service (ReDoS)
|
|
385 |
attack.
|
538
|
386 |
\citeauthor{Davis18} detected more
|
532
|
387 |
than 1000 super-linear (SL) regular expressions
|
|
388 |
in Node.js, Python core libraries, and npm and pypi.
|
|
389 |
They therefore concluded that evil regular expressions
|
538
|
390 |
are problems "more than a parlour trick", but one that
|
532
|
391 |
requires
|
|
392 |
more research attention.
|
|
393 |
|
603
|
394 |
This work aims to address this issue
|
|
395 |
with the help of formal proofs.
|
|
396 |
We offer a lexing algorithm based
|
|
397 |
on Brzozowski derivatives with certified correctness (in
|
|
398 |
Isabelle/HOL)
|
|
399 |
and finiteness property.
|
604
|
400 |
Such properties guarantee the absence of
|
603
|
401 |
catastrophic backtracking in most cases.
|
604
|
402 |
We will give more details in the next sections
|
|
403 |
on (i) why the slow cases in graph \ref{fig:aStarStarb}
|
|
404 |
can occur
|
|
405 |
and (ii) why we choose our
|
|
406 |
approach (Brzozowski derivatives and formal proofs).
|
602
|
407 |
|
603
|
408 |
|
605
|
409 |
\section{Regex, and the Problems with Regex Matchers}
|
601
|
410 |
Regular expressions and regular expression matchers
|
|
411 |
have of course been studied for many, many years.
|
605
|
412 |
Theoretical results in automata theory say
|
604
|
413 |
that basic regular expression matching should be linear
|
605
|
414 |
w.r.t the input.
|
|
415 |
This assumes that the regular expression
|
|
416 |
$r$ was pre-processed and turned into a
|
|
417 |
deterministic finite automata (DFA) before matching,
|
|
418 |
which could be exponential\cite{Sakarovitch2009}.
|
604
|
419 |
By basic we mean textbook definitions such as the one
|
|
420 |
below, involving only characters, alternatives,
|
|
421 |
sequences, and Kleene stars:
|
|
422 |
\[
|
|
423 |
r ::= \ZERO | \ONE | c | r_1 + r_2 | r_1 \cdot r_2 | r^*
|
|
424 |
\]
|
|
425 |
Modern regular expression matchers used by programmers,
|
|
426 |
however,
|
|
427 |
support richer constructs such as bounded repetitions
|
|
428 |
and back-references.
|
605
|
429 |
To differentiate, people use the word \emph{regex} to refer
|
|
430 |
to those expressions with richer constructs while reserving the
|
|
431 |
term \emph{regular expression}
|
|
432 |
for the more traditional meaning in formal languages theory.
|
|
433 |
We follow this convention
|
|
434 |
in this thesis.
|
|
435 |
In the future, we aim to support all the popular features of regexes,
|
604
|
436 |
but for this work we mainly look at regular expressions.
|
|
437 |
|
605
|
438 |
|
|
439 |
|
|
440 |
%Most modern regex libraries
|
|
441 |
%the so-called PCRE standard (Peral Compatible Regular Expressions)
|
|
442 |
%has the back-references
|
604
|
443 |
Regexes come with a lot of constructs
|
605
|
444 |
beyond the basic ones
|
|
445 |
that make it more convenient for
|
604
|
446 |
programmers to write regular expressions.
|
605
|
447 |
Depending on the types of these constructs
|
|
448 |
the task of matching and lexing with them
|
|
449 |
will have different levels of complexity increase.
|
604
|
450 |
Some of those constructs are syntactic sugars that are
|
|
451 |
simply short hand notations
|
605
|
452 |
that save the programmers a few keystrokes.
|
|
453 |
These will not cause trouble for regex libraries.
|
|
454 |
|
|
455 |
\noindent
|
|
456 |
For example the
|
604
|
457 |
non-binary alternative involving three or more choices:
|
|
458 |
\[
|
605
|
459 |
(a | b | c) \stackrel{means}{=} ((a + b)+ c)
|
604
|
460 |
\]
|
605
|
461 |
the range operator $-$ used to express the alternative
|
|
462 |
of all characters between its operands in a concise way:
|
604
|
463 |
\[
|
605
|
464 |
[0~-9]\stackrel{means}{=} (0 | 1 | \ldots | 9 ) \; \text{(all number digits)}
|
604
|
465 |
\]
|
605
|
466 |
and the
|
|
467 |
wildcard character $.$ used to refer to any single character:
|
|
468 |
\[
|
|
469 |
. \stackrel{means}{=} [0-9a-zA-Z+-()*\&\ldots]
|
|
470 |
\]
|
604
|
471 |
|
605
|
472 |
\noindent
|
|
473 |
\subsection{Bounded Repetitions}
|
|
474 |
Some of those constructs do make the expressions much
|
|
475 |
more compact.
|
|
476 |
For example, the bounded regular expressions
|
|
477 |
(where $n$ and $m$ are constant natural numbers)
|
|
478 |
$r^{\{n\}}$, $r^{\{\ldots m\}}$, $r^{\{n\ldots \}}$ and $r^{\{n\ldots m\}}$,
|
|
479 |
defined as
|
|
480 |
\begin{center}
|
|
481 |
\begin{tabular}{lcl}
|
|
482 |
$L \; r^{\{n\}}$ & $\dn$ & $(L \; r)^n$\\
|
|
483 |
$L \; r^{\{\ldots m\}}$ & $\dn$ & $\bigcup_{0 \leq i \leq m}. (L \; r)^i$\\
|
|
484 |
$L \; r^{\{n\ldots \}}$ & $\dn$ & $\bigcup_{n \leq i}. (L \; r)^i$\\
|
|
485 |
$L \; r^{\{n \ldots m\}}$ & $\dn$ & $\bigcup_{n \leq i \leq m}. (L \; r)^i$
|
|
486 |
\end{tabular}
|
|
487 |
\end{center}
|
|
488 |
are exponentially smaller compared with
|
|
489 |
their unfolded form: for example $r^{\{n\}}$
|
|
490 |
as opposed to
|
|
491 |
\[
|
|
492 |
\underbrace{r\ldots r}_\text{n copies of r}.
|
|
493 |
\]
|
|
494 |
%Therefore, a naive algorithm that simply unfolds
|
|
495 |
%them into their desugared forms
|
|
496 |
%will suffer from at least an exponential runtime increase.
|
603
|
497 |
|
|
498 |
The problem here is that tools based on the classic notion of
|
|
499 |
automata need to expand $r^{n}$ into $n$ connected
|
|
500 |
copies of the automaton for $r$. This leads to very inefficient matching
|
|
501 |
algorithms or algorithms that consume large amounts of memory.
|
605
|
502 |
Implementations using $\DFA$s will
|
|
503 |
either become excruciatingly slow
|
|
504 |
(for example Verbatim++\cite{Verbatimpp}) or get
|
|
505 |
out of memory errors (for example $\mathit{LEX}$ and
|
|
506 |
$\mathit{JFLEX}$\footnote{which are lexer generators
|
|
507 |
in C and JAVA that generate $\mathit{DFA}$-based
|
|
508 |
lexers. The user provides a set of regular expressions
|
|
509 |
and configurations to them, and then
|
|
510 |
gets an output program encoding a minimized $\mathit{DFA}$
|
|
511 |
that can be compiled and run.
|
|
512 |
When given the above countdown regular expression,
|
|
513 |
a small $n$ (a few dozen) would result in a
|
|
514 |
determinised automata
|
|
515 |
with millions of states.}) under large counters.
|
603
|
516 |
A classic example is the regular expression $(a+b)^* a (a+b)^{n}$
|
605
|
517 |
where the minimal DFA requires at least $2^{n+1}$ states.
|
|
518 |
For example, when $n$ is equal to 2,
|
604
|
519 |
an $\mathit{NFA}$ describing it would look like:
|
|
520 |
\begin{center}
|
|
521 |
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
|
|
522 |
\node[state,initial] (q_0) {$q_0$};
|
|
523 |
\node[state, red] (q_1) [right=of q_0] {$q_1$};
|
|
524 |
\node[state, red] (q_2) [right=of q_1] {$q_2$};
|
|
525 |
\node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
|
|
526 |
\path[->]
|
|
527 |
(q_0) edge node {a} (q_1)
|
|
528 |
edge [loop below] node {a,b} ()
|
|
529 |
(q_1) edge node {a,b} (q_2)
|
|
530 |
(q_2) edge node {a,b} (q_3);
|
|
531 |
\end{tikzpicture}
|
|
532 |
\end{center}
|
605
|
533 |
which requires at least $2^3$ states
|
|
534 |
for its subset construction.\footnote{The
|
|
535 |
red states are "countdown states" which counts down
|
604
|
536 |
the number of characters needed in addition to the current
|
|
537 |
string to make a successful match.
|
|
538 |
For example, state $q_1$ indicates a match that has
|
|
539 |
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
|
|
540 |
and just consumed the "delimiter" $a$ in the middle, and
|
|
541 |
need to match 2 more iterations of $(a|b)$ to complete.
|
|
542 |
State $q_2$ on the other hand, can be viewed as a state
|
|
543 |
after $q_1$ has consumed 1 character, and just waits
|
|
544 |
for 1 more character to complete.
|
|
545 |
$q_3$ is the last state, requiring 0 more character and is accepting.
|
|
546 |
Depending on the suffix of the
|
|
547 |
input string up to the current read location,
|
|
548 |
the states $q_1$ and $q_2$, $q_3$
|
|
549 |
may or may
|
|
550 |
not be active, independent from each other.
|
|
551 |
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
|
|
552 |
contain at least $2^3$ non-equivalent states that cannot be merged,
|
|
553 |
because the subset construction during determinisation will generate
|
|
554 |
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
|
|
555 |
Generalizing this to regular expressions with larger
|
|
556 |
bounded repetitions number, we have that
|
|
557 |
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
|
605
|
558 |
would require at least $2^{n+1}$ states, if $r$ itself contains
|
604
|
559 |
more than 1 string.
|
|
560 |
This is to represent all different
|
605
|
561 |
scenarios which "countdown" states are active.}
|
603
|
562 |
|
605
|
563 |
One of the most recent work in the context of lexing
|
|
564 |
%with this issue
|
|
565 |
is the Verbatim lexer by Egolf, Lasser and Fisher\cite{Verbatim}.
|
|
566 |
This is relevant work and we will compare later on
|
|
567 |
our derivative-based matcher we are going to present.
|
|
568 |
There is also some newer work called
|
|
569 |
Verbatim++\cite{Verbatimpp}, which does not use derivatives,
|
|
570 |
but deterministic finite automaton instead.
|
|
571 |
%An example that gives problem to automaton approaches would be
|
|
572 |
%the regular expression $(a|b)^*a(a|b)^{\{n\}}$.
|
|
573 |
%It requires at least $2^{n+1}$ states to represent
|
|
574 |
%as a DFA.
|
538
|
575 |
|
|
576 |
|
605
|
577 |
Bounded repetitions are very important because they
|
|
578 |
tend to occur a lot in practical use.
|
|
579 |
For example in the regex library RegExLib,
|
607
|
580 |
the rules library of Snort \cite{Snort1999}\footnote{
|
|
581 |
Snort is a network intrusion detection (NID) tool
|
|
582 |
for monitoring network traffic.},
|
|
583 |
as well as in XML Schema definitions (XSDs).
|
605
|
584 |
According to Bj\"{o}rklund et al \cite{xml2015},
|
|
585 |
more than half of the
|
|
586 |
XSDs they found have bounded regular expressions in them.
|
|
587 |
Often the counters are quite large, the largest up to ten million.
|
|
588 |
An example XSD they gave
|
|
589 |
was:
|
606
|
590 |
%\begin{verbatim}
|
|
591 |
%<sequence minOccurs="0" maxOccurs="65535">
|
|
592 |
% <element name="TimeIncr" type="mpeg7:MediaIncrDurationType"/>
|
|
593 |
% <element name="MotionParams" type="float" minOccurs="2" maxOccurs="12"/>
|
|
594 |
%</sequence>
|
|
595 |
%\end{verbatim}
|
605
|
596 |
This can be seen as the expression
|
|
597 |
$(ab^{2\ldots 12})^{0 \ldots 65535}$, where $a$ and $b$ are themselves
|
|
598 |
regular expressions
|
|
599 |
satisfying certain constraints (such as
|
|
600 |
satisfying the floating point number format).
|
538
|
601 |
|
605
|
602 |
It is therefore quite unsatisfying that
|
|
603 |
some regular expressions matching libraries
|
|
604 |
impose adhoc limits
|
|
605 |
for bounded regular expressions:
|
|
606 |
For example, in the regular expression matching library in the Go
|
|
607 |
language the regular expression $a^{1001}$ is not permitted, because no counter
|
|
608 |
can be above 1000, and in the built-in Rust regular expression library
|
|
609 |
expressions such as $a^{\{1000\}\{100\}\{5\}}$ give an error message
|
606
|
610 |
for being too big.
|
|
611 |
As Becchi and Crawley\cite{Becchi08} have pointed out,
|
|
612 |
the reason for these restrictions
|
|
613 |
are that they simulate a non-deterministic finite
|
|
614 |
automata (NFA) with a breadth-first search.
|
|
615 |
This way the number of active states could
|
|
616 |
be equal to the counter number.
|
|
617 |
When the counters are large,
|
|
618 |
the memory requirement could become
|
|
619 |
infeasible, and the pattern needs to be rejected straight away.
|
|
620 |
\begin{figure}[H]
|
|
621 |
\begin{center}
|
|
622 |
\begin{tikzpicture} [node distance = 2cm, on grid, auto]
|
|
623 |
|
|
624 |
\node (q0) [state, initial] {$0$};
|
|
625 |
\node (q1) [state, right = of q0] {$1$};
|
|
626 |
\node (q2) [state, right = of q1] {$2$};
|
|
627 |
\node (qdots) [right = of q2] {$\ldots$};
|
|
628 |
\node (qn) [state, right = of qdots] {$n$};
|
|
629 |
\node (qn1) [state, right = of qn] {$n+1$};
|
|
630 |
\node (qn2) [state, right = of qn1] {$n+2$};
|
|
631 |
\node (qn3) [state, accepting, right = of qn2] {$n+3$};
|
|
632 |
|
|
633 |
\path [-stealth, thick]
|
|
634 |
(q0) edge [loop above] node {a} ()
|
|
635 |
(q0) edge node {a} (q1)
|
|
636 |
(q1) edge node {.} (q2)
|
|
637 |
(q2) edge node {.} (qdots)
|
|
638 |
(qdots) edge node {.} (qn)
|
|
639 |
(qn) edge node {.} (qn1)
|
|
640 |
(qn1) edge node {b} (qn2)
|
|
641 |
(qn2) edge node {$c$} (qn3);
|
|
642 |
\end{tikzpicture}
|
|
643 |
%\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
|
|
644 |
% \node[state,initial] (q_0) {$0$};
|
|
645 |
% \node[state, ] (q_1) [right=of q_0] {$1$};
|
|
646 |
% \node[state, ] (q_2) [right=of q_1] {$2$};
|
|
647 |
% \node[state,
|
|
648 |
% \node[state, accepting, ](q_3) [right=of q_2] {$3$};
|
|
649 |
% \path[->]
|
|
650 |
% (q_0) edge node {a} (q_1)
|
|
651 |
% edge [loop below] node {a,b} ()
|
|
652 |
% (q_1) edge node {a,b} (q_2)
|
|
653 |
% (q_2) edge node {a,b} (q_3);
|
|
654 |
%\end{tikzpicture}
|
|
655 |
\end{center}
|
|
656 |
\caption{The example given by Becchi and Crawley
|
|
657 |
that NFA simulation can consume large
|
|
658 |
amounts of memory: $.^*a.^{\{n\}}bc$ matching
|
|
659 |
strings of the form $aaa\ldots aaaabc$.
|
|
660 |
When traversing in a breadth-first manner,
|
|
661 |
all states from 0 till $n+1$ will become active.}
|
|
662 |
\end{figure}
|
|
663 |
%Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
|
|
664 |
%type of $\mathit{NFA}$ simulation and guarantees a linear runtime
|
|
665 |
%in terms of input string length.
|
|
666 |
%TODO:try out these lexers
|
|
667 |
These problems can of course be solved in matching algorithms where
|
605
|
668 |
automata go beyond the classic notion and for instance include explicit
|
|
669 |
counters \cite{Turo_ov__2020}.
|
606
|
670 |
These solutions can be quite effective,
|
|
671 |
with the ability to process
|
|
672 |
gigabytes of string input per second
|
|
673 |
even with large counters \cite{Becchi08}.
|
|
674 |
But formally reasoning about these automata can be challenging
|
|
675 |
and un-intuitive.
|
|
676 |
Therefore, correctness and runtime claims made about these solutions need to be
|
|
677 |
taken with a grain of salt.
|
605
|
678 |
|
|
679 |
In the work reported in \cite{CSL2022} and here,
|
|
680 |
we add better support using derivatives
|
|
681 |
for bounded regular expressions $r^{\{n\}}$.
|
|
682 |
The results
|
|
683 |
extend straightforwardly to
|
|
684 |
repetitions with an interval such as
|
|
685 |
$r^{\{n\ldots m\}}$.
|
|
686 |
The merit of Brzozowski derivatives
|
|
687 |
on this problem is that
|
|
688 |
it can be naturally extended to support bounded repetitions.
|
|
689 |
Moreover these extensions are still made up of only
|
|
690 |
inductive datatypes and recursive functions,
|
|
691 |
making it handy to deal with using theorem provers.
|
|
692 |
%The point here is that Brzozowski derivatives and the algorithms by Sulzmann and Lu can be
|
|
693 |
%straightforwardly extended to deal with bounded regular expressions
|
|
694 |
%and moreover the resulting code still consists of only simple
|
|
695 |
%recursive functions and inductive datatypes.
|
|
696 |
Finally, bounded regular expressions do not destroy our finite
|
|
697 |
boundedness property, which we shall prove later on.
|
538
|
698 |
|
|
699 |
|
606
|
700 |
|
|
701 |
|
|
702 |
|
605
|
703 |
\subsection{Back-References}
|
606
|
704 |
The other way to simulate an $\mathit{NFA}$ for matching is choosing
|
|
705 |
a single transition each time, keeping all the other options in
|
|
706 |
a queue or stack, and backtracking if that choice eventually
|
|
707 |
fails. This method, often called a "depth-first-search",
|
|
708 |
is efficient in a lot of cases, but could end up
|
|
709 |
with exponential run time.
|
|
710 |
The backtracking method is employed in regex libraries
|
|
711 |
that support \emph{back-references}, for example
|
|
712 |
in Java and Python.
|
605
|
713 |
%\section{Back-references and The Terminology Regex}
|
538
|
714 |
|
605
|
715 |
%When one constructs an $\NFA$ out of a regular expression
|
|
716 |
%there is often very little to be done in the first phase, one simply
|
|
717 |
%construct the $\NFA$ states based on the structure of the input regular expression.
|
538
|
718 |
|
605
|
719 |
%In the lexing phase, one can simulate the $\mathit{NFA}$ running in two ways:
|
|
720 |
%one by keeping track of all active states after consuming
|
|
721 |
%a character, and update that set of states iteratively.
|
|
722 |
%This can be viewed as a breadth-first-search of the $\mathit{NFA}$
|
|
723 |
%for a path terminating
|
|
724 |
%at an accepting state.
|
606
|
725 |
|
|
726 |
|
|
727 |
|
|
728 |
Given a regular expression like this (the sequence
|
532
|
729 |
operator is omitted for brevity):
|
|
730 |
\begin{center}
|
606
|
731 |
$r_1r_2r_3r_4$
|
532
|
732 |
\end{center}
|
606
|
733 |
one could label sub-expressions of interest
|
532
|
734 |
by parenthesizing them and giving
|
|
735 |
them a number by the order in which their opening parentheses appear.
|
|
736 |
One possible way of parenthesizing and labelling is given below:
|
|
737 |
\begin{center}
|
|
738 |
$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
|
|
739 |
\end{center}
|
606
|
740 |
The sub-expressions
|
|
741 |
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$ and $r_4$ are labelled
|
|
742 |
by 1 to 4, and can be ``referred back'' by their respective numbers.
|
|
743 |
%These sub-expressions are called "capturing groups".
|
|
744 |
To do so, we use the syntax $\backslash i$
|
|
745 |
to denote that we want the sub-string
|
|
746 |
of the input just matched by the i-th
|
|
747 |
sub-expression to appear again,
|
|
748 |
exactly the same as it first appeared:
|
532
|
749 |
\begin{center}
|
|
750 |
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots
|
|
751 |
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
|
|
752 |
\end{center}
|
606
|
753 |
%The backslash and number $i$ are the
|
|
754 |
%so-called "back-references".
|
|
755 |
%Let $e$ be an expression made of regular expressions
|
|
756 |
%and back-references. $e$ contains the expression $e_i$
|
|
757 |
%as its $i$-th capturing group.
|
|
758 |
%The semantics of back-reference can be recursively
|
|
759 |
%written as:
|
|
760 |
%\begin{center}
|
|
761 |
% \begin{tabular}{c}
|
|
762 |
% $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
|
|
763 |
% $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
|
|
764 |
% \end{tabular}
|
|
765 |
%\end{center}
|
|
766 |
A concrete example
|
|
767 |
for back-references would be
|
532
|
768 |
\begin{center}
|
607
|
769 |
$(.^*)\backslash 1$,
|
532
|
770 |
\end{center}
|
606
|
771 |
which would match
|
|
772 |
strings that can be split into two identical halves,
|
607
|
773 |
for example $\mathit{foofoo}$, $\mathit{ww}$ and etc.
|
|
774 |
Note that this is different from
|
|
775 |
repeating the sub-expression verbatim like
|
|
776 |
\begin{center}
|
|
777 |
$(.^*)(.^*)$,
|
|
778 |
\end{center}
|
|
779 |
which does not impose any restrictions on what strings the second
|
|
780 |
sub-expression $.^*$
|
|
781 |
might match.
|
|
782 |
Another example of back-references would be
|
|
783 |
\begin{center}
|
|
784 |
$(.)(.)\backslash 2\backslash 1$
|
|
785 |
\end{center}
|
|
786 |
which expresses four-character palindromes
|
|
787 |
like $abba$, $x??x$ etc.
|
|
788 |
|
|
789 |
Back-references is a regex construct
|
|
790 |
that programmers found quite useful.
|
|
791 |
According to Becchi and Crawley\cite{Becchi08},
|
|
792 |
6\% of Snort rules (up until 2008) include the use of them.
|
|
793 |
The most common use of back-references
|
|
794 |
would be expressing well-formed html files,
|
|
795 |
where back-references would be handy in expressing
|
|
796 |
a pair of opening and closing tags like
|
|
797 |
\begin{center}
|
|
798 |
$\langle html \rangle \ldots \langle / html \rangle$
|
|
799 |
\end{center}
|
|
800 |
A regex describing such a format
|
|
801 |
could be
|
|
802 |
\begin{center}
|
|
803 |
$\langle (.^+) \rangle \ldots \langle / \backslash 1 \rangle$
|
|
804 |
\end{center}
|
|
805 |
Despite being useful, the syntax and expressive power of regexes
|
|
806 |
go beyond the regular language hierarchy
|
|
807 |
with back-references.
|
|
808 |
In fact, they allow the regex construct to express
|
532
|
809 |
languages that cannot be contained in context-free
|
|
810 |
languages either.
|
|
811 |
For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
|
|
812 |
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
|
|
813 |
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
|
|
814 |
Such a language is contained in the context-sensitive hierarchy
|
|
815 |
of formal languages.
|
|
816 |
Solving the back-reference expressions matching problem
|
607
|
817 |
is NP-complete\parencite{alfred2014algorithms}.
|
|
818 |
A non-bactracking,
|
532
|
819 |
efficient solution is not known to exist.
|
607
|
820 |
Regex libraries supporting back-references such as PCRE therefore have to
|
|
821 |
revert to a depth-first search algorithm that backtracks.
|
|
822 |
The unreasonable part with them, is that even in the case of
|
|
823 |
regexes not involving back-references, there is still
|
|
824 |
a (non-negligible) chance they might backtrack super-linearly.
|
538
|
825 |
|
607
|
826 |
\subsection{Summary of the Catastrophic Backtracking Problem}
|
532
|
827 |
Summing these up, we can categorise existing
|
607
|
828 |
practical regex libraries into two kinds:
|
|
829 |
(i)The ones with linear
|
|
830 |
time guarantees like Go and Rust. The cost with them is that
|
|
831 |
they impose restrictions
|
532
|
832 |
on the user input (not allowing back-references,
|
607
|
833 |
bounded repetitions cannot exceed a counter limit etc.).
|
|
834 |
(ii) Those
|
|
835 |
that allow large bounded regular expressions and back-references
|
|
836 |
at the expense of using a backtracking algorithm.
|
|
837 |
They could grind to a halt
|
|
838 |
on some very simple cases, posing a vulnerability of
|
|
839 |
a ReDoS attack.
|
|
840 |
|
|
841 |
|
|
842 |
We would like to have regex engines that can
|
|
843 |
deal with the regular part (e.g.
|
|
844 |
bounded repetitions) of regexes more
|
|
845 |
efficiently.
|
|
846 |
Also we want to make sure that they do it correctly.
|
|
847 |
It turns out that such aim is not so easy to achieve.
|
532
|
848 |
%TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions
|
|
849 |
% For example, the Rust regex engine claims to be linear,
|
|
850 |
% but does not support lookarounds and back-references.
|
|
851 |
% The GoLang regex library does not support over 1000 repetitions.
|
|
852 |
% Java and Python both support back-references, but shows
|
|
853 |
%catastrophic backtracking behaviours on inputs without back-references(
|
|
854 |
%when the language is still regular).
|
|
855 |
%TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
|
|
856 |
%TODO: verify the fact Rust does not allow 1000+ reps
|
|
857 |
|
|
858 |
|
605
|
859 |
|
|
860 |
|
|
861 |
%The time cost of regex matching algorithms in general
|
|
862 |
%involve two different phases, and different things can go differently wrong on
|
|
863 |
%these phases.
|
|
864 |
%$\DFA$s usually have problems in the first (construction) phase
|
|
865 |
%, whereas $\NFA$s usually run into trouble
|
|
866 |
%on the second phase.
|
|
867 |
|
|
868 |
|
|
869 |
\section{Error-prone POSIX Implementations}
|
607
|
870 |
When there are multiple ways of matching a string
|
|
871 |
with a regular expression, a matcher needs to
|
|
872 |
disambiguate.
|
|
873 |
The standard for which particular match to pick
|
|
874 |
is called the disambiguation strategy.
|
|
875 |
The more intuitive strategy is called POSIX,
|
|
876 |
which always chooses the longest initial match.
|
|
877 |
An alternative strategy would be greedy matches,
|
|
878 |
which always ends a sub-match as early as possible.
|
|
879 |
The POSIX standard is widely adopted in many operating systems.
|
|
880 |
However, many implementations (including the C libraries
|
|
881 |
used by Linux and OS X distributions) contain bugs
|
|
882 |
or do not meet the specification they claim to adhere to.
|
|
883 |
In some cases, they either fail to generate a lexing
|
|
884 |
result when there exists a match,
|
605
|
885 |
or give results that are inconsistent with the $\POSIX$ standard.
|
607
|
886 |
A concrete example would be the regex given by \cite{fowler2003}
|
605
|
887 |
\begin{center}
|
607
|
888 |
$(aba + ab + a)^* \text{and the string} ababa$
|
605
|
889 |
\end{center}
|
|
890 |
The correct $\POSIX$ match for the above would be
|
|
891 |
with the entire string $ababa$,
|
|
892 |
split into two Kleene star iterations, $[ab] [aba]$ at positions
|
|
893 |
$[0, 2), [2, 5)$
|
|
894 |
respectively.
|
|
895 |
But trying this out in regex101\parencite{regex101}
|
|
896 |
with different language engines would yield
|
|
897 |
the same two fragmented matches: $[aba]$ at $[0, 3)$
|
|
898 |
and $a$ at $[4, 5)$.
|
607
|
899 |
Fowler \cite{fowler2003} and Kuklewicz \cite{KuklewiczHaskell}
|
|
900 |
commented that most regex libraries are not
|
605
|
901 |
correctly implementing the POSIX (maximum-munch)
|
|
902 |
rule of regular expression matching.
|
|
903 |
As Grathwohl\parencite{grathwohl2014crash} wrote,
|
|
904 |
\begin{quote}
|
607
|
905 |
``The POSIX strategy is more complicated than the
|
605
|
906 |
greedy because of the dependence on information about
|
607
|
907 |
the length of matched strings in the various subexpressions.''
|
605
|
908 |
\end{quote}
|
|
909 |
%\noindent
|
607
|
910 |
The implementation complexity of POSIX rules also come from
|
|
911 |
the specification being not very clear.
|
|
912 |
There are many informal summaries of this disambiguation
|
|
913 |
strategy, which are often quite long and delicate.
|
|
914 |
For example Kuklewicz described the POSIX rule as
|
|
915 |
\begin{quote}
|
|
916 |
``
|
|
917 |
\begin{itemize}
|
|
918 |
\item
|
|
919 |
regular expressions (REs) take the leftmost starting match, and the longest match starting there
|
|
920 |
earlier subpatterns have leftmost-longest priority over later subpatterns\\
|
|
921 |
\item
|
|
922 |
higher-level subpatterns have leftmost-longest priority over their component subpatterns\\
|
|
923 |
\item
|
|
924 |
REs have right associative concatenation which can be changed with parenthesis\\
|
|
925 |
\item
|
|
926 |
parenthesized subexpressions return the match from their last usage\\
|
|
927 |
\item
|
|
928 |
text of component subexpressions must be contained in the text of the
|
|
929 |
higher-level subexpressions\\
|
|
930 |
\item
|
|
931 |
if "p" and "q" can never match the same text then "p|q" and "q|p" are equivalent, up to trivial renumbering of captured subexpressions\\
|
|
932 |
\item
|
|
933 |
if "p" in "p*" is used to capture non-empty text then additional repetitions of "p" will not capture an empty string\\''
|
|
934 |
\end{itemize}
|
|
935 |
\end{quote}
|
|
936 |
The text above
|
|
937 |
is trying to capture something very precise,
|
|
938 |
and is crying out for formalising.
|
605
|
939 |
|
|
940 |
|
607
|
941 |
%\subsection{Different Phases of a Matching/Lexing Algorithm}
|
|
942 |
%
|
|
943 |
%
|
|
944 |
%Most lexing algorithms can be roughly divided into
|
|
945 |
%two phases during its run.
|
|
946 |
%The first phase is the "construction" phase,
|
|
947 |
%in which the algorithm builds some
|
|
948 |
%suitable data structure from the input regex $r$, so that
|
|
949 |
%it can be easily operated on later.
|
|
950 |
%We denote
|
|
951 |
%the time cost for such a phase by $P_1(r)$.
|
|
952 |
%The second phase is the lexing phase, when the input string
|
|
953 |
%$s$ is read and the data structure
|
|
954 |
%representing that regex $r$ is being operated on.
|
|
955 |
%We represent the time
|
|
956 |
%it takes by $P_2(r, s)$.\\
|
|
957 |
%For $\mathit{DFA}$,
|
|
958 |
%we have $P_2(r, s) = O( |s| )$,
|
|
959 |
%because we take at most $|s|$ steps,
|
|
960 |
%and each step takes
|
|
961 |
%at most one transition--
|
|
962 |
%a deterministic-finite-automata
|
|
963 |
%by definition has at most one state active and at most one
|
|
964 |
%transition upon receiving an input symbol.
|
|
965 |
%But unfortunately in the worst case
|
|
966 |
%$P_1(r) = O(exp^{|r|})$. An example will be given later.
|
|
967 |
%For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold
|
|
968 |
%expressions like $r^n$ into
|
|
969 |
%\[
|
|
970 |
% \underbrace{r \cdots r}_{\text{n copies of r}}.
|
|
971 |
%\]
|
|
972 |
%The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
|
|
973 |
%On the other hand, if backtracking is used, the worst-case time bound bloats
|
|
974 |
%to $|r| * 2^{|s|}$.
|
|
975 |
%%on the input
|
|
976 |
%%And when calculating the time complexity of the matching algorithm,
|
|
977 |
%%we are assuming that each input reading step requires constant time.
|
|
978 |
%%which translates to that the number of
|
|
979 |
%%states active and transitions taken each time is bounded by a
|
|
980 |
%%constant $C$.
|
|
981 |
%%But modern regex libraries in popular language engines
|
|
982 |
%% often want to support much richer constructs than just
|
|
983 |
%% sequences and Kleene stars,
|
|
984 |
%%such as negation, intersection,
|
|
985 |
%%bounded repetitions and back-references.
|
|
986 |
%%And de-sugaring these "extended" regular expressions
|
|
987 |
%%into basic ones might bloat the size exponentially.
|
|
988 |
%%TODO: more reference for exponential size blowup on desugaring.
|
|
989 |
%
|
|
990 |
%\subsection{Why $\mathit{DFA}s$ can be slow in the first phase}
|
|
991 |
%
|
|
992 |
%
|
|
993 |
%The good things about $\mathit{DFA}$s is that once
|
|
994 |
%generated, they are fast and stable, unlike
|
|
995 |
%backtracking algorithms.
|
|
996 |
%However, they do not scale well with bounded repetitions.
|
|
997 |
%
|
|
998 |
%\subsubsection{Problems with Bounded Repetitions}
|
|
999 |
%
|
|
1000 |
%
|
605
|
1001 |
|
607
|
1002 |
To summarise, we need regex libraries that are both fast
|
|
1003 |
and correct.
|
|
1004 |
And that correctness needs to be built on a precise
|
|
1005 |
model of what POSIX disambiguation is.
|
|
1006 |
We propose a solution that addresses both problems
|
|
1007 |
based on Brzozowski, Sulzmann and Lu and Ausaf and Urban's work.
|
|
1008 |
The end result is a regular expression lexing algorithm that comes with
|
538
|
1009 |
\begin{itemize}
|
|
1010 |
\item
|
607
|
1011 |
a proven correctness theorem according to POSIX specification
|
|
1012 |
given by Ausaf and Urban \cite{AusafDyckhoffUrban2016},
|
538
|
1013 |
\item
|
607
|
1014 |
a proven property saying that the algorithm's internal data structure will
|
|
1015 |
remain finite,
|
538
|
1016 |
\item
|
607
|
1017 |
and extension to
|
|
1018 |
the bounded repetitions construct with the correctness and finiteness property
|
|
1019 |
maintained.
|
538
|
1020 |
\end{itemize}
|
532
|
1021 |
|
538
|
1022 |
\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
|
|
1023 |
We propose Brzozowski derivatives on regular expressions as
|
|
1024 |
a solution to this.
|
|
1025 |
In the last fifteen or so years, Brzozowski's derivatives of regular
|
|
1026 |
expressions have sparked quite a bit of interest in the functional
|
|
1027 |
programming and theorem prover communities.
|
532
|
1028 |
|
538
|
1029 |
\subsection{Motivation}
|
|
1030 |
|
|
1031 |
Derivatives give a simple solution
|
|
1032 |
to the problem of matching a string $s$ with a regular
|
|
1033 |
expression $r$: if the derivative of $r$ w.r.t.\ (in
|
|
1034 |
succession) all the characters of the string matches the empty string,
|
|
1035 |
then $r$ matches $s$ (and {\em vice versa}).
|
532
|
1036 |
|
538
|
1037 |
The beauty of
|
532
|
1038 |
Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
|
|
1039 |
expressible in any functional language, and easily definable and
|
|
1040 |
reasoned about in theorem provers---the definitions just consist of
|
|
1041 |
inductive datatypes and simple recursive functions.
|
|
1042 |
And an algorithms based on it by
|
|
1043 |
Suzmann and Lu \parencite{Sulzmann2014} allows easy extension
|
|
1044 |
to include extended regular expressions and
|
|
1045 |
simplification of internal data structures
|
|
1046 |
eliminating the exponential behaviours.
|
|
1047 |
|
|
1048 |
However, two difficulties with derivative-based matchers exist:
|
538
|
1049 |
\subsubsection{Problems with Current Brzozowski Matchers}
|
532
|
1050 |
First, Brzozowski's original matcher only generates a yes/no answer
|
|
1051 |
for whether a regular expression matches a string or not. This is too
|
|
1052 |
little information in the context of lexing where separate tokens must
|
|
1053 |
be identified and also classified (for example as keywords
|
|
1054 |
or identifiers). Sulzmann and Lu~\cite{Sulzmann2014} overcome this
|
|
1055 |
difficulty by cleverly extending Brzozowski's matching
|
|
1056 |
algorithm. Their extended version generates additional information on
|
|
1057 |
\emph{how} a regular expression matches a string following the POSIX
|
|
1058 |
rules for regular expression matching. They achieve this by adding a
|
|
1059 |
second ``phase'' to Brzozowski's algorithm involving an injection
|
538
|
1060 |
function. In our own earlier work, we provided the formal
|
532
|
1061 |
specification of what POSIX matching means and proved in Isabelle/HOL
|
|
1062 |
the correctness
|
|
1063 |
of Sulzmann and Lu's extended algorithm accordingly
|
|
1064 |
\cite{AusafDyckhoffUrban2016}.
|
|
1065 |
|
|
1066 |
The second difficulty is that Brzozowski's derivatives can
|
|
1067 |
grow to arbitrarily big sizes. For example if we start with the
|
|
1068 |
regular expression $(a+aa)^*$ and take
|
|
1069 |
successive derivatives according to the character $a$, we end up with
|
|
1070 |
a sequence of ever-growing derivatives like
|
|
1071 |
|
|
1072 |
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
|
|
1073 |
\begin{center}
|
|
1074 |
\begin{tabular}{rll}
|
|
1075 |
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
|
|
1076 |
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
|
|
1077 |
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
|
|
1078 |
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
|
|
1079 |
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
|
|
1080 |
\end{tabular}
|
|
1081 |
\end{center}
|
|
1082 |
|
|
1083 |
\noindent where after around 35 steps we run out of memory on a
|
|
1084 |
typical computer (we shall define shortly the precise details of our
|
|
1085 |
regular expressions and the derivative operation). Clearly, the
|
|
1086 |
notation involving $\ZERO$s and $\ONE$s already suggests
|
|
1087 |
simplification rules that can be applied to regular regular
|
|
1088 |
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
|
|
1089 |
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
|
|
1090 |
r$. While such simple-minded simplifications have been proved in our
|
|
1091 |
earlier work to preserve the correctness of Sulzmann and Lu's
|
|
1092 |
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
|
|
1093 |
\emph{not} help with limiting the growth of the derivatives shown
|
|
1094 |
above: the growth is slowed, but the derivatives can still grow rather
|
|
1095 |
quickly beyond any finite bound.
|
|
1096 |
|
|
1097 |
|
|
1098 |
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
|
538
|
1099 |
\cite{Sulzmann2014} where they introduce bit-coded
|
532
|
1100 |
regular expressions. In this version, POSIX values are
|
538
|
1101 |
represented as bit sequences and such sequences are incrementally generated
|
532
|
1102 |
when derivatives are calculated. The compact representation
|
538
|
1103 |
of bit sequences and regular expressions allows them to define a more
|
532
|
1104 |
``aggressive'' simplification method that keeps the size of the
|
|
1105 |
derivatives finite no matter what the length of the string is.
|
|
1106 |
They make some informal claims about the correctness and linear behaviour
|
|
1107 |
of this version, but do not provide any supporting proof arguments, not
|
538
|
1108 |
even ``pencil-and-paper'' arguments. They write about their bit-coded
|
532
|
1109 |
\emph{incremental parsing method} (that is the algorithm to be formalised
|
538
|
1110 |
in this dissertation)
|
532
|
1111 |
|
|
1112 |
|
|
1113 |
|
|
1114 |
\begin{quote}\it
|
|
1115 |
``Correctness Claim: We further claim that the incremental parsing
|
|
1116 |
method [..] in combination with the simplification steps [..]
|
|
1117 |
yields POSIX parse trees. We have tested this claim
|
|
1118 |
extensively [..] but yet
|
|
1119 |
have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
|
|
1120 |
\end{quote}
|
|
1121 |
|
|
1122 |
Ausaf and Urban were able to back this correctness claim with
|
|
1123 |
a formal proof.
|
|
1124 |
|
|
1125 |
But as they stated,
|
|
1126 |
\begin{quote}\it
|
|
1127 |
The next step would be to implement a more aggressive simplification procedure on annotated regular expressions and then prove the corresponding algorithm generates the same values as blexer. Alas due to time constraints we are unable to do so here.
|
|
1128 |
\end{quote}
|
|
1129 |
|
|
1130 |
This thesis implements the aggressive simplifications envisioned
|
|
1131 |
by Ausaf and Urban,
|
|
1132 |
and gives a formal proof of the correctness with those simplifications.
|
|
1133 |
|
|
1134 |
|
|
1135 |
%----------------------------------------------------------------------------------------
|
|
1136 |
\section{Contribution}
|
|
1137 |
|
|
1138 |
|
|
1139 |
|
|
1140 |
This work addresses the vulnerability of super-linear and
|
|
1141 |
buggy regex implementations by the combination
|
|
1142 |
of Brzozowski's derivatives and interactive theorem proving.
|
|
1143 |
We give an
|
|
1144 |
improved version of Sulzmann and Lu's bit-coded algorithm using
|
|
1145 |
derivatives, which come with a formal guarantee in terms of correctness and
|
|
1146 |
running time as an Isabelle/HOL proof.
|
538
|
1147 |
Further improvements to the algorithm with an even stronger version of
|
|
1148 |
simplification is made.
|
|
1149 |
We have not yet come up with one, but believe that it leads to a
|
|
1150 |
formalised proof with a time bound linear to input and
|
532
|
1151 |
cubic to regular expression size using a technique by
|
538
|
1152 |
Antimirov\cite{Antimirov}.
|
532
|
1153 |
|
|
1154 |
|
538
|
1155 |
The main contribution of this thesis is
|
|
1156 |
\begin{itemize}
|
|
1157 |
\item
|
|
1158 |
a proven correct lexing algorithm
|
|
1159 |
\item
|
|
1160 |
with formalized finite bounds on internal data structures' sizes.
|
|
1161 |
\end{itemize}
|
|
1162 |
|
532
|
1163 |
To our best knowledge, no lexing libraries using Brzozowski derivatives
|
|
1164 |
have a provable time guarantee,
|
|
1165 |
and claims about running time are usually speculative and backed by thin empirical
|
|
1166 |
evidence.
|
|
1167 |
%TODO: give references
|
|
1168 |
For example, Sulzmann and Lu had proposed an algorithm in which they
|
|
1169 |
claim a linear running time.
|
|
1170 |
But that was falsified by our experiments and the running time
|
|
1171 |
is actually $\Omega(2^n)$ in the worst case.
|
|
1172 |
A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
|
|
1173 |
%TODO: give references
|
|
1174 |
lexer, which calculates POSIX matches and is based on derivatives.
|
|
1175 |
They formalized the correctness of the lexer, but not the complexity.
|
|
1176 |
In the performance evaluation section, they simply analyzed the run time
|
|
1177 |
of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
|
|
1178 |
and concluded that the algorithm is quadratic in terms of input length.
|
|
1179 |
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
|
|
1180 |
the time it took to lex only 40 $a$'s was 5 minutes.
|
|
1181 |
|
|
1182 |
|
|
1183 |
|
|
1184 |
\subsection{Related Work}
|
|
1185 |
We are aware
|
|
1186 |
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
|
|
1187 |
Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
|
|
1188 |
of the work by Krauss and Nipkow \parencite{Krauss2011}. And another one
|
|
1189 |
in Coq is given by Coquand and Siles \parencite{Coquand2012}.
|
|
1190 |
Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
|
|
1191 |
|
538
|
1192 |
|
|
1193 |
When a regular expression does not behave as intended,
|
|
1194 |
people usually try to rewrite the regex to some equivalent form
|
|
1195 |
or they try to avoid the possibly problematic patterns completely,
|
|
1196 |
for which many false positives exist\parencite{Davis18}.
|
|
1197 |
Animated tools to "debug" regular expressions such as
|
|
1198 |
\parencite{regexploit2021} \parencite{regex101} are also popular.
|
|
1199 |
We are also aware of static analysis work on regular expressions that
|
|
1200 |
aims to detect potentially expoential regex patterns. Rathnayake and Thielecke
|
|
1201 |
\parencite{Rathnayake2014StaticAF} proposed an algorithm
|
|
1202 |
that detects regular expressions triggering exponential
|
|
1203 |
behavious on backtracking matchers.
|
|
1204 |
Weideman \parencite{Weideman2017Static} came up with
|
|
1205 |
non-linear polynomial worst-time estimates
|
|
1206 |
for regexes, attack string that exploit the worst-time
|
|
1207 |
scenario, and "attack automata" that generates
|
|
1208 |
attack strings.
|
|
1209 |
|
|
1210 |
|
532
|
1211 |
|
|
1212 |
|
|
1213 |
\section{Structure of the thesis}
|
538
|
1214 |
In chapter 2 \ref{Inj} we will introduce the concepts
|
532
|
1215 |
and notations we
|
|
1216 |
use for describing the lexing algorithm by Sulzmann and Lu,
|
538
|
1217 |
and then give the lexing algorithm.
|
|
1218 |
We will give its variant in \ref{Bitcoded1}.
|
|
1219 |
Then we illustrate in \ref{Bitcoded2}
|
532
|
1220 |
how the algorithm without bitcodes falls short for such aggressive
|
|
1221 |
simplifications and therefore introduce our version of the
|
538
|
1222 |
bit-coded algorithm and
|
532
|
1223 |
its correctness proof .
|
538
|
1224 |
In \ref{Finite} we give the second guarantee
|
532
|
1225 |
of our bitcoded algorithm, that is a finite bound on the size of any
|
|
1226 |
regex's derivatives.
|
538
|
1227 |
In \ref{Cubic} we discuss stronger simplifications to improve the finite bound
|
|
1228 |
in \ref{Finite} to a polynomial one, and demonstrate how one can extend the
|
532
|
1229 |
algorithm to include constructs such as bounded repetitions and negations.
|
|
1230 |
|
|
1231 |
|
|
1232 |
|
|
1233 |
|
|
1234 |
|
|
1235 |
%----------------------------------------------------------------------------------------
|
|
1236 |
|
|
1237 |
|
|
1238 |
%----------------------------------------------------------------------------------------
|
|
1239 |
|
|
1240 |
%----------------------------------------------------------------------------------------
|
|
1241 |
|
|
1242 |
%----------------------------------------------------------------------------------------
|
|
1243 |
|
|
1244 |
|