\documentclass[dvipsnames,14pt,t,xelatex]{beamer}
\usepackage{./slides}
\usepackage{./graph}
\usepackage{./langs}
\usepackage{./data}
\usepackage{changepage}
\hfuzz=220pt
\lstset{language=Scala,
style=mystyle,
numbersep=0pt,
numbers=none,
xleftmargin=0mm}
\newcommand{\bl}[1]{\textcolor{blue}{#1}}
% beamer stuff
\renewcommand{\slidecaption}{CFL 01, King's College London}
\begin{filecontents}{re-python2.data}
1 0.033
5 0.036
10 0.034
15 0.036
18 0.059
19 0.084
20 0.141
21 0.248
22 0.485
23 0.878
24 1.71
25 3.40
26 7.08
27 14.12
28 26.69
\end{filecontents}
\begin{filecontents}{re-java.data}
5 0.00298
10 0.00418
15 0.00996
16 0.01710
17 0.03492
18 0.03303
19 0.05084
20 0.10177
21 0.19960
22 0.41159
23 0.82234
24 1.70251
25 3.36112
26 6.63998
27 13.35120
28 29.81185
\end{filecontents}
\begin{filecontents}{re-java9.data}
1000 0.01410
2000 0.04882
3000 0.10609
4000 0.17456
5000 0.27530
6000 0.41116
7000 0.53741
8000 0.70261
9000 0.93981
10000 0.97419
11000 1.28697
12000 1.51387
14000 2.07079
16000 2.69846
20000 4.41823
24000 6.46077
26000 7.64373
30000 9.99446
34000 12.966885
38000 16.281621
42000 19.180228
46000 21.984721
50000 26.950203
60000 43.0327746
\end{filecontents}
\begin{filecontents}{re-usize.data}
1 16
2 33
3 63
4 108
5 181
6 297
7 484
8 785
9 1271
10 2056
11 3325
12 5377
13 8696
14 14065
15 22751
16 36804
17 59541
18 96329
19 155852
20 252161
21 407991
22 660128
23 1068093
24 1728193
25 2796256
26 4524417
27 7320639
\end{filecontents}
\begin{filecontents}{re-ssize.data}
1 12
2 19
3 19
4 19
5 19
6 19
7 19
8 19
9 19
10 19
11 19
12 19
13 19
14 19
15 19
16 19
17 19
18 19
19 19
20 19
21 19
22 19
23 19
24 19
25 19
26 19
27 19
28 19
29 19
30 19
31 19
32 19
33 19
34 19
35 19
36 19
37 19
38 19
39 19
40 19
\end{filecontents}
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[t]
\frametitle{%
\begin{tabular}{@ {}c@ {}}
\\[-1mm]
\LARGE Fast Regular Expression \\[-1mm]
\LARGE Matching Using Derivatives\\[-3mm]
\end{tabular}}
\begin{center}
%\includegraphics[scale=0.3]{pics/ante1.jpg}\hspace{5mm}
%\includegraphics[scale=0.31]{pics/ante2.jpg}\\
%\footnotesize\textcolor{gray}{Antikythera automaton, 100 BC (Archimedes?)}
\end{center}
\normalsize
\begin{center}
\begin{tabular}{ll}
%Email: & christian.urban at kcl.ac.uk\\
%Office: & N\liningnums{7.07} (North Wing, Bush House)\\
%Slides: & KEATS
Student: & Chengsong Tan\\
Supervisor: & Christian Urban \\
Date: & 2019/5/10
\end{tabular}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[t]
\frametitle{Regular Expressions}
Their inductive definition:\\
\hspace{10pt}
\vspace{100pt}
\begin{textblock}{6}(1.5,3.5)
\begin{tabular}{@ {}rrl@ {\hspace{13mm}}l}
\bl{$r$} & \bl{$::=$} & \bl{$\ZERO$} & nothing\\
& \bl{$\mid$} & \bl{$\ONE$} & empty string / \pcode{""} / $[]$\\
& \bl{$\mid$} & \bl{$c$} & character\\
& \bl{$\mid$} & \bl{$r_1 + r_2$} & alternative / choice\\
& \bl{$\mid$} & \bl{$r_1 \cdot r_2$} & sequence\\
& \bl{$\mid$} & \bl{$r^*$} & star (zero or more)\\
\end{tabular}
\end{textblock}
\begin{itemize}
\item { Formalized by Stephen Coles Kleene in 1950s}
\item {\bf What could be possibly interesting?}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\iffalse
\begin{frame}[t]
\frametitle{Regular Expressions}
John Regehr {\small(Univ.~Utah, LLVM compiler hacker)}\smallskip\\
\begin{bubble}[10.5cm]
\bf ``\ldots{}It’s effectively a perpetual
employment act for solid compiler hackers.''
\end{bubble}
\onslide<1->{
\only<2>{
\begin{itemize}
\item {\bf Hardware is getting weirder
rather than getting clocked faster}
\begin{itemize}
\item Almost all processors are
multicores nowadays and it looks like there is increasing asymmetry in
resources across cores. Processors come with vector units, crypto
accelerators etc. We have DSPs, GPUs,
ARM big.little, and Xeon Phi. This is only scratching the
surface.
\end{itemize}
\end{itemize}}
\only<3>{
\begin{itemize}
\item {\bf We’re getting tired of low-level languages and
their associated security disasters}
\begin{itemize}
\item
We want to write new code, to
whatever extent possible, in safer, higher-level
languages. Compilers are caught right in the middle of these
opposing trends: one of their main jobs is to help bridge the large
and growing gap between increasingly high-level languages and
increasingly wacky platforms.
\end{itemize}
\end{itemize}}}
\end{frame}
\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\iffalse
\begin{frame}[c]
\frametitle{Why Bother?}
\begin{columns}[t]
\begin{column}{.5\textwidth}
Ruby, Python, Java 8\medskip\\
\begin{tikzpicture}\footnotesize
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,0.0)}},
ylabel={time in secs},
enlargelimits=false,
xtick={0,5,...,30},
xmax=33,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5.5cm,
height=4cm,
legend entries={Python,Ruby},
legend pos=north west,
legend cell align=left]
\addplot[blue,mark=*, mark options={fill=white}] table {re-python.data};
\addplot[brown,mark=triangle*, mark options={fill=white}] table {re-ruby.data};
\end{axis}
\end{tikzpicture}
\begin{tikzpicture}\footnotesize
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,0.0)}},
ylabel={time in secs},
enlargelimits=false,
xtick={0,5,...,30},
xmax=33,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5.5cm,
height=4cm,
legend entries={Python, Java 8},
legend pos=north west,
legend cell align=left]
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
\end{axis}
\end{tikzpicture}
\end{column}
\begin{column}{.5\textwidth}
Us (after next lecture)\medskip\\
\begin{tikzpicture}\footnotesize
\begin{axis}[
xlabel={$n$},
x label style={at={(1.07,0.0)}},
ylabel={time in secs},
enlargelimits=false,
xtick={0,5000,...,10000},
xmax=11000,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5.5cm,
height=4cm]
\addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
\addplot[black,mark=square*,mark options={fill=white}] table {re3.data};
\end{axis}
\end{tikzpicture}
\begin{tikzpicture}\footnotesize
\begin{axis}[
xlabel={$n$},
x label style={at={(1.07,0.0)}},
ylabel={time in secs},
enlargelimits=false,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
width=5.5cm,
height=4cm]
\addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
\end{axis}
\end{tikzpicture}
\end{column}
\end{columns}\bigskip
\small\centering
matching \texttt{[a?]\{n\}[a]\{n\}} and \texttt{(a*)*b}
against $\underbrace{\texttt{a}...\texttt{a}}_n$
\end{frame}
\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[c]
\frametitle{Catastrophic Backtracking}
\begin{columns}
\begin{column}{.5\textwidth}
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.07,0.0)}},
ylabel={time in secs},
%y label style={at={(0.06,0.5)}},
enlargelimits=false,
xtick={0,5,...,30},
xmax=33,
ymax=45,
ytick={0,10,...,40},
scaled ticks=false,
axis lines=left,
width=5cm,
height=4cm,
legend entries={Python, Java 8},
legend pos=north west]
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
\end{axis}
\end{tikzpicture}
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.07,0.0)}},
ylabel={time in secs},
%y label style={at={(0.06,0.5)}},
%enlargelimits=false,
%xtick={0,5000,...,30000},
xmax=65000,
ymax=45,
ytick={0,10,...,40},
scaled ticks=false,
axis lines=left,
width=5cm,
height=4cm,
legend entries={Java 9},
legend pos=north west]
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
\end{axis}
\end{tikzpicture}
\end{column}
\end{columns}\bigskip
\center
matching \texttt{(a*)*b}
against $\underbrace{\texttt{a}...\texttt{a}}_n$
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[fragile]
\frametitle{Impact}
\begin{itemize}
\item Network Traffic Analysis
\begin{itemize}
\item Snort: > 5 million downloads
\item Bro: > 10000 downloads
\item thousands of regexes
\item \alert{R}egular \alert{e}xpression \alert{D}enial \alert{o}f \alert{S}ervice (ReDoS)\medskip
\end{itemize}
\begin{verbatim}
Jan 2 00:53:19 talisker sshd: Received disconnect from 110.53.183.227: [preauth]
Jan 2 00:58:31 talisker sshd: Received disconnect from 110.53.183.252: [preauth]
Jan 2 01:01:28 talisker sshd: Received disconnect from 221.194.47.236: [preauth]
Jan 2 01:03:59 talisker sshd: Received disconnect from 110.53.183.228: [preauth]
Jan 2 01:06:53 talisker sshd: Received disconnect from 221.194.47.245: [preauth]
...
\end{verbatim}
\item Compilers: lexical analysis
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[c]
\frametitle{Evil Regular Expressions}
\begin{itemize}
%\item \alert{R}egular \alert{e}xpression \alert{D}enial \alert{o}f \alert{S}ervice (ReDoS)\medskip
\item Evil regular expressions\medskip
\begin{itemize}
\item \bl{$(a^{?\{n\}}) \cdot a^{\{n\}}$}
\item \bl{$(a^*)^*\cdot b$}
\item \bl{$([a$\,-\,$z]^+)^*$}
\item \bl{$(a + a \cdot a)^*$}
\item \bl{$(a + a^?)^*$}
\item \bl{$\^(.*?,)\{11\}P$}
\item \bl{$<html>.*?<head>.*?<title>.*?</title>.*?</head>.*?<body[^>]*>.*?</body>.*?</html>$}
\end{itemize}
\item Pearl Compatible Regular Expression(PCRE) \bl{$(r) \backslash 1$}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[c]
\frametitle{A Concrete Example}
\begin{adjustwidth*}{}{-1.2cm}
\resizebox{13cm}{4cm}{
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto]
\node[state,initial] (q_0) {$0$};
\node[state] (q_1) [right = of q_0] {$1$};
\node[state] (q_2) [right=of q_1] {$2$};
\node (q_dots1) [right = of q_2] {$\cdots$};
\node [state] (q_n) [right = of q_dots1] {$n$};
\node [state] (q_n1) [right = of q_n] {$n+1$};
\node[state] (q_n2) [right=of q_n1] {$n+2$};
\node[state,accepting] (q_n3) [right=of q_n2] {$n+3$};
\path[->]
(q_0) edge [loop above] node {$*$} (q_0)
(q_0) edge node {$a$} (q_1)
(q_1) edge node {$*$} (q_2)
(q_2) edge node {$*$} (q_dots1)
(q_dots1) edge node{$*$} (q_n)
(q_n) edge node{$*$} (q_n1)
(q_n1) edge node{$*$} (q_n2)
(q_n2) edge node{$*$} (q_n3);
\draw [decorate,decoration={brace,amplitude=10pt,mirror,raise=10pt},yshift=0pt]
(q_2.south west) -- (q_n1.south east) node [black,midway,yshift = -2cm] {\footnotesize n states};
%\draw [decorate,decoration={brace,amplitude=10pt,mirror,raise=4pt},yshift=0pt]
%(3.5,0.65) -- (3.5,6.5) node [black,midway,xshift=0.8cm] {\footnotesize
%$P_2$};
\end{tikzpicture}
}
\end{adjustwidth*}
\begin{itemize}
\item
NFA accepting Regex $.*a.{n}bc$
\item
Matching the string aaaaaaaaaaaaaaa.....abc
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[c]
\frametitle{Brzozowski Derivative}
\begin{itemize}
\item Brzozowski invented in his 1964 Phd thesis
\item Intuition: Chopping off the first character
\begin{center}
\bl{$A \backslash c \dn \{ s \;|\; c\!::\!s \in A\}$ }
\end{center}
\item
For \bl{$A = \{\textit{foo}, \textit{bar}, \textit{frak}\}$} then
\begin{center}
\bl{\begin{tabular}{l@{\hspace{2mm}}c@{\hspace{2mm}}l}
$A \backslash f $ & $=$ & $\{\textit{oo}, \textit{rak}\}$\\
$A \backslash b $ & $=$ & $\{\textit{ar}\}$\\
$A \backslash a $ & $=$ & $\{\}$\pause
\end{tabular}}
\end{center}
\item
\begin{tabular}{rp{4cm}}
\bl {$r \,matches \,s=c_1c_2...c_n$}
\bl{$\Leftrightarrow$} \bl{$c_1c_2...c_n \in L(r)$} \bl{$\Leftrightarrow$}\\
\bl{$c_2...c_n \in L(r) \backslash c_1$}\\
\bl{$\Leftrightarrow$} \bl{$\cdots$}\\
\bl{$\Leftrightarrow$} \bl{$[] \in ((L(r)\backslash c_1) \backslash c_2)\cdots\backslash c_n$}
\end{tabular}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[c]
\frametitle{Previous Example}
\begin{center}
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.09,-0.15)}},
ylabel={time in secs},
scaled x ticks=false,
enlargelimits=false,
xtick distance=10000,
xmax=44000,
ytick={0,10,...,30},
ymax=35,
axis lines=left,
width=7cm,
height=3.5cm,
legend entries={Java \liningnums{9}+},
legend pos=north west,
legend cell align=left]
\addplot[blue,mark=square*,mark options={fill=white}] table {re-java9.data};
\end{axis}
\end{tikzpicture}
\end{center}
\begin{center}
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.09,0.0)}},
ylabel={time in secs},
scaled x ticks=false,
xtick distance=2000000,
enlargelimits=false,
xmax=6400000,
ytick={0,10,...,30},
ymax=35,
axis lines=left,
width=7cm,
height=3.5cm,
legend entries={Simple Scala},
legend pos=north west,
legend cell align=left]
%\addplot[green,mark=square*,mark options={fill=white}] table {re2a.data};
\addplot[black,mark=square*,mark options={fill=white}] table {re3a.data};
\end{axis}
\end{tikzpicture}
\end{center}
Regex: \bl{$(a^*)^* \cdot b$} \space\space\space\space\space\space
Strings of the form \bl{$\underbrace{\,a\ldots a\,}_{n}$}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[t]
\frametitle{Optimization}
\begin{center}
\begin{tikzpicture}
\begin{semilogyaxis}[
xlabel={$n$},
x label style={at={(1.05,0.0)}},
ylabel={regex size},
enlargelimits=false,
xtick={1,4,...,30},
xmax=33,
%ytick={0, 10,...,100},
scaled ticks=false,
axis lines=left,
width=9cm,
height=4.5cm,
legend entries={Simple Scala},
legend pos= south east,
legend cell align=left
]
\addplot[black,mark=square*, mark options={fill=white}] table {re-usize.data};
%\addplot[brown,mark=pentagon*, mark options={fill=white}] table {re-ssize.data};
%\addplot[red,mark=triangle*,mark options={fill=white}] table {re1.data};
%\addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
\end{semilogyaxis}
\end{tikzpicture}
\end{center}
Regex: \bl{$(a+aa)^* \cdot b$} \space\space\space\space\space\space
Strings of the form \bl{$\underbrace{\,a\ldots a\,}_{n}$}\\
\begin{itemize}
\item
Solution: Simplification%(Sulzmann \& Lu)
\begin{itemize}
\item $r+(r+s) = r+r+s = r+s$
\item $1 \cdot r= r$
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[t]
\frametitle{Optimization}
Regex: \bl{$(a+aa)^* \cdot b$} \space\space\space\space\space\space
Strings of the form \bl{$\underbrace{\,a\ldots a\,}_{n}$}
\begin{center}
\begin{tikzpicture}
\begin{axis}[
xlabel={$n$},
x label style={at={(1.05,0.0)}},
ylabel={regex size},
enlargelimits=false,
xtick={1,4,...,30},
xmax=33,
%ytick={0, 10,...,100},
scaled ticks=false,
axis lines=left,
width=9cm,
height=4.5cm,
legend entries={Simplification Applied},
legend pos= south east,
legend cell align=left
]
%\addplot[black,mark=square*, mark options={fill=white}] table {re-usize.data};
\addplot[black,mark=triangle*, mark options={fill=white}] table {re-ssize.data};
%\addplot[red,mark=triangle*,mark options={fill=white}] table {re1.data};
%\addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
\end{axis}
\end{tikzpicture}
\end{center}
\begin{itemize}
\item
Lexer? Not just matcher
\begin{itemize}
\item
Bit-Coded Regular Expression Parsing (Lasse Nielsen, Fritz Henglein 2011)
\item
Bit-Coded Regular Expression Parsing with Simplification (Sulzmann and Lu 2014)
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[t]
\frametitle{Ongoing Work}
\begin{tikzpicture}
\begin{semilogyaxis}[
xlabel={$n$},
x label style={at={(1.05,0.0)}},
ylabel={regex size},
enlargelimits=false,
xtick={1,4,...,30},
xmax=33,
%ytick={0, 10,...,100},
scaled ticks=false,
axis lines=left,
width=9cm,
height=4.5cm,
legend entries={Naive, Simp},
legend pos= north west,
legend cell align=left
]
\addplot[black,mark=square*, mark options={fill=white}] table {re-usize.data};
\addplot[black,mark=triangle*, mark options={fill=white}] table {re-ssize.data};
%\addplot[red,mark=triangle*,mark options={fill=white}] table {re1.data};
%\addplot[green,mark=square*,mark options={fill=white}] table {re2.data};
\end{semilogyaxis}
\end{tikzpicture}
\begin{itemize}
\item
Correctness of Sulzmann \& Lu's algorithm
\item
Size Bound $O(n^2t^2)$ of Dervatives (1995 Antimirov "Partial Derivative")
\item
More radical simplifications.
\item
Extend the algorithm to back-references.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}[c]
\frametitle{\begin{tabular}{c}\\[3cm]\alert{Questions?}\end{tabular}}
\mbox{}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End: