text/proposal.tex
author Chengsong
Mon, 10 Jul 2023 14:32:48 +0100
changeset 663 0d1e68268d0f
parent 359 fedc16924b76
permissions -rw-r--r--
more explanation for the name "closed form" and their intuition
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
     1
% !TEX program = pfdlatex
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     2
\documentclass{article}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     3
\usepackage{url}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     4
\usepackage{color}\definecolor{darkblue}{rgb}{0,0,0.5}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     5
\usepackage[a4paper,colorlinks=true,linkcolor=darkblue,citecolor=darkblue,filecolor=darkblue,pagecolor=darkblue,urlcolor=darkblue]{hyperref}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     6
\usepackage{style}
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
     7
%\usepackage{langs}
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     8
\usepackage{tikz}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     9
\usepackage{pgf}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    10
\usepackage{pgfplots}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    11
\usepackage{stackengine}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    12
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    13
\addtolength{\oddsidemargin}{-6mm}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    14
\addtolength{\evensidemargin}{-6mm}
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
    15
\addtolength{\textwidth}{12mm} 
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    16
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    17
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    18
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    19
%% \usepackage{accents}
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
    20
\newcommand\barbelow[1]{\stackunbder[1.2pt]{#1}{\raisebox{-4mm}{\boldmath$\uparrow$}}}
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    21
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    22
\begin{filecontents}{re-python2.data}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    23
1 0.033
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    24
5 0.036
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    25
10 0.034
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    26
15 0.036
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    27
18 0.059
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    28
19 0.084 
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    29
20 0.141
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    30
21 0.248
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    31
22 0.485
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    32
23 0.878
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    33
24 1.71
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    34
25 3.40
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    35
26 7.08
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    36
27 14.12
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    37
28 26.69
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    38
\end{filecontents}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    39
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    40
\begin{filecontents}{re-java.data}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    41
5  0.00298
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    42
10  0.00418
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    43
15  0.00996
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    44
16  0.01710
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    45
17  0.03492
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    46
18  0.03303
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    47
19  0.05084
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    48
20  0.10177
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    49
21  0.19960
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    50
22  0.41159
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    51
23  0.82234
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    52
24  1.70251
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    53
25  3.36112
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    54
26  6.63998
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    55
27  13.35120
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    56
28  29.81185
290
ed3169a567ea updated
Christian Urban <urbanc@in.tum.de>
parents: 285
diff changeset
    57
\end{filecontents} 
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    58
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    59
\begin{filecontents}{re-java9.data}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    60
1000  0.01410
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    61
2000  0.04882
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    62
3000  0.10609
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    63
4000  0.17456
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    64
5000  0.27530
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    65
6000  0.41116
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    66
7000  0.53741
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    67
8000  0.70261
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    68
9000  0.93981
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    69
10000 0.97419
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    70
11000 1.28697
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    71
12000 1.51387
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    72
14000 2.07079
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    73
16000 2.69846
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    74
20000 4.41823
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    75
24000 6.46077
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    76
26000 7.64373
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    77
30000 9.99446
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    78
34000 12.966885
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    79
38000 16.281621
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    80
42000 19.180228
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    81
46000 21.984721
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    82
50000 26.950203
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    83
60000 43.0327746
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    84
\end{filecontents}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    85
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    86
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    87
\begin{document}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    88
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    89
  
285
acc027964d10 updated
Christian Urban <urbanc@in.tum.de>
parents: 284
diff changeset
    90
\section*{PPProposal: \\
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    91
  Fast Regular Expression Matching\\
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
    92
  with Brzozowski bderivatives}
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    93
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    94
\subsubsection*{Summary}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    95
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    96
If you want to connect a computer directly to the Internet, it must be
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    97
immediately \mbox{hardened} against outside attacks. The current
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    98
technology for this is to use regular expressions in order to
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    99
automatically scan all incoming network traffic for signs when a
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   100
computer is under attack and if found, to take appropriate
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   101
counter-actions. One possible action could be to slow down the traffic
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   102
from sites where repeated password-guesses originate. Well-known
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   103
network intrusion prevention systems that use regular expressions for
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   104
traffic analysis are Snort and Bro.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   105
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   106
Given the large volume of Internet traffic even the smallest servers
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   107
can handle nowadays, the regular expressions for traffic analysis have
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   108
become a real bottleneck.  This is not just a nuisance, but actually a
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   109
security vulnerability in itself: it can lead to denial-of-service
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   110
attacks.  The proposed project aims to remove this bottleneck by using
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   111
a little-known technique of building bderivatives of regular
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   112
expressions. These derivatives have not yet been used in the area of
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   113
network traffic analysis, but have the potential to solve some
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   114
tenacious problems with existing approaches. The project will require
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   115
theoretical work, but also a practical implementation (a
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   116
proof-of-concept at least).  The work will build on earlier work by
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   117
Ausaf and Urban from King's College London
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   118
\cite{AusafDyckhoffUrban2016}.
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   119
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   120
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   121
\subsubsection*{Background}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   122
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   123
If every 10 minutes a thief turned up at your car and rattled
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   124
violently at the doorhandles to see if he could get in, you would move
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   125
your car to a safer neighbourhood. Computers, however, need to stay
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   126
connected to the Internet all the time and there they have to withstand such
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   127
attacks. A rather typical instance of an attack can be seen in the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   128
following log entries from a server at King's:
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   129
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   130
{\small
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   131
\begin{verbatim}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   132
 Jan 2 00:53:19 talisker sshd: Received disconnect from 110.53.183.227: [preauth]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   133
 Jan 2 00:58:31 talisker sshd: Received disconnect from 110.53.183.252: [preauth]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   134
 Jan 2 01:01:28 talisker sshd: Received disconnect from 221.194.47.236: [preauth]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   135
 Jan 2 01:03:59 talisker sshd: Received disconnect from 110.53.183.228: [preauth]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   136
 Jan 2 01:06:53 talisker sshd: Received disconnect from 221.194.47.245: [preauth]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   137
 ...
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   138
\end{verbatim}}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   139
  
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   140
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   141
\noindent
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   142
This is a record of the network activities from the server
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   143
\href{http://talisker.inf.kcl.ac.uk/cgi-bin/repos.cgi}{talisker.inf.kcl.ac.uk},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   144
which hosts lecture material for students. The log indicates several
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   145
unsuccessful login attempts via the \texttt{ssh} program from
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   146
computers with the addresses \texttt{110.53.183.227} and so on. This
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   147
is a clear sign of a \emph{brute-force attack} where hackers
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   148
systematically try out password candidates.  Such attacks are blunt,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   149
but unfortunately very powerful.  They can originate from anywhere in
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   150
the World; they are automated and often conducted from computers that
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   151
have been previously hijacked.  Once the attacker ``hits'' on the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   152
correct password, then he or she gets access to the server. The only
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   153
way to prevent this methodical password-guessing is to spot the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   154
corresponding traces in the log and then slow down the traffic from
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   155
such sites in a way that perhaps only one password per hour can be
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   156
tried out. This does not affect ``normal'' operations of the server,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   157
but drastically decreases the chances of hitting the correct password
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   158
by a brute-force attack.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   159
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   160
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   161
Server administrators use \emph{regular expressions} for scanning log
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   162
files. The purpose of these expressions is to specify patterns of
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   163
interest (for example \texttt{Received disconnect from 110.53.183.227}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   164
where the address needs to be variable). A program will
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   165
then continuously scan for such patterns and trigger an action if a
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   166
pattern is found. Popular examples of such programs are Snort and Bro
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   167
\cite{snort,bro}.  Clearly, there are also other kinds of
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   168
vulnerabilities hackers try to take advantage of---mainly programming
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   169
mistakes that can be abused to gain access to a computer. This means
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   170
server administrators have a suite of sometimes thousands of regular
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   171
expressions, all prescribing some suspicious pattern for when a
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   172
computer is unbder attack.
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   173
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   174
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   175
The unbderlying algorithmic problem is to decide whether a string in
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   176
the logs matches the patterns determined by the regular
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   177
expressions. Such decisions need to be done as fast as possible,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   178
otherwise the scanning programs would not be useful as a hardening
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   179
technique for servers. The quest for speed with these decisions is
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   180
presently a rather big challenge for the amount of traffic typical
305
6e2cef17a9b3 updated
Christian Urban <urbanc@in.tum.de>
parents: 290
diff changeset
   181
servers have to cope with and the large number of patterns that might
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   182
be of interest.  The problem is that when thousands of regular
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   183
expressions are involved, the process of detecting whether a string
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   184
matches a regular expression can be excruciating slow. This might not
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   185
happen in most instances, but in some small number of
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   186
instances. However in these small number of instances the algorithm
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   187
for regular expression matching can grind to a halt.  This phenomenon
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   188
is called \emph{catastrophic backtracking} \cite{catast}.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   189
Unfortunately, catastrophic backtracking is not just a nuisance, but a
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   190
security vulnerability in itself. The reason is that attackers look
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   191
for these instances where the scan is very slow and then trigger a
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   192
\emph{denial-of-service attack} against the server\ldots meaning the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   193
server will not be reachable anymore to normal users.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   194
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   195
Existing approaches try to mitigate the speed problem with regular
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   196
expressions by implementing the matching algorithms in hardware,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   197
rather than in software \cite{hardware}. Although this solution offers
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   198
speed, it is often unappealing because attack patterns can change or
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   199
need to be augmented. A software solution is clearly more desirable in
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   200
these circumstances.  Also given that regular expressions were
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   201
introduced in 1950 by Kleene, one might think regular expressions have
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   202
since been studied and implemented to death. But this would definitely
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   203
be a mistake: in fact they are still an active research area and
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   204
off-the-shelf implementations are still extremely prone to the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   205
problem of catastrophic backtracking. This can be seen in the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   206
following two graphs:
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   207
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   208
\begin{center}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   209
\begin{tabular}{@{}cc@{}}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   210
%\multicolumn{2}{c}{Graph: $(a^*)^*\cdot b$ and strings 
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   211
%           $\unbderbrace{a\ldots a}_{n}$}\smallskip\\  
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   212
\begin{tikzpicture}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   213
\begin{axis}[
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   214
    xlabel={$n$},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   215
    x label style={at={(1.05,0.0)}},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   216
    ylabel={time in secs},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   217
    y label style={at={(0.06,0.5)}},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   218
    enlargelimits=false,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   219
    xtick={0,5,...,30},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   220
    xmax=33,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   221
    ymax=45,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   222
    ytick={0,10,...,40},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   223
    scaled ticks=false,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   224
    axis lines=left,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   225
    width=6cm,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   226
    height=4.5cm, 
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   227
    legend entries={Python, Java 8},  
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   228
    legend pos=north west]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   229
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   230
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   231
\end{axis}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   232
\end{tikzpicture}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   233
  & 
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   234
\begin{tikzpicture}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   235
\begin{axis}[
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   236
    xlabel={$n$},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   237
    x label style={at={(1.05,0.0)}},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   238
    ylabel={time in secs},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   239
    y label style={at={(0.06,0.5)}},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   240
    %enlargelimits=false,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   241
    %xtick={0,5000,...,30000},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   242
    xmax=65000,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   243
    ymax=45,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   244
    ytick={0,10,...,40},
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   245
    scaled ticks=false,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   246
    axis lines=left,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   247
    width=6cm,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   248
    height=4.5cm, 
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   249
    legend entries={Java 9},  
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   250
    legend pos=north west]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   251
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java9.data};
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   252
\end{axis}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   253
\end{tikzpicture}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   254
\end{tabular}  
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   255
\end{center}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   256
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   257
\noindent
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   258
These graphs show how long strings can be when using the rather simple
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   259
regular expression $(a^*)^* \,b$ and the built-in regular expression
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   260
matchers in the popular programming languages Python and Java (Version
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   261
8 and Version 9). There are many more regular expressions that show
284
913205c1fd0d updated
Christian Urban <urbanc@in.tum.de>
parents: 283
diff changeset
   262
the same behaviour.  On the left-hand side the graphs show that for a
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   263
string consisting of just 28 $a\,$'s, Python and the olbder Java (which
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   264
was the latest version of Java until September 2017) already need 30
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   265
seconds to decide whether this string is matched or not. This is an
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   266
abysmal result given that Python and Java are very popular programming
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   267
languages. We already know that this problem can be decided much, much
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   268
faster.  If these slow regular expression matchers would be employed
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   269
in network traffic analysis, then this example is already an
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   270
opportunity for mounting an easy denial-of-service attack: one just
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   271
has to send to the server a large enough string of a's. The newer
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   272
version of Java is better---it can match strings of around 50,000
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   273
$a\,$'s in 30 seconds---however this would still not be good enough
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   274
for being a useful tool for network traffic analysis. What is
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   275
interesting is that even a relatively simple-minded regular expression
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   276
matcher based on Brzozowski bderivatives can out-compete the regular
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   277
expressions matchers in Java and Python on such catastrophic
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   278
backtracking examples. This gain in speed is the motivation and
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   279
starting point for the proposed project. \smallskip
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   280
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   281
\subsection*{Research Plan}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   282
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   283
Regular expressions are already an old and well-studied topic in
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   284
Computer Science. They were originally introduced by Kleene in 1951
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   285
and are utilised in text search algorithms for ``find'' or ``find and
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   286
replace'' operations \cite{Brian,Kleene51}. Because of their wide
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   287
range of applications, many programming languages provide capabilities
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   288
for regular expression matching via libraries.  The classic theory
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   289
of regular expressions involves just 6 different kinds of regular
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   290
expressions, often defined in terms of the following grammar:
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   291
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   292
\begin{center}\small
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   293
\begin{tabular}{lcll}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   294
  $r$ & $::=$ & $\ZERO$     & cannot match anything\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   295
      &   $|$ & $\ONE$      & can only match the empty string\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   296
      &   $|$ & $c$         & can match a single character (in this case $c$)\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   297
      &   $|$ & $r_1 + r_2$ & can match a string either with $r_1$ or with $r_2$\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   298
  &   $|$ & $r_1\cdot r_2$ & can match the first part of a string with $r_1$ and\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   299
          &  & & then the second part with $r_2$\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   300
      &   $|$ & $r^*$       & can match zero or more times $r$\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   301
\end{tabular}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   302
\end{center}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   303
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   304
\noindent
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   305
For practical purposes, however, regular expressions have been
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   306
extended in various ways in orbder to make them even more powerful and
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   307
even more useful for applications. Some of the problems to do with
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   308
catastrophic backtracking stem, unfortunately, from these extensions.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   309
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   310
The crux in this project is to not use automata for regular expression
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   311
matching (the traditional technique), but to use bderivatives of
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   312
regular expressions instead.  These bderivatives were introduced by
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   313
Brzozowski in 1964 \cite{Brzozowski1964}, but somehow had been lost
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   314
``in the sands of time'' \cite{Owens2009}. However, they recently have
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   315
become again a ``hot'' research topic with numerous research papers
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   316
appearing in the last five years. One reason for this interest is that
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   317
Brzozowski bderivatives straightforwardly extend to more general
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   318
regular expression constructors, which cannot be easily treated with
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   319
standard techniques. They can also be implemented with ease in any
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   320
functional programming language: the definition for bderivatives
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   321
consists of just two simple recursive functions shown in
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   322
Figure~\ref{f}. Moreover, it can be easily shown (by a mathematical
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   323
proof) that the resulting regular matcher is in fact always producing
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   324
a correct answer. This is an area where Urban can provide a lot of
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   325
expertise for this project: he is one of the main developers of the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   326
Isabelle theorem prover, which is a program designed for such proofs
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   327
\cite{Isabelle}.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   328
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   329
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   330
\begin{figure}[t]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   331
\begin{center}\small
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   332
\begin{tabular}{lcl}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   333
$\textit{nullable}(\ZERO)$ & $\dn$ & $\textit{false}$\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   334
$\textit{nullable}(\ONE)$  & $\dn$ & $\textit{true}$\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   335
$\textit{nullable}(c)$     & $\dn$ & $\textit{false}$\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   336
$\textit{nullable}(r_1 + r_2)$ & $\dn$ & $\textit{nullable}(r_1) \vee \textit{nullable}(r_2)$\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   337
$\textit{nullable}(r_1 \cdot r_2)$ & $\dn$ & $\textit{nullable}(r_1) \wedge \textit{nullable}(r_2)$\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   338
$\textit{nullable}(r^*)$ & $\dn$ & $\textit{true}$\medskip\\
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   339
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   340
$\textit{bder}\;c\;(\ZERO)$ & $\dn$ & $\ZERO$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   341
$\textit{bder}\;c\;(\ONE)$  & $\dn$ & $\ZERO$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   342
$\textit{bder}\;c\;(d)$     & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;\ONE \; \textit{else} \;\ZERO$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   343
$\textit{bder}\;c\;(r_1 + r_2)$ & $\dn$ & $(\textit{bder}\;c\;r_1) + (\textit{bder}\;c\;r_2)$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   344
$\textit{bder}\;c\;(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   345
      & & $\textit{then}\;((\textit{bder}\;c\;r_1)\cdot r_2) + (\textit{bder}\;c\;r_2)$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   346
      & & $\textit{else}\;(\textit{bder}\;c\;r_1)\cdot r_2$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   347
$\textit{bder}\;c\;(r^*)$ & $\dn$ & $(\textit{bder}\;c\;r)\cdot (r^*)$\\[-5mm]
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   348
\end{tabular}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   349
\end{center}
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   350
\caption{The complete definition of bderivatives of regular expressions
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   351
  \cite{Brzozowski1964}.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   352
  This definition can be implemented with ease in functional programming languages and can be easily extended to regular expressions used in practice.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   353
  The are more flexible for applications and easier to manipulate in
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   354
  mathematical proofs, than traditional techniques for regular expression
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   355
  matching.\medskip\label{f}}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   356
\end{figure}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   357
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   358
There are a number of avenues for research on Brzozowski bderivatives.
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   359
One problem I like to immediately tackle in this project is how to
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   360
handle \emph{back-references} in regular expressions. It is known that
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   361
the inclusion of back-references causes the unbderlying algorithmic
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   362
problem to become NP-hard \cite{Aho}, and is the main reason why
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   363
regular expression matchers suffer from the catastrophic backtracking
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   364
problem. However, the full generality of back-references are not
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   365
needed in practical applications. The goal therefore is to sufficiently
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   366
restrict the problem so that an efficient algorithm can be
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   367
designed. There seem to be good indications that Brzozowski
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   368
bderivatives are useful for that.
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   369
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   370
Another problem is \emph{how} regular expressions match a string
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   371
\cite{Sulzmann2014}. In this case the algorithm does not just give a
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   372
yes/no answer about whether the regular expression matches the string,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   373
but also generates a value that encodes which part of the string is
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   374
matched by which part of the regular expression. This is important in
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   375
applications, like network analysis where from a matching log entry a
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   376
specific computer address needs to be extracted.  Also compilers make
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   377
extensive use of such an extension when they lex their source
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   378
programs, i.e.~break up code into word-like components.  Again Ausaf
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   379
and Urban from King's made some initial progress about proving the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   380
correctness of such an extension, but their algorithm is not yet fast
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   381
enough for practical purposes \cite{AusafDyckhoffUrban2016}. It would
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   382
be desirable to study optimisations that make the algorithm faster,
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   383
but preserve the correctness guaranties obtained by Ausaf and Urban.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   384
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   385
\mbox{}\\[-11mm]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   386
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   387
\subsubsection*{Conclusion}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   388
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   389
Much research has already gone into regular expressions, and one might
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   390
think they have been studied and implemented to death. But this is far
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   391
from the truth. In fact regular expressions have become a ``hot''
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   392
research topic in recent years because of problems with the
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   393
traditional methods (in applications such as network traffic
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   394
analysis), but also because the technique of bderivatives of regular
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   395
expression has been re-discovered. These bderivatives provide an
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   396
alternative approach to regular expression matching. Their potential
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   397
lies in the fact that they are more flexible in implementations and
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   398
much easier to manipulate in mathematical proofs. Therefore I like to
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   399
research them in this project.\medskip
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   400
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   401
\noindent {\bf Impact:} Regular expression matching is a core
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   402
technology in Computer Science with numerous applications. I will
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   403
focus on the area of network traffic analysis and also lexical
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   404
analysis. In these areas this project can have a quite large impact:
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   405
for example the program Bro has been downloaded at least 10,000 times and Snort
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   406
even has 5 million downloads and over 600,000 registered users
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   407
\cite{snort,bro}. Lexical analysis is a core component in every
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   408
compiler, which are the bedrock on which nearly all programming is
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   409
built on.
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   410
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   411
% {\small
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   412
% \begin{itemize}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   413
% \item[$\bullet$] \url{http://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   414
% \item[$\bullet$] \url{https://vimeo.com/112065252}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   415
% \item[$\bullet$] \url{http://davidvgalbraith.com/how-i-fixed-atom/}  
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   416
% \end{itemize}}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   417
\mbox{}\\[-11mm]
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   418
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   419
\small
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   420
\bibliographystyle{plain}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   421
\renewcommand{\refname}{References\\[-7mm]\mbox{}}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   422
\bibliography{proposal}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   423
359
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   424
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   425
\newpage
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   426
\begin{center}\small 
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   427
\begin{tabular}{lcl}
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   428
$\textit{bder}\;c\;_{[]}(\ZERO)$ & $\dn$ & $_{[]}\ZERO$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   429
$\textit{bder}\;c\;_{bs}(\ONE)$  & $\dn$ & $_{[]}\ZERO$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   430
$\textit{bder}\;c\;_{bs}(d)$     & $\dn$ & $\textit{if}\; c = d\;\textit{then} \;_{bs}\ONE \; \textit{else} \;_{[]}\ZERO$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   431
$\textit{bder}\;c\;_{bs}(r_1 + r_2)$ & $\dn$ & $_{bs}(\textit{bder}\;c\;r_1 + \textit{bder}\;c\;r_2)$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   432
$\textit{bder}\;c\;_{bs}(r_1 \cdot r_2)$ & $\dn$ & $\textit{if}\;\textit{nullable}(r_1)$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   433
      & & $\textit{then}\;_{bs}(_{[]}(\textit{bder}\;c\;r_1)\cdot r_2) + (_{\textit{bmkeps}\,r_1\rightarrow}\textit{bder}\;c\;r_2)$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   434
      & & $\textit{else}\;_{bs}(\textit{bder}\;c\;r_1)\cdot r_2$\\
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   435
$\textit{bder}\;c\;_{bs}(r^*)$ & $\dn$ & $_{bs}(_{0\rightarrow}\textit{bder}\;c\;r)\cdot (r^*)$\\[-5mm]
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   436
\end{tabular}
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   437
\end{center}
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   438
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   439
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   440
fedc16924b76 updated
Christian Urban <christian.urban@kcl.ac.uk>
parents: 305
diff changeset
   441
283
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   442
\end{document}
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   443
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   444
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   445
%%% Local Variables: 
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   446
%%% mode: latex
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   447
%%% TeX-master: t
c4d821c6309d updated
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
   448
%%% End: