9ms/ecoop_paper.tex
author Chengsong
Sat, 29 Jun 2019 12:28:49 +0100
changeset 28 1a92233763bd
child 29 cf8ebf62bded
permissions -rw-r--r--
created folder for 9mth report
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
28
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     1
\documentclass[a4paper,UKenglish]{lipics}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     2
\usepackage{graphic}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     3
\usepackage{data}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     4
% \documentclass{article}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     5
%\usepackage[utf8]{inputenc}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     6
%\usepackage[english]{babel}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     7
%\usepackage{listings}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     8
% \usepackage{amsthm}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
     9
% \usepackage{hyperref}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    10
% \usepackage[margin=0.5in]{geometry}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    11
%\usepackage{pmboxdraw}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    12
 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    13
\title{POSIX Regular Expression Matching and Lexing}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    14
\author{Chengsong Tan}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    15
\affil{King's College London\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    16
London, UK\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    17
\texttt{chengsong.tan@kcl.ac.uk}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    18
\authorrunning{Chengsong Tan}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    19
\Copyright{Chengsong Tan}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    20
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    21
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    22
\newcommand{\ZERO}{\mbox{\bf 0}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    23
\newcommand{\ONE}{\mbox{\bf 1}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    24
\def\lexer{\mathit{lexer}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    25
\def\mkeps{\mathit{mkeps}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    26
\def\inj{\mathit{inj}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    27
\def\Empty{\mathit{Empty}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    28
\def\Left{\mathit{Left}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    29
\def\Right{\mathit{Right}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    30
\def\Stars{\mathit{Stars}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    31
\def\Char{\mathit{Char}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    32
\def\Seq{\mathit{Seq}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    33
\def\Der{\mathit{Der}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    34
\def\nullable{\mathit{nullable}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    35
\def\Z{\mathit{Z}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    36
\def\S{\mathit{S}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    37
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    38
%\theoremstyle{theorem}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    39
%\newtheorem{theorem}{Theorem}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    40
%\theoremstyle{lemma}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    41
%\newtheorem{lemma}{Lemma}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    42
%\newcommand{\lemmaautorefname}{Lemma}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    43
%\theoremstyle{definition}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    44
%\newtheorem{definition}{Definition}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    45
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    46
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    47
\begin{document}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    48
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    49
\maketitle
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    50
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    51
\begin{abstract}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    52
  Brzozowski introduced in 1964 a beautifully simple algorithm for
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    53
  regular expression matching based on the notion of derivatives of
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    54
  regular expressions. In 2014, Sulzmann and Lu extended this
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    55
  algorithm to not just give a YES/NO answer for whether or not a regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    56
  expression matches a string, but in case it matches also \emph{how}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    57
  it matches the string.  This is important for applications such as
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    58
  lexing (tokenising a string). The problem is to make the algorithm
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    59
  by Sulzmann and Lu fast on all inputs without breaking its
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    60
  correctness.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    61
\end{abstract}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    62
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    63
\section{Introduction}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    64
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    65
This PhD-project is about regular expression matching and
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    66
lexing. Given the maturity of this topic, the reader might wonder:
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    67
Surely, regular expressions must have already been studied to death?
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    68
What could possibly be \emph{not} known in this area? And surely all
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    69
implemented algorithms for regular expression matching are blindingly
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    70
fast?
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    71
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    72
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    73
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    74
Unfortunately these preconceptions are not supported by evidence: Take
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    75
for example the regular expression $(a^*)^*\,b$ and ask whether
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    76
strings of the form $aa..a$ match this regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    77
expression. Obviously they do not match---the expected $b$ in the last
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    78
position is missing. One would expect that modern regular expression
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    79
matching engines can find this out very quickly. Alas, if one tries
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    80
this example in JavaScript, Python or Java 8 with strings like 28
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    81
$a$'s, one discovers that this decision takes around 30 seconds and
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    82
takes considerably longer when adding a few more $a$'s, as the graphs
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    83
below show:
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    84
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    85
\begin{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    86
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    87
\begin{tikzpicture}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    88
\begin{axis}[
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    89
    xlabel={$n$},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    90
    x label style={at={(1.05,-0.05)}},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    91
    ylabel={time in secs},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    92
    enlargelimits=false,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    93
    xtick={0,5,...,30},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    94
    xmax=33,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    95
    ymax=35,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    96
    ytick={0,5,...,30},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    97
    scaled ticks=false,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    98
    axis lines=left,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
    99
    width=5cm,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   100
    height=4cm, 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   101
    legend entries={JavaScript},  
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   102
    legend pos=north west,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   103
    legend cell align=left]
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   104
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   105
\end{axis}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   106
\end{tikzpicture}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   107
  &
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   108
\begin{tikzpicture}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   109
\begin{axis}[
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   110
    xlabel={$n$},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   111
    x label style={at={(1.05,-0.05)}},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   112
    %ylabel={time in secs},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   113
    enlargelimits=false,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   114
    xtick={0,5,...,30},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   115
    xmax=33,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   116
    ymax=35,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   117
    ytick={0,5,...,30},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   118
    scaled ticks=false,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   119
    axis lines=left,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   120
    width=5cm,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   121
    height=4cm, 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   122
    legend entries={Python},  
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   123
    legend pos=north west,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   124
    legend cell align=left]
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   125
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   126
\end{axis}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   127
\end{tikzpicture}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   128
  &
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   129
\begin{tikzpicture}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   130
\begin{axis}[
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   131
    xlabel={$n$},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   132
    x label style={at={(1.05,-0.05)}},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   133
    %ylabel={time in secs},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   134
    enlargelimits=false,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   135
    xtick={0,5,...,30},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   136
    xmax=33,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   137
    ymax=35,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   138
    ytick={0,5,...,30},
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   139
    scaled ticks=false,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   140
    axis lines=left,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   141
    width=5cm,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   142
    height=4cm, 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   143
    legend entries={Java 8},  
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   144
    legend pos=north west,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   145
    legend cell align=left]
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   146
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   147
\end{axis}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   148
\end{tikzpicture}\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   149
\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   150
           of the form $\underbrace{aa..a}_{n}$.}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   151
\end{tabular}    
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   152
\end{center}  
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   153
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   154
\noindent These are clearly abysmal and possibly surprising results.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   155
One would expect these systems doing much better than that---after
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   156
all, given a DFA and a string, whether a string is matched by this DFA
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   157
should be linear.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   158
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   159
Admittedly, the regular expression $(a^*)^*\,b$ is carefully chosen to
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   160
exhibit this ``exponential behaviour''.  Unfortunately, such regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   161
expressions are not just a few ``outliers'', but actually they are
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   162
frequent enough that a separate name has been created for
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   163
them---\emph{evil regular expressions}. In empiric work, Davis et al
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   164
report that they have found thousands of such evil regular expressions
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   165
in the JavaScript and Python ecosystems \cite{Davis18}.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   166
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   167
This exponential blowup sometimes causes real pain in real life:
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   168
for example on 20 July 2016 one evil regular expression brought the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   169
webpage \href{http://stackexchange.com}{Stack Exchange} to its knees \footnote{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   170
In this instance, a regular expression intended to just trim white
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   171
spaces from the beginning and the end of a line actually consumed
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   172
massive amounts of CPU-resources and because of this the web servers
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   173
ground to a halt. This happened when a post with 20,000 white spaces
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   174
was submitted, but importantly the white spaces were neither at the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   175
beginning nor at the end. As a result, the regular expression matching
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   176
engine needed to backtrack over many choices.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   177
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   178
The underlying problem is that many ``real life'' regular expression
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   179
matching engines do not use DFAs for matching. This is because they
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   180
support regular expressions that are not covered by the classical
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   181
automata theory, and in this more general setting there are quite a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   182
few research questions still unanswered and fast algorithms still need
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   183
to be developed.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   184
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   185
There is also another under-researched problem to do with regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   186
expressions and lexing, i.e.~the process of breaking up strings into
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   187
sequences of tokens according to some regular expressions. In this
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   188
setting one is not just interested in whether or not a regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   189
expression matches a string, but if it matches also in \emph{how} it
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   190
matches the string.  Consider for example a regular expression
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   191
$r_{key}$ for recognising keywords such as \textit{if}, \textit{then}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   192
and so on; and a regular expression $r_{id}$ for recognising
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   193
identifiers (say, a single character followed by characters or
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   194
numbers). One can then form the compound regular expression
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   195
$(r_{key} + r_{id})^*$ and use it to tokenise strings.  But then how
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   196
should the string \textit{iffoo} be tokenised?  It could be tokenised
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   197
as a keyword followed by an identifier, or the entire string as a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   198
single identifier.  Similarly, how should the string \textit{if} be
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   199
tokenised? Both regular expressions, $r_{key}$ and $r_{id}$, would
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   200
``fire''---so is it an identifier or a keyword?  While in applications
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   201
there is a well-known strategy to decide these questions, called POSIX
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   202
matching, only relatively recently precise definitions of what POSIX
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   203
matching actually means have been formalised
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   204
\cite{AusafDyckhoffUrban2016,OkuiSuzuki2010,Vansummeren2006}. Roughly,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   205
POSIX matching means matching the longest initial substring and
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   206
in the case of a tie, the initial submatch is chosen according to some priorities attached to the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   207
regular expressions (e.g.~keywords have a higher priority than
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   208
identifiers). This sounds rather simple, but according to Grathwohl et
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   209
al \cite[Page 36]{CrashCourse2014} this is not the case. They wrote:
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   210
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   211
\begin{quote}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   212
\it{}``The POSIX strategy is more complicated than the greedy because of 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   213
the dependence on information about the length of matched strings in the 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   214
various subexpressions.''
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   215
\end{quote}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   216
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   217
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   218
This is also supported by evidence collected by Kuklewicz
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   219
\cite{Kuklewicz} who noticed that a number of POSIX regular expression
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   220
matchers calculate incorrect results.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   221
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   222
Our focus is on an algorithm introduced by Sulzmann and Lu in 2014 for
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   223
regular expression matching according to the POSIX strategy
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   224
\cite{Sulzmann2014}. Their algorithm is based on an older algorithm by
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   225
Brzozowski from 1964 where he introduced the notion of derivatives of
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   226
regular expressions \cite{Brzozowski1964}. We shall briefly explain
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   227
the algorithms next.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   228
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   229
\section{The Algorithms by  Brzozowski, and Sulzmann and Lu}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   230
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   231
Suppose regular expressions are given by the following grammar (for
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   232
the moment ignore the grammar for values on the right-hand side):
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   233
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   234
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   235
\begin{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   236
	\begin{tabular}{c@{\hspace{20mm}}c}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   237
		\begin{tabular}{@{}rrl@{}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   238
			\multicolumn{3}{@{}l}{\textbf{Regular Expressions}}\medskip\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   239
			$r$ & $::=$  & $\ZERO$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   240
			& $\mid$ & $\ONE$   \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   241
			& $\mid$ & $c$          \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   242
			& $\mid$ & $r_1 \cdot r_2$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   243
			& $\mid$ & $r_1 + r_2$   \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   244
			\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   245
			& $\mid$ & $r^*$         \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   246
		\end{tabular}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   247
		&
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   248
		\begin{tabular}{@{\hspace{0mm}}rrl@{}}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   249
			\multicolumn{3}{@{}l}{\textbf{Values}}\medskip\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   250
			$v$ & $::=$  & \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   251
			&        & $\Empty$   \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   252
			& $\mid$ & $\Char(c)$          \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   253
			& $\mid$ & $\Seq\,v_1\, v_2$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   254
			& $\mid$ & $\Left(v)$   \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   255
			& $\mid$ & $\Right(v)$  \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   256
			& $\mid$ & $\Stars\,[v_1,\ldots\,v_n]$ \\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   257
		\end{tabular}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   258
	\end{tabular}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   259
\end{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   260
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   261
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   262
The intended meaning of the regular expressions is as usual: $\ZERO$
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   263
cannot match any string, $\ONE$ can match the empty string, the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   264
character regular expression $c$ can match the character $c$, and so
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   265
on. The brilliant contribution by Brzozowski is the notion of
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   266
\emph{derivatives} of regular expressions.  The idea behind this
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   267
notion is as follows: suppose a regular expression $r$ can match a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   268
string of the form $c\!::\! s$ (that is a list of characters starting
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   269
with $c$), what does the regular expression look like that can match
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   270
just $s$? Brzozowski gave a neat answer to this question. He defined
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   271
the following operation on regular expressions, written
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   272
$r\backslash c$ (the derivative of $r$ w.r.t.~the character $c$):
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   273
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   274
\begin{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   275
\begin{tabular}{lcl}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   276
		$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\  
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   277
		$\ONE \backslash c$  & $\dn$ & $\ZERO$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   278
		$d \backslash c$     & $\dn$ & 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   279
		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   280
$(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   281
$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \nullable(r_1)$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   282
	&   & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   283
	&   & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   284
	$(r^*)\backslash c$           & $\dn$ & $(r\backslash c) \cdot r^*$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   285
\end{tabular}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   286
\end{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   287
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   288
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   289
In this definition $\nullable(\_)$ stands for a simple recursive
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   290
function that tests whether a regular expression can match the empty
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   291
string (its definition is omitted). Assuming the classic notion of a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   292
\emph{language} of a regular expression, written $L(\_)$, the main
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   293
property of the derivative operation is that
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   294
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   295
\begin{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   296
$c\!::\!s \in L(r)$ holds
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   297
if and only if $s \in L(r\backslash c)$.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   298
\end{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   299
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   300
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   301
The beauty of derivatives is that they lead to a really simple regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   302
expression matching algorithm: To find out whether a string $s$
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   303
matches with a regular expression $r$, build the derivatives of $r$
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   304
w.r.t.\ (in succession) all the characters of the string $s$. Finally,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   305
test whether the resulting regular expression can match the empty
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   306
string.  If yes, then $r$ matches $s$, and no in the negative
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   307
case. For us the main advantage is that derivatives can be
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   308
straightforwardly implemented in any functional programming language,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   309
and are easily definable and reasoned about in theorem provers---the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   310
definitions just consist of inductive datatypes and simple recursive
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   311
functions. Moreover, the notion of derivatives can be easily
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   312
generalised to cover extended regular expression constructors such as
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   313
the not-regular expression, written $\neg\,r$, or bounded repetitions
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   314
(for example $r^{\{n\}}$ and $r^{\{n..m\}}$), which cannot be so
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   315
straightforwardly realised within the classic automata approach.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   316
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   317
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   318
One limitation, however, of Brzozowski's algorithm is that it only
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   319
produces a YES/NO answer for whether a string is being matched by a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   320
regular expression.  Sulzmann and Lu~\cite{Sulzmann2014} extended this
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   321
algorithm to allow generation of an actual matching, called a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   322
\emph{value}---see the grammar above for its definition.  Assuming a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   323
regular expression matches a string, values encode the information of
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   324
\emph{how} the string is matched by the regular expression---that is,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   325
which part of the string is matched by which part of the regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   326
expression. To illustrate this consider the string $xy$ and the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   327
regular expression $(x + (y + xy))^*$. We can view this regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   328
expression as a tree and if the string $xy$ is matched by two Star
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   329
``iterations'', then the $x$ is matched by the left-most alternative
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   330
in this tree and the $y$ by the right-left alternative. This suggests
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   331
to record this matching as
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   332
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   333
\begin{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   334
$\Stars\,[\Left\,(\Char\,x), \Right(\Left(\Char\,y))]$
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   335
\end{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   336
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   337
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   338
where $\Stars$ records how many
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   339
iterations were used; and $\Left$, respectively $\Right$, which
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   340
alternative is used. The value for
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   341
matching $xy$ in a single ``iteration'', i.e.~the POSIX value,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   342
would look as follows
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   343
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   344
\begin{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   345
$\Stars\,[\Seq\,(\Char\,x)\,(\Char\,y)]$
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   346
\end{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   347
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   348
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   349
where $\Stars$ has only a single-element list for the single iteration
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   350
and $\Seq$ indicates that $xy$ is matched by a sequence regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   351
expression.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   352
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   353
The contribution of Sulzmann and Lu is an extension of Brzozowski's
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   354
algorithm by a second phase (the first phase being building successive
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   355
derivatives). In this second phase, for every successful match the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   356
corresponding POSIX value is computed. As mentioned before, from this
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   357
value we can extract the information \emph{how} a regular expression
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   358
matched a string. We omit the details here on how Sulzmann and Lu
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   359
achieved this~(see \cite{Sulzmann2014}). Rather, we shall focus next on the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   360
process of simplification of regular expressions, which is needed in
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   361
order to obtain \emph{fast} versions of the Brzozowski's, and Sulzmann
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   362
and Lu's algorithms.  This is where the PhD-project hopes to advance
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   363
the state-of-the-art.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   364
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   365
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   366
\section{Simplification of Regular Expressions}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   367
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   368
The main drawback of building successive derivatives according to
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   369
Brzozowski's definition is that they can grow very quickly in size.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   370
This is mainly due to the fact that the derivative operation generates
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   371
often ``useless'' $\ZERO$s and $\ONE$s in derivatives.  As a result,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   372
if implemented naively both algorithms by Brzozowski and by Sulzmann
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   373
and Lu are excruciatingly slow. For example when starting with the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   374
regular expression $(a + aa)^*$ and building 12 successive derivatives
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   375
w.r.t.~the character $a$, one obtains a derivative regular expression
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   376
with more than 8000 nodes (when viewed as a tree). Operations like
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   377
derivative and $\nullable$ need to traverse such trees and
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   378
consequently the bigger the size of the derivative the slower the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   379
algorithm. Fortunately, one can simplify regular expressions after
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   380
each derivative step. Various simplifications of regular expressions
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   381
are possible, such as the simplifications of $\ZERO + r$,
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   382
$r + \ZERO$, $\ONE\cdot r$, $r \cdot \ONE$, and $r + r$ to just
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   383
$r$. These simplifications do not affect the answer for whether a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   384
regular expression matches a string or not, but fortunately also do
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   385
not affect the POSIX strategy of how regular expressions match
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   386
strings---although the latter is much harder to establish. Some
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   387
initial results in this regard have been obtained in
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   388
\cite{AusafDyckhoffUrban2016}. However, what has not been achieved yet
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   389
is a very tight bound for the size. Such a tight bound is suggested by
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   390
work of Antimirov who proved that (partial) derivatives can be bound
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   391
by the number of characters contained in the initial regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   392
expression \cite{Antimirov95}. We believe, and have generated test
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   393
data, that a similar bound can be obtained for the derivatives in
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   394
Sulzmann and Lu's algorithm. Let us give some details about this next.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   395
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   396
We first followed Sulzmann and Lu's idea of introducing
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   397
\emph{annotated regular expressions}~\cite{Sulzmann2014}. They are
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   398
defined by the following grammar:
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   399
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   400
\begin{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   401
\begin{tabular}{lcl}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   402
  $\textit{a}$ & $::=$  & $\textit{ZERO}$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   403
                  & $\mid$ & $\textit{ONE}\;\;bs$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   404
                  & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   405
                  & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   406
                  & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   407
                  & $\mid$ & $\textit{STAR}\;\;bs\,a$
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   408
\end{tabular}    
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   409
\end{center}  
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   410
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   411
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   412
where $bs$ stands for bitsequences, and $as$ (in \textit{ALTS}) for a
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   413
list of annotated regular expressions. These bitsequences encode
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   414
information about the (POSIX) value that should be generated by the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   415
Sulzmann and Lu algorithm. There are operations that can transform the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   416
usual (un-annotated) regular expressions into annotated regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   417
expressions, and there are operations for encoding/decoding values to
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   418
or from bitsequences. For example the encoding function for values is
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   419
defined as follows:
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   420
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   421
\begin{center}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   422
\begin{tabular}{lcl}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   423
  $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   424
  $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   425
  $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   426
  $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   427
  $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   428
  $\textit{code}(\Stars\,[])$ & $\dn$ & $[\S]$\\
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   429
  $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\Z :: code(v) \;@\;
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   430
                                                 code(\Stars\,vs)$
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   431
\end{tabular}    
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   432
\end{center} 
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   433
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   434
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   435
where $\Z$ and $\S$ are arbitrary names for the ``bits'' in the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   436
bitsequences. Although this encoding is ``lossy'' in the sense of not
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   437
recording all details of a value, Sulzmann and Lu have defined the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   438
decoding function such that with additional information (namely the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   439
corresponding regular expression) a value can be precisely extracted
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   440
from a bitsequence.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   441
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   442
The main point of the bitsequences and annotated regular expressions
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   443
is that we can apply rather aggressive (in terms of size)
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   444
simplification rules in order to keep derivatives small.  We have
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   445
developed such ``aggressive'' simplification rules and generated test
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   446
data that show that the expected bound can be achieved. Obviously we
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   447
could only partially cover  the search space as there are infinitely
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   448
many regular expressions and strings. One modification we introduced
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   449
is to allow a list of annotated regular expressions in the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   450
\textit{ALTS} constructor. This allows us to not just delete
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   451
unnecessary $\ZERO$s and $\ONE$s from regular expressions, but also
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   452
unnecessary ``copies'' of regular expressions (very similar to
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   453
simplifying $r + r$ to just $r$, but in a more general
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   454
setting). Another modification is that we use simplification rules
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   455
inspired by Antimirov's work on partial derivatives. They maintain the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   456
idea that only the first ``copy'' of a regular expression in an
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   457
alternative contributes to the calculation of a POSIX value. All
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   458
subsequent copies can be prunned from the regular expression.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   459
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   460
We are currently engaged with proving that our simplification rules
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   461
actually do not affect the POSIX value that should be generated by the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   462
algorithm according to the specification of a POSIX value and
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   463
furthermore that our derivatives stay small for all derivatives. For
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   464
this proof we use the theorem prover Isabelle. Once completed, this
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   465
result will advance the state-of-the-art: Sulzmann and Lu wrote in
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   466
their paper \cite{Sulzmann2014} about the bitcoded ``incremental
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   467
parsing method'' (that is the matching algorithm outlined in this
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   468
section):
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   469
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   470
\begin{quote}\it
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   471
  ``Correctness Claim: We further claim that the incremental parsing
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   472
  method in Figure~5 in combination with the simplification steps in
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   473
  Figure 6 yields POSIX parse trees. We have tested this claim
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   474
  extensively by using the method in Figure~3 as a reference but yet
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   475
  have to work out all proof details.''
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   476
\end{quote}  
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   477
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   478
\noindent
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   479
We would settle the correctness claim and furthermore obtain a much
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   480
tighter bound on the sizes of derivatives. The result is that our
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   481
algorithm should be correct and faster on all inputs.  The original
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   482
blow-up, as observed in JavaScript, Python and Java, would be excluded
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   483
from happening in our algorithm.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   484
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   485
\section{Conclusion}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   486
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   487
In this PhD-project we are interested in fast algorithms for regular
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   488
expression matching. While this seems to be a ``settled'' area, in
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   489
fact interesting research questions are popping up as soon as one steps
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   490
outside the classic automata theory (for example in terms of what kind
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   491
of regular expressions are supported). The reason why it is
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   492
interesting for us to look at the derivative approach introduced by
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   493
Brzozowski for regular expression matching, and then much further
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   494
developed by Sulzmann and Lu, is that derivatives can elegantly deal
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   495
with some of the regular expressions that are of interest in ``real
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   496
life''. This includes the not-regular expression, written $\neg\,r$
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   497
(that is all strings that are not recognised by $r$), but also bounded
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   498
regular expressions such as $r^{\{n\}}$ and $r^{\{n..m\}}$). There is
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   499
also hope that the derivatives can provide another angle for how to
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   500
deal more efficiently with back-references, which are one of the
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   501
reasons why regular expression engines in JavaScript, Python and Java
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   502
choose to not implement the classic automata approach of transforming
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   503
regular expressions into NFAs and then DFAs---because we simply do not
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   504
know how such back-references can be represented by DFAs.
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   505
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   506
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   507
\bibliographystyle{plain}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   508
\bibliography{root}
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   509
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   510
1a92233763bd created folder for 9mth report
Chengsong
parents:
diff changeset
   511
\end{document}