ChengsongTanPhdThesis/Chapters/Chapter1.tex
author Chengsong
Thu, 26 May 2022 20:51:40 +0100
changeset 518 ff7945a988a3
parent 516 6fecb7fe8cd0
child 519 856d025dbc15
permissions -rwxr-xr-x
more to thesis
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     1
% Chapter 1
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     2
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     3
\chapter{Introduction} % Main chapter title
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     4
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     5
\label{Chapter1} % For referencing the chapter elsewhere, use \ref{Chapter1} 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     6
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     7
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     8
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
     9
% Define some commands to keep the formatting separated from the content 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    10
\newcommand{\keyword}[1]{\textbf{#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    11
\newcommand{\tabhead}[1]{\textbf{#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    12
\newcommand{\code}[1]{\texttt{#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    13
\newcommand{\file}[1]{\texttt{\bfseries#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    14
\newcommand{\option}[1]{\texttt{\itshape#1}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    15
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    16
%\newcommand{\sflataux}[1]{\textit{sflat}\_\textit{aux} \, #1}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    17
\newcommand\sflat[1]{\llparenthesis #1 \rrparenthesis }
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    18
\newcommand{\ASEQ}[3]{\textit{ASEQ}_{#1} \, #2 \, #3}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    19
\newcommand{\bderssimp}[2]{#1 \backslash_{bsimp} #2}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    20
\newcommand{\rderssimp}[2]{#1 \backslash_{rsimp} #2}
506
69ad05398894 thesis chapter 2 section 2.4 2.5
Chengsong
parents: 505
diff changeset
    21
\newcommand{\bders}[2]{#1 \backslash #2}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    22
\newcommand{\bsimp}[1]{\textit{bsimp}(#1)}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    23
\newcommand{\rsimp}[1]{\textit{rsimp}(#1)}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    24
\newcommand{\sflataux}[1]{\llparenthesis #1 \rrparenthesis'}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    25
\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    26
\newcommand{\ZERO}{\mbox{\bf 0}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    27
\newcommand{\ONE}{\mbox{\bf 1}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    28
\newcommand{\AALTS}[2]{\oplus {\scriptstyle #1}\, #2}
505
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    29
\newcommand{\rdistinct}[2]{\textit{distinct} \; \textit{#1} \; #2}
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    30
\newcommand\hflat[1]{\llparenthesis  #1 \rrparenthesis_*}
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    31
\newcommand\hflataux[1]{\llparenthesis #1 \rrparenthesis_*'}
506
69ad05398894 thesis chapter 2 section 2.4 2.5
Chengsong
parents: 505
diff changeset
    32
\newcommand\createdByStar[1]{\textit{createdByStar}(#1)}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    33
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    34
\newcommand\myequiv{\mathrel{\stackrel{\makebox[0pt]{\mbox{\normalfont\tiny equiv}}}{=}}}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    35
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    36
\def\lexer{\mathit{lexer}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    37
\def\mkeps{\mathit{mkeps}}
505
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    38
\newcommand{\rder}[2]{#2 \backslash #1}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    39
500
Chengsong
parents: 472
diff changeset
    40
\def\AZERO{\textit{AZERO}}
Chengsong
parents: 472
diff changeset
    41
\def\AONE{\textit{AONE}}
Chengsong
parents: 472
diff changeset
    42
\def\ACHAR{\textit{ACHAR}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    43
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
    44
\def\POSIX{\textit{POSIX}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    45
\def\ALTS{\textit{ALTS}}
500
Chengsong
parents: 472
diff changeset
    46
\def\ASTAR{\textit{ASTAR}}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    47
\def\DFA{\textit{DFA}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    48
\def\bmkeps{\textit{bmkeps}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    49
\def\retrieve{\textit{retrieve}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    50
\def\blexer{\textit{blexer}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    51
\def\flex{\textit{flex}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    52
\def\inj{\mathit{inj}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    53
\def\Empty{\mathit{Empty}}
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
    54
\def\Left{\mathit{Left}}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    55
\def\Right{\mathit{Right}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    56
\def\Stars{\mathit{Stars}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    57
\def\Char{\mathit{Char}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    58
\def\Seq{\mathit{Seq}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    59
\def\Der{\mathit{Der}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    60
\def\nullable{\mathit{nullable}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    61
\def\Z{\mathit{Z}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    62
\def\S{\mathit{S}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    63
\def\rup{r^\uparrow}
500
Chengsong
parents: 472
diff changeset
    64
%\def\bderssimp{\mathit{bders}\_\mathit{simp}}
Chengsong
parents: 472
diff changeset
    65
\def\distinctWith{\textit{distinctWith}}
Chengsong
parents: 472
diff changeset
    66
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
    67
\def\size{\mathit{size}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    68
\def\rexp{\mathbf{rexp}}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    69
\def\simp{\mathit{simp}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    70
\def\simpALTs{\mathit{simp}\_\mathit{ALTs}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    71
\def\map{\mathit{map}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    72
\def\distinct{\mathit{distinct}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    73
\def\blexersimp{\mathit{blexer}\_\mathit{simp}}
500
Chengsong
parents: 472
diff changeset
    74
\def\map{\textit{map}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    75
%\def\vsuf{\textit{vsuf}}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    76
%\def\sflataux{\textit{sflat}\_\textit{aux}}
500
Chengsong
parents: 472
diff changeset
    77
\def\rrexp{\textit{rrexp}}
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    78
\newcommand\rnullable[1]{\textit{rnullable}(#1)}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    79
\newcommand\rsize[1]{\llbracket #1 \rrbracket_r}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    80
\newcommand\asize[1]{\llbracket #1 \rrbracket}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    81
\newcommand\rerase[1]{ (#1)\downarrow_r}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    82
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
    83
500
Chengsong
parents: 472
diff changeset
    84
\def\erase{\textit{erase}}
Chengsong
parents: 472
diff changeset
    85
\def\STAR{\textit{STAR}}
Chengsong
parents: 472
diff changeset
    86
\def\flts{\textit{flts}}
Chengsong
parents: 472
diff changeset
    87
503
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    88
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    89
\def\RZERO{\mathbf{0}_r }
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    90
\def\RONE{\mathbf{1}_r}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    91
\newcommand\RCHAR[1]{\mathbf{#1}_r}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    92
\newcommand\RSEQ[2]{#1 \cdot #2}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    93
\newcommand\RALTS[1]{\oplus #1}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    94
\newcommand\RSTAR[1]{#1^*}
35b80e37dfe7 new writing
Chengsong
parents: 500
diff changeset
    95
\newcommand\vsuf[2]{\textit{vsuf} \;#1\;#2}
505
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
    96
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    97
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    98
%This part is about regular expressions, Brzozowski derivatives,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
    99
%and a bit-coded lexing algorithm with proven correctness and time bounds.
472
Chengsong
parents: 471
diff changeset
   100
Chengsong
parents: 471
diff changeset
   101
%TODO: look up snort rules to use here--give readers idea of what regexes look like
Chengsong
parents: 471
diff changeset
   102
Chengsong
parents: 471
diff changeset
   103
471
Chengsong
parents: 469
diff changeset
   104
Regular expressions are widely used in computer science: 
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   105
be it in text-editors\parencite{atomEditor} with syntax highlighting and auto-completion, 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   106
command-line tools like $\mathit{grep}$ that facilitate easy 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   107
text processing, network intrusion
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   108
detection systems that reject suspicious traffic, or compiler
500
Chengsong
parents: 472
diff changeset
   109
front ends--the majority of the solutions to these tasks 
Chengsong
parents: 472
diff changeset
   110
involve lexing with regular 
Chengsong
parents: 472
diff changeset
   111
expressions.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   112
Given its usefulness and ubiquity, one would imagine that
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   113
modern regular expression matching implementations
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   114
are mature and fully studied.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   115
Indeed, in a popular programming language' regex engine, 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   116
supplying it with regular expressions and strings, one can
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   117
get rich matching information in a very short time.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   118
Some network intrusion detection systems
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   119
use regex engines that are able to process 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   120
megabytes or even gigabytes of data per second\parencite{Turo_ov__2020}.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   121
Unfortunately, this is not the case for $\mathbf{all}$ inputs.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   122
%TODO: get source for SNORT/BRO's regex matching engine/speed
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   123
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   124
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   125
Take $(a^*)^*\,b$ and ask whether
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   126
strings of the form $aa..a$ match this regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   127
expression. Obviously this is not the case---the expected $b$ in the last
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   128
position is missing. One would expect that modern regular expression
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   129
matching engines can find this out very quickly. Alas, if one tries
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   130
this example in JavaScript, Python or Java 8, even with strings of a small
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   131
length, say around 30 $a$'s, one discovers that 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   132
this decision takes crazy time to finish given the simplicity of the problem.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   133
This is clearly exponential behaviour, and 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   134
is triggered by some relatively simple regex patterns, as the graphs
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   135
below show:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   136
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   137
\begin{figure}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   138
\centering
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   139
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   140
\begin{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   141
\begin{axis}[
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   142
    xlabel={$n$},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   143
    x label style={at={(1.05,-0.05)}},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   144
    ylabel={time in secs},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   145
    enlargelimits=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   146
    xtick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   147
    xmax=33,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   148
    ymax=35,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   149
    ytick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   150
    scaled ticks=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   151
    axis lines=left,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   152
    width=5cm,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   153
    height=4cm, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   154
    legend entries={JavaScript},  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   155
    legend pos=north west,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   156
    legend cell align=left]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   157
\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   158
\end{axis}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   159
\end{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   160
  &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   161
\begin{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   162
\begin{axis}[
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   163
    xlabel={$n$},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   164
    x label style={at={(1.05,-0.05)}},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   165
    %ylabel={time in secs},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   166
    enlargelimits=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   167
    xtick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   168
    xmax=33,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   169
    ymax=35,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   170
    ytick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   171
    scaled ticks=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   172
    axis lines=left,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   173
    width=5cm,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   174
    height=4cm, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   175
    legend entries={Python},  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   176
    legend pos=north west,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   177
    legend cell align=left]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   178
\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   179
\end{axis}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   180
\end{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   181
  &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   182
\begin{tikzpicture}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   183
\begin{axis}[
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   184
    xlabel={$n$},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   185
    x label style={at={(1.05,-0.05)}},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   186
    %ylabel={time in secs},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   187
    enlargelimits=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   188
    xtick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   189
    xmax=33,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   190
    ymax=35,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   191
    ytick={0,5,...,30},
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   192
    scaled ticks=false,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   193
    axis lines=left,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   194
    width=5cm,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   195
    height=4cm, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   196
    legend entries={Java 8},  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   197
    legend pos=north west,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   198
    legend cell align=left]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   199
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   200
\end{axis}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   201
\end{tikzpicture}\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   202
\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   203
           of the form $\underbrace{aa..a}_{n}$.}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   204
\end{tabular}    
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   205
\caption{aStarStarb} \label{fig:aStarStarb}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   206
\end{figure}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   207
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   208
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   209
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   210
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   211
This superlinear blowup in matching algorithms sometimes cause
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   212
considerable grief in real life: for example on 20 July 2016 one evil
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   213
regular expression brought the webpage
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   214
\href{http://stackexchange.com}{Stack Exchange} to its
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   215
knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   216
In this instance, a regular expression intended to just trim white
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   217
spaces from the beginning and the end of a line actually consumed
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   218
massive amounts of CPU-resources---causing web servers to grind to a
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   219
halt. This happened when a post with 20,000 white spaces was submitted,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   220
but importantly the white spaces were neither at the beginning nor at
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   221
the end. As a result, the regular expression matching engine needed to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   222
backtrack over many choices. In this example, the time needed to process
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   223
the string was $O(n^2)$ with respect to the string length. This
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   224
quadratic overhead was enough for the homepage of Stack Exchange to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   225
respond so slowly that the load balancer assumed a $\mathit{DoS}$ 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   226
attack and therefore stopped the servers from responding to any
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   227
requests. This made the whole site become unavailable. 
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   228
A more recent example is a global outage of all Cloudflare servers on 2 July
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   229
2019. A poorly written regular expression exhibited exponential
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   230
behaviour and exhausted CPUs that serve HTTP traffic. Although the outage
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   231
had several causes, at the heart was a regular expression that
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   232
was used to monitor network
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   233
traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   234
%TODO: data points for some new versions of languages
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   235
These problems with regular expressions 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   236
are not isolated events that happen
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   237
very occasionally, but actually widespread.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   238
They occur so often that they get a 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   239
name--Regular-Expression-Denial-Of-Service (ReDoS)
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   240
attack.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   241
Davis et al. \parencite{Davis18} detected more
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   242
than 1000 super-linear (SL) regular expressions
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   243
in Node.js, Python core libraries, and npm and pypi. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   244
They therefore concluded that evil regular expressions
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   245
are problems more than "a parlour trick", but one that
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   246
requires
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   247
more research attention.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   248
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   249
 \section{The Problem Behind Slow Cases}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   250
471
Chengsong
parents: 469
diff changeset
   251
%find literature/find out for yourself that REGEX->DFA on basic regexes
Chengsong
parents: 469
diff changeset
   252
%does not blow up the size
Chengsong
parents: 469
diff changeset
   253
Shouldn't regular expression matching be linear?
Chengsong
parents: 469
diff changeset
   254
How can one explain the super-linear behaviour of the 
Chengsong
parents: 469
diff changeset
   255
regex matching engines we have?
Chengsong
parents: 469
diff changeset
   256
The time cost of regex matching algorithms in general
Chengsong
parents: 469
diff changeset
   257
involve two phases: 
Chengsong
parents: 469
diff changeset
   258
the construction phase, in which the algorithm builds some  
Chengsong
parents: 469
diff changeset
   259
suitable data structure from the input regex $r$, we denote
Chengsong
parents: 469
diff changeset
   260
the time cost by $P_1(r)$.
Chengsong
parents: 469
diff changeset
   261
The lexing
Chengsong
parents: 469
diff changeset
   262
phase, when the input string $s$ is read and the data structure
Chengsong
parents: 469
diff changeset
   263
representing that regex $r$ is being operated on. We represent the time
Chengsong
parents: 469
diff changeset
   264
it takes by $P_2(r, s)$.\\
Chengsong
parents: 469
diff changeset
   265
In the case of a $\mathit{DFA}$,
Chengsong
parents: 469
diff changeset
   266
we have $P_2(r, s) = O( |s| )$,
Chengsong
parents: 469
diff changeset
   267
because we take at most $|s|$ steps, 
Chengsong
parents: 469
diff changeset
   268
and each step takes
Chengsong
parents: 469
diff changeset
   269
at most one transition--
Chengsong
parents: 469
diff changeset
   270
a deterministic-finite-automata
Chengsong
parents: 469
diff changeset
   271
by definition has at most one state active and at most one
Chengsong
parents: 469
diff changeset
   272
transition upon receiving an input symbol.
Chengsong
parents: 469
diff changeset
   273
But unfortunately in the  worst case
Chengsong
parents: 469
diff changeset
   274
$P_1(r) = O(exp^{|r|})$. An example will be given later. \\
Chengsong
parents: 469
diff changeset
   275
For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
Chengsong
parents: 469
diff changeset
   276
expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
Chengsong
parents: 469
diff changeset
   277
The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
Chengsong
parents: 469
diff changeset
   278
On the other hand, if backtracking is used, the worst-case time bound bloats
Chengsong
parents: 469
diff changeset
   279
to $|r| * 2^|s|$ .
Chengsong
parents: 469
diff changeset
   280
%on the input
Chengsong
parents: 469
diff changeset
   281
%And when calculating the time complexity of the matching algorithm,
Chengsong
parents: 469
diff changeset
   282
%we are assuming that each input reading step requires constant time.
Chengsong
parents: 469
diff changeset
   283
%which translates to that the number of 
Chengsong
parents: 469
diff changeset
   284
%states active and transitions taken each time is bounded by a
Chengsong
parents: 469
diff changeset
   285
%constant $C$.
Chengsong
parents: 469
diff changeset
   286
%But modern  regex libraries in popular language engines
Chengsong
parents: 469
diff changeset
   287
% often want to support much richer constructs than just
Chengsong
parents: 469
diff changeset
   288
% sequences and Kleene stars,
Chengsong
parents: 469
diff changeset
   289
%such as negation, intersection, 
Chengsong
parents: 469
diff changeset
   290
%bounded repetitions and back-references.
Chengsong
parents: 469
diff changeset
   291
%And de-sugaring these "extended" regular expressions 
Chengsong
parents: 469
diff changeset
   292
%into basic ones might bloat the size exponentially.
Chengsong
parents: 469
diff changeset
   293
%TODO: more reference for exponential size blowup on desugaring. 
Chengsong
parents: 469
diff changeset
   294
\subsection{Tools that uses $\mathit{DFA}$s}
Chengsong
parents: 469
diff changeset
   295
%TODO:more tools that use DFAs?
Chengsong
parents: 469
diff changeset
   296
$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
Chengsong
parents: 469
diff changeset
   297
in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
Chengsong
parents: 469
diff changeset
   298
lexers. The user provides a set of regular expressions
Chengsong
parents: 469
diff changeset
   299
and configurations to such lexer generators, and then 
Chengsong
parents: 469
diff changeset
   300
gets an output program encoding a minimized $\mathit{DFA}$
Chengsong
parents: 469
diff changeset
   301
that can be compiled and run. 
Chengsong
parents: 469
diff changeset
   302
The good things about $\mathit{DFA}$s is that once
Chengsong
parents: 469
diff changeset
   303
generated, they are fast and stable, unlike
Chengsong
parents: 469
diff changeset
   304
backtracking algorithms. 
Chengsong
parents: 469
diff changeset
   305
However, they do not scale well with bounded repetitions.\\
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   306
516
6fecb7fe8cd0 blexer2: modified for plotting
Chengsong
parents: 506
diff changeset
   307
6fecb7fe8cd0 blexer2: modified for plotting
Chengsong
parents: 506
diff changeset
   308
6fecb7fe8cd0 blexer2: modified for plotting
Chengsong
parents: 506
diff changeset
   309
471
Chengsong
parents: 469
diff changeset
   310
Bounded repetitions, usually written in the form
Chengsong
parents: 469
diff changeset
   311
$r^{\{c\}}$ (where $c$ is a constant natural number),
Chengsong
parents: 469
diff changeset
   312
denotes a regular expression accepting strings
Chengsong
parents: 469
diff changeset
   313
that can be divided into $c$ substrings, where each 
Chengsong
parents: 469
diff changeset
   314
substring is in $r$. 
Chengsong
parents: 469
diff changeset
   315
For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
Chengsong
parents: 469
diff changeset
   316
an $\mathit{NFA}$ describing it would look like:
Chengsong
parents: 469
diff changeset
   317
\begin{center}
Chengsong
parents: 469
diff changeset
   318
\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
Chengsong
parents: 469
diff changeset
   319
   \node[state,initial] (q_0)   {$q_0$}; 
Chengsong
parents: 469
diff changeset
   320
   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
Chengsong
parents: 469
diff changeset
   321
   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
Chengsong
parents: 469
diff changeset
   322
   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
Chengsong
parents: 469
diff changeset
   323
    \path[->] 
Chengsong
parents: 469
diff changeset
   324
    (q_0) edge  node {a} (q_1)
Chengsong
parents: 469
diff changeset
   325
    	  edge [loop below] node {a,b} ()
Chengsong
parents: 469
diff changeset
   326
    (q_1) edge  node  {a,b} (q_2)
Chengsong
parents: 469
diff changeset
   327
    (q_2) edge  node  {a,b} (q_3);
Chengsong
parents: 469
diff changeset
   328
\end{tikzpicture}
Chengsong
parents: 469
diff changeset
   329
\end{center}
Chengsong
parents: 469
diff changeset
   330
The red states are "countdown states" which counts down 
Chengsong
parents: 469
diff changeset
   331
the number of characters needed in addition to the current
Chengsong
parents: 469
diff changeset
   332
string to make a successful match.
Chengsong
parents: 469
diff changeset
   333
For example, state $q_1$ indicates a match that has
Chengsong
parents: 469
diff changeset
   334
gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
Chengsong
parents: 469
diff changeset
   335
and just consumed the "delimiter" $a$ in the middle, and 
Chengsong
parents: 469
diff changeset
   336
need to match 2 more iterations of $(a|b)$ to complete.
Chengsong
parents: 469
diff changeset
   337
State $q_2$ on the other hand, can be viewed as a state
Chengsong
parents: 469
diff changeset
   338
after $q_1$ has consumed 1 character, and just waits
Chengsong
parents: 469
diff changeset
   339
for 1 more character to complete.
Chengsong
parents: 469
diff changeset
   340
$q_3$ is the last state, requiring 0 more character and is accepting.
Chengsong
parents: 469
diff changeset
   341
Depending on the suffix of the
Chengsong
parents: 469
diff changeset
   342
input string up to the current read location,
Chengsong
parents: 469
diff changeset
   343
the states $q_1$ and $q_2$, $q_3$
Chengsong
parents: 469
diff changeset
   344
may or may
Chengsong
parents: 469
diff changeset
   345
not be active, independent from each other.
Chengsong
parents: 469
diff changeset
   346
A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
Chengsong
parents: 469
diff changeset
   347
contain at least $2^3$ non-equivalent states that cannot be merged, 
Chengsong
parents: 469
diff changeset
   348
because the subset construction during determinisation will generate
Chengsong
parents: 469
diff changeset
   349
all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
Chengsong
parents: 469
diff changeset
   350
Generalizing this to regular expressions with larger
Chengsong
parents: 469
diff changeset
   351
bounded repetitions number, we have that
Chengsong
parents: 469
diff changeset
   352
regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
Chengsong
parents: 469
diff changeset
   353
would require at least $2^{n+1}$ states, if $r$ contains
Chengsong
parents: 469
diff changeset
   354
more than 1 string.
Chengsong
parents: 469
diff changeset
   355
This is to represent all different 
Chengsong
parents: 469
diff changeset
   356
scenarios which "countdown" states are active.
Chengsong
parents: 469
diff changeset
   357
For those regexes, tools such as $\mathit{JFLEX}$ 
Chengsong
parents: 469
diff changeset
   358
would generate gigantic $\mathit{DFA}$'s or
Chengsong
parents: 469
diff changeset
   359
out of memory errors.
Chengsong
parents: 469
diff changeset
   360
For this reason, regex libraries that support 
Chengsong
parents: 469
diff changeset
   361
bounded repetitions often choose to use the $\mathit{NFA}$ 
Chengsong
parents: 469
diff changeset
   362
approach.
Chengsong
parents: 469
diff changeset
   363
\subsection{The $\mathit{NFA}$ approach to regex matching}
Chengsong
parents: 469
diff changeset
   364
One can simulate the $\mathit{NFA}$ running in two ways:
Chengsong
parents: 469
diff changeset
   365
one by keeping track of all active states after consuming 
Chengsong
parents: 469
diff changeset
   366
a character, and update that set of states iteratively.
Chengsong
parents: 469
diff changeset
   367
This can be viewed as a breadth-first-search of the $\mathit{NFA}$
Chengsong
parents: 469
diff changeset
   368
for a path terminating
Chengsong
parents: 469
diff changeset
   369
at an accepting state.
Chengsong
parents: 469
diff changeset
   370
Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
Chengsong
parents: 469
diff changeset
   371
type of $\mathit{NFA}$ simulation, and guarantees a linear runtime
Chengsong
parents: 469
diff changeset
   372
in terms of input string length.
Chengsong
parents: 469
diff changeset
   373
%TODO:try out these lexers
Chengsong
parents: 469
diff changeset
   374
The other way to use $\mathit{NFA}$ for matching is choosing  
Chengsong
parents: 469
diff changeset
   375
a single transition each time, keeping all the other options in 
Chengsong
parents: 469
diff changeset
   376
a queue or stack, and backtracking if that choice eventually 
Chengsong
parents: 469
diff changeset
   377
fails. This method, often called a  "depth-first-search", 
Chengsong
parents: 469
diff changeset
   378
is efficient in a lot of cases, but could end up
Chengsong
parents: 469
diff changeset
   379
with exponential run time.\\
Chengsong
parents: 469
diff changeset
   380
%TODO:COMPARE java python lexer speed with Rust and Go
Chengsong
parents: 469
diff changeset
   381
The reason behind backtracking algorithms in languages like
Chengsong
parents: 469
diff changeset
   382
Java and Python is that they support back-references.
Chengsong
parents: 469
diff changeset
   383
\subsection{Back References in Regex--Non-Regular part}
Chengsong
parents: 469
diff changeset
   384
If we have a regular expression like this (the sequence
Chengsong
parents: 469
diff changeset
   385
operator is omitted for brevity):
Chengsong
parents: 469
diff changeset
   386
\begin{center}
Chengsong
parents: 469
diff changeset
   387
	$r_1(r_2(r_3r_4))$
Chengsong
parents: 469
diff changeset
   388
\end{center}
Chengsong
parents: 469
diff changeset
   389
We could label sub-expressions of interest 
Chengsong
parents: 469
diff changeset
   390
by parenthesizing them and giving 
Chengsong
parents: 469
diff changeset
   391
them a number by the order in which their opening parentheses appear.
Chengsong
parents: 469
diff changeset
   392
One possible way of parenthesizing and labelling is given below:
Chengsong
parents: 469
diff changeset
   393
\begin{center}
Chengsong
parents: 469
diff changeset
   394
	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
Chengsong
parents: 469
diff changeset
   395
\end{center}
Chengsong
parents: 469
diff changeset
   396
$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
Chengsong
parents: 469
diff changeset
   397
by 1 to 4. $1$ would refer to the entire expression 
Chengsong
parents: 469
diff changeset
   398
$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
Chengsong
parents: 469
diff changeset
   399
These sub-expressions are called "capturing groups".
Chengsong
parents: 469
diff changeset
   400
We can use the following syntax to denote that we want a string just matched by a 
Chengsong
parents: 469
diff changeset
   401
sub-expression (capturing group) to appear at a certain location again, 
Chengsong
parents: 469
diff changeset
   402
exactly as it was:
Chengsong
parents: 469
diff changeset
   403
\begin{center}
Chengsong
parents: 469
diff changeset
   404
$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
Chengsong
parents: 469
diff changeset
   405
\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
Chengsong
parents: 469
diff changeset
   406
\end{center}
Chengsong
parents: 469
diff changeset
   407
The backslash and number $i$ are used to denote such 
Chengsong
parents: 469
diff changeset
   408
so-called "back-references".
Chengsong
parents: 469
diff changeset
   409
Let $e$ be an expression made of regular expressions 
Chengsong
parents: 469
diff changeset
   410
and back-references. $e$ contains the expression $e_i$
Chengsong
parents: 469
diff changeset
   411
as its $i$-th capturing group.
Chengsong
parents: 469
diff changeset
   412
The semantics of back-reference can be recursively
Chengsong
parents: 469
diff changeset
   413
written as:
Chengsong
parents: 469
diff changeset
   414
\begin{center}
Chengsong
parents: 469
diff changeset
   415
	\begin{tabular}{c}
Chengsong
parents: 469
diff changeset
   416
		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
Chengsong
parents: 469
diff changeset
   417
		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
Chengsong
parents: 469
diff changeset
   418
	\end{tabular}
Chengsong
parents: 469
diff changeset
   419
\end{center}
Chengsong
parents: 469
diff changeset
   420
The concrete example
Chengsong
parents: 469
diff changeset
   421
$((a|b|c|\ldots|z)^*)\backslash 1$
Chengsong
parents: 469
diff changeset
   422
would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
Chengsong
parents: 469
diff changeset
   423
Back-reference is a construct in the "regex" standard
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   424
that programmers found useful, but not exactly 
471
Chengsong
parents: 469
diff changeset
   425
regular any more.
Chengsong
parents: 469
diff changeset
   426
In fact, that allows the regex construct to express 
Chengsong
parents: 469
diff changeset
   427
languages that cannot be contained in context-free
Chengsong
parents: 469
diff changeset
   428
languages either.
Chengsong
parents: 469
diff changeset
   429
For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
Chengsong
parents: 469
diff changeset
   430
expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
Chengsong
parents: 469
diff changeset
   431
which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
Chengsong
parents: 469
diff changeset
   432
Such a language is contained in the context-sensitive hierarchy
Chengsong
parents: 469
diff changeset
   433
of formal languages. 
Chengsong
parents: 469
diff changeset
   434
Solving the back-reference expressions matching problem
Chengsong
parents: 469
diff changeset
   435
is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
Chengsong
parents: 469
diff changeset
   436
efficient solution is not known to exist.
Chengsong
parents: 469
diff changeset
   437
%TODO:read a bit more about back reference algorithms
Chengsong
parents: 469
diff changeset
   438
It seems that languages like Java and Python made the trade-off
Chengsong
parents: 469
diff changeset
   439
to support back-references at the expense of having to backtrack,
Chengsong
parents: 469
diff changeset
   440
even in the case of regexes not involving back-references.\\
Chengsong
parents: 469
diff changeset
   441
Summing these up, we can categorise existing 
Chengsong
parents: 469
diff changeset
   442
practical regex libraries into the ones  with  linear
Chengsong
parents: 469
diff changeset
   443
time guarantees like Go and Rust, which impose restrictions
Chengsong
parents: 469
diff changeset
   444
on the user input (not allowing back-references, 
Chengsong
parents: 469
diff changeset
   445
bounded repetitions canno exceed 1000 etc.), and ones  
Chengsong
parents: 469
diff changeset
   446
 that allows the programmer much freedom, but grinds to a halt
Chengsong
parents: 469
diff changeset
   447
 in some non-negligible portion of cases.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   448
 %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
471
Chengsong
parents: 469
diff changeset
   449
% For example, the Rust regex engine claims to be linear, 
Chengsong
parents: 469
diff changeset
   450
% but does not support lookarounds and back-references.
Chengsong
parents: 469
diff changeset
   451
% The GoLang regex library does not support over 1000 repetitions.  
Chengsong
parents: 469
diff changeset
   452
% Java and Python both support back-references, but shows
Chengsong
parents: 469
diff changeset
   453
%catastrophic backtracking behaviours on inputs without back-references(
Chengsong
parents: 469
diff changeset
   454
%when the language is still regular).
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   455
 %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   456
 %TODO: verify the fact Rust does not allow 1000+ reps
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   457
 %TODO: Java 17 updated graphs? Is it ok to still use Java 8 graphs?
471
Chengsong
parents: 469
diff changeset
   458
\section{Buggy Regex Engines} 
Chengsong
parents: 469
diff changeset
   459
Chengsong
parents: 469
diff changeset
   460
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   461
 Another thing about these libraries is that there
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   462
 is no correctness guarantee.
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   463
 In some cases, they either fail to generate a lexing result when there exists a match,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   464
 or give the wrong way of matching.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   465
 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   466
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   467
It turns out that regex libraries not only suffer from 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   468
exponential backtracking problems, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   469
but also undesired (or even buggy) outputs.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   470
%TODO: comment from who
471
Chengsong
parents: 469
diff changeset
   471
Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   472
correctly implementing the POSIX (maximum-munch)
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   473
rule of regular expression matching.
471
Chengsong
parents: 469
diff changeset
   474
This experience is echoed by the writer's
Chengsong
parents: 469
diff changeset
   475
tryout of a few online regex testers:
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   476
A concrete example would be 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   477
the regex
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   478
\begin{verbatim}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   479
(((((a*a*)b*)b){20})*)c
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   480
\end{verbatim}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   481
and the string
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   482
\begin{verbatim}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   483
baabaabababaabaaaaaaaaababaa
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   484
aababababaaaabaaabaaaaaabaab
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   485
aabababaababaaaaaaaaababaaaa
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   486
babababaaaaaaaaaaaaac
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   487
\end{verbatim}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   488
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   489
This seemingly complex regex simply says "some $a$'s
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   490
followed by some $b$'s then followed by 1 single $b$,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   491
and this iterates 20 times, finally followed by a $c$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   492
And a POSIX match would involve the entire string,"eating up"
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   493
all the $b$'s in it.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   494
%TODO: give a coloured example of how this matches POSIXly
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   495
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   496
This regex would trigger catastrophic backtracking in 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   497
languages like Python and Java,
471
Chengsong
parents: 469
diff changeset
   498
whereas it gives a non-POSIX  and uninformative 
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   499
match in languages like Go or .NET--The match with only 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   500
character $c$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   501
471
Chengsong
parents: 469
diff changeset
   502
As Grathwohl\parencite{grathwohl2014crash} commented,
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   503
\begin{center}
471
Chengsong
parents: 469
diff changeset
   504
	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   505
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   506
472
Chengsong
parents: 471
diff changeset
   507
%\section{How people solve problems with regexes}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   508
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   509
472
Chengsong
parents: 471
diff changeset
   510
When a regular expression does not behave as intended,
Chengsong
parents: 471
diff changeset
   511
people usually try to rewrite the regex to some equivalent form
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   512
or they try to avoid the possibly problematic patterns completely,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   513
for which many false positives exist\parencite{Davis18}.
472
Chengsong
parents: 471
diff changeset
   514
Animated tools to "debug" regular expressions
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   515
are also popular, regexploit\parencite{regexploit2021}, regex101\parencite{regex101} 
472
Chengsong
parents: 471
diff changeset
   516
to name a few.
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   517
We are also aware of static analysis work on regular expressions that
472
Chengsong
parents: 471
diff changeset
   518
aims to detect potentially expoential regex patterns. Rathnayake and Thielecke 
471
Chengsong
parents: 469
diff changeset
   519
\parencite{Rathnayake2014StaticAF} proposed an algorithm
Chengsong
parents: 469
diff changeset
   520
that detects regular expressions triggering exponential
Chengsong
parents: 469
diff changeset
   521
behavious on backtracking matchers.
472
Chengsong
parents: 471
diff changeset
   522
Weideman \parencite{Weideman2017Static} came up with 
Chengsong
parents: 471
diff changeset
   523
non-linear polynomial worst-time estimates
471
Chengsong
parents: 469
diff changeset
   524
for regexes, attack string that exploit the worst-time 
Chengsong
parents: 469
diff changeset
   525
scenario, and "attack automata" that generates
472
Chengsong
parents: 471
diff changeset
   526
attack strings.
Chengsong
parents: 471
diff changeset
   527
%Arguably these methods limits the programmers' freedom
Chengsong
parents: 471
diff changeset
   528
%or productivity when all they want is to come up with a regex
Chengsong
parents: 471
diff changeset
   529
%that solves the text processing problem.
Chengsong
parents: 471
diff changeset
   530
471
Chengsong
parents: 469
diff changeset
   531
%TODO:also the regex101 debugger
Chengsong
parents: 469
diff changeset
   532
\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   533
 Is it possible to have a regex lexing algorithm with proven correctness and 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   534
 time complexity, which allows easy extensions to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   535
  constructs like 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   536
 bounded repetitions, negation,  lookarounds, and even back-references? 
472
Chengsong
parents: 471
diff changeset
   537
  
Chengsong
parents: 471
diff changeset
   538
  We propose Brzozowski derivatives on regular expressions as
Chengsong
parents: 471
diff changeset
   539
  a solution to this.
Chengsong
parents: 471
diff changeset
   540
  
Chengsong
parents: 471
diff changeset
   541
  In the last fifteen or so years, Brzozowski's derivatives of regular
Chengsong
parents: 471
diff changeset
   542
expressions have sparked quite a bit of interest in the functional
Chengsong
parents: 471
diff changeset
   543
programming and theorem prover communities.  The beauty of
Chengsong
parents: 471
diff changeset
   544
Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
Chengsong
parents: 471
diff changeset
   545
expressible in any functional language, and easily definable and
Chengsong
parents: 471
diff changeset
   546
reasoned about in theorem provers---the definitions just consist of
Chengsong
parents: 471
diff changeset
   547
inductive datatypes and simple recursive functions. 
Chengsong
parents: 471
diff changeset
   548
And an algorithms based on it by 
Chengsong
parents: 471
diff changeset
   549
Suzmann and Lu  \parencite{Sulzmann2014} allows easy extension
Chengsong
parents: 471
diff changeset
   550
to include  extended regular expressions and 
Chengsong
parents: 471
diff changeset
   551
 simplification of internal data structures 
Chengsong
parents: 471
diff changeset
   552
 eliminating the exponential behaviours.
Chengsong
parents: 471
diff changeset
   553
 
Chengsong
parents: 471
diff changeset
   554
Chengsong
parents: 471
diff changeset
   555
  
471
Chengsong
parents: 469
diff changeset
   556
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   557
 
471
Chengsong
parents: 469
diff changeset
   558
Chengsong
parents: 469
diff changeset
   559
Chengsong
parents: 469
diff changeset
   560
Chengsong
parents: 469
diff changeset
   561
%----------------------------------------------------------------------------------------
Chengsong
parents: 469
diff changeset
   562
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   563
\section{Contribution}
472
Chengsong
parents: 471
diff changeset
   564
471
Chengsong
parents: 469
diff changeset
   565
Chengsong
parents: 469
diff changeset
   566
472
Chengsong
parents: 471
diff changeset
   567
This work addresses the vulnerability of super-linear and
Chengsong
parents: 471
diff changeset
   568
buggy regex implementations by the combination
Chengsong
parents: 471
diff changeset
   569
of Brzozowski's derivatives and interactive theorem proving. 
Chengsong
parents: 471
diff changeset
   570
We give an 
471
Chengsong
parents: 469
diff changeset
   571
improved version of  Sulzmann and Lu's bit-coded algorithm using 
Chengsong
parents: 469
diff changeset
   572
derivatives, which come with a formal guarantee in terms of correctness and 
Chengsong
parents: 469
diff changeset
   573
running time as an Isabelle/HOL proof.
Chengsong
parents: 469
diff changeset
   574
Then we improve the algorithm with an even stronger version of 
Chengsong
parents: 469
diff changeset
   575
simplification, and prove a time bound linear to input and
Chengsong
parents: 469
diff changeset
   576
cubic to regular expression size using a technique by
Chengsong
parents: 469
diff changeset
   577
Antimirov.
Chengsong
parents: 469
diff changeset
   578
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   579
 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   580
The main contribution of this thesis is a proven correct lexing algorithm
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   581
with formalized time bounds.
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   582
To our best knowledge, no lexing libraries using Brzozowski derivatives
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   583
have a provable time guarantee, 
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   584
and claims about running time are usually speculative and backed by thin empirical
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   585
evidence.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   586
%TODO: give references
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   587
For example, Sulzmann and Lu had proposed an algorithm  in which they
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   588
claim a linear running time.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   589
But that was falsified by our experiments and the running time 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   590
is actually $\Omega(2^n)$ in the worst case.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   591
A similar claim about a theoretical runtime of $O(n^2)$ is made for the Verbatim
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   592
%TODO: give references
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   593
lexer, which calculates POSIX matches and is based on derivatives.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   594
They formalized the correctness of the lexer, but not the complexity.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   595
In the performance evaluation section, they simply analyzed the run time
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   596
of matching $a$ with the string $\underbrace{a \ldots a}_{\text{n a's}}$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   597
and concluded that the algorithm is quadratic in terms of input length.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   598
When we tried out their extracted OCaml code with our example $(a+aa)^*$,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   599
the time it took to lex only 40 $a$'s was 5 minutes.
472
Chengsong
parents: 471
diff changeset
   600
Chengsong
parents: 471
diff changeset
   601
We  believe our results of a proof of performance on general
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   602
inputs rather than specific examples a novel contribution.\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   603
472
Chengsong
parents: 471
diff changeset
   604
Chengsong
parents: 471
diff changeset
   605
\subsection{Related Work}
Chengsong
parents: 471
diff changeset
   606
We are aware
Chengsong
parents: 471
diff changeset
   607
of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
Chengsong
parents: 471
diff changeset
   608
Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
Chengsong
parents: 471
diff changeset
   609
of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
Chengsong
parents: 471
diff changeset
   610
in Coq is given by Coquand and Siles \parencite{Coquand2012}.
Chengsong
parents: 471
diff changeset
   611
Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
Chengsong
parents: 471
diff changeset
   612
 
Chengsong
parents: 471
diff changeset
   613
 %We propose Brzozowski's derivatives as a solution to this problem.
Chengsong
parents: 471
diff changeset
   614
% about Lexing Using Brzozowski derivatives
Chengsong
parents: 471
diff changeset
   615
 \section{Preliminaries}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   616
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   617
Suppose we have an alphabet $\Sigma$, the strings  whose characters
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   618
are from $\Sigma$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   619
can be expressed as $\Sigma^*$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   620
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   621
We use patterns to define a set of strings concisely. Regular expressions
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   622
are one of such patterns systems:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   623
The basic regular expressions  are defined inductively
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   624
 by the following grammar:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   625
\[			r ::=   \ZERO \mid  \ONE
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   626
			 \mid  c  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   627
			 \mid  r_1 \cdot r_2
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   628
			 \mid  r_1 + r_2   
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   629
			 \mid r^*         
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   630
\]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   631
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   632
The language or set of strings defined by regular expressions are defined as
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   633
%TODO: FILL in the other defs
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   634
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   635
\begin{tabular}{lcl}
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   636
$L \; (r_1 + r_2)$ & $\dn$ & $ L \; (r_1) \cup L \; ( r_2)$\\
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   637
$L \; (r_1 \cdot r_2)$ & $\dn$ & $ L \; (r_1) \cap L \; (r_2)$\\
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   638
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   639
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   640
Which are also called the "language interpretation".
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   641
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   642
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   643
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   644
The Brzozowski derivative w.r.t character $c$ is an operation on the regex,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   645
where the operation transforms the regex to a new one containing
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   646
strings without the head character $c$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   647
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   648
Formally, we define first such a transformation on any string set, which
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   649
we call semantic derivative:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   650
\begin{center}
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   651
$\Der \; c\; \textit{A} = \{s \mid c :: s \in A\}$
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   652
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   653
Mathematically, it can be expressed as the 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   654
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   655
If the $\textit{StringSet}$ happen to have some structure, for example,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   656
if it is regular, then we have that it
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   657
472
Chengsong
parents: 471
diff changeset
   658
% Derivatives of a
Chengsong
parents: 471
diff changeset
   659
%regular expression, written $r \backslash c$, give a simple solution
Chengsong
parents: 471
diff changeset
   660
%to the problem of matching a string $s$ with a regular
Chengsong
parents: 471
diff changeset
   661
%expression $r$: if the derivative of $r$ w.r.t.\ (in
Chengsong
parents: 471
diff changeset
   662
%succession) all the characters of the string matches the empty string,
Chengsong
parents: 471
diff changeset
   663
%then $r$ matches $s$ (and {\em vice versa}).  
Chengsong
parents: 471
diff changeset
   664
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   665
The the derivative of regular expression, denoted as
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   666
$r \backslash c$, is a function that takes parameters
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   667
$r$ and $c$, and returns another regular expression $r'$,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   668
which is computed by the following recursive function:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   669
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   670
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   671
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   672
		$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   673
		$\ONE \backslash c$  & $\dn$ & $\ZERO$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   674
		$d \backslash c$     & $\dn$ & 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   675
		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   676
$(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   677
$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, nullable(r_1)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   678
	&   & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   679
	&   & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   680
	$(r^*)\backslash c$           & $\dn$ & $(r\backslash c) \cdot r^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   681
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   682
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   683
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   684
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   685
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   686
The $\nullable$ function tests whether the empty string $""$ 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   687
is in the language of $r$:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   688
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   689
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   690
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   691
		\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   692
			$\nullable(\ZERO)$     & $\dn$ & $\mathit{false}$ \\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   693
			$\nullable(\ONE)$      & $\dn$ & $\mathit{true}$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   694
			$\nullable(c)$ 	       & $\dn$ & $\mathit{false}$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   695
			$\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   696
			$\nullable(r_1\cdot r_2)$  & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   697
			$\nullable(r^*)$       & $\dn$ & $\mathit{true}$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   698
		\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   699
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   700
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   701
The empty set does not contain any string and
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   702
therefore not the empty string, the empty string 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   703
regular expression contains the empty string
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   704
by definition, the character regular expression
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   705
is the singleton that contains character only,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   706
and therefore does not contain the empty string,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   707
the alternative regular expression(or "or" expression)
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   708
might have one of its children regular expressions
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   709
being nullable and any one of its children being nullable
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   710
would suffice. The sequence regular expression
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   711
would require both children to have the empty string
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   712
to compose an empty string and the Kleene star
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   713
operation naturally introduced the empty string.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   714
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   715
We can give the meaning of regular expressions derivatives
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   716
by language interpretation:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   717
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   718
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   719
 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   720
  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   721
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   722
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   723
		$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   724
		$\ONE \backslash c$  & $\dn$ & $\ZERO$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   725
		$d \backslash c$     & $\dn$ & 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   726
		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   727
$(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   728
$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, nullable(r_1)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   729
	&   & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   730
	&   & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   731
	$(r^*)\backslash c$           & $\dn$ & $(r\backslash c) \cdot r^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   732
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   733
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   734
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   735
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   736
The function derivative, written $\backslash c$, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   737
defines how a regular expression evolves into
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   738
a new regular expression after all the string it contains
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   739
is chopped off a certain head character $c$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   740
The most involved cases are the sequence 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   741
and star case.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   742
The sequence case says that if the first regular expression
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   743
contains an empty string then the second component of the sequence
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   744
might be chosen as the target regular expression to be chopped
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   745
off its head character.
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   746
The star regular expression's derivative unwraps the iteration of
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   747
regular expression and attaches the star regular expression
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   748
to the sequence's second element to make sure a copy is retained
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   749
for possible more iterations in later phases of lexing.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   750
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   751
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   752
The main property of the derivative operation
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   753
that enables us to reason about the correctness of
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   754
an algorithm using derivatives is 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   755
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   756
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   757
$c\!::\!s \in L(r)$ holds
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   758
if and only if $s \in L(r\backslash c)$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   759
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   760
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   761
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   762
We can generalise the derivative operation shown above for single characters
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   763
to strings as follows:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   764
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   765
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   766
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   767
$r \backslash (c\!::\!s) $ & $\dn$ & $(r \backslash c) \backslash s$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   768
$r \backslash [\,] $ & $\dn$ & $r$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   769
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   770
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   771
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   772
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   773
and then define Brzozowski's  regular-expression matching algorithm as:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   774
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   775
\[
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   776
match\;s\;r \;\dn\; nullable(r\backslash s)
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   777
\]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   778
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   779
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   780
Assuming the a string is given as a sequence of characters, say $c_0c_1..c_n$, 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   781
this algorithm presented graphically is as follows:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   782
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   783
\begin{equation}\label{graph:*}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   784
\begin{tikzcd}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   785
r_0 \arrow[r, "\backslash c_0"]  & r_1 \arrow[r, "\backslash c_1"] & r_2 \arrow[r, dashed]  & r_n  \arrow[r,"\textit{nullable}?"] & \;\textrm{YES}/\textrm{NO}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   786
\end{tikzcd}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   787
\end{equation}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   788
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   789
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   790
where we start with  a regular expression  $r_0$, build successive
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   791
derivatives until we exhaust the string and then use \textit{nullable}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   792
to test whether the result can match the empty string. It can  be
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   793
relatively  easily shown that this matcher is correct  (that is given
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   794
an $s = c_0...c_{n-1}$ and an $r_0$, it generates YES if and only if $s \in L(r_0)$).
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   795
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   796
Beautiful and simple definition.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   797
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   798
If we implement the above algorithm naively, however,
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   799
the algorithm can be excruciatingly slow. 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   800
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   801
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   802
\begin{figure}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   803
\centering
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   804
\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   805
\begin{tikzpicture}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   806
\begin{axis}[
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   807
    xlabel={$n$},
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   808
    x label style={at={(1.05,-0.05)}},
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   809
    ylabel={time in secs},
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   810
    enlargelimits=false,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   811
    xtick={0,5,...,30},
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   812
    xmax=33,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   813
    ymax=10000,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   814
    ytick={0,1000,...,10000},
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   815
    scaled ticks=false,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   816
    axis lines=left,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   817
    width=5cm,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   818
    height=4cm, 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   819
    legend entries={JavaScript},  
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   820
    legend pos=north west,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   821
    legend cell align=left]
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   822
\addplot[red,mark=*, mark options={fill=white}] table {EightThousandNodes.data};
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   823
\end{axis}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   824
\end{tikzpicture}\\
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   825
\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   826
           of the form $\underbrace{aa..a}_{n}$.}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   827
\end{tabular}    
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   828
\caption{EightThousandNodes} \label{fig:EightThousandNodes}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   829
\end{figure}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   830
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   831
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   832
(8000 node data to be added here)
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   833
For example, when starting with the regular
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   834
expression $(a + aa)^*$ and building a few successive derivatives (around 10)
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   835
w.r.t.~the character $a$, one obtains a derivative regular expression
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   836
with more than 8000 nodes (when viewed as a tree)\ref{EightThousandNodes}.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   837
The reason why $(a + aa) ^*$ explodes so drastically is that without
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   838
pruning, the algorithm will keep records of all possible ways of matching:
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   839
\begin{center}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   840
$(a + aa) ^* \backslash (aa) = (\ZERO + \ONE \ONE)\cdot(a + aa)^* + (\ONE + \ONE a) \cdot (a + aa)^*$
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   841
\end{center}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   842
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   843
\noindent
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   844
Each of the above alternative branches correspond to the match 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   845
$aa $, $a \quad a$ and $a \quad a \cdot (a)$(incomplete).
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   846
These different ways of matching will grow exponentially with the string length,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   847
and without simplifications that throw away some of these very similar matchings,
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   848
it is no surprise that these expressions grow so quickly.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   849
Operations like
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   850
$\backslash$ and $\nullable$ need to traverse such trees and
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   851
consequently the bigger the size of the derivative the slower the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   852
algorithm. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   853
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   854
Brzozowski was quick in finding that during this process a lot useless
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   855
$\ONE$s and $\ZERO$s are generated and therefore not optimal.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   856
He also introduced some "similarity rules" such
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   857
as $P+(Q+R) = (P+Q)+R$ to merge syntactically 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   858
different but language-equivalent sub-regexes to further decrease the size
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   859
of the intermediate regexes. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   860
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   861
More simplifications are possible, such as deleting duplicates
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   862
and opening up nested alternatives to trigger even more simplifications.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   863
And suppose we apply simplification after each derivative step, and compose
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   864
these two operations together as an atomic one: $a \backslash_{simp}\,c \dn
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   865
\textit{simp}(a \backslash c)$. Then we can build
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   866
a matcher without having  cumbersome regular expressions.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   867
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   868
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   869
If we want the size of derivatives in the algorithm to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   870
stay even lower, we would need more aggressive simplifications.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   871
Essentially we need to delete useless $\ZERO$s and $\ONE$s, as well as
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   872
deleting duplicates whenever possible. For example, the parentheses in
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   873
$(a+b) \cdot c + b\cdot c$ can be opened up to get $a\cdot c + b \cdot c + b
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   874
\cdot c$, and then simplified to just $a \cdot c + b \cdot c$. Another
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   875
example is simplifying $(a^*+a) + (a^*+ \ONE) + (a +\ONE)$ to just
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   876
$a^*+a+\ONE$. Adding these more aggressive simplification rules help us
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   877
to achieve a very tight size bound, namely,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   878
 the same size bound as that of the \emph{partial derivatives}. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   879
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   880
Building derivatives and then simplify them.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   881
So far so good. But what if we want to 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   882
do lexing instead of just a YES/NO answer?
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   883
This requires us to go back again to the world 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   884
without simplification first for a moment.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   885
Sulzmann and Lu~\cite{Sulzmann2014} first came up with a nice and 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   886
elegant(arguably as beautiful as the original
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   887
derivatives definition) solution for this.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   888
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   889
\subsection*{Values and the Lexing Algorithm by Sulzmann and Lu}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   890
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   891
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   892
They first defined the datatypes for storing the 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   893
lexing information called a \emph{value} or
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   894
sometimes also \emph{lexical value}.  These values and regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   895
expressions correspond to each other as illustrated in the following
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   896
table:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   897
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   898
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   899
	\begin{tabular}{c@{\hspace{20mm}}c}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   900
		\begin{tabular}{@{}rrl@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   901
			\multicolumn{3}{@{}l}{\textbf{Regular Expressions}}\medskip\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   902
			$r$ & $::=$  & $\ZERO$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   903
			& $\mid$ & $\ONE$   \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   904
			& $\mid$ & $c$          \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   905
			& $\mid$ & $r_1 \cdot r_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   906
			& $\mid$ & $r_1 + r_2$   \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   907
			\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   908
			& $\mid$ & $r^*$         \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   909
		\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   910
		&
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   911
		\begin{tabular}{@{\hspace{0mm}}rrl@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   912
			\multicolumn{3}{@{}l}{\textbf{Values}}\medskip\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   913
			$v$ & $::=$  & \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   914
			&        & $\Empty$   \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   915
			& $\mid$ & $\Char(c)$          \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   916
			& $\mid$ & $\Seq\,v_1\, v_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   917
			& $\mid$ & $\Left(v)$   \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   918
			& $\mid$ & $\Right(v)$  \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   919
			& $\mid$ & $\Stars\,[v_1,\ldots\,v_n]$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   920
		\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   921
	\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   922
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   923
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   924
\noindent
472
Chengsong
parents: 471
diff changeset
   925
Chengsong
parents: 471
diff changeset
   926
Building on top of Sulzmann and Lu's attempt to formalize the 
Chengsong
parents: 471
diff changeset
   927
notion of POSIX lexing rules \parencite{Sulzmann2014}, 
Chengsong
parents: 471
diff changeset
   928
Ausaf and Urban\parencite{AusafDyckhoffUrban2016} modelled
Chengsong
parents: 471
diff changeset
   929
POSIX matching as a ternary relation recursively defined in a
Chengsong
parents: 471
diff changeset
   930
natural deduction style.
Chengsong
parents: 471
diff changeset
   931
With the formally-specified rules for what a POSIX matching is,
Chengsong
parents: 471
diff changeset
   932
they proved in Isabelle/HOL that the algorithm gives correct results.
Chengsong
parents: 471
diff changeset
   933
505
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
   934
But having a correct result is still not enough, 
5ce3bd8e5696 thesis chapter2 section 2.4
Chengsong
parents: 503
diff changeset
   935
we want at least some degree of $\mathbf{efficiency}$.
472
Chengsong
parents: 471
diff changeset
   936
Chengsong
parents: 471
diff changeset
   937
Chengsong
parents: 471
diff changeset
   938
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   939
One regular expression can have multiple lexical values. For example
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   940
for the regular expression $(a+b)^*$, it has a infinite list of
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   941
values corresponding to it: $\Stars\,[]$, $\Stars\,[\Left(Char(a))]$,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   942
$\Stars\,[\Right(Char(b))]$, $\Stars\,[\Left(Char(a),\,\Right(Char(b))]$,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   943
$\ldots$, and vice versa.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   944
Even for the regular expression matching a certain string, there could 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   945
still be more than one value corresponding to it.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   946
Take the example where $r= (a^*\cdot a^*)^*$ and the string 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   947
$s=\underbrace{aa\ldots a}_\text{n \textit{a}s}$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   948
The number of different ways of matching 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   949
without allowing any value under a star to be flattened
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   950
to an empty string can be given by the following formula:
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   951
\begin{equation}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   952
	C_n = (n+1)+n C_1+\ldots + 2 C_{n-1}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   953
\end{equation}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   954
and a closed form formula can be calculated to be
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   955
\begin{equation}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   956
	C_n =\frac{(2+\sqrt{2})^n - (2-\sqrt{2})^n}{4\sqrt{2}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   957
\end{equation}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   958
which is clearly in exponential order.
472
Chengsong
parents: 471
diff changeset
   959
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   960
A lexer aimed at getting all the possible values has an exponential
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   961
worst case runtime. Therefore it is impractical to try to generate
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   962
all possible matches in a run. In practice, we are usually 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   963
interested about POSIX values, which by intuition always
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   964
\begin{itemize}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   965
\item
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   966
match the leftmost regular expression when multiple options of matching
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   967
are available  
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   968
\item 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   969
always match a subpart as much as possible before proceeding
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   970
to the next token.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   971
\end{itemize}
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   972
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   973
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   974
 For example, the above example has the POSIX value
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   975
$ \Stars\,[\Seq(Stars\,[\underbrace{\Char(a),\ldots,\Char(a)}_\text{n iterations}], Stars\,[])]$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   976
The output of an algorithm we want would be a POSIX matching
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   977
encoded as a value.
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   978
The reason why we are interested in $\POSIX$ values is that they can
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   979
be practically used in the lexing phase of a compiler front end.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   980
For instance, when lexing a code snippet 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   981
$\textit{iffoo} = 3$ with the regular expression $\textit{keyword} + \textit{identifier}$, we want $\textit{iffoo}$ to be recognized
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   982
as an identifier rather than a keyword.
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
   983
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   984
The contribution of Sulzmann and Lu is an extension of Brzozowski's
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   985
algorithm by a second phase (the first phase being building successive
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   986
derivatives---see \eqref{graph:*}). In this second phase, a POSIX value 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   987
is generated in case the regular expression matches  the string. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   988
Pictorially, the Sulzmann and Lu algorithm is as follows:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   989
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   990
\begin{ceqn}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   991
\begin{equation}\label{graph:2}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   992
\begin{tikzcd}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   993
r_0 \arrow[r, "\backslash c_0"]  \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   994
v_0           & v_1 \arrow[l,"inj_{r_0} c_0"]                & v_2 \arrow[l, "inj_{r_1} c_1"]              & v_n \arrow[l, dashed]         
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   995
\end{tikzcd}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   996
\end{equation}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   997
\end{ceqn}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   998
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
   999
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1000
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1001
For convenience, we shall employ the following notations: the regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1002
expression we start with is $r_0$, and the given string $s$ is composed
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1003
of characters $c_0 c_1 \ldots c_{n-1}$. In  the first phase from the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1004
left to right, we build the derivatives $r_1$, $r_2$, \ldots  according
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1005
to the characters $c_0$, $c_1$  until we exhaust the string and obtain
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1006
the derivative $r_n$. We test whether this derivative is
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1007
$\textit{nullable}$ or not. If not, we know the string does not match
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1008
$r$ and no value needs to be generated. If yes, we start building the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1009
values incrementally by \emph{injecting} back the characters into the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1010
earlier values $v_n, \ldots, v_0$. This is the second phase of the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1011
algorithm from the right to left. For the first value $v_n$, we call the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1012
function $\textit{mkeps}$, which builds a POSIX lexical value
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1013
for how the empty string has been matched by the (nullable) regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1014
expression $r_n$. This function is defined as
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1015
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1016
	\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1017
		\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1018
			$\mkeps(\ONE)$ 		& $\dn$ & $\Empty$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1019
			$\mkeps(r_{1}+r_{2})$	& $\dn$ 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1020
			& \textit{if} $\nullable(r_{1})$\\ 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1021
			& & \textit{then} $\Left(\mkeps(r_{1}))$\\ 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1022
			& & \textit{else} $\Right(\mkeps(r_{2}))$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1023
			$\mkeps(r_1\cdot r_2)$ 	& $\dn$ & $\Seq\,(\mkeps\,r_1)\,(\mkeps\,r_2)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1024
			$mkeps(r^*)$	        & $\dn$ & $\Stars\,[]$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1025
		\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1026
	\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1027
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1028
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1029
\noindent 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1030
After the $\mkeps$-call, we inject back the characters one by one in order to build
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1031
the lexical value $v_i$ for how the regex $r_i$ matches the string $s_i$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1032
($s_i = c_i \ldots c_{n-1}$ ) from the previous lexical value $v_{i+1}$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1033
After injecting back $n$ characters, we get the lexical value for how $r_0$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1034
matches $s$. The POSIX value is maintained throught out the process.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1035
For this Sulzmann and Lu defined a function that reverses
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1036
the ``chopping off'' of characters during the derivative phase. The
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1037
corresponding function is called \emph{injection}, written
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1038
$\textit{inj}$; it takes three arguments: the first one is a regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1039
expression ${r_{i-1}}$, before the character is chopped off, the second
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1040
is a character ${c_{i-1}}$, the character we want to inject and the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1041
third argument is the value ${v_i}$, into which one wants to inject the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1042
character (it corresponds to the regular expression after the character
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1043
has been chopped off). The result of this function is a new value. The
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1044
definition of $\textit{inj}$ is as follows: 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1045
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1046
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1047
\begin{tabular}{l@{\hspace{1mm}}c@{\hspace{1mm}}l}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1048
  $\textit{inj}\,(c)\,c\,Empty$            & $\dn$ & $Char\,c$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1049
  $\textit{inj}\,(r_1 + r_2)\,c\,\Left(v)$ & $\dn$ & $\Left(\textit{inj}\,r_1\,c\,v)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1050
  $\textit{inj}\,(r_1 + r_2)\,c\,Right(v)$ & $\dn$ & $Right(\textit{inj}\,r_2\,c\,v)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1051
  $\textit{inj}\,(r_1 \cdot r_2)\,c\,Seq(v_1,v_2)$ & $\dn$  & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1052
  $\textit{inj}\,(r_1 \cdot r_2)\,c\,\Left(Seq(v_1,v_2))$ & $\dn$  & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1053
  $\textit{inj}\,(r_1 \cdot r_2)\,c\,Right(v)$ & $\dn$  & $Seq(\textit{mkeps}(r_1),\textit{inj}\,r_2\,c\,v)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1054
  $\textit{inj}\,(r^*)\,c\,Seq(v,Stars\,vs)$         & $\dn$  & $Stars((\textit{inj}\,r\,c\,v)\,::\,vs)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1055
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1056
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1057
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1058
\noindent This definition is by recursion on the ``shape'' of regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1059
expressions and values. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1060
The clauses basically do one thing--identifying the ``holes'' on 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1061
value to inject the character back into.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1062
For instance, in the last clause for injecting back to a value
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1063
that would turn into a new star value that corresponds to a star,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1064
we know it must be a sequence value. And we know that the first 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1065
value of that sequence corresponds to the child regex of the star
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1066
with the first character being chopped off--an iteration of the star
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1067
that had just been unfolded. This value is followed by the already
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1068
matched star iterations we collected before. So we inject the character 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1069
back to the first value and form a new value with this new iteration
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1070
being added to the previous list of iterations, all under the $Stars$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1071
top level.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1072
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1073
We have mentioned before that derivatives without simplification 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1074
can get clumsy, and this is true for values as well--they reflect
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1075
the regular expressions size by definition.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1076
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1077
One can introduce simplification on the regex and values, but have to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1078
be careful in not breaking the correctness as the injection 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1079
function heavily relies on the structure of the regexes and values
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1080
being correct and match each other.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1081
It can be achieved by recording some extra rectification functions
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1082
during the derivatives step, and applying these rectifications in 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1083
each run during the injection phase.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1084
And we can prove that the POSIX value of how
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1085
regular expressions match strings will not be affected---although is much harder
472
Chengsong
parents: 471
diff changeset
  1086
to establish. 
Chengsong
parents: 471
diff changeset
  1087
Some initial results in this regard have been
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1088
obtained in \cite{AusafDyckhoffUrban2016}. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1089
472
Chengsong
parents: 471
diff changeset
  1090
Chengsong
parents: 471
diff changeset
  1091
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1092
%Brzozowski, after giving the derivatives and simplification,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1093
%did not explore lexing with simplification or he may well be 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1094
%stuck on an efficient simplificaiton with a proof.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1095
%He went on to explore the use of derivatives together with 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1096
%automaton, and did not try lexing using derivatives.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1097
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1098
We want to get rid of complex and fragile rectification of values.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1099
Can we not create those intermediate values $v_1,\ldots v_n$,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1100
and get the lexing information that should be already there while
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1101
doing derivatives in one pass, without a second phase of injection?
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1102
In the meantime, can we make sure that simplifications
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1103
are easily handled without breaking the correctness of the algorithm?
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1104
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1105
Sulzmann and Lu solved this problem by
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1106
introducing additional informtaion to the 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1107
regular expressions called \emph{bitcodes}.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1108
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1109
\subsection*{Bit-coded Algorithm}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1110
Bits and bitcodes (lists of bits) are defined as:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1111
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1112
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1113
		$b ::=   1 \mid  0 \qquad
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1114
bs ::= [] \mid b::bs    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1115
$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1116
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1117
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1118
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1119
The $1$ and $0$ are not in bold in order to avoid 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1120
confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1121
bit-lists) can be used to encode values (or potentially incomplete values) in a
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1122
compact form. This can be straightforwardly seen in the following
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1123
coding function from values to bitcodes: 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1124
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1125
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1126
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1127
  $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1128
  $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1129
  $\textit{code}(\Left\,v)$ & $\dn$ & $0 :: code(v)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1130
  $\textit{code}(\Right\,v)$ & $\dn$ & $1 :: code(v)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1131
  $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1132
  $\textit{code}(\Stars\,[])$ & $\dn$ & $[0]$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1133
  $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $1 :: code(v) \;@\;
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1134
                                                 code(\Stars\,vs)$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1135
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1136
\end{center} 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1137
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1138
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1139
Here $\textit{code}$ encodes a value into a bitcodes by converting
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1140
$\Left$ into $0$, $\Right$ into $1$, and marks the start of a non-empty
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1141
star iteration by $1$. The border where a local star terminates
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1142
is marked by $0$. This coding is lossy, as it throws away the information about
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1143
characters, and also does not encode the ``boundary'' between two
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1144
sequence values. Moreover, with only the bitcode we cannot even tell
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1145
whether the $1$s and $0$s are for $\Left/\Right$ or $\Stars$. The
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1146
reason for choosing this compact way of storing information is that the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1147
relatively small size of bits can be easily manipulated and ``moved
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1148
around'' in a regular expression. In order to recover values, we will 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1149
need the corresponding regular expression as an extra information. This
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1150
means the decoding function is defined as:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1151
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1152
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1153
%\begin{definition}[Bitdecoding of Values]\mbox{}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1154
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1155
\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1156
  $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1157
  $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1158
  $\textit{decode}'\,(0\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1159
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1160
       (\Left\,v, bs_1)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1161
  $\textit{decode}'\,(1\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1162
     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1163
       (\Right\,v, bs_1)$\\                           
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1164
  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1165
        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1166
  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1167
  & &   \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1168
  $\textit{decode}'\,(0\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1169
  $\textit{decode}'\,(1\!::\!bs)\,(r^*)$ & $\dn$ & 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1170
         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1171
  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1172
  & &   \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1173
  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1174
  $\textit{decode}\,bs\,r$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1175
     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1176
  & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1177
       \textit{else}\;\textit{None}$                       
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1178
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1179
\end{center}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1180
%\end{definition}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1181
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1182
Sulzmann and Lu's integrated the bitcodes into regular expressions to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1183
create annotated regular expressions \cite{Sulzmann2014}.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1184
\emph{Annotated regular expressions} are defined by the following
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1185
grammar:%\comment{ALTS should have  an $as$ in  the definitions, not  just $a_1$ and $a_2$}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1186
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1187
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1188
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1189
  $\textit{a}$ & $::=$  & $\ZERO$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1190
                  & $\mid$ & $_{bs}\ONE$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1191
                  & $\mid$ & $_{bs}{\bf c}$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1192
                  & $\mid$ & $_{bs}\sum\,as$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1193
                  & $\mid$ & $_{bs}a_1\cdot a_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1194
                  & $\mid$ & $_{bs}a^*$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1195
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1196
\end{center}  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1197
%(in \textit{ALTS})
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1198
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1199
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1200
where $bs$ stands for bitcodes, $a$  for $\mathbf{a}$nnotated regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1201
expressions and $as$ for a list of annotated regular expressions.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1202
The alternative constructor($\sum$) has been generalized to 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1203
accept a list of annotated regular expressions rather than just 2.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1204
We will show that these bitcodes encode information about
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1205
the (POSIX) value that should be generated by the Sulzmann and Lu
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1206
algorithm.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1207
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1208
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1209
To do lexing using annotated regular expressions, we shall first
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1210
transform the usual (un-annotated) regular expressions into annotated
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1211
regular expressions. This operation is called \emph{internalisation} and
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1212
defined as follows:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1213
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1214
%\begin{definition}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1215
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1216
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1217
  $(\ZERO)^\uparrow$ & $\dn$ & $\ZERO$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1218
  $(\ONE)^\uparrow$ & $\dn$ & $_{[]}\ONE$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1219
  $(c)^\uparrow$ & $\dn$ & $_{[]}{\bf c}$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1220
  $(r_1 + r_2)^\uparrow$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1221
  $_{[]}\sum[\textit{fuse}\,[0]\,r_1^\uparrow,\,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1222
  \textit{fuse}\,[1]\,r_2^\uparrow]$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1223
  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1224
         $_{[]}r_1^\uparrow \cdot r_2^\uparrow$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1225
  $(r^*)^\uparrow$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1226
         $_{[]}(r^\uparrow)^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1227
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1228
\end{center}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1229
%\end{definition}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1230
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1231
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1232
We use up arrows here to indicate that the basic un-annotated regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1233
expressions are ``lifted up'' into something slightly more complex. In the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1234
fourth clause, $\textit{fuse}$ is an auxiliary function that helps to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1235
attach bits to the front of an annotated regular expression. Its
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1236
definition is as follows:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1237
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1238
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1239
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1240
  $\textit{fuse}\;bs \; \ZERO$ & $\dn$ & $\ZERO$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1241
  $\textit{fuse}\;bs\; _{bs'}\ONE$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1242
     $_{bs @ bs'}\ONE$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1243
  $\textit{fuse}\;bs\;_{bs'}{\bf c}$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1244
     $_{bs@bs'}{\bf c}$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1245
  $\textit{fuse}\;bs\,_{bs'}\sum\textit{as}$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1246
     $_{bs@bs'}\sum\textit{as}$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1247
  $\textit{fuse}\;bs\; _{bs'}a_1\cdot a_2$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1248
     $_{bs@bs'}a_1 \cdot a_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1249
  $\textit{fuse}\;bs\,_{bs'}a^*$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1250
     $_{bs @ bs'}a^*$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1251
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1252
\end{center}  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1253
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1254
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1255
After internalising the regular expression, we perform successive
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1256
derivative operations on the annotated regular expressions. This
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1257
derivative operation is the same as what we had previously for the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1258
basic regular expressions, except that we beed to take care of
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1259
the bitcodes:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1260
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1261
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1262
\iffalse
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1263
 %\begin{definition}{bder}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1264
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1265
  \begin{tabular}{@{}lcl@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1266
  $(\textit{ZERO})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1267
  $(\textit{ONE}\;bs)\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1268
  $(\textit{CHAR}\;bs\,d)\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1269
        $\textit{if}\;c=d\; \;\textit{then}\;
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1270
         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1271
  $(\textit{ALTS}\;bs\,as)\,\backslash c$ & $\dn$ &
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1272
  $\textit{ALTS}\;bs\,(map (\backslash c) as)$\\
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1273
  $(\textit{SEQ}\;bs\,a_1\,a_2)\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1274
     $\textit{if}\;\textit{bnullable}\,a_1$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1275
					       & &$\textit{then}\;\textit{ALTS}\,bs\,List((\textit{SEQ}\,[]\,(a_1\,\backslash c)\,a_2),$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1276
					       & &$\phantom{\textit{then}\;\textit{ALTS}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c)))$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1277
  & &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\,\backslash c)\,a_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1278
  $(\textit{STAR}\,bs\,a)\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1279
      $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\,\backslash c))\,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1280
       (\textit{STAR}\,[]\,r)$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1281
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1282
\end{center}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1283
%\end{definition}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1284
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1285
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1286
  \begin{tabular}{@{}lcl@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1287
  $(\textit{ZERO})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1288
  $(_{bs}\textit{ONE})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1289
  $(_{bs}\textit{CHAR}\;d)\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1290
        $\textit{if}\;c=d\; \;\textit{then}\;
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1291
         _{bs}\textit{ONE}\;\textit{else}\;\textit{ZERO}$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1292
  $(_{bs}\textit{ALTS}\;\textit{as})\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1293
  $_{bs}\textit{ALTS}\;(\textit{as}.\textit{map}(\backslash c))$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1294
  $(_{bs}\textit{SEQ}\;a_1\,a_2)\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1295
     $\textit{if}\;\textit{bnullable}\,a_1$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1296
					       & &$\textit{then}\;_{bs}\textit{ALTS}\,List((_{[]}\textit{SEQ}\,(a_1\,\backslash c)\,a_2),$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1297
					       & &$\phantom{\textit{then}\;_{bs}\textit{ALTS}\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c)))$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1298
  & &$\textit{else}\;_{bs}\textit{SEQ}\,(a_1\,\backslash c)\,a_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1299
  $(_{bs}\textit{STAR}\,a)\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1300
      $_{bs}\textit{SEQ}\;(\textit{fuse}\, [0] \; r\,\backslash c )\,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1301
       (_{bs}\textit{STAR}\,[]\,r)$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1302
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1303
\end{center}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1304
%\end{definition}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1305
\fi
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1306
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1307
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1308
  \begin{tabular}{@{}lcl@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1309
  $(\ZERO)\,\backslash c$ & $\dn$ & $\ZERO$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1310
  $(_{bs}\ONE)\,\backslash c$ & $\dn$ & $\ZERO$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1311
  $(_{bs}{\bf d})\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1312
        $\textit{if}\;c=d\; \;\textit{then}\;
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1313
         _{bs}\ONE\;\textit{else}\;\ZERO$\\  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1314
  $(_{bs}\sum \;\textit{as})\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1315
  $_{bs}\sum\;(\textit{as.map}(\backslash c))$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1316
  $(_{bs}\;a_1\cdot a_2)\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1317
     $\textit{if}\;\textit{bnullable}\,a_1$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1318
					       & &$\textit{then}\;_{bs}\sum\,[(_{[]}\,(a_1\,\backslash c)\cdot\,a_2),$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1319
					       & &$\phantom{\textit{then},\;_{bs}\sum\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c))]$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1320
  & &$\textit{else}\;_{bs}\,(a_1\,\backslash c)\cdot a_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1321
  $(_{bs}a^*)\,\backslash c$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1322
      $_{bs}(\textit{fuse}\, [0] \; r\,\backslash c)\cdot
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1323
       (_{[]}r^*))$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1324
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1325
\end{center}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1326
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1327
%\end{definition}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1328
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1329
For instance, when we do derivative of  $_{bs}a^*$ with respect to c,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1330
we need to unfold it into a sequence,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1331
and attach an additional bit $0$ to the front of $r \backslash c$
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1332
to indicate one more star iteration. Also the sequence clause
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1333
is more subtle---when $a_1$ is $\textit{bnullable}$ (here
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1334
\textit{bnullable} is exactly the same as $\textit{nullable}$, except
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1335
that it is for annotated regular expressions, therefore we omit the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1336
definition). Assume that $\textit{bmkeps}$ correctly extracts the bitcode for how
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1337
$a_1$ matches the string prior to character $c$ (more on this later),
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1338
then the right branch of alternative, which is $\textit{fuse} \; \bmkeps \;  a_1 (a_2
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1339
\backslash c)$ will collapse the regular expression $a_1$(as it has
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1340
already been fully matched) and store the parsing information at the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1341
head of the regular expression $a_2 \backslash c$ by fusing to it. The
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1342
bitsequence $\textit{bs}$, which was initially attached to the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1343
first element of the sequence $a_1 \cdot a_2$, has
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1344
now been elevated to the top-level of $\sum$, as this information will be
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1345
needed whichever way the sequence is matched---no matter whether $c$ belongs
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1346
to $a_1$ or $ a_2$. After building these derivatives and maintaining all
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1347
the lexing information, we complete the lexing by collecting the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1348
bitcodes using a generalised version of the $\textit{mkeps}$ function
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1349
for annotated regular expressions, called $\textit{bmkeps}$:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1350
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1351
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1352
%\begin{definition}[\textit{bmkeps}]\mbox{}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1353
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1354
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1355
  $\textit{bmkeps}\,(_{bs}\ONE)$ & $\dn$ & $bs$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1356
  $\textit{bmkeps}\,(_{bs}\sum a::\textit{as})$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1357
     $\textit{if}\;\textit{bnullable}\,a$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1358
  & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1359
  & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(_{bs}\sum \textit{as})$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1360
  $\textit{bmkeps}\,(_{bs} a_1 \cdot a_2)$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1361
     $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1362
  $\textit{bmkeps}\,(_{bs}a^*)$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1363
     $bs \,@\, [0]$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1364
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1365
\end{center}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1366
%\end{definition}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1367
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1368
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1369
This function completes the value information by travelling along the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1370
path of the regular expression that corresponds to a POSIX value and
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1371
collecting all the bitcodes, and using $S$ to indicate the end of star
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1372
iterations. If we take the bitcodes produced by $\textit{bmkeps}$ and
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1373
decode them, we get the value we expect. The corresponding lexing
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1374
algorithm looks as follows:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1375
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1376
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1377
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1378
  $\textit{blexer}\;r\,s$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1379
      $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\                
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1380
  & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1381
  & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1382
  & & $\;\;\textit{else}\;\textit{None}$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1383
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1384
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1385
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1386
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1387
In this definition $\_\backslash s$ is the  generalisation  of the derivative
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1388
operation from characters to strings (just like the derivatives for un-annotated
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1389
regular expressions).
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1390
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1391
Now we introduce the simplifications, which is why we introduce the 
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1392
bitcodes in the first place.
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1393
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1394
\subsection*{Simplification Rules}
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1395
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1396
This section introduces aggressive (in terms of size) simplification rules
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1397
on annotated regular expressions
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1398
to keep derivatives small. Such simplifications are promising
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1399
as we have
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1400
generated test data that show
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1401
that a good tight bound can be achieved. We could only
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1402
partially cover the search space as there are infinitely many regular
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1403
expressions and strings. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1404
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1405
One modification we introduced is to allow a list of annotated regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1406
expressions in the $\sum$ constructor. This allows us to not just
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1407
delete unnecessary $\ZERO$s and $\ONE$s from regular expressions, but
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1408
also unnecessary ``copies'' of regular expressions (very similar to
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1409
simplifying $r + r$ to just $r$, but in a more general setting). Another
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1410
modification is that we use simplification rules inspired by Antimirov's
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1411
work on partial derivatives. They maintain the idea that only the first
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1412
``copy'' of a regular expression in an alternative contributes to the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1413
calculation of a POSIX value. All subsequent copies can be pruned away from
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1414
the regular expression. A recursive definition of our  simplification function 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1415
that looks somewhat similar to our Scala code is given below:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1416
%\comment{Use $\ZERO$, $\ONE$ and so on. 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1417
%Is it $ALTS$ or $ALTS$?}\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1418
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1419
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1420
  \begin{tabular}{@{}lcl@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1421
   
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1422
  $\textit{simp} \; (_{bs}a_1\cdot a_2)$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp}  \; a_2) \; \textit{match} $ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1423
   &&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow  \ZERO$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1424
   &&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow  \ZERO$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1425
   &&$\quad\textit{case} \;  (\ONE, a_2') \Rightarrow  \textit{fuse} \; bs \;  a_2'$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1426
   &&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow  \textit{fuse} \; bs \;  a_1'$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1427
   &&$\quad\textit{case} \; (a_1', a_2') \Rightarrow   _{bs}a_1' \cdot a_2'$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1428
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1429
  $\textit{simp} \; (_{bs}\sum \textit{as})$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{as.map(simp)})) \; \textit{match} $ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1430
  &&$\quad\textit{case} \; [] \Rightarrow  \ZERO$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1431
   &&$\quad\textit{case} \; a :: [] \Rightarrow  \textit{fuse bs a}$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1432
   &&$\quad\textit{case} \;  as' \Rightarrow _{bs}\sum \textit{as'}$\\ 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1433
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1434
   $\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$   
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1435
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1436
\end{center}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1437
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1438
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1439
The simplification does a pattern matching on the regular expression.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1440
When it detected that the regular expression is an alternative or
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1441
sequence, it will try to simplify its child regular expressions
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1442
recursively and then see if one of the children turns into $\ZERO$ or
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1443
$\ONE$, which might trigger further simplification at the current level.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1444
The most involved part is the $\sum$ clause, where we use two
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1445
auxiliary functions $\textit{flatten}$ and $\textit{distinct}$ to open up nested
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1446
alternatives and reduce as many duplicates as possible. Function
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1447
$\textit{distinct}$  keeps the first occurring copy only and removes all later ones
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1448
when detected duplicates. Function $\textit{flatten}$ opens up nested $\sum$s.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1449
Its recursive definition is given below:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1450
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1451
 \begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1452
  \begin{tabular}{@{}lcl@{}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1453
  $\textit{flatten} \; (_{bs}\sum \textit{as}) :: \textit{as'}$ & $\dn$ & $(\textit{map} \;
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1454
     (\textit{fuse}\;bs)\; \textit{as}) \; @ \; \textit{flatten} \; as' $ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1455
  $\textit{flatten} \; \ZERO :: as'$ & $\dn$ & $ \textit{flatten} \;  \textit{as'} $ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1456
    $\textit{flatten} \; a :: as'$ & $\dn$ & $a :: \textit{flatten} \; \textit{as'}$ \quad(otherwise) 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1457
\end{tabular}    
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1458
\end{center}  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1459
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1460
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1461
Here $\textit{flatten}$ behaves like the traditional functional programming flatten
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1462
function, except that it also removes $\ZERO$s. Or in terms of regular expressions, it
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1463
removes parentheses, for example changing $a+(b+c)$ into $a+b+c$.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1464
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1465
Having defined the $\simp$ function,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1466
we can use the previous notation of  natural
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1467
extension from derivative w.r.t.~character to derivative
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1468
w.r.t.~string:%\comment{simp in  the [] case?}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1469
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1470
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1471
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1472
$r \backslash_{simp} (c\!::\!s) $ & $\dn$ & $(r \backslash_{simp}\, c) \backslash_{simp}\, s$ \\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1473
$r \backslash_{simp} [\,] $ & $\dn$ & $r$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1474
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1475
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1476
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1477
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1478
to obtain an optimised version of the algorithm:
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1479
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1480
 \begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1481
\begin{tabular}{lcl}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1482
  $\textit{blexer\_simp}\;r\,s$ & $\dn$ &
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1483
      $\textit{let}\;a = (r^\uparrow)\backslash_{simp}\, s\;\textit{in}$\\                
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1484
  & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1485
  & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1486
  & & $\;\;\textit{else}\;\textit{None}$
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1487
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1488
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1489
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1490
\noindent
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1491
This algorithm keeps the regular expression size small, for example,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1492
with this simplification our previous $(a + aa)^*$ example's 8000 nodes
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1493
will be reduced to just 6 and stays constant, no matter how long the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1494
input string is.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1495
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1496
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1497
 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1498
Derivatives give a simple solution
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1499
to the problem of matching a string $s$ with a regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1500
expression $r$: if the derivative of $r$ w.r.t.\ (in
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1501
succession) all the characters of the string matches the empty string,
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1502
then $r$ matches $s$ (and {\em vice versa}).  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1503
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1504
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1505
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1506
However, two difficulties with derivative-based matchers exist:
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1507
First, Brzozowski's original matcher only generates a yes/no answer
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1508
for whether a regular expression matches a string or not.  This is too
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1509
little information in the context of lexing where separate tokens must
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1510
be identified and also classified (for example as keywords
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1511
or identifiers).  Sulzmann and Lu~\cite{Sulzmann2014} overcome this
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1512
difficulty by cleverly extending Brzozowski's matching
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1513
algorithm. Their extended version generates additional information on
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1514
\emph{how} a regular expression matches a string following the POSIX
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1515
rules for regular expression matching. They achieve this by adding a
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1516
second ``phase'' to Brzozowski's algorithm involving an injection
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1517
function.  In our own earlier work we provided the formal
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1518
specification of what POSIX matching means and proved in Isabelle/HOL
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1519
the correctness
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1520
of Sulzmann and Lu's extended algorithm accordingly
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1521
\cite{AusafDyckhoffUrban2016}.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1522
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1523
The second difficulty is that Brzozowski's derivatives can 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1524
grow to arbitrarily big sizes. For example if we start with the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1525
regular expression $(a+aa)^*$ and take
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1526
successive derivatives according to the character $a$, we end up with
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1527
a sequence of ever-growing derivatives like 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1528
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1529
\def\ll{\stackrel{\_\backslash{} a}{\longrightarrow}}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1530
\begin{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1531
\begin{tabular}{rll}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1532
$(a + aa)^*$ & $\ll$ & $(\ONE + \ONE{}a) \cdot (a + aa)^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1533
& $\ll$ & $(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* \;+\; (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1534
& $\ll$ & $(\ZERO + \ZERO{}a + \ZERO) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^* \;+\; $\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1535
& & $\qquad(\ZERO + \ZERO{}a + \ONE) \cdot (a + aa)^* + (\ONE + \ONE{}a) \cdot (a + aa)^*$\\
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1536
& $\ll$ & \ldots \hspace{15mm}(regular expressions of sizes 98, 169, 283, 468, 767, \ldots)
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1537
\end{tabular}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1538
\end{center}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1539
 
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1540
\noindent where after around 35 steps we run out of memory on a
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1541
typical computer (we shall define shortly the precise details of our
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1542
regular expressions and the derivative operation).  Clearly, the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1543
notation involving $\ZERO$s and $\ONE$s already suggests
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1544
simplification rules that can be applied to regular regular
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1545
expressions, for example $\ZERO{}\,r \Rightarrow \ZERO$, $\ONE{}\,r
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1546
\Rightarrow r$, $\ZERO{} + r \Rightarrow r$ and $r + r \Rightarrow
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1547
r$. While such simple-minded simplifications have been proved in our
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1548
earlier work to preserve the correctness of Sulzmann and Lu's
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1549
algorithm \cite{AusafDyckhoffUrban2016}, they unfortunately do
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1550
\emph{not} help with limiting the growth of the derivatives shown
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1551
above: the growth is slowed, but the derivatives can still grow rather
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1552
quickly beyond any finite bound.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1553
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1554
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1555
Sulzmann and Lu overcome this ``growth problem'' in a second algorithm
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1556
\cite{Sulzmann2014} where they introduce bitcoded
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1557
regular expressions. In this version, POSIX values are
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1558
represented as bitsequences and such sequences are incrementally generated
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1559
when derivatives are calculated. The compact representation
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1560
of bitsequences and regular expressions allows them to define a more
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1561
``aggressive'' simplification method that keeps the size of the
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1562
derivatives finite no matter what the length of the string is.
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1563
They make some informal claims about the correctness and linear behaviour
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1564
of this version, but do not provide any supporting proof arguments, not
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1565
even ``pencil-and-paper'' arguments. They write about their bitcoded
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1566
\emph{incremental parsing method} (that is the algorithm to be formalised
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1567
in this paper):
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1568
518
ff7945a988a3 more to thesis
Chengsong
parents: 516
diff changeset
  1569
%motivation part
468
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1570
\begin{quote}\it
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1571
  ``Correctness Claim: We further claim that the incremental parsing
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1572
  method [..] in combination with the simplification steps [..]
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1573
  yields POSIX parse trees. We have tested this claim
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1574
  extensively [..] but yet
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1575
  have to work out all proof details.'' \cite[Page 14]{Sulzmann2014}
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1576
\end{quote}  
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1577
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1578
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1579
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1580
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1581
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1582
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1583
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1584
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1585
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1586
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1587
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1588
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1589
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1590
%----------------------------------------------------------------------------------------
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1591
a0f27e21b42c all texrelated
Chengsong
parents:
diff changeset
  1592