# HG changeset patch # User Chengsong # Date 1561807729 -3600 # Node ID 1a92233763bdd5e609b770b495b360877fc4fd07 # Parent a28f1d5daa347a510d00a27fc3a9c458dc0f5190 created folder for 9mth report diff -r a28f1d5daa34 -r 1a92233763bd 9ms/cc-by.pdf Binary file 9ms/cc-by.pdf has changed diff -r a28f1d5daa34 -r 1a92233763bd 9ms/data.sty --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/9ms/data.sty Sat Jun 29 12:28:49 2019 +0100 @@ -0,0 +1,58 @@ +% The data files, written on the first run. + + +\begin{filecontents}{re-python2.data} +1 0.033 +5 0.036 +10 0.034 +15 0.036 +18 0.059 +19 0.084 +20 0.141 +21 0.248 +22 0.485 +23 0.878 +24 1.71 +25 3.40 +26 7.08 +27 14.12 +28 26.69 +\end{filecontents} + + +% JavaScript, example (a*)*b +\begin{filecontents}{re-js.data} +5 0.061 +10 0.061 +15 0.061 +20 0.070 +23 0.131 +25 0.308 +26 0.564 +28 1.994 +30 7.648 +31 15.881 +32 32.190 +\end{filecontents} + +% Java 8, example (a*)*b +\begin{filecontents}{re-java.data} +5 0.00298 +10 0.00418 +15 0.00996 +16 0.01710 +17 0.03492 +18 0.03303 +19 0.05084 +20 0.10177 +21 0.19960 +22 0.41159 +23 0.82234 +24 1.70251 +25 3.36112 +26 6.63998 +27 13.35120 +28 29.81185 +\end{filecontents} + + diff -r a28f1d5daa34 -r 1a92233763bd 9ms/ecoop_paper.tex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/9ms/ecoop_paper.tex Sat Jun 29 12:28:49 2019 +0100 @@ -0,0 +1,511 @@ +\documentclass[a4paper,UKenglish]{lipics} +\usepackage{graphic} +\usepackage{data} +% \documentclass{article} +%\usepackage[utf8]{inputenc} +%\usepackage[english]{babel} +%\usepackage{listings} +% \usepackage{amsthm} +% \usepackage{hyperref} +% \usepackage[margin=0.5in]{geometry} +%\usepackage{pmboxdraw} + +\title{POSIX Regular Expression Matching and Lexing} +\author{Chengsong Tan} +\affil{King's College London\\ +London, UK\\ +\texttt{chengsong.tan@kcl.ac.uk}} +\authorrunning{Chengsong Tan} +\Copyright{Chengsong Tan} + +\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}% +\newcommand{\ZERO}{\mbox{\bf 0}} +\newcommand{\ONE}{\mbox{\bf 1}} +\def\lexer{\mathit{lexer}} +\def\mkeps{\mathit{mkeps}} +\def\inj{\mathit{inj}} +\def\Empty{\mathit{Empty}} +\def\Left{\mathit{Left}} +\def\Right{\mathit{Right}} +\def\Stars{\mathit{Stars}} +\def\Char{\mathit{Char}} +\def\Seq{\mathit{Seq}} +\def\Der{\mathit{Der}} +\def\nullable{\mathit{nullable}} +\def\Z{\mathit{Z}} +\def\S{\mathit{S}} + +%\theoremstyle{theorem} +%\newtheorem{theorem}{Theorem} +%\theoremstyle{lemma} +%\newtheorem{lemma}{Lemma} +%\newcommand{\lemmaautorefname}{Lemma} +%\theoremstyle{definition} +%\newtheorem{definition}{Definition} + + +\begin{document} + +\maketitle + +\begin{abstract} + Brzozowski introduced in 1964 a beautifully simple algorithm for + regular expression matching based on the notion of derivatives of + regular expressions. In 2014, Sulzmann and Lu extended this + algorithm to not just give a YES/NO answer for whether or not a regular + expression matches a string, but in case it matches also \emph{how} + it matches the string. This is important for applications such as + lexing (tokenising a string). The problem is to make the algorithm + by Sulzmann and Lu fast on all inputs without breaking its + correctness. +\end{abstract} + +\section{Introduction} + +This PhD-project is about regular expression matching and +lexing. Given the maturity of this topic, the reader might wonder: +Surely, regular expressions must have already been studied to death? +What could possibly be \emph{not} known in this area? And surely all +implemented algorithms for regular expression matching are blindingly +fast? + + + +Unfortunately these preconceptions are not supported by evidence: Take +for example the regular expression $(a^*)^*\,b$ and ask whether +strings of the form $aa..a$ match this regular +expression. Obviously they do not match---the expected $b$ in the last +position is missing. One would expect that modern regular expression +matching engines can find this out very quickly. Alas, if one tries +this example in JavaScript, Python or Java 8 with strings like 28 +$a$'s, one discovers that this decision takes around 30 seconds and +takes considerably longer when adding a few more $a$'s, as the graphs +below show: + +\begin{center} +\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}} +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n$}, + x label style={at={(1.05,-0.05)}}, + ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=35, + ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={JavaScript}, + legend pos=north west, + legend cell align=left] +\addplot[red,mark=*, mark options={fill=white}] table {re-js.data}; +\end{axis} +\end{tikzpicture} + & +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n$}, + x label style={at={(1.05,-0.05)}}, + %ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=35, + ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={Python}, + legend pos=north west, + legend cell align=left] +\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data}; +\end{axis} +\end{tikzpicture} + & +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n$}, + x label style={at={(1.05,-0.05)}}, + %ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=35, + ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={Java 8}, + legend pos=north west, + legend cell align=left] +\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data}; +\end{axis} +\end{tikzpicture}\\ +\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings + of the form $\underbrace{aa..a}_{n}$.} +\end{tabular} +\end{center} + +\noindent These are clearly abysmal and possibly surprising results. +One would expect these systems doing much better than that---after +all, given a DFA and a string, whether a string is matched by this DFA +should be linear. + +Admittedly, the regular expression $(a^*)^*\,b$ is carefully chosen to +exhibit this ``exponential behaviour''. Unfortunately, such regular +expressions are not just a few ``outliers'', but actually they are +frequent enough that a separate name has been created for +them---\emph{evil regular expressions}. In empiric work, Davis et al +report that they have found thousands of such evil regular expressions +in the JavaScript and Python ecosystems \cite{Davis18}. + +This exponential blowup sometimes causes real pain in real life: +for example on 20 July 2016 one evil regular expression brought the +webpage \href{http://stackexchange.com}{Stack Exchange} to its knees \footnote{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}. +In this instance, a regular expression intended to just trim white +spaces from the beginning and the end of a line actually consumed +massive amounts of CPU-resources and because of this the web servers +ground to a halt. This happened when a post with 20,000 white spaces +was submitted, but importantly the white spaces were neither at the +beginning nor at the end. As a result, the regular expression matching +engine needed to backtrack over many choices. + +The underlying problem is that many ``real life'' regular expression +matching engines do not use DFAs for matching. This is because they +support regular expressions that are not covered by the classical +automata theory, and in this more general setting there are quite a +few research questions still unanswered and fast algorithms still need +to be developed. + +There is also another under-researched problem to do with regular +expressions and lexing, i.e.~the process of breaking up strings into +sequences of tokens according to some regular expressions. In this +setting one is not just interested in whether or not a regular +expression matches a string, but if it matches also in \emph{how} it +matches the string. Consider for example a regular expression +$r_{key}$ for recognising keywords such as \textit{if}, \textit{then} +and so on; and a regular expression $r_{id}$ for recognising +identifiers (say, a single character followed by characters or +numbers). One can then form the compound regular expression +$(r_{key} + r_{id})^*$ and use it to tokenise strings. But then how +should the string \textit{iffoo} be tokenised? It could be tokenised +as a keyword followed by an identifier, or the entire string as a +single identifier. Similarly, how should the string \textit{if} be +tokenised? Both regular expressions, $r_{key}$ and $r_{id}$, would +``fire''---so is it an identifier or a keyword? While in applications +there is a well-known strategy to decide these questions, called POSIX +matching, only relatively recently precise definitions of what POSIX +matching actually means have been formalised +\cite{AusafDyckhoffUrban2016,OkuiSuzuki2010,Vansummeren2006}. Roughly, +POSIX matching means matching the longest initial substring and +in the case of a tie, the initial submatch is chosen according to some priorities attached to the +regular expressions (e.g.~keywords have a higher priority than +identifiers). This sounds rather simple, but according to Grathwohl et +al \cite[Page 36]{CrashCourse2014} this is not the case. They wrote: + +\begin{quote} +\it{}``The POSIX strategy is more complicated than the greedy because of +the dependence on information about the length of matched strings in the +various subexpressions.'' +\end{quote} + +\noindent +This is also supported by evidence collected by Kuklewicz +\cite{Kuklewicz} who noticed that a number of POSIX regular expression +matchers calculate incorrect results. + +Our focus is on an algorithm introduced by Sulzmann and Lu in 2014 for +regular expression matching according to the POSIX strategy +\cite{Sulzmann2014}. Their algorithm is based on an older algorithm by +Brzozowski from 1964 where he introduced the notion of derivatives of +regular expressions \cite{Brzozowski1964}. We shall briefly explain +the algorithms next. + +\section{The Algorithms by Brzozowski, and Sulzmann and Lu} + +Suppose regular expressions are given by the following grammar (for +the moment ignore the grammar for values on the right-hand side): + + +\begin{center} + \begin{tabular}{c@{\hspace{20mm}}c} + \begin{tabular}{@{}rrl@{}} + \multicolumn{3}{@{}l}{\textbf{Regular Expressions}}\medskip\\ + $r$ & $::=$ & $\ZERO$\\ + & $\mid$ & $\ONE$ \\ + & $\mid$ & $c$ \\ + & $\mid$ & $r_1 \cdot r_2$\\ + & $\mid$ & $r_1 + r_2$ \\ + \\ + & $\mid$ & $r^*$ \\ + \end{tabular} + & + \begin{tabular}{@{\hspace{0mm}}rrl@{}} + \multicolumn{3}{@{}l}{\textbf{Values}}\medskip\\ + $v$ & $::=$ & \\ + & & $\Empty$ \\ + & $\mid$ & $\Char(c)$ \\ + & $\mid$ & $\Seq\,v_1\, v_2$\\ + & $\mid$ & $\Left(v)$ \\ + & $\mid$ & $\Right(v)$ \\ + & $\mid$ & $\Stars\,[v_1,\ldots\,v_n]$ \\ + \end{tabular} + \end{tabular} +\end{center} + +\noindent +The intended meaning of the regular expressions is as usual: $\ZERO$ +cannot match any string, $\ONE$ can match the empty string, the +character regular expression $c$ can match the character $c$, and so +on. The brilliant contribution by Brzozowski is the notion of +\emph{derivatives} of regular expressions. The idea behind this +notion is as follows: suppose a regular expression $r$ can match a +string of the form $c\!::\! s$ (that is a list of characters starting +with $c$), what does the regular expression look like that can match +just $s$? Brzozowski gave a neat answer to this question. He defined +the following operation on regular expressions, written +$r\backslash c$ (the derivative of $r$ w.r.t.~the character $c$): + +\begin{center} +\begin{tabular}{lcl} + $\ZERO \backslash c$ & $\dn$ & $\ZERO$\\ + $\ONE \backslash c$ & $\dn$ & $\ZERO$\\ + $d \backslash c$ & $\dn$ & + $\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\ +$(r_1 + r_2)\backslash c$ & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\ +$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \nullable(r_1)$\\ + & & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\ + & & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\ + $(r^*)\backslash c$ & $\dn$ & $(r\backslash c) \cdot r^*$\\ +\end{tabular} +\end{center} + +\noindent +In this definition $\nullable(\_)$ stands for a simple recursive +function that tests whether a regular expression can match the empty +string (its definition is omitted). Assuming the classic notion of a +\emph{language} of a regular expression, written $L(\_)$, the main +property of the derivative operation is that + +\begin{center} +$c\!::\!s \in L(r)$ holds +if and only if $s \in L(r\backslash c)$. +\end{center} + +\noindent +The beauty of derivatives is that they lead to a really simple regular +expression matching algorithm: To find out whether a string $s$ +matches with a regular expression $r$, build the derivatives of $r$ +w.r.t.\ (in succession) all the characters of the string $s$. Finally, +test whether the resulting regular expression can match the empty +string. If yes, then $r$ matches $s$, and no in the negative +case. For us the main advantage is that derivatives can be +straightforwardly implemented in any functional programming language, +and are easily definable and reasoned about in theorem provers---the +definitions just consist of inductive datatypes and simple recursive +functions. Moreover, the notion of derivatives can be easily +generalised to cover extended regular expression constructors such as +the not-regular expression, written $\neg\,r$, or bounded repetitions +(for example $r^{\{n\}}$ and $r^{\{n..m\}}$), which cannot be so +straightforwardly realised within the classic automata approach. + + +One limitation, however, of Brzozowski's algorithm is that it only +produces a YES/NO answer for whether a string is being matched by a +regular expression. Sulzmann and Lu~\cite{Sulzmann2014} extended this +algorithm to allow generation of an actual matching, called a +\emph{value}---see the grammar above for its definition. Assuming a +regular expression matches a string, values encode the information of +\emph{how} the string is matched by the regular expression---that is, +which part of the string is matched by which part of the regular +expression. To illustrate this consider the string $xy$ and the +regular expression $(x + (y + xy))^*$. We can view this regular +expression as a tree and if the string $xy$ is matched by two Star +``iterations'', then the $x$ is matched by the left-most alternative +in this tree and the $y$ by the right-left alternative. This suggests +to record this matching as + +\begin{center} +$\Stars\,[\Left\,(\Char\,x), \Right(\Left(\Char\,y))]$ +\end{center} + +\noindent +where $\Stars$ records how many +iterations were used; and $\Left$, respectively $\Right$, which +alternative is used. The value for +matching $xy$ in a single ``iteration'', i.e.~the POSIX value, +would look as follows + +\begin{center} +$\Stars\,[\Seq\,(\Char\,x)\,(\Char\,y)]$ +\end{center} + +\noindent +where $\Stars$ has only a single-element list for the single iteration +and $\Seq$ indicates that $xy$ is matched by a sequence regular +expression. + +The contribution of Sulzmann and Lu is an extension of Brzozowski's +algorithm by a second phase (the first phase being building successive +derivatives). In this second phase, for every successful match the +corresponding POSIX value is computed. As mentioned before, from this +value we can extract the information \emph{how} a regular expression +matched a string. We omit the details here on how Sulzmann and Lu +achieved this~(see \cite{Sulzmann2014}). Rather, we shall focus next on the +process of simplification of regular expressions, which is needed in +order to obtain \emph{fast} versions of the Brzozowski's, and Sulzmann +and Lu's algorithms. This is where the PhD-project hopes to advance +the state-of-the-art. + + +\section{Simplification of Regular Expressions} + +The main drawback of building successive derivatives according to +Brzozowski's definition is that they can grow very quickly in size. +This is mainly due to the fact that the derivative operation generates +often ``useless'' $\ZERO$s and $\ONE$s in derivatives. As a result, +if implemented naively both algorithms by Brzozowski and by Sulzmann +and Lu are excruciatingly slow. For example when starting with the +regular expression $(a + aa)^*$ and building 12 successive derivatives +w.r.t.~the character $a$, one obtains a derivative regular expression +with more than 8000 nodes (when viewed as a tree). Operations like +derivative and $\nullable$ need to traverse such trees and +consequently the bigger the size of the derivative the slower the +algorithm. Fortunately, one can simplify regular expressions after +each derivative step. Various simplifications of regular expressions +are possible, such as the simplifications of $\ZERO + r$, +$r + \ZERO$, $\ONE\cdot r$, $r \cdot \ONE$, and $r + r$ to just +$r$. These simplifications do not affect the answer for whether a +regular expression matches a string or not, but fortunately also do +not affect the POSIX strategy of how regular expressions match +strings---although the latter is much harder to establish. Some +initial results in this regard have been obtained in +\cite{AusafDyckhoffUrban2016}. However, what has not been achieved yet +is a very tight bound for the size. Such a tight bound is suggested by +work of Antimirov who proved that (partial) derivatives can be bound +by the number of characters contained in the initial regular +expression \cite{Antimirov95}. We believe, and have generated test +data, that a similar bound can be obtained for the derivatives in +Sulzmann and Lu's algorithm. Let us give some details about this next. + +We first followed Sulzmann and Lu's idea of introducing +\emph{annotated regular expressions}~\cite{Sulzmann2014}. They are +defined by the following grammar: + +\begin{center} +\begin{tabular}{lcl} + $\textit{a}$ & $::=$ & $\textit{ZERO}$\\ + & $\mid$ & $\textit{ONE}\;\;bs$\\ + & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\ + & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\ + & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\ + & $\mid$ & $\textit{STAR}\;\;bs\,a$ +\end{tabular} +\end{center} + +\noindent +where $bs$ stands for bitsequences, and $as$ (in \textit{ALTS}) for a +list of annotated regular expressions. These bitsequences encode +information about the (POSIX) value that should be generated by the +Sulzmann and Lu algorithm. There are operations that can transform the +usual (un-annotated) regular expressions into annotated regular +expressions, and there are operations for encoding/decoding values to +or from bitsequences. For example the encoding function for values is +defined as follows: + +\begin{center} +\begin{tabular}{lcl} + $\textit{code}(\Empty)$ & $\dn$ & $[]$\\ + $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\ + $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\ + $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\ + $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\ + $\textit{code}(\Stars\,[])$ & $\dn$ & $[\S]$\\ + $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\Z :: code(v) \;@\; + code(\Stars\,vs)$ +\end{tabular} +\end{center} + +\noindent +where $\Z$ and $\S$ are arbitrary names for the ``bits'' in the +bitsequences. Although this encoding is ``lossy'' in the sense of not +recording all details of a value, Sulzmann and Lu have defined the +decoding function such that with additional information (namely the +corresponding regular expression) a value can be precisely extracted +from a bitsequence. + +The main point of the bitsequences and annotated regular expressions +is that we can apply rather aggressive (in terms of size) +simplification rules in order to keep derivatives small. We have +developed such ``aggressive'' simplification rules and generated test +data that show that the expected bound can be achieved. Obviously we +could only partially cover the search space as there are infinitely +many regular expressions and strings. One modification we introduced +is to allow a list of annotated regular expressions in the +\textit{ALTS} constructor. This allows us to not just delete +unnecessary $\ZERO$s and $\ONE$s from regular expressions, but also +unnecessary ``copies'' of regular expressions (very similar to +simplifying $r + r$ to just $r$, but in a more general +setting). Another modification is that we use simplification rules +inspired by Antimirov's work on partial derivatives. They maintain the +idea that only the first ``copy'' of a regular expression in an +alternative contributes to the calculation of a POSIX value. All +subsequent copies can be prunned from the regular expression. + +We are currently engaged with proving that our simplification rules +actually do not affect the POSIX value that should be generated by the +algorithm according to the specification of a POSIX value and +furthermore that our derivatives stay small for all derivatives. For +this proof we use the theorem prover Isabelle. Once completed, this +result will advance the state-of-the-art: Sulzmann and Lu wrote in +their paper \cite{Sulzmann2014} about the bitcoded ``incremental +parsing method'' (that is the matching algorithm outlined in this +section): + +\begin{quote}\it + ``Correctness Claim: We further claim that the incremental parsing + method in Figure~5 in combination with the simplification steps in + Figure 6 yields POSIX parse trees. We have tested this claim + extensively by using the method in Figure~3 as a reference but yet + have to work out all proof details.'' +\end{quote} + +\noindent +We would settle the correctness claim and furthermore obtain a much +tighter bound on the sizes of derivatives. The result is that our +algorithm should be correct and faster on all inputs. The original +blow-up, as observed in JavaScript, Python and Java, would be excluded +from happening in our algorithm. + +\section{Conclusion} + +In this PhD-project we are interested in fast algorithms for regular +expression matching. While this seems to be a ``settled'' area, in +fact interesting research questions are popping up as soon as one steps +outside the classic automata theory (for example in terms of what kind +of regular expressions are supported). The reason why it is +interesting for us to look at the derivative approach introduced by +Brzozowski for regular expression matching, and then much further +developed by Sulzmann and Lu, is that derivatives can elegantly deal +with some of the regular expressions that are of interest in ``real +life''. This includes the not-regular expression, written $\neg\,r$ +(that is all strings that are not recognised by $r$), but also bounded +regular expressions such as $r^{\{n\}}$ and $r^{\{n..m\}}$). There is +also hope that the derivatives can provide another angle for how to +deal more efficiently with back-references, which are one of the +reasons why regular expression engines in JavaScript, Python and Java +choose to not implement the classic automata approach of transforming +regular expressions into NFAs and then DFAs---because we simply do not +know how such back-references can be represented by DFAs. + + +\bibliographystyle{plain} +\bibliography{root} + + +\end{document} diff -r a28f1d5daa34 -r 1a92233763bd 9ms/graphic.sty --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/9ms/graphic.sty Sat Jun 29 12:28:49 2019 +0100 @@ -0,0 +1,12 @@ +\usepackage{tikz} +%\usepackage{pgf} +%\usetikzlibrary{positioning} +%\usetikzlibrary{calc} +%\usetikzlibrary{automata} +%\usetikzlibrary{arrows} +%\usetikzlibrary{backgrounds} +%\usetikzlibrary{fit} +%\usepackage{tikz-qtree} +\usepackage{pgfplots} + +%\pgfplotsset{compat=1.15} diff -r a28f1d5daa34 -r 1a92233763bd 9ms/lipics-logo-bw.pdf Binary file 9ms/lipics-logo-bw.pdf has changed diff -r a28f1d5daa34 -r 1a92233763bd 9ms/root.bib --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/9ms/root.bib Sat Jun 29 12:28:49 2019 +0100 @@ -0,0 +1,317 @@ +%% This BibTeX bibliography file was created using BibDesk. +%% https://bibdesk.sourceforge.io/ + +%% Created for CS TAN at 2019-06-26 17:07:31 +0100 + + +%% Saved with string encoding Unicode (UTF-8) + + + +@misc{SE16, + Author = {StackStatus}, + Date-Added = {2019-06-26 11:28:41 +0000}, + Date-Modified = {2019-06-26 16:07:31 +0000}, + Keywords = {ReDos Attack}, + Month = {July}, + Rating = {5}, + Read = {1}, + Title = {Stack Overflow Outage Postmortem}, + Url = {https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}, + Year = {2016}, + Bdsk-Url-1 = {https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}} + +@article{HosoyaVouillonPierce2005, + Author = {H.~Hosoya and J.~Vouillon and B.~C.~Pierce}, + Journal = {ACM Transactions on Programming Languages and Systems (TOPLAS)}, + Number = 1, + Pages = {46--90}, + Title = {{R}egular {E}xpression {T}ypes for {XML}}, + Volume = 27, + Year = {2005}} + +@misc{POSIX, + Note = {\url{http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html}}, + Title = {{T}he {O}pen {G}roup {B}ase {S}pecification {I}ssue 6 {IEEE} {S}td 1003.1 2004 {E}dition}, + Year = {2004}} + +@inproceedings{AusafDyckhoffUrban2016, + Author = {F.~Ausaf and R.~Dyckhoff and C.~Urban}, + Booktitle = {Proc.~of the 7th International Conference on Interactive Theorem Proving (ITP)}, + Pages = {69--86}, + Series = {LNCS}, + Title = {{POSIX} {L}exing with {D}erivatives of {R}egular {E}xpressions ({P}roof {P}earl)}, + Volume = {9807}, + Year = {2016}} + +@article{aduAFP16, + Author = {F.~Ausaf and R.~Dyckhoff and C.~Urban}, + Issn = {2150-914x}, + Journal = {Archive of Formal Proofs}, + Note = {\url{http://www.isa-afp.org/entries/Posix-Lexing.shtml}, Formal proof development}, + Title = {{POSIX} {L}exing with {D}erivatives of {R}egular {E}xpressions}, + Year = 2016} + +@techreport{CrashCourse2014, + Annote = {draft report}, + Author = {N.~B.~B.~Grathwohl and F.~Henglein and U.~T.~Rasmussen}, + Institution = {University of Copenhagen}, + Title = {{A} {C}rash-{C}ourse in {R}egular {E}xpression {P}arsing and {R}egular {E}xpressions as {T}ypes}, + Year = {2014}} + +@inproceedings{Sulzmann2014, + Author = {M.~Sulzmann and K.~Lu}, + Booktitle = {Proc.~of the 12th International Conference on Functional and Logic Programming (FLOPS)}, + Pages = {203--220}, + Series = {LNCS}, + Title = {{POSIX} {R}egular {E}xpression {P}arsing with {D}erivatives}, + Volume = {8475}, + Year = {2014}} + +@inproceedings{Sulzmann2014b, + Author = {M.~Sulzmann and P.~van Steenhoven}, + Booktitle = {Proc.~of the 23rd International Conference on Compiler Construction (CC)}, + Pages = {174--191}, + Series = {LNCS}, + Title = {{A} {F}lexible and {E}fficient {ML} {L}exer {T}ool {B}ased on {E}xtended {R}egular {E}xpression {S}ubmatching}, + Volume = {8409}, + Year = {2014}} + +@book{Pierce2015, + Author = {B.~C.~Pierce and C.~Casinghino and M.~Gaboardi and M.~Greenberg and C.~Hri\c{t}cu and V.~Sj\"{o}berg and B.~Yorgey}, + Note = {\url{http://www.cis.upenn.edu/~bcpierce/sf}}, + Publisher = {Electronic textbook}, + Title = {{S}oftware {F}oundations}, + Year = {2015}} + +@misc{Kuklewicz, + Author = {C.~Kuklewicz}, + Howpublished = {\url{https://wiki.haskell.org/Regex_Posix}}, + Title = {{R}egex {P}osix}} + +@article{Vansummeren2006, + Author = {S.~Vansummeren}, + Journal = {ACM Transactions on Programming Languages and Systems}, + Number = {3}, + Pages = {389--428}, + Title = {{T}ype {I}nference for {U}nique {P}attern {M}atching}, + Volume = {28}, + Year = {2006}} + +@inproceedings{Asperti12, + Author = {A.~Asperti}, + Booktitle = {Proc.~of the 3rd International Conference on Interactive Theorem Proving (ITP)}, + Pages = {283--298}, + Series = {LNCS}, + Title = {{A} {C}ompact {P}roof of {D}ecidability for {R}egular {E}xpression {E}quivalence}, + Volume = {7406}, + Year = {2012}} + +@inproceedings{Frisch2004, + Author = {A.~Frisch and L.~Cardelli}, + Booktitle = {Proc.~of the 31st International Conference on Automata, Languages and Programming (ICALP)}, + Pages = {618--629}, + Series = {LNCS}, + Title = {{G}reedy {R}egular {E}xpression {M}atching}, + Volume = {3142}, + Year = {2004}} + +@article{Antimirov95, + Author = {V.~Antimirov}, + Journal = {Theoretical Computer Science}, + Pages = {291--319}, + Title = {{P}artial {D}erivatives of {R}egular {E}xpressions and {F}inite {A}utomata {C}onstructions}, + Volume = {155}, + Year = {1995}} + +@inproceedings{Nipkow98, + Author = {T.~Nipkow}, + Booktitle = {Proc.~of the 11th International Conference on Theorem Proving in Higher Order Logics (TPHOLs)}, + Pages = {1--15}, + Series = {LNCS}, + Title = {{V}erified {L}exical {A}nalysis}, + Volume = 1479, + Year = 1998} + +@article{Brzozowski1964, + Author = {J.~A.~Brzozowski}, + Journal = {Journal of the {ACM}}, + Number = {4}, + Pages = {481--494}, + Title = {{D}erivatives of {R}egular {E}xpressions}, + Volume = {11}, + Year = {1964}} + +@article{Leroy2009, + Author = {X.~Leroy}, + Journal = {Communications of the ACM}, + Number = 7, + Pages = {107--115}, + Title = {{F}ormal {V}erification of a {R}ealistic {C}ompiler}, + Volume = 52, + Year = 2009} + +@inproceedings{Paulson2015, + Author = {L.~C.~Paulson}, + Booktitle = {Proc.~of the 25th International Conference on Automated Deduction (CADE)}, + Pages = {231--245}, + Series = {LNAI}, + Title = {{A} {F}ormalisation of {F}inite {A}utomata {U}sing {H}ereditarily {F}inite {S}ets}, + Volume = {9195}, + Year = {2015}} + +@article{Wu2014, + Author = {C.~Wu and X.~Zhang and C.~Urban}, + Journal = {Journal of Automatic Reasoning}, + Number = {4}, + Pages = {451--480}, + Title = {{A} {F}ormalisation of the {M}yhill-{N}erode {T}heorem based on {R}egular {E}xpressions}, + Volume = {52}, + Year = {2014}} + +@inproceedings{Regehr2011, + Author = {X.~Yang and Y.~Chen and E.~Eide and J.~Regehr}, + Booktitle = {Proc.~of the 32nd ACM SIGPLAN Conference on Programming Language Design and Implementation (PLDI)}, + Pages = {283--294}, + Title = {{F}inding and {U}nderstanding {B}ugs in {C} {C}ompilers}, + Year = {2011}} + +@article{Norrish2014, + Author = {A.~Barthwal and M.~Norrish}, + Journal = {Journal of Computer and System Sciences}, + Number = {2}, + Pages = {346--362}, + Title = {{A} {M}echanisation of {S}ome {C}ontext-{F}ree {L}anguage {T}heory in {HOL4}}, + Volume = {80}, + Year = {2014}} + +@article{Thompson1968, + Author = {K.~Thompson}, + Issue_Date = {June 1968}, + Journal = {Communications of the ACM}, + Number = {6}, + Pages = {419--422}, + Title = {{P}rogramming {T}echniques: {R}egular {E}xpression {S}earch {A}lgorithm}, + Volume = {11}, + Year = {1968}} + +@article{Owens2009, + Author = {S.~Owens and J.~H.~Reppy and A.~Turon}, + Journal = {Journal of Functinal Programming}, + Number = {2}, + Pages = {173--190}, + Title = {{R}egular-{E}xpression {D}erivatives {R}e-{E}xamined}, + Volume = {19}, + Year = {2009}} + +@inproceedings{Sulzmann2015, + Author = {M.~Sulzmann and P.~Thiemann}, + Booktitle = {Proc.~of the 9th International Conference on Language and Automata Theory and Applications (LATA)}, + Pages = {275--286}, + Series = {LNCS}, + Title = {{D}erivatives for {R}egular {S}huffle {E}xpressions}, + Volume = {8977}, + Year = {2015}} + +@inproceedings{Chen2012, + Author = {H.~Chen and S.~Yu}, + Booktitle = {Proc.~in the International Workshop on Theoretical Computer Science (WTCS)}, + Pages = {343--356}, + Series = {LNCS}, + Title = {{D}erivatives of {R}egular {E}xpressions and an {A}pplication}, + Volume = {7160}, + Year = {2012}} + +@article{Krauss2011, + Author = {A.~Krauss and T.~Nipkow}, + Journal = {Journal of Automated Reasoning}, + Pages = {95--106}, + Title = {{P}roof {P}earl: {R}egular {E}xpression {E}quivalence and {R}elation {A}lgebra}, + Volume = 49, + Year = 2012} + +@inproceedings{Traytel2015, + Author = {D.~Traytel}, + Booktitle = {Proc.~of the 24th Annual Conference on Computer Science Logic (CSL)}, + Pages = {487--503}, + Series = {LIPIcs}, + Title = {{A} {C}oalgebraic {D}ecision {P}rocedure for {WS1S}}, + Volume = {41}, + Year = {2015}} + +@inproceedings{Traytel2013, + Author = {D.~Traytel and T.~Nipkow}, + Booktitle = {Proc.~of the 18th ACM SIGPLAN International Conference on Functional Programming (ICFP)}, + Pages = {3-12}, + Title = {{A} {V}erified {D}ecision {P}rocedure for {MSO} on {W}ords {B}ased on {D}erivatives of {R}egular {E}xpressions}, + Year = 2013} + +@inproceedings{Coquand2012, + Author = {T.~Coquand and V.~Siles}, + Booktitle = {Proc.~of the 1st International Conference on Certified Programs and Proofs (CPP)}, + Pages = {119--134}, + Series = {LNCS}, + Title = {{A} {D}ecision {P}rocedure for {R}egular {E}xpression {E}quivalence in {T}ype {T}heory}, + Volume = {7086}, + Year = {2011}} + +@inproceedings{Almeidaetal10, + Author = {J.~B.~Almeida and N.~Moriera and D.~Pereira and S.~M.~de Sousa}, + Booktitle = {Proc.~of the 15th International Conference on Implementation and Application of Automata (CIAA)}, + Pages = {59-68}, + Series = {LNCS}, + Title = {{P}artial {D}erivative {A}utomata {F}ormalized in {C}oq}, + Volume = {6482}, + Year = {2010}} + +@article{Owens2008, + Author = {S.~Owens and K.~Slind}, + Journal = {Higher-Order and Symbolic Computation}, + Number = {4}, + Pages = {377--409}, + Title = {{A}dapting {F}unctional {P}rograms to {H}igher {O}rder {L}ogic}, + Volume = {21}, + Year = {2008}} + +@article{Owens2, + Author = {S.~Owens and K.~Slind}, + Bibsource = {dblp computer science bibliography, http://dblp.org}, + Biburl = {http://dblp.uni-trier.de/rec/bib/journals/lisp/OwensS08}, + Doi = {10.1007/s10990-008-9038-0}, + Journal = {Higher-Order and Symbolic Computation}, + Number = {4}, + Pages = {377--409}, + Timestamp = {Wed, 16 Dec 2009 13:51:02 +0100}, + Title = {Adapting functional programs to higher order logic}, + Url = {http://dx.doi.org/10.1007/s10990-008-9038-0}, + Volume = {21}, + Year = {2008}, + Bdsk-Url-1 = {http://dx.doi.org/10.1007/s10990-008-9038-0}} + +@misc{PCRE, + Title = {{PCRE - Perl Compatible Regular Expressions}}, + Url = {http://www.pcre.org}, + Bdsk-Url-1 = {http://www.pcre.org}} + +@inproceedings{OkuiSuzuki2010, + Author = {S.~Okui and T.~Suzuki}, + Booktitle = {Proc.~of the 15th International Conference on Implementation and Application of Automata (CIAA)}, + Pages = {231--240}, + Series = {LNCS}, + Title = {{D}isambiguation in {R}egular {E}xpression {M}atching via {P}osition {A}utomata with {A}ugmented {T}ransitions}, + Volume = {6482}, + Year = {2010}} + +@techreport{OkuiSuzukiTech, + Author = {S.~Okui and T.~Suzuki}, + Institution = {University of Aizu}, + Title = {{D}isambiguation in {R}egular {E}xpression {M}atching via {P}osition {A}utomata with {A}ugmented {T}ransitions}, + Year = {2013}} + +@inproceedings{Davis18, + Author = {J.~C.~Davis and C.~.A.~Coghlan and F.~Servant and D.~Lee}, + Booktitle = {Proc.~of the 26th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE)}, + Numpages = {11}, + Pages = {246--256}, + Title = {{T}he {I}mpact of {R}egular {E}xpression {D}enial of {S}ervice ({ReDoS}) in {P}ractice: {A}n {E}mpirical {S}tudy at the {E}cosystem {S}cale}, + Year = {2018}}