# HG changeset patch
# User Chengsong
# Date 1646262839 0
# Node ID a73b2e553804bb8dadd6e1337ca4e82063da1907
# Parent  65e786a583652298dfeb4101da09bc1fd8a5e998
templateforPhd

diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/ChengsongPhDThesis.tex
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/ChengsongPhDThesis.tex	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,1650 @@
+\documentclass[a4paper,UKenglish]{lipics}
+\usepackage{graphic}
+\usepackage{data}
+\usepackage{tikz-cd}
+%\usepackage{algorithm}
+\usepackage{amsmath}
+\usepackage[noend]{algpseudocode}
+\usepackage{enumitem}
+\usepackage{nccmath}
+
+\usetikzlibrary{positioning}
+
+\definecolor{darkblue}{rgb}{0,0,0.6}
+\hypersetup{colorlinks=true,allcolors=darkblue}
+\newcommand{\comment}[1]%
+{{\color{red}$\Rightarrow$}\marginpar{\raggedright\small{\bf\color{red}#1}}}
+
+% \documentclass{article}
+%\usepackage[utf8]{inputenc}
+%\usepackage[english]{babel}
+%\usepackage{listings}
+% \usepackage{amsthm}
+%\usepackage{hyperref}
+% \usepackage[margin=0.5in]{geometry}
+%\usepackage{pmboxdraw}
+ 
+\title{POSIX Regular Expression Matching and Lexing}
+\author{Chengsong Tan}
+\affil{King's College London\\
+London, UK\\
+\texttt{chengsong.tan@kcl.ac.uk}}
+\authorrunning{Chengsong Tan}
+\Copyright{Chengsong Tan}
+
+\newcommand{\dn}{\stackrel{\mbox{\scriptsize def}}{=}}%
+\newcommand{\ZERO}{\mbox{\bf 0}}
+\newcommand{\ONE}{\mbox{\bf 1}}
+\def\lexer{\mathit{lexer}}
+\def\mkeps{\mathit{mkeps}}
+
+\def\inj{\mathit{inj}}
+\def\Empty{\mathit{Empty}}
+\def\Left{\mathit{Left}}
+\def\Right{\mathit{Right}}
+\def\Stars{\mathit{Stars}}
+\def\Char{\mathit{Char}}
+\def\Seq{\mathit{Seq}}
+\def\Der{\mathit{Der}}
+\def\nullable{\mathit{nullable}}
+\def\Z{\mathit{Z}}
+\def\S{\mathit{S}}
+
+
+\def\awidth{\mathit{awidth}}
+\def\pder{\mathit{pder}}
+\def\maxterms{\mathit{maxterms}}
+\def\bsimp{\mathit{bsimp}}
+
+%\theoremstyle{theorem}
+%\newtheorem{theorem}{Theorem}
+%\theoremstyle{lemma}
+%\newtheorem{lemma}{Lemma}
+%\newcommand{\lemmaautorefname}{Lemma}
+%\theoremstyle{definition}
+%\newtheorem{definition}{Definition}
+\algnewcommand\algorithmicswitch{\textbf{switch}}
+\algnewcommand\algorithmiccase{\textbf{case}}
+\algnewcommand\algorithmicassert{\texttt{assert}}
+\algnewcommand\Assert[1]{\State \algorithmicassert(#1)}%
+% New "environments"
+\algdef{SE}[SWITCH]{Switch}{EndSwitch}[1]{\algorithmicswitch\ #1\ \algorithmicdo}{\algorithmicend\ \algorithmicswitch}%
+\algdef{SE}[CASE]{Case}{EndCase}[1]{\algorithmiccase\ #1}{\algorithmicend\ \algorithmiccase}%
+\algtext*{EndSwitch}%
+\algtext*{EndCase}%
+
+
+\begin{document}
+
+\maketitle
+
+\begin{abstract}
+  Brzozowski introduced in 1964 a beautifully simple algorithm for
+  regular expression matching based on the notion of derivatives of
+  regular expressions. In 2014, Sulzmann and Lu extended this
+  algorithm to not just give a YES/NO answer for whether or not a
+  regular expression matches a string, but in case it does also
+  answers with \emph{how} it matches the string.  This is important for
+  applications such as lexing (tokenising a string). The problem is to
+  make the algorithm by Sulzmann and Lu fast on all inputs without
+  breaking its correctness. We have already developed some
+  simplification rules for this, but have not yet proved that they
+  preserve the correctness of the algorithm. We also have not yet
+  looked at extended regular expressions, such as bounded repetitions,
+  negation and back-references.
+\end{abstract}
+
+\section{Introduction}
+\subsection{Practical Example of Regex}
+
+%TODO: read rules libraries and the explanation for some of the rules
+matching some string $s$ with a regex
+
+\begin{verbatim}
+(?:(?:\"|'|\]|\}|\\|\d|
+(?:nan|infinity|true|false|null|undefined|symbol|math)
+|\`|\-|\+)+[)]*;?((?:\s|-|~|!|{}|\|\||\+)*.*(?:.*=.*))) 
+\end{verbatim}
+
+
+%Could be from a network intrusion detection algorithm.
+%Checking whether there is some malicious code 
+%in the network data blocks being routed.
+%If so, discard the data and identify the sender for future alert.
+
+\subsection{The problem: Efficient Matching and Lexing}
+A programmer writes patterns to process texts,
+where a regex is a structured symbolic pattern
+specifying what the string should be like.
+
+The above regex looks complicated, but can be 
+described by some basic constructs:
+
+
+Suppose (basic) regular expressions are given by the following grammar:
+\[			r ::=   \ZERO \mid  \ONE
+			 \mid  c  
+			 \mid  r_1 \cdot r_2
+			 \mid  r_1 + r_2   
+			 \mid r^*         
+\]
+
+\noindent
+The intended meaning of the constructors is as follows: $\ZERO$
+cannot match any string, $\ONE$ can match the empty string, the
+character regular expression $c$ can match the character $c$, and so
+on.
+
+and the underlying algorithmic problem is:
+\begin{center}
+\begin{tabular}{lcr}
+$\textit{Match}(r, s)$ & $ =  $ & $\textit{if}\; s \in L(r)\; \textit{output} \; \textit{YES}$\\
+&				& $\textit{else} \; \textit{output} \; \textit{NO}$
+\end{tabular}
+\end{center}
+
+Deciding whether a string is in the language of the regex
+can be intuitively done by constructing an NFA\cite{Thompson_1968}:
+and simulate the running of it:
+\begin{figure}
+\centering
+\includegraphics[scale=0.5]{pics/regex_nfa_base.png}
+\end{figure}
+
+\begin{figure}
+\centering
+\includegraphics[scale=0.5]{pics/regex_nfa_seq1.png}
+\end{figure}
+
+\begin{figure}
+\centering
+\includegraphics[scale=0.5]{pics/regex_nfa_seq2.png}
+\end{figure}
+
+\begin{figure}
+\centering
+\includegraphics[scale=0.5]{pics/regex_nfa_alt.png}
+\end{figure}
+
+
+\begin{figure}
+\centering
+\includegraphics[scale=0.5]{pics/regex_nfa_star.png}
+\end{figure}
+
+Which should be simple enough that modern programmers
+have no problems with it at all?
+Not really:
+
+Take $(a^*)^*\,b$ and ask whether
+strings of the form $aa..a$ match this regular
+expression. Obviously this is not the case---the expected $b$ in the last
+position is missing. One would expect that modern regular expression
+matching engines can find this out very quickly. Alas, if one tries
+this example in JavaScript, Python or Java 8 with strings like 28
+$a$'s, one discovers that this decision takes around 30 seconds and
+takes considerably longer when adding a few more $a$'s, as the graphs
+below show:
+
+\begin{center}
+\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n$},
+    x label style={at={(1.05,-0.05)}},
+    ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=33,
+    ymax=35,
+    ytick={0,5,...,30},
+    scaled ticks=false,
+    axis lines=left,
+    width=5cm,
+    height=4cm, 
+    legend entries={JavaScript},  
+    legend pos=north west,
+    legend cell align=left]
+\addplot[red,mark=*, mark options={fill=white}] table {re-js.data};
+\end{axis}
+\end{tikzpicture}
+  &
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n$},
+    x label style={at={(1.05,-0.05)}},
+    %ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=33,
+    ymax=35,
+    ytick={0,5,...,30},
+    scaled ticks=false,
+    axis lines=left,
+    width=5cm,
+    height=4cm, 
+    legend entries={Python},  
+    legend pos=north west,
+    legend cell align=left]
+\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
+\end{axis}
+\end{tikzpicture}
+  &
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n$},
+    x label style={at={(1.05,-0.05)}},
+    %ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=33,
+    ymax=35,
+    ytick={0,5,...,30},
+    scaled ticks=false,
+    axis lines=left,
+    width=5cm,
+    height=4cm, 
+    legend entries={Java 8},  
+    legend pos=north west,
+    legend cell align=left]
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
+\end{axis}
+\end{tikzpicture}\\
+\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings 
+           of the form $\underbrace{aa..a}_{n}$.}
+\end{tabular}    
+\end{center}  
+
+Why?
+Using $\textit{NFA}$'s that can backtrack.
+%TODO: what does it mean to do DFS BFS on NFA's
+	
+
+Then how about determinization?
+\begin{itemize}
+\item
+ Turning NFA's to DFA's can cause the size of the automata
+to blow up exponentially.
+\item
+Want to extract submatch information.
+For example, 
+$r_1 \cdot r_2$ matches $s$,
+want to know $s = s_1@s_2$ where $s_i$ 
+corresponds to $r_i$. Where $s_i$ might be the
+attacker's ip address.
+\item
+Variants such as counting automaton exist.
+But usually made super fast on a certain class
+of regexes like bounded repetitions:
+\begin{verbatim}
+.*a.{100}
+\end{verbatim}
+On a lot of inputs this works very well.
+On average good practical performance.
+~10MiB per second.
+But cannot be super fast on all inputs of regexes and strings,
+can be imprecise (incorrect) when it comes to more complex regexes.
+
+\end{itemize}
+%TODO: real world example?
+
+
+
+\subsection{derivatives}
+Q:
+Is there an efficient lexing algorithm with provable guarantees on 
+correctness and running time?
+Brzozowski Derivatives\cite{Brzozowski1964}!
+
+\begin{center}
+		\begin{tabular}{lcl}
+			$\nullable(\ZERO)$     & $\dn$ & $\mathit{false}$ \\  
+			$\nullable(\ONE)$      & $\dn$ & $\mathit{true}$ \\
+			$\nullable(c)$ 	       & $\dn$ & $\mathit{false}$ \\
+			$\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\
+			$\nullable(r_1\cdot r_2)$  & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\
+			$\nullable(r^*)$       & $\dn$ & $\mathit{true}$ \\
+		\end{tabular}
+	\end{center}
+	
+	
+	
+This function simply tests whether the empty string is in $L(r)$.
+He then defined
+the following operation on regular expressions, written
+$r\backslash c$ (the derivative of $r$ w.r.t.~the character $c$):
+
+\begin{center}
+\begin{tabular}{lcl}
+		$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\  
+		$\ONE \backslash c$  & $\dn$ & $\ZERO$\\
+		$d \backslash c$     & $\dn$ & 
+		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
+$(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
+$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, nullable(r_1)$\\
+	&   & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
+	&   & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
+	$(r^*)\backslash c$           & $\dn$ & $(r\backslash c) \cdot r^*$\\
+\end{tabular}
+\end{center}
+
+
+
+
+\begin{ceqn}
+\begin{equation}\label{graph:01}
+\begin{tikzcd}
+r_0 \arrow[r, "\backslash c_0"]  \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\
+v_0           & v_1 \arrow[l,"inj_{r_0} c_0"]                & v_2 \arrow[l, "inj_{r_1} c_1"]              & v_n \arrow[l, dashed]         
+\end{tikzcd}
+\end{equation}
+\end{ceqn}
+Nicely functional, correctness easily provable, but suffers
+from large stack size with long strings, and 
+inability to perform even moderate simplification.
+
+
+
+The Sulzmann and Lu's bit-coded algorithm:
+\begin{figure}
+\centering
+\includegraphics[scale=0.3]{bitcoded_sulzmann.png}
+\end{figure}
+
+This one-phase algorithm is free from the burden of large stack usage:
+
+\begin{center}
+\begin{tikzpicture}[scale=2,node distance=1.9cm,
+                    every node/.style={minimum size=7mm}]
+\node (r0)  {$r_0$};
+\node (r1) [right=of r0]{$r_1$};
+\draw[->,line width=0.2mm](r0)--(r1) node[above,midway] {$\backslash\,c_0$};
+\node (r2) [right=of r1]{$r_2$};
+\draw[->, line width = 0.2mm](r1)--(r2) node[above,midway] {$\backslash\,c_1$};
+\node (rn) [right=of r2]{$r_n$};
+\draw[dashed,->,line width=0.2mm](r2)--(rn) node[above,midway] {} ;
+\draw (rn) node[anchor=west] {\;\raisebox{3mm}{$\nullable$}};
+\node (bs) [below=of rn]{$bs$};
+\draw[->,line width=0.2mm](rn) -- (bs);
+\node (v0) [left=of bs] {$v_0$};
+\draw[->,line width=0.2mm](bs)--(v0) node[below,midway] {$\textit{decode}$};
+\draw (rn) node[anchor=north west] {\;\raisebox{-8mm}{$\textit{collect bits}$}};
+\draw[->, line width=0.2mm](v0)--(r0) node[below, midway] {};
+\end{tikzpicture}
+\end{center}
+
+
+
+This is functional code, and easily provable (proof by Urban and Ausaf).
+
+But it suffers from exponential blows even with the simplification steps:
+\begin{figure}
+\centering
+\includegraphics[scale= 0.3]{pics/nub_filter_simp.png}
+\end{figure}
+claim: Sulzmann and Lu claimed it linear $w.r.t$ input.
+
+example that blows it up:
+$(a+aa)^*$
+\section{Contributions}
+\subsection{Our contribution 1}
+an improved version of the above algorithm that solves most blow up 
+cases, including the above example.
+
+a formalized closed-form for string derivatives:
+\[ (\sum rs) \backslash_s s = simp(\sum_{r \in rs}(r \backslash_s s)) \]
+\[ (r1\cdot r2) \backslash_s s = simp(r_1 \backslash_s s  \cdot r_2 + \sum_{s' \in Suffix(s)} r_2 \backslash_s s' )\]
+\[r0^* \backslash_s s = simp(\sum_{s' \in substr(s)} (r0 \backslash_s s') \cdot r0^*) \]
+
+
+
+Also with a size guarantee that make sure the size of the derivatives 
+don't go up unbounded.
+
+
+\begin{theorem}
+Given a regular expression r, we have 
+\begin{center}
+$\exists N_r.\; s.t. \;\forall s. \; |r \backslash_s s| < N_r$
+\end{center}
+\end{theorem}
+
+The proof for this is using partial derivative's terms to bound it.
+\begin{center}
+\begin{tabular}{lcl}
+$| \maxterms (\bsimp  (a\cdot b) \backslash s)|$ & $=$ & $ |maxterms(\bsimp( (a\backslash s \cdot b) + \sum_{s'\in sl}(b\backslash s') ))|$\\
+& $\leq$ & $| (\pder_{s@[c]} a ) \cdot b|  + | (\bigcup_{s' \in Suf(s@[c])} (\pder_{s'} \; b))|$\\
+& $=$ & $\awidth(a) + \awidth(b)$ \\
+& $=$ & $\awidth(a+b)$
+\end{tabular}
+\end{center} 
+
+
+\subsection{Our Contribution 2}
+more aggressive simplification that prunes away sub-parts
+of a regex based on what terms has appeared before.
+Which gives us a truly linear bound on the input length.
+
+
+
+\section{To be completed}
+
+benchmarking our algorithm against JFLEX
+counting set automata, Silex, other main regex engines (incorporate their ideas such
+as zippers and other data structures reducing memory use).
+
+extend to back references.
+
+
+
+
+
+
+\noindent These are clearly abysmal and possibly surprising results. One
+would expect these systems to do  much better than that---after all,
+given a DFA and a string, deciding whether a string is matched by this
+DFA should be linear in terms of the size of the regular expression and
+the string?
+
+Admittedly, the regular expression $(a^*)^*\,b$ is carefully chosen to
+exhibit this super-linear behaviour.  But unfortunately, such regular
+expressions are not just a few outliers. They are actually 
+frequent enough to have a separate name created for
+them---\emph{evil regular expressions}. In empiric work, Davis et al
+report that they have found thousands of such evil regular expressions
+in the JavaScript and Python ecosystems \cite{Davis18}. Static analysis
+approach that is both sound and complete exists\cite{17Bir}, but the running 
+time on certain examples in the RegExLib and Snort regular expressions
+libraries is unacceptable. Therefore the problem of efficiency still remains.
+
+This superlinear blowup in matching algorithms sometimes causes
+considerable grief in real life: for example on 20 July 2016 one evil
+regular expression brought the webpage
+\href{http://stackexchange.com}{Stack Exchange} to its
+knees.\footnote{\url{https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
+In this instance, a regular expression intended to just trim white
+spaces from the beginning and the end of a line actually consumed
+massive amounts of CPU-resources---causing web servers to grind to a
+halt. This happened when a post with 20,000 white spaces was submitted,
+but importantly the white spaces were neither at the beginning nor at
+the end. As a result, the regular expression matching engine needed to
+backtrack over many choices. In this example, the time needed to process
+the string was $O(n^2)$ with respect to the string length. This
+quadratic overhead was enough for the homepage of Stack Exchange to
+respond so slowly that the load balancer assumed there must be some
+attack and therefore stopped the servers from responding to any
+requests. This made the whole site become unavailable. Another very
+recent example is a global outage of all Cloudflare servers on 2 July
+2019. A poorly written regular expression exhibited exponential
+behaviour and exhausted CPUs that serve HTTP traffic. Although the
+outage had several causes, at the heart was a regular expression that
+was used to monitor network
+traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}}
+
+The underlying problem is that many ``real life'' regular expression
+matching engines do not use DFAs for matching. This is because they
+support regular expressions that are not covered by the classical
+automata theory, and in this more general setting there are quite a few
+research questions still unanswered and fast algorithms still need to be
+developed (for example how to treat efficiently bounded repetitions, negation and
+back-references).
+%question: dfa can have exponential states. isn't this the actual reason why they do not use dfas?
+%how do they avoid dfas exponential states if they use them for fast matching?
+
+There is also another under-researched problem to do with regular
+expressions and lexing, i.e.~the process of breaking up strings into
+sequences of tokens according to some regular expressions. In this
+setting one is not just interested in whether or not a regular
+expression matches a string, but also in \emph{how}.  Consider for
+example a regular expression $r_{key}$ for recognising keywords such as
+\textit{if}, \textit{then} and so on; and a regular expression $r_{id}$
+for recognising identifiers (say, a single character followed by
+characters or numbers). One can then form the compound regular
+expression $(r_{key} + r_{id})^*$ and use it to tokenise strings.  But
+then how should the string \textit{iffoo} be tokenised?  It could be
+tokenised as a keyword followed by an identifier, or the entire string
+as a single identifier.  Similarly, how should the string \textit{if} be
+tokenised? Both regular expressions, $r_{key}$ and $r_{id}$, would
+``fire''---so is it an identifier or a keyword?  While in applications
+there is a well-known strategy to decide these questions, called POSIX
+matching, only relatively recently precise definitions of what POSIX
+matching actually means have been formalised
+\cite{AusafDyckhoffUrban2016,OkuiSuzuki2010,Vansummeren2006}. Such a
+definition has also been given by Sulzmann and  Lu \cite{Sulzmann2014},
+but the corresponding correctness proof turned out to be  faulty
+\cite{AusafDyckhoffUrban2016}. Roughly, POSIX matching means matching
+the longest initial substring. In the case of a tie, the initial
+sub-match is chosen according to some priorities attached to the regular
+expressions (e.g.~keywords have a higher priority than identifiers).
+This sounds rather simple, but according to Grathwohl et al \cite[Page
+36]{CrashCourse2014} this is not the case. They wrote:
+
+\begin{quote}
+\it{}``The POSIX strategy is more complicated than the greedy because of 
+the dependence on information about the length of matched strings in the 
+various subexpressions.''
+\end{quote}
+
+\noindent
+This is also supported by evidence collected by Kuklewicz
+\cite{Kuklewicz} who noticed that a number of POSIX regular expression
+matchers calculate incorrect results.
+
+Our focus in this project is on an algorithm introduced by Sulzmann and
+Lu in 2014 for regular expression matching according to the POSIX
+strategy \cite{Sulzmann2014}. Their algorithm is based on an older
+algorithm by Brzozowski from 1964 where he introduced the notion of
+derivatives of regular expressions~\cite{Brzozowski1964}. We shall
+briefly explain this algorithm next.
+
+\section{The Algorithm by Brzozowski based on Derivatives of Regular
+Expressions}
+
+Suppose (basic) regular expressions are given by the following grammar:
+\[			r ::=   \ZERO \mid  \ONE
+			 \mid  c  
+			 \mid  r_1 \cdot r_2
+			 \mid  r_1 + r_2   
+			 \mid r^*         
+\]
+
+\noindent
+The intended meaning of the constructors is as follows: $\ZERO$
+cannot match any string, $\ONE$ can match the empty string, the
+character regular expression $c$ can match the character $c$, and so
+on.
+
+The ingenious contribution by Brzozowski is the notion of
+\emph{derivatives} of regular expressions.  The idea behind this
+notion is as follows: suppose a regular expression $r$ can match a
+string of the form $c\!::\! s$ (that is a list of characters starting
+with $c$), what does the regular expression look like that can match
+just $s$? Brzozowski gave a neat answer to this question. He started
+with the definition of $nullable$:
+\begin{center}
+		\begin{tabular}{lcl}
+			$\nullable(\ZERO)$     & $\dn$ & $\mathit{false}$ \\  
+			$\nullable(\ONE)$      & $\dn$ & $\mathit{true}$ \\
+			$\nullable(c)$ 	       & $\dn$ & $\mathit{false}$ \\
+			$\nullable(r_1 + r_2)$ & $\dn$ & $\nullable(r_1) \vee \nullable(r_2)$ \\
+			$\nullable(r_1\cdot r_2)$  & $\dn$ & $\nullable(r_1) \wedge \nullable(r_2)$ \\
+			$\nullable(r^*)$       & $\dn$ & $\mathit{true}$ \\
+		\end{tabular}
+	\end{center}
+This function simply tests whether the empty string is in $L(r)$.
+He then defined
+the following operation on regular expressions, written
+$r\backslash c$ (the derivative of $r$ w.r.t.~the character $c$):
+
+\begin{center}
+\begin{tabular}{lcl}
+		$\ZERO \backslash c$ & $\dn$ & $\ZERO$\\  
+		$\ONE \backslash c$  & $\dn$ & $\ZERO$\\
+		$d \backslash c$     & $\dn$ & 
+		$\mathit{if} \;c = d\;\mathit{then}\;\ONE\;\mathit{else}\;\ZERO$\\
+$(r_1 + r_2)\backslash c$     & $\dn$ & $r_1 \backslash c \,+\, r_2 \backslash c$\\
+$(r_1 \cdot r_2)\backslash c$ & $\dn$ & $\mathit{if} \, nullable(r_1)$\\
+	&   & $\mathit{then}\;(r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c$\\
+	&   & $\mathit{else}\;(r_1\backslash c) \cdot r_2$\\
+	$(r^*)\backslash c$           & $\dn$ & $(r\backslash c) \cdot r^*$\\
+\end{tabular}
+\end{center}
+
+%Assuming the classic notion of a
+%\emph{language} of a regular expression, written $L(\_)$, t
+
+\noindent
+The main property of the derivative operation is that
+
+\begin{center}
+$c\!::\!s \in L(r)$ holds
+if and only if $s \in L(r\backslash c)$.
+\end{center}
+
+\noindent
+For us the main advantage is that derivatives can be
+straightforwardly implemented in any functional programming language,
+and are easily definable and reasoned about in theorem provers---the
+definitions just consist of inductive datatypes and simple recursive
+functions. Moreover, the notion of derivatives can be easily
+generalised to cover extended regular expression constructors such as
+the not-regular expression, written $\neg\,r$, or bounded repetitions
+(for example $r^{\{n\}}$ and $r^{\{n..m\}}$), which cannot be so
+straightforwardly realised within the classic automata approach.
+For the moment however, we focus only on the usual basic regular expressions.
+
+
+Now if we want to find out whether a string $s$ matches with a regular
+expression $r$, we can build the derivatives of $r$ w.r.t.\ (in succession)
+all the characters of the string $s$. Finally, test whether the
+resulting regular expression can match the empty string.  If yes, then
+$r$ matches $s$, and no in the negative case. To implement this idea
+we can generalise the derivative operation to strings like this:
+
+\begin{center}
+\begin{tabular}{lcl}
+$r \backslash (c\!::\!s) $ & $\dn$ & $(r \backslash c) \backslash s$ \\
+$r \backslash [\,] $ & $\dn$ & $r$
+\end{tabular}
+\end{center}
+
+\noindent
+and then define as  regular-expression matching algorithm: 
+\[
+match\;s\;r \;\dn\; nullable(r\backslash s)
+\]
+
+\noindent
+This algorithm looks graphically as follows:
+\begin{equation}\label{graph:*}
+\begin{tikzcd}
+r_0 \arrow[r, "\backslash c_0"]  & r_1 \arrow[r, "\backslash c_1"] & r_2 \arrow[r, dashed]  & r_n  \arrow[r,"\textit{nullable}?"] & \;\textrm{YES}/\textrm{NO}
+\end{tikzcd}
+\end{equation}
+
+\noindent
+where we start with  a regular expression  $r_0$, build successive
+derivatives until we exhaust the string and then use \textit{nullable}
+to test whether the result can match the empty string. It can  be
+relatively  easily shown that this matcher is correct  (that is given
+an $s = c_0...c_{n-1}$ and an $r_0$, it generates YES if and only if $s \in L(r_0)$).
+
+ 
+\section{Values and the Algorithm by Sulzmann and Lu}
+
+One limitation of Brzozowski's algorithm is that it only produces a
+YES/NO answer for whether a string is being matched by a regular
+expression.  Sulzmann and Lu~\cite{Sulzmann2014} extended this algorithm
+to allow generation of an actual matching, called a \emph{value} or
+sometimes also \emph{lexical value}.  These values and regular
+expressions correspond to each other as illustrated in the following
+table:
+
+
+\begin{center}
+	\begin{tabular}{c@{\hspace{20mm}}c}
+		\begin{tabular}{@{}rrl@{}}
+			\multicolumn{3}{@{}l}{\textbf{Regular Expressions}}\medskip\\
+			$r$ & $::=$  & $\ZERO$\\
+			& $\mid$ & $\ONE$   \\
+			& $\mid$ & $c$          \\
+			& $\mid$ & $r_1 \cdot r_2$\\
+			& $\mid$ & $r_1 + r_2$   \\
+			\\
+			& $\mid$ & $r^*$         \\
+		\end{tabular}
+		&
+		\begin{tabular}{@{\hspace{0mm}}rrl@{}}
+			\multicolumn{3}{@{}l}{\textbf{Values}}\medskip\\
+			$v$ & $::=$  & \\
+			&        & $\Empty$   \\
+			& $\mid$ & $\Char(c)$          \\
+			& $\mid$ & $\Seq\,v_1\, v_2$\\
+			& $\mid$ & $\Left(v)$   \\
+			& $\mid$ & $\Right(v)$  \\
+			& $\mid$ & $\Stars\,[v_1,\ldots\,v_n]$ \\
+		\end{tabular}
+	\end{tabular}
+\end{center}
+
+\noindent
+No value  corresponds to $\ZERO$; $\Empty$ corresponds to $\ONE$;
+$\Char$ to the character regular expression; $\Seq$ to the sequence
+regular expression and so on. The idea of values is to encode a kind of
+lexical value for how the sub-parts of a regular expression match the
+sub-parts of a string. To see this, suppose a \emph{flatten} operation,
+written $|v|$ for values. We can use this function to extract the
+underlying string of a value $v$. For example, $|\mathit{Seq} \,
+(\textit{Char x}) \, (\textit{Char y})|$ is the string $xy$.  Using
+flatten, we can describe how values encode lexical values: $\Seq\,v_1\,
+v_2$ encodes a tree with two children nodes that tells how the string
+$|v_1| @ |v_2|$ matches the regex $r_1 \cdot r_2$ whereby $r_1$ matches
+the substring $|v_1|$ and, respectively, $r_2$ matches the substring
+$|v_2|$. Exactly how these two are matched is contained in the children
+nodes $v_1$ and $v_2$ of parent $\textit{Seq}$. 
+
+To give a concrete example of how values work, consider the string $xy$
+and the regular expression $(x + (y + xy))^*$. We can view this regular
+expression as a tree and if the string $xy$ is matched by two Star
+``iterations'', then the $x$ is matched by the left-most alternative in
+this tree and the $y$ by the right-left alternative. This suggests to
+record this matching as
+
+\begin{center}
+$\Stars\,[\Left\,(\Char\,x), \Right(\Left(\Char\,y))]$
+\end{center}
+
+\noindent
+where $\Stars \; [\ldots]$ records all the
+iterations; and $\Left$, respectively $\Right$, which
+alternative is used. The value for
+matching $xy$ in a single ``iteration'', i.e.~the POSIX value,
+would look as follows
+
+\begin{center}
+$\Stars\,[\Seq\,(\Char\,x)\,(\Char\,y)]$
+\end{center}
+
+\noindent
+where $\Stars$ has only a single-element list for the single iteration
+and $\Seq$ indicates that $xy$ is matched by a sequence regular
+expression.
+
+The contribution of Sulzmann and Lu is an extension of Brzozowski's
+algorithm by a second phase (the first phase being building successive
+derivatives---see \eqref{graph:*}). In this second phase, a POSIX value 
+is generated in case the regular expression matches  the string. 
+Pictorially, the Sulzmann and Lu algorithm is as follows:
+
+\begin{ceqn}
+\begin{equation}\label{graph:2}
+\begin{tikzcd}
+r_0 \arrow[r, "\backslash c_0"]  \arrow[d] & r_1 \arrow[r, "\backslash c_1"] \arrow[d] & r_2 \arrow[r, dashed] \arrow[d] & r_n \arrow[d, "mkeps" description] \\
+v_0           & v_1 \arrow[l,"inj_{r_0} c_0"]                & v_2 \arrow[l, "inj_{r_1} c_1"]              & v_n \arrow[l, dashed]         
+\end{tikzcd}
+\end{equation}
+\end{ceqn}
+
+\noindent
+For convenience, we shall employ the following notations: the regular
+expression we start with is $r_0$, and the given string $s$ is composed
+of characters $c_0 c_1 \ldots c_{n-1}$. In  the first phase from the
+left to right, we build the derivatives $r_1$, $r_2$, \ldots  according
+to the characters $c_0$, $c_1$  until we exhaust the string and obtain
+the derivative $r_n$. We test whether this derivative is
+$\textit{nullable}$ or not. If not, we know the string does not match
+$r$ and no value needs to be generated. If yes, we start building the
+values incrementally by \emph{injecting} back the characters into the
+earlier values $v_n, \ldots, v_0$. This is the second phase of the
+algorithm from the right to left. For the first value $v_n$, we call the
+function $\textit{mkeps}$, which builds the lexical value
+for how the empty string has been matched by the (nullable) regular
+expression $r_n$. This function is defined as
+
+	\begin{center}
+		\begin{tabular}{lcl}
+			$\mkeps(\ONE)$ 		& $\dn$ & $\Empty$ \\
+			$\mkeps(r_{1}+r_{2})$	& $\dn$ 
+			& \textit{if} $\nullable(r_{1})$\\ 
+			& & \textit{then} $\Left(\mkeps(r_{1}))$\\ 
+			& & \textit{else} $\Right(\mkeps(r_{2}))$\\
+			$\mkeps(r_1\cdot r_2)$ 	& $\dn$ & $\Seq\,(\mkeps\,r_1)\,(\mkeps\,r_2)$\\
+			$mkeps(r^*)$	        & $\dn$ & $\Stars\,[]$
+		\end{tabular}
+	\end{center}
+
+
+\noindent There are no cases for $\ZERO$ and $c$, since
+these regular expression cannot match the empty string. Note
+also that in case of alternatives we give preference to the
+regular expression on the left-hand side. This will become
+important later on about what value is calculated.
+
+After the $\mkeps$-call, we inject back the characters one by one in order to build
+the lexical value $v_i$ for how the regex $r_i$ matches the string $s_i$
+($s_i = c_i \ldots c_{n-1}$ ) from the previous lexical value $v_{i+1}$.
+After injecting back $n$ characters, we get the lexical value for how $r_0$
+matches $s$. For this Sulzmann and Lu defined a function that reverses
+the ``chopping off'' of characters during the derivative phase. The
+corresponding function is called \emph{injection}, written
+$\textit{inj}$; it takes three arguments: the first one is a regular
+expression ${r_{i-1}}$, before the character is chopped off, the second
+is a character ${c_{i-1}}$, the character we want to inject and the
+third argument is the value ${v_i}$, into which one wants to inject the
+character (it corresponds to the regular expression after the character
+has been chopped off). The result of this function is a new value. The
+definition of $\textit{inj}$ is as follows: 
+
+\begin{center}
+\begin{tabular}{l@{\hspace{1mm}}c@{\hspace{1mm}}l}
+  $\textit{inj}\,(c)\,c\,Empty$            & $\dn$ & $Char\,c$\\
+  $\textit{inj}\,(r_1 + r_2)\,c\,\Left(v)$ & $\dn$ & $\Left(\textit{inj}\,r_1\,c\,v)$\\
+  $\textit{inj}\,(r_1 + r_2)\,c\,Right(v)$ & $\dn$ & $Right(\textit{inj}\,r_2\,c\,v)$\\
+  $\textit{inj}\,(r_1 \cdot r_2)\,c\,Seq(v_1,v_2)$ & $\dn$  & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\
+  $\textit{inj}\,(r_1 \cdot r_2)\,c\,\Left(Seq(v_1,v_2))$ & $\dn$  & $Seq(\textit{inj}\,r_1\,c\,v_1,v_2)$\\
+  $\textit{inj}\,(r_1 \cdot r_2)\,c\,Right(v)$ & $\dn$  & $Seq(\textit{mkeps}(r_1),\textit{inj}\,r_2\,c\,v)$\\
+  $\textit{inj}\,(r^*)\,c\,Seq(v,Stars\,vs)$         & $\dn$  & $Stars((\textit{inj}\,r\,c\,v)\,::\,vs)$\\
+\end{tabular}
+\end{center}
+
+\noindent This definition is by recursion on the ``shape'' of regular
+expressions and values. To understands this definition better consider
+the situation when we build the derivative on regular expression $r_{i-1}$.
+For this we chop off a character from $r_{i-1}$ to form $r_i$. This leaves a
+``hole'' in $r_i$ and its corresponding value $v_i$. 
+To calculate $v_{i-1}$, we need to
+locate where that hole is and fill it. 
+We can find this location by
+comparing $r_{i-1}$ and $v_i$. For instance, if $r_{i-1}$ is of shape
+$r_a \cdot r_b$, and $v_i$ is of shape $\Left(Seq(v_1,v_2))$, we know immediately that 
+%
+\[ (r_a \cdot r_b)\backslash c = (r_a\backslash c) \cdot r_b \,+\, r_b\backslash c,\]
+
+\noindent
+otherwise if $r_a$ is not nullable,
+\[ (r_a \cdot r_b)\backslash c = (r_a\backslash c) \cdot r_b,\]
+
+\noindent
+the value $v_i$ should be  $\Seq(\ldots)$, contradicting the fact that
+$v_i$ is actually of shape $\Left(\ldots)$. Furthermore, since $v_i$ is of shape
+$\Left(\ldots)$ instead of $\Right(\ldots)$, we know that the left
+branch of \[ (r_a \cdot r_b)\backslash c =
+\bold{\underline{ (r_a\backslash c) \cdot r_b} }\,+\, r_b\backslash c,\](underlined)
+ is taken instead of the right one. This means $c$ is chopped off 
+from $r_a$ rather than $r_b$.
+We have therefore found out 
+that the hole will be on $r_a$. So we recursively call $\inj\, 
+r_a\,c\,v_a$ to fill that hole in $v_a$. After injection, the value 
+$v_i$ for $r_i = r_a \cdot r_b$ should be $\Seq\,(\inj\,r_a\,c\,v_a)\,v_b$.
+Other clauses can be understood in a similar way.
+
+%\comment{Other word: insight?}
+The following example gives an insight of $\textit{inj}$'s effect and
+how Sulzmann and Lu's algorithm works as a whole. Suppose we have a
+regular expression $((((a+b)+ab)+c)+abc)^*$, and want to match it
+against the string $abc$ (when $abc$ is written as a regular expression,
+the standard way of expressing it is $a \cdot (b \cdot c)$. But we
+usually omit the parentheses and dots here for better readability. This
+algorithm returns a POSIX value, which means it will produce the longest
+matching. Consequently, it matches the string $abc$ in one star
+iteration, using the longest alternative $abc$ in the sub-expression (we shall use $r$ to denote this
+sub-expression for conciseness):
+
+\[((((a+b)+ab)+c)+\underbrace{abc}_r)\] 
+
+\noindent
+Before $\textit{inj}$ is called, our lexer first builds derivative using
+string $abc$ (we simplified some regular expressions like $\ZERO \cdot
+b$ to $\ZERO$ for conciseness; we also omit parentheses if they are
+clear from the context):
+
+%Similarly, we allow
+%$\textit{ALT}$ to take a list of regular expressions as an argument
+%instead of just 2 operands to reduce the nested depth of
+%$\textit{ALT}$
+
+\begin{center}
+\begin{tabular}{lcl}
+$r^*$ & $\xrightarrow{\backslash a}$ & $r_1 = (\ONE+\ZERO+\ONE \cdot b + \ZERO + \ONE \cdot b \cdot c) \cdot r^*$\\
+      & $\xrightarrow{\backslash b}$ & $r_2 = (\ZERO+\ZERO+\ONE \cdot \ONE + \ZERO + \ONE \cdot \ONE \cdot c) \cdot r^* +(\ZERO+\ONE+\ZERO  + \ZERO + \ZERO) \cdot r^*$\\
+      & $\xrightarrow{\backslash c}$ & $r_3 = ((\ZERO+\ZERO+\ZERO + \ZERO + \ONE \cdot \ONE \cdot \ONE) \cdot r^* + (\ZERO+\ZERO+\ZERO  + \ONE + \ZERO) \cdot r^*) + $\\ 
+      &                              & $\phantom{r_3 = (} ((\ZERO+\ONE+\ZERO  + \ZERO + \ZERO) \cdot r^* + (\ZERO+\ZERO+\ZERO  + \ONE + \ZERO) \cdot r^* )$
+\end{tabular}
+\end{center}
+
+\noindent
+In  case $r_3$ is nullable, we can call $\textit{mkeps}$ 
+to construct a lexical value for how $r_3$ matched the string $abc$. 
+This function gives the following value $v_3$: 
+
+
+\begin{center}
+$\Left(\Left(\Seq(\Right(\Seq(\Empty, \Seq(\Empty,\Empty))), \Stars [])))$
+\end{center}
+The outer $\Left(\Left(\ldots))$ tells us the leftmost nullable part of $r_3$(underlined):
+
+\begin{center}
+	\begin{tabular}{l@{\hspace{2mm}}l}
+    & $\big(\underline{(\ZERO+\ZERO+\ZERO+ \ZERO+ \ONE \cdot \ONE \cdot \ONE) \cdot r^*} 
+    \;+\; (\ZERO+\ZERO+\ZERO + \ONE + \ZERO) \cdot r^*\big)$ \smallskip\\
+    $+$ & $\big((\ZERO+\ONE+\ZERO  + \ZERO + \ZERO) \cdot r^*
+    \;+\; (\ZERO+\ZERO+\ZERO  + \ONE + \ZERO) \cdot r^* \big)$
+  	\end{tabular}
+ \end{center}
+
+\noindent
+ Note that the leftmost location of term $(\ZERO+\ZERO+\ZERO + \ZERO + \ONE \cdot \ONE \cdot
+ \ONE) \cdot r^*$ (which corresponds to the initial sub-match $abc$) allows
+ $\textit{mkeps}$ to pick it up because $\textit{mkeps}$ is defined to always choose the
+ left one when it is nullable. In the case of this example, $abc$ is
+ preferred over $a$ or $ab$. This $\Left(\Left(\ldots))$ location is
+ generated by two applications of the splitting clause
+
+\begin{center}
+     $(r_1 \cdot r_2)\backslash c  \;\;(when \; r_1 \; nullable) \, = (r_1\backslash c) \cdot r_2 \,+\, r_2\backslash c.$
+\end{center}
+       
+\noindent
+By this clause, we put $r_1 \backslash c \cdot r_2 $ at the
+$\textit{front}$ and $r_2 \backslash c$ at the $\textit{back}$. This
+allows $\textit{mkeps}$ to always pick up among two matches the one with a longer
+initial sub-match. Removing the outside $\Left(\Left(...))$, the inside
+sub-value 
+ 
+\begin{center}
+ $\Seq(\Right(\Seq(\Empty, \Seq(\Empty, \Empty))), \Stars [])$
+\end{center}
+
+\noindent
+tells us how the empty string $[]$ is matched with $(\ZERO+\ZERO+\ZERO + \ZERO + \ONE \cdot
+\ONE \cdot \ONE) \cdot r^*$. We match $[]$ by a sequence of two nullable regular
+expressions. The first one is an alternative, we take the rightmost
+alternative---whose language contains the empty string. The second
+nullable regular expression is a Kleene star. $\Stars$ tells us how it
+generates the nullable regular expression: by 0 iterations to form
+$\ONE$. Now $\textit{inj}$ injects characters back and incrementally
+builds a lexical value based on $v_3$. Using the value $v_3$, the character
+c, and the regular expression $r_2$, we can recover how $r_2$ matched
+the string $[c]$ : $\textit{inj} \; r_2 \; c \; v_3$ gives us
+ \begin{center}
+ $v_2 = \Left(\Seq(\Right(\Seq(\Empty, \Seq(\Empty, c))), \Stars [])),$
+ \end{center}
+which tells us how $r_2$ matched $[c]$. After this we inject back the character $b$, and get
+\begin{center}
+$v_1 = \Seq(\Right(\Seq(\Empty, \Seq(b, c))), \Stars [])$
+\end{center}
+ for how 
+ \begin{center}
+ $r_1= (\ONE+\ZERO+\ONE \cdot b + \ZERO + \ONE \cdot b \cdot c) \cdot r*$
+ \end{center}
+  matched  the string $bc$ before it split into two substrings. 
+  Finally, after injecting character $a$ back to $v_1$, 
+  we get  the lexical value tree 
+  \begin{center}
+  $v_0= \Stars [\Right(\Seq(a, \Seq(b, c)))]$
+  \end{center}
+   for how $r$ matched $abc$. This completes the algorithm.
+   
+%We omit the details of injection function, which is provided by Sulzmann and Lu's paper \cite{Sulzmann2014}. 
+Readers might have noticed that the lexical value information is actually
+already available when doing derivatives. For example, immediately after
+the operation $\backslash a$ we know that if we want to match a string
+that starts with $a$, we can either take the initial match to be 
+
+ \begin{center}
+\begin{enumerate}
+    \item[1)] just $a$ or
+    \item[2)] string $ab$ or 
+    \item[3)] string $abc$.
+\end{enumerate}
+\end{center}
+
+\noindent
+In order to differentiate between these choices, we just need to
+remember their positions---$a$ is on the left, $ab$ is in the middle ,
+and $abc$ is on the right. Which of these alternatives is chosen
+later does not affect their relative position because the algorithm does
+not change this order. If this parsing information can be determined and
+does not change because of later derivatives, there is no point in
+traversing this information twice. This leads to an optimisation---if we
+store the information for lexical values inside the regular expression,
+update it when we do derivative on them, and collect the information
+when finished with derivatives and call $\textit{mkeps}$ for deciding which
+branch is POSIX, we can generate the lexical value in one pass, instead of
+doing the rest $n$ injections. This leads to Sulzmann and Lu's novel
+idea of using bitcodes in derivatives.
+
+In the next section, we shall focus on the bitcoded algorithm and the
+process of simplification of regular expressions. This is needed in
+order to obtain \emph{fast} versions of the Brzozowski's, and Sulzmann
+and Lu's algorithms.  This is where the PhD-project aims to advance the
+state-of-the-art.
+
+
+\section{Simplification of Regular Expressions}
+
+Using bitcodes to guide  parsing is not a novel idea. It was applied to
+context free grammars and then adapted by Henglein and Nielson for
+efficient regular expression  lexing using DFAs~\cite{nielson11bcre}.
+Sulzmann and Lu took this idea of bitcodes a step further by integrating
+bitcodes into derivatives. The reason why we want to use bitcodes in
+this project is that we want to introduce more aggressive simplification
+rules in order to keep the size of derivatives small throughout. This is
+because the main drawback of building successive derivatives according
+to Brzozowski's definition is that they can grow very quickly in size.
+This is mainly due to the fact that the derivative operation generates
+often ``useless'' $\ZERO$s and $\ONE$s in derivatives.  As a result, if
+implemented naively both algorithms by Brzozowski and by Sulzmann and Lu
+are excruciatingly slow. For example when starting with the regular
+expression $(a + aa)^*$ and building 12 successive derivatives
+w.r.t.~the character $a$, one obtains a derivative regular expression
+with more than 8000 nodes (when viewed as a tree). Operations like
+$\textit{der}$ and $\nullable$ need to traverse such trees and
+consequently the bigger the size of the derivative the slower the
+algorithm. 
+
+Fortunately, one can simplify regular expressions after each derivative
+step. Various simplifications of regular expressions are possible, such
+as the simplification of $\ZERO + r$, $r + \ZERO$, $\ONE\cdot r$, $r
+\cdot \ONE$, and $r + r$ to just $r$. These simplifications do not
+affect the answer for whether a regular expression matches a string or
+not, but fortunately also do not affect the POSIX strategy of how
+regular expressions match strings---although the latter is much harder
+to establish. Some initial results in this regard have been
+obtained in \cite{AusafDyckhoffUrban2016}. 
+
+Unfortunately, the simplification rules outlined above  are not
+sufficient to prevent a size explosion in all cases. We
+believe a tighter bound can be achieved that prevents an explosion in
+\emph{all} cases. Such a tighter bound is suggested by work of Antimirov who
+proved that (partial) derivatives can be bound by the number of
+characters contained in the initial regular expression
+\cite{Antimirov95}. He defined the \emph{partial derivatives} of regular
+expressions as follows:
+
+\begin{center}
+\begin{tabular}{lcl}
+ $\textit{pder} \; c \; \ZERO$ & $\dn$ & $\emptyset$\\
+ $\textit{pder} \; c \; \ONE$ & $\dn$ & $\emptyset$ \\
+ $\textit{pder} \; c \; d$ & $\dn$ & $\textit{if} \; c \,=\, d \; \{  \ONE   \}  \; \textit{else} \; \emptyset$ \\ 
+  $\textit{pder} \; c \; r_1+r_2$ & $\dn$ & $pder \; c \; r_1 \cup pder \; c \;  r_2$ \\
+   $\textit{pder} \; c \; r_1 \cdot r_2$ & $\dn$ & $\textit{if} \; nullable \; r_1 $\\
+     & & $\textit{then} \; \{  r \cdot r_2 \mid r \in pder \; c \; r_1   \}  \cup pder \; c \; r_2 \;$\\
+     & & $\textit{else} \; \{  r \cdot r_2 \mid r \in pder \; c \; r_1   \} $ \\ 
+     $\textit{pder} \; c \; r^*$ & $\dn$ & $ \{  r' \cdot r^* \mid r' \in pder \; c \; r   \}  $ \\  
+ \end{tabular}
+ \end{center}
+
+\noindent
+A partial derivative of a regular expression $r$ is essentially a set of
+regular expressions that are either $r$'s children expressions or a
+concatenation of them. Antimirov has proved a tight bound of the sum of
+the size of \emph{all} partial derivatives no matter what the string
+looks like. Roughly speaking the size sum will be at most cubic in the
+size of the regular expression.
+
+If we want the size of derivatives in Sulzmann and Lu's algorithm to
+stay below this bound, we would need more aggressive simplifications.
+Essentially we need to delete useless $\ZERO$s and $\ONE$s, as well as
+deleting duplicates whenever possible. For example, the parentheses in
+$(a+b) \cdot c + bc$ can be opened up to get $a\cdot c +  b \cdot c + b
+\cdot c$, and then simplified to just $a \cdot c + b \cdot c$. Another
+example is simplifying $(a^*+a) + (a^*+ \ONE) + (a +\ONE)$ to just
+$a^*+a+\ONE$. Adding these more aggressive simplification rules helps us
+to achieve the same size bound as that of the partial derivatives. 
+
+In order to implement the idea of ``spilling out alternatives'' and to
+make them compatible with the $\text{inj}$-mechanism, we use
+\emph{bitcodes}. Bits and bitcodes (lists of bits) are just:
+
+%This allows us to prove a tight
+%bound on the size of regular expression during the running time of the
+%algorithm if we can establish the connection between our simplification
+%rules and partial derivatives.
+
+ %We believe, and have generated test
+%data, that a similar bound can be obtained for the derivatives in
+%Sulzmann and Lu's algorithm. Let us give some details about this next.
+
+
+\begin{center}
+		$b ::=   S \mid  Z \qquad
+bs ::= [] \mid b:bs    
+$
+\end{center}
+
+\noindent
+The $S$ and $Z$ are arbitrary names for the bits in order to avoid 
+confusion with the regular expressions $\ZERO$ and $\ONE$. Bitcodes (or
+bit-lists) can be used to encode values (or incomplete values) in a
+compact form. This can be straightforwardly seen in the following
+coding function from values to bitcodes: 
+
+\begin{center}
+\begin{tabular}{lcl}
+  $\textit{code}(\Empty)$ & $\dn$ & $[]$\\
+  $\textit{code}(\Char\,c)$ & $\dn$ & $[]$\\
+  $\textit{code}(\Left\,v)$ & $\dn$ & $\Z :: code(v)$\\
+  $\textit{code}(\Right\,v)$ & $\dn$ & $\S :: code(v)$\\
+  $\textit{code}(\Seq\,v_1\,v_2)$ & $\dn$ & $code(v_1) \,@\, code(v_2)$\\
+  $\textit{code}(\Stars\,[])$ & $\dn$ & $[\Z]$\\
+  $\textit{code}(\Stars\,(v\!::\!vs))$ & $\dn$ & $\S :: code(v) \;@\;
+                                                 code(\Stars\,vs)$
+\end{tabular}    
+\end{center} 
+
+\noindent
+Here $\textit{code}$ encodes a value into a bitcodes by converting
+$\Left$ into $\Z$, $\Right$ into $\S$, the start point of a non-empty
+star iteration into $\S$, and the border where a local star terminates
+into $\Z$. This coding is lossy, as it throws away the information about
+characters, and also does not encode the ``boundary'' between two
+sequence values. Moreover, with only the bitcode we cannot even tell
+whether the $\S$s and $\Z$s are for $\Left/\Right$ or $\Stars$. The
+reason for choosing this compact way of storing information is that the
+relatively small size of bits can be easily manipulated and ``moved
+around'' in a regular expression. In order to recover values, we will 
+need the corresponding regular expression as an extra information. This
+means the decoding function is defined as:
+
+
+%\begin{definition}[Bitdecoding of Values]\mbox{}
+\begin{center}
+\begin{tabular}{@{}l@{\hspace{1mm}}c@{\hspace{1mm}}l@{}}
+  $\textit{decode}'\,bs\,(\ONE)$ & $\dn$ & $(\Empty, bs)$\\
+  $\textit{decode}'\,bs\,(c)$ & $\dn$ & $(\Char\,c, bs)$\\
+  $\textit{decode}'\,(\Z\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}\;
+       (\Left\,v, bs_1)$\\
+  $\textit{decode}'\,(\S\!::\!bs)\;(r_1 + r_2)$ & $\dn$ &
+     $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r_2\;\textit{in}\;
+       (\Right\,v, bs_1)$\\                           
+  $\textit{decode}'\,bs\;(r_1\cdot r_2)$ & $\dn$ &
+        $\textit{let}\,(v_1, bs_1) = \textit{decode}'\,bs\,r_1\;\textit{in}$\\
+  & &   $\textit{let}\,(v_2, bs_2) = \textit{decode}'\,bs_1\,r_2$\\
+  & &   \hspace{35mm}$\textit{in}\;(\Seq\,v_1\,v_2, bs_2)$\\
+  $\textit{decode}'\,(\Z\!::\!bs)\,(r^*)$ & $\dn$ & $(\Stars\,[], bs)$\\
+  $\textit{decode}'\,(\S\!::\!bs)\,(r^*)$ & $\dn$ & 
+         $\textit{let}\,(v, bs_1) = \textit{decode}'\,bs\,r\;\textit{in}$\\
+  & &   $\textit{let}\,(\Stars\,vs, bs_2) = \textit{decode}'\,bs_1\,r^*$\\
+  & &   \hspace{35mm}$\textit{in}\;(\Stars\,v\!::\!vs, bs_2)$\bigskip\\
+  
+  $\textit{decode}\,bs\,r$ & $\dn$ &
+     $\textit{let}\,(v, bs') = \textit{decode}'\,bs\,r\;\textit{in}$\\
+  & & $\textit{if}\;bs' = []\;\textit{then}\;\textit{Some}\,v\;
+       \textit{else}\;\textit{None}$                       
+\end{tabular}    
+\end{center}    
+%\end{definition}
+
+Sulzmann and Lu's integrated the bitcodes into regular expressions to
+create annotated regular expressions \cite{Sulzmann2014}.
+\emph{Annotated regular expressions} are defined by the following
+grammar:%\comment{ALTS should have  an $as$ in  the definitions, not  just $a_1$ and $a_2$}
+
+\begin{center}
+\begin{tabular}{lcl}
+  $\textit{a}$ & $::=$  & $\textit{ZERO}$\\
+                  & $\mid$ & $\textit{ONE}\;\;bs$\\
+                  & $\mid$ & $\textit{CHAR}\;\;bs\,c$\\
+                  & $\mid$ & $\textit{ALTS}\;\;bs\,as$\\
+                  & $\mid$ & $\textit{SEQ}\;\;bs\,a_1\,a_2$\\
+                  & $\mid$ & $\textit{STAR}\;\;bs\,a$
+\end{tabular}    
+\end{center}  
+%(in \textit{ALTS})
+
+\noindent
+where $bs$ stands for bitcodes, $a$  for $\bold{a}$nnotated regular
+expressions and $as$ for a list of annotated regular expressions.
+The alternative constructor($\textit{ALTS}$) has been generalized to 
+accept a list of annotated regular expressions rather than just 2.
+We will show that these bitcodes encode information about
+the (POSIX) value that should be generated by the Sulzmann and Lu
+algorithm.
+
+
+To do lexing using annotated regular expressions, we shall first
+transform the usual (un-annotated) regular expressions into annotated
+regular expressions. This operation is called \emph{internalisation} and
+defined as follows:
+
+%\begin{definition}
+\begin{center}
+\begin{tabular}{lcl}
+  $(\ZERO)^\uparrow$ & $\dn$ & $\textit{ZERO}$\\
+  $(\ONE)^\uparrow$ & $\dn$ & $\textit{ONE}\,[]$\\
+  $(c)^\uparrow$ & $\dn$ & $\textit{CHAR}\,[]\,c$\\
+  $(r_1 + r_2)^\uparrow$ & $\dn$ &
+  $\textit{ALTS}\;[]\,List((\textit{fuse}\,[\Z]\,r_1^\uparrow),\,
+  (\textit{fuse}\,[\S]\,r_2^\uparrow))$\\
+  $(r_1\cdot r_2)^\uparrow$ & $\dn$ &
+         $\textit{SEQ}\;[]\,r_1^\uparrow\,r_2^\uparrow$\\
+  $(r^*)^\uparrow$ & $\dn$ &
+         $\textit{STAR}\;[]\,r^\uparrow$\\
+\end{tabular}    
+\end{center}    
+%\end{definition}
+
+\noindent
+We use up arrows here to indicate that the basic un-annotated regular
+expressions are ``lifted up'' into something slightly more complex. In the
+fourth clause, $\textit{fuse}$ is an auxiliary function that helps to
+attach bits to the front of an annotated regular expression. Its
+definition is as follows:
+
+\begin{center}
+\begin{tabular}{lcl}
+  $\textit{fuse}\;bs\,(\textit{ZERO})$ & $\dn$ & $\textit{ZERO}$\\
+  $\textit{fuse}\;bs\,(\textit{ONE}\,bs')$ & $\dn$ &
+     $\textit{ONE}\,(bs\,@\,bs')$\\
+  $\textit{fuse}\;bs\,(\textit{CHAR}\,bs'\,c)$ & $\dn$ &
+     $\textit{CHAR}\,(bs\,@\,bs')\,c$\\
+  $\textit{fuse}\;bs\,(\textit{ALTS}\,bs'\,as)$ & $\dn$ &
+     $\textit{ALTS}\,(bs\,@\,bs')\,as$\\
+  $\textit{fuse}\;bs\,(\textit{SEQ}\,bs'\,a_1\,a_2)$ & $\dn$ &
+     $\textit{SEQ}\,(bs\,@\,bs')\,a_1\,a_2$\\
+  $\textit{fuse}\;bs\,(\textit{STAR}\,bs'\,a)$ & $\dn$ &
+     $\textit{STAR}\,(bs\,@\,bs')\,a$
+\end{tabular}    
+\end{center}  
+
+\noindent
+After internalising the regular expression, we perform successive
+derivative operations on the annotated regular expressions. This
+derivative operation is the same as what we had previously for the
+basic regular expressions, except that we beed to take care of
+the bitcodes:
+
+ %\begin{definition}{bder}
+\begin{center}
+  \begin{tabular}{@{}lcl@{}}
+  $(\textit{ZERO})\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
+  $(\textit{ONE}\;bs)\,\backslash c$ & $\dn$ & $\textit{ZERO}$\\  
+  $(\textit{CHAR}\;bs\,d)\,\backslash c$ & $\dn$ &
+        $\textit{if}\;c=d\; \;\textit{then}\;
+         \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\  
+  $(\textit{ALTS}\;bs\,as)\,\backslash c$ & $\dn$ &
+  $\textit{ALTS}\;bs\,(as.map(\backslash c))$\\
+  $(\textit{SEQ}\;bs\,a_1\,a_2)\,\backslash c$ & $\dn$ &
+     $\textit{if}\;\textit{bnullable}\,a_1$\\
+					       & &$\textit{then}\;\textit{ALTS}\,bs\,List((\textit{SEQ}\,[]\,(a_1\,\backslash c)\,a_2),$\\
+					       & &$\phantom{\textit{then}\;\textit{ALTS}\,bs\,}(\textit{fuse}\,(\textit{bmkeps}\,a_1)\,(a_2\,\backslash c)))$\\
+  & &$\textit{else}\;\textit{SEQ}\,bs\,(a_1\,\backslash c)\,a_2$\\
+  $(\textit{STAR}\,bs\,a)\,\backslash c$ & $\dn$ &
+      $\textit{SEQ}\;bs\,(\textit{fuse}\, [\Z] (r\,\backslash c))\,
+       (\textit{STAR}\,[]\,r)$
+\end{tabular}    
+\end{center}    
+%\end{definition}
+
+\noindent
+For instance, when we unfold $\textit{STAR} \; bs \; a$ into a sequence,
+we need to attach an additional bit $Z$ to the front of $r \backslash c$
+to indicate that there is one more star iteration. Also the $SEQ$ clause
+is more subtle---when $a_1$ is $\textit{bnullable}$ (here
+\textit{bnullable} is exactly the same as $\textit{nullable}$, except
+that it is for annotated regular expressions, therefore we omit the
+definition). Assume that $bmkeps$ correctly extracts the bitcode for how
+$a_1$ matches the string prior to character $c$ (more on this later),
+then the right branch of $ALTS$, which is $fuse \; bmkeps \;  a_1 (a_2
+\backslash c)$ will collapse the regular expression $a_1$(as it has
+already been fully matched) and store the parsing information at the
+head of the regular expression $a_2 \backslash c$ by fusing to it. The
+bitsequence $bs$, which was initially attached to the head of $SEQ$, has
+now been elevated to the top-level of $ALTS$, as this information will be
+needed whichever way the $SEQ$ is matched---no matter whether $c$ belongs
+to $a_1$ or $ a_2$. After building these derivatives and maintaining all
+the lexing information, we complete the lexing by collecting the
+bitcodes using a generalised version of the $\textit{mkeps}$ function
+for annotated regular expressions, called $\textit{bmkeps}$:
+
+
+%\begin{definition}[\textit{bmkeps}]\mbox{}
+\begin{center}
+\begin{tabular}{lcl}
+  $\textit{bmkeps}\,(\textit{ONE}\;bs)$ & $\dn$ & $bs$\\
+  $\textit{bmkeps}\,(\textit{ALTS}\;bs\,a::as)$ & $\dn$ &
+     $\textit{if}\;\textit{bnullable}\,a$\\
+  & &$\textit{then}\;bs\,@\,\textit{bmkeps}\,a$\\
+  & &$\textit{else}\;bs\,@\,\textit{bmkeps}\,(\textit{ALTS}\;bs\,as)$\\
+  $\textit{bmkeps}\,(\textit{SEQ}\;bs\,a_1\,a_2)$ & $\dn$ &
+     $bs \,@\,\textit{bmkeps}\,a_1\,@\, \textit{bmkeps}\,a_2$\\
+  $\textit{bmkeps}\,(\textit{STAR}\;bs\,a)$ & $\dn$ &
+     $bs \,@\, [\S]$
+\end{tabular}    
+\end{center}    
+%\end{definition}
+
+\noindent
+This function completes the value information by travelling along the
+path of the regular expression that corresponds to a POSIX value and
+collecting all the bitcodes, and using $S$ to indicate the end of star
+iterations. If we take the bitcodes produced by $\textit{bmkeps}$ and
+decode them, we get the value we expect. The corresponding lexing
+algorithm looks as follows:
+
+\begin{center}
+\begin{tabular}{lcl}
+  $\textit{blexer}\;r\,s$ & $\dn$ &
+      $\textit{let}\;a = (r^\uparrow)\backslash s\;\textit{in}$\\                
+  & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
+  & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
+  & & $\;\;\textit{else}\;\textit{None}$
+\end{tabular}
+\end{center}
+
+\noindent
+In this definition $\_\backslash s$ is the  generalisation  of the derivative
+operation from characters to strings (just like the derivatives for un-annotated
+regular expressions).
+
+The main point of the bitcodes and annotated regular expressions is that
+we can apply rather aggressive (in terms of size) simplification rules
+in order to keep derivatives small. We have developed such
+``aggressive'' simplification rules and generated test data that show
+that the expected bound can be achieved. Obviously we could only
+partially cover  the search space as there are infinitely many regular
+expressions and strings. 
+
+One modification we introduced is to allow a list of annotated regular
+expressions in the \textit{ALTS} constructor. This allows us to not just
+delete unnecessary $\ZERO$s and $\ONE$s from regular expressions, but
+also unnecessary ``copies'' of regular expressions (very similar to
+simplifying $r + r$ to just $r$, but in a more general setting). Another
+modification is that we use simplification rules inspired by Antimirov's
+work on partial derivatives. They maintain the idea that only the first
+``copy'' of a regular expression in an alternative contributes to the
+calculation of a POSIX value. All subsequent copies can be pruned away from
+the regular expression. A recursive definition of our  simplification function 
+that looks somewhat similar to our Scala code is given below:
+%\comment{Use $\ZERO$, $\ONE$ and so on. 
+%Is it $ALTS$ or $ALTS$?}\\
+
+\begin{center}
+  \begin{tabular}{@{}lcl@{}}
+   
+  $\textit{simp} \; (\textit{SEQ}\;bs\,a_1\,a_2)$ & $\dn$ & $ (\textit{simp} \; a_1, \textit{simp}  \; a_2) \; \textit{match} $ \\
+   &&$\quad\textit{case} \; (\ZERO, \_) \Rightarrow  \ZERO$ \\
+   &&$\quad\textit{case} \; (\_, \ZERO) \Rightarrow  \ZERO$ \\
+   &&$\quad\textit{case} \;  (\ONE, a_2') \Rightarrow  \textit{fuse} \; bs \;  a_2'$ \\
+   &&$\quad\textit{case} \; (a_1', \ONE) \Rightarrow  \textit{fuse} \; bs \;  a_1'$ \\
+   &&$\quad\textit{case} \; (a_1', a_2') \Rightarrow  \textit{SEQ} \; bs \; a_1' \;  a_2'$ \\
+
+  $\textit{simp} \; (\textit{ALTS}\;bs\,as)$ & $\dn$ & $\textit{distinct}( \textit{flatten} ( \textit{map simp as})) \; \textit{match} $ \\
+  &&$\quad\textit{case} \; [] \Rightarrow  \ZERO$ \\
+   &&$\quad\textit{case} \; a :: [] \Rightarrow  \textit{fuse bs a}$ \\
+   &&$\quad\textit{case} \;  as' \Rightarrow  \textit{ALTS}\;bs\;as'$\\ 
+
+   $\textit{simp} \; a$ & $\dn$ & $\textit{a} \qquad \textit{otherwise}$   
+\end{tabular}    
+\end{center}    
+
+\noindent
+The simplification does a pattern matching on the regular expression.
+When it detected that the regular expression is an alternative or
+sequence, it will try to simplify its children regular expressions
+recursively and then see if one of the children turn into $\ZERO$ or
+$\ONE$, which might trigger further simplification at the current level.
+The most involved part is the $\textit{ALTS}$ clause, where we use two
+auxiliary functions $\textit{flatten}$ and $\textit{distinct}$ to open up nested
+$\textit{ALTS}$ and reduce as many duplicates as possible. Function
+$\textit{distinct}$  keeps the first occurring copy only and remove all later ones
+when detected duplicates. Function $\textit{flatten}$ opens up nested \textit{ALTS}.
+Its recursive definition is given below:
+
+ \begin{center}
+  \begin{tabular}{@{}lcl@{}}
+  $\textit{flatten} \; (\textit{ALTS}\;bs\,as) :: as'$ & $\dn$ & $(\textit{map} \;
+     (\textit{fuse}\;bs)\; \textit{as}) \; @ \; \textit{flatten} \; as' $ \\
+  $\textit{flatten} \; \textit{ZERO} :: as'$ & $\dn$ & $ \textit{flatten} \;  as' $ \\
+    $\textit{flatten} \; a :: as'$ & $\dn$ & $a :: \textit{flatten} \; as'$ \quad(otherwise) 
+\end{tabular}    
+\end{center}  
+
+\noindent
+Here $\textit{flatten}$ behaves like the traditional functional programming flatten
+function, except that it also removes $\ZERO$s. Or in terms of regular expressions, it
+removes parentheses, for example changing $a+(b+c)$ into $a+b+c$.
+
+Suppose we apply simplification after each derivative step, and view
+these two operations as an atomic one: $a \backslash_{simp}\,c \dn
+\textit{simp}(a \backslash c)$. Then we can use the previous natural
+extension from derivative w.r.t.~character to derivative
+w.r.t.~string:%\comment{simp in  the [] case?}
+
+\begin{center}
+\begin{tabular}{lcl}
+$r \backslash_{simp} (c\!::\!s) $ & $\dn$ & $(r \backslash_{simp}\, c) \backslash_{simp}\, s$ \\
+$r \backslash_{simp} [\,] $ & $\dn$ & $r$
+\end{tabular}
+\end{center}
+
+\noindent
+we obtain an optimised version of the algorithm:
+
+ \begin{center}
+\begin{tabular}{lcl}
+  $\textit{blexer\_simp}\;r\,s$ & $\dn$ &
+      $\textit{let}\;a = (r^\uparrow)\backslash_{simp}\, s\;\textit{in}$\\                
+  & & $\;\;\textit{if}\; \textit{bnullable}(a)$\\
+  & & $\;\;\textit{then}\;\textit{decode}\,(\textit{bmkeps}\,a)\,r$\\
+  & & $\;\;\textit{else}\;\textit{None}$
+\end{tabular}
+\end{center}
+
+\noindent
+This algorithm keeps the regular expression size small, for example,
+with this simplification our previous $(a + aa)^*$ example's 8000 nodes
+will be reduced to just 6 and stays constant, no matter how long the
+input string is.
+
+
+
+\section{Current Work}
+
+We are currently engaged in two tasks related to this algorithm. The
+first task is proving that our simplification rules actually do not
+affect the POSIX value that should be generated by the algorithm
+according to the specification of a POSIX value and furthermore obtain a
+much tighter bound on the sizes of derivatives. The result is that our
+algorithm should be correct and faster on all inputs.  The original
+blow-up, as observed in JavaScript, Python and Java, would be excluded
+from happening in our algorithm. For this proof we use the theorem prover
+Isabelle. Once completed, this result will advance the state-of-the-art:
+Sulzmann and Lu wrote in their paper~\cite{Sulzmann2014} about the
+bitcoded ``incremental parsing method'' (that is the lexing algorithm
+outlined in this section):
+
+\begin{quote}\it
+  ``Correctness Claim: We further claim that the incremental parsing
+  method in Figure~5 in combination with the simplification steps in
+  Figure 6 yields POSIX parse tree [our lexical values]. We have tested this claim
+  extensively by using the method in Figure~3 as a reference but yet
+  have to work out all proof details.''
+\end{quote}  
+
+\noindent 
+We like to settle this correctness claim. It is relatively
+straightforward to establish that after one simplification step, the part of a
+nullable derivative that corresponds to a POSIX value remains intact and can
+still be collected, in other words, we can show that
+%\comment{Double-check....I
+%think this  is not the case}
+%\comment{If i remember correctly, you have proved this lemma.
+%I feel this is indeed not true because you might place arbitrary 
+%bits on the regex r, however if this is the case, did i remember wrongly that
+%you proved something like simplification does not affect $\textit{bmkeps}$ results?
+%Anyway, i have amended this a little bit so it does not allow arbitrary bits attached
+%to a regex. Maybe it works now.}
+
+\begin{center}
+	$\textit{bmkeps} \; a = \textit{bmkeps} \; \textit{bsimp} \; a\;($\textit{provided}$ \; a\; is \; \textit{bnullable} )$
+\end{center} 
+
+\noindent
+as this basically comes down to proving actions like removing the
+additional $r$ in $r+r$  does not delete important POSIX information in
+a regular expression. The hard part of this proof is to establish that
+
+\begin{center}
+	$ \textit{blexer}\_{simp}(r, \; s) =  \textit{blexer}(r, \; s)$
+\end{center}
+%comment{This is not true either...look at the definion blexer/blexer-simp}
+
+\noindent That is, if we do derivative on regular expression $r$ and then
+simplify it, and repeat this process until we exhaust the string, we get a
+regular expression $r''$($\textit{LHS}$)  that provides the POSIX matching
+information, which is exactly the same as the result $r'$($\textit{RHS}$ of the
+normal derivative algorithm that only does derivative repeatedly and has no
+simplification at all.  This might seem at first glance very unintuitive, as
+the $r'$ could be exponentially larger than $r''$, but can be explained in the
+following way: we are pruning away the possible matches that are not POSIX.
+Since there could be exponentially many 
+non-POSIX matchings and only 1 POSIX matching, it
+is understandable that our $r''$ can be a lot smaller.  we can still provide
+the same POSIX value if there is one.  This is not as straightforward as the
+previous proposition, as the two regular expressions $r'$ and $r''$ might have
+become very different.  The crucial point is to find the
+$\textit{POSIX}$  information of a regular expression and how it is modified,
+augmented and propagated 
+during simplification in parallel with the regular expression that
+has not been simplified in the subsequent derivative operations.  To aid this,
+we use the helper function retrieve described by Sulzmann and Lu:
+\begin{center}
+\begin{tabular}{@{}l@{\hspace{2mm}}c@{\hspace{2mm}}l@{}}
+  $\textit{retrieve}\,(\textit{ONE}\,bs)\,\Empty$ & $\dn$ & $bs$\\
+  $\textit{retrieve}\,(\textit{CHAR}\,bs\,c)\,(\Char\,d)$ & $\dn$ & $bs$\\
+  $\textit{retrieve}\,(\textit{ALTS}\,bs\,a::as)\,(\Left\,v)$ & $\dn$ &
+     $bs \,@\, \textit{retrieve}\,a\,v$\\
+  $\textit{retrieve}\,(\textit{ALTS}\,bs\,a::as)\,(\Right\,v)$ & $\dn$ &
+  $bs \,@\, \textit{retrieve}\,(\textit{ALTS}\,bs\,as)\,v$\\
+  $\textit{retrieve}\,(\textit{SEQ}\,bs\,a_1\,a_2)\,(\Seq\,v_1\,v_2)$ & $\dn$ &
+     $bs \,@\,\textit{retrieve}\,a_1\,v_1\,@\, \textit{retrieve}\,a_2\,v_2$\\
+  $\textit{retrieve}\,(\textit{STAR}\,bs\,a)\,(\Stars\,[])$ & $\dn$ &
+     $bs \,@\, [\S]$\\
+  $\textit{retrieve}\,(\textit{STAR}\,bs\,a)\,(\Stars\,(v\!::\!vs))$ & $\dn$ &\\
+  \multicolumn{3}{l}{
+     \hspace{3cm}$bs \,@\, [\Z] \,@\, \textit{retrieve}\,a\,v\,@\,
+                    \textit{retrieve}\,(\textit{STAR}\,[]\,a)\,(\Stars\,vs)$}\\
+\end{tabular}
+\end{center}
+%\comment{Did not read further}\\
+This function assembles the bitcode 
+%that corresponds to a lexical value for how
+%the current derivative matches the suffix of the string(the characters that
+%have not yet appeared, but will appear as the successive derivatives go on.
+%How do we get this "future" information? By the value $v$, which is
+%computed by a pass of the algorithm that uses
+%$inj$ as described in the previous section).  
+using information from both the derivative regular expression and the
+value. Sulzmann and Lu poroposed this function, but did not prove
+anything about it. Ausaf and Urban used it to connect the bitcoded
+algorithm to the older algorithm by the following equation:
+ 
+ \begin{center} $inj \;a\; c \; v = \textit{decode} \; (\textit{retrieve}\;
+	 (r^\uparrow)\backslash_{simp} \,c)\,v)$ 
+ \end{center} 
+
+\noindent
+whereby $r^\uparrow$ stands for the internalised version of $r$. Ausaf
+and Urban also used this fact to prove  the correctness of bitcoded
+algorithm without simplification.  Our purpose of using this, however,
+is to establish 
+
+\begin{center}
+$ \textit{retrieve} \;
+a \; v \;=\; \textit{retrieve}  \; (\textit{simp}\,a) \; v'.$
+\end{center}
+The idea is that using $v'$, a simplified version of $v$ that had gone
+through the same simplification step as $\textit{simp}(a)$, we are able
+to extract the bitcode that gives the same parsing information as the
+unsimplified one. However, we noticed that constructing such a  $v'$
+from $v$ is not so straightforward. The point of this is that  we might
+be able to finally bridge the gap by proving
+
+\begin{center}
+$\textit{retrieve} \; (r^\uparrow   \backslash  s) \; v = \;\textit{retrieve} \;
+(\textit{simp}(r^\uparrow)  \backslash  s) \; v'$
+\end{center}
+
+\noindent
+and subsequently
+
+\begin{center}
+$\textit{retrieve} \; (r^\uparrow \backslash  s) \; v\; = \; \textit{retrieve} \;
+(r^\uparrow  \backslash_{simp}  \, s) \; v'$.
+\end{center}
+
+\noindent
+The $\textit{LHS}$ of the above equation is the bitcode we want. This
+would prove that our simplified version of regular expression still
+contains all the bitcodes needed. The task here is to find a way to
+compute the correct $v'$.
+
+The second task is to speed up the more aggressive simplification.  Currently
+it is slower than the original naive simplification by Ausaf and Urban (the
+naive version as implemented by Ausaf   and Urban of course can ``explode'' in
+some cases).  It is therefore not surprising that the speed is also much slower
+than regular expression engines in popular programming languages such as Java
+and Python on most inputs that are linear. For example, just by rewriting the
+example regular expression in the beginning of this report  $(a^*)^*\,b$ into
+$a^*\,b$ would eliminate the ambiguity in the matching and make the time
+for matching linear with respect to the input string size. This allows the 
+DFA approach to become blindingly fast, and dwarf the speed of our current
+implementation. For example, here is a comparison of Java regex engine 
+and our implementation on this example.
+
+\begin{center}
+\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n*1000$},
+    x label style={at={(1.05,-0.05)}},
+    ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=33,
+    ymax=9,
+    scaled ticks=true,
+    axis lines=left,
+    width=5cm,
+    height=4cm, 
+    legend entries={Bitcoded Algorithm},  
+    legend pos=north west,
+    legend cell align=left]
+\addplot[red,mark=*, mark options={fill=white}] table {bad-scala.data};
+\end{axis}
+\end{tikzpicture}
+  &
+\begin{tikzpicture}
+\begin{axis}[
+    xlabel={$n*1000$},
+    x label style={at={(1.05,-0.05)}},
+    %ylabel={time in secs},
+    enlargelimits=false,
+    xtick={0,5,...,30},
+    xmax=33,
+    ymax=9,
+    scaled ticks=false,
+    axis lines=left,
+    width=5cm,
+    height=4cm, 
+    legend entries={Java},  
+    legend pos=north west,
+    legend cell align=left]
+\addplot[cyan,mark=*, mark options={fill=white}] table {good-java.data};
+\end{axis}
+\end{tikzpicture}\\
+\multicolumn{3}{c}{Graphs: Runtime for matching $a^*\,b$ with strings 
+           of the form $\underbrace{aa..a}_{n}$.}
+\end{tabular}    
+\end{center}  
+
+
+Java regex engine can match string of thousands of characters in a few milliseconds,
+whereas our current algorithm gets excruciatingly slow on input of this size.
+The running time in theory is linear, however it does not appear to be the 
+case in an actual implementation. So it needs to be explored how to
+make our algorithm faster on all inputs.  It could be the recursive calls that are
+needed to manipulate bits that are causing the slow down. A possible solution
+is to write recursive functions into tail-recusive form.
+Another possibility would be to explore
+again the connection to DFAs to speed up the algorithm on 
+subcalls that are small enough. This is very much work in progress.
+
+\section{Conclusion}
+
+In this PhD-project we are interested in fast algorithms for regular
+expression matching. While this seems to be a ``settled'' area, in
+fact interesting research questions are popping up as soon as one steps
+outside the classic automata theory (for example in terms of what kind
+of regular expressions are supported). The reason why it is
+interesting for us to look at the derivative approach introduced by
+Brzozowski for regular expression matching, and then much further
+developed by Sulzmann and Lu, is that derivatives can elegantly deal
+with some of the regular expressions that are of interest in ``real
+life''. This includes the not-regular expression, written $\neg\,r$
+(that is all strings that are not recognised by $r$), but also bounded
+regular expressions such as $r^{\{n\}}$ and $r^{\{n..m\}}$). There is
+also hope that the derivatives can provide another angle for how to
+deal more efficiently with back-references, which are one of the
+reasons why regular expression engines in JavaScript, Python and Java
+choose to not implement the classic automata approach of transforming
+regular expressions into NFAs and then DFAs---because we simply do not
+know how such back-references can be represented by DFAs.
+We also plan to implement the bitcoded algorithm
+in some imperative language like C to see if the inefficiency of the 
+Scala implementation
+is language specific. To make this research more comprehensive we also plan
+to contrast our (faster) version of bitcoded algorithm with the
+Symbolic Regex Matcher, the RE2, the Rust Regex Engine, and the static
+analysis approach by implementing them in the same language and then compare
+their performance.
+
+\bibliographystyle{plain}
+\bibliography{root,regex_time_complexity}
+
+
+
+\end{document}
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/bad-scala.data
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/bad-scala.data	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,34 @@
+%% LaTeX2e file `bad-scala.data'
+%% generated by the `filecontents' environment
+%% from source `mentoringMeeting' on 2022/02/22.
+%%
+1  0.048028
+2  0.126308
+3  0.169670
+4  0.264180
+5  0.384297
+6  0.612785
+7  0.817211
+8  1.089386
+9  1.363222
+10  1.644981
+11  1.987046
+12  2.416012
+13  2.920956
+14  3.398956
+15  3.739649
+16  4.329977
+17  4.932896
+18  5.633909
+19  6.436107
+20  6.927233
+21  7.441163
+22  8.242139
+23  8.901034
+24  9.680487
+25  10.496226
+26  11.385733
+27  12.247613
+28  13.990491
+29  14.245804
+30  15.206283
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/bitcoded_sulzmann.png
Binary file ChengsongPhdThesis/bitcoded_sulzmann.png has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/cc-by.pdf
Binary file ChengsongPhdThesis/cc-by.pdf has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/data.sty
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/data.sty	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,123 @@
+% The data files, written on the first run.
+
+
+\begin{filecontents}{re-python2.data}
+1 0.033
+5 0.036
+10 0.034
+15 0.036
+18 0.059
+19 0.084
+20 0.141
+21 0.248
+22 0.485
+23 0.878
+24 1.71
+25 3.40
+26 7.08
+27 14.12
+28 26.69
+\end{filecontents}
+
+
+% JavaScript, example (a*)*b  
+\begin{filecontents}{re-js.data}
+5   0.061
+10  0.061
+15  0.061
+20  0.070
+23  0.131
+25  0.308
+26  0.564
+28  1.994
+30  7.648
+31  15.881 
+32  32.190
+\end{filecontents}
+
+% Java 8, example (a*)*b  
+\begin{filecontents}{re-java.data}
+5  0.00298
+10  0.00418
+15  0.00996
+16  0.01710
+17  0.03492
+18  0.03303
+19  0.05084
+20  0.10177
+21  0.19960
+22  0.41159
+23  0.82234
+24  1.70251
+25  3.36112
+26  6.63998
+27  13.35120
+28  29.81185
+\end{filecontents}
+
+% Java, example (a*)b  
+\begin{filecontents}{good-java.data}
+1 1.5633E-5
+2 1.299E-5
+3 1.1451E-5
+4 1.5846E-5
+5 1.9934E-5
+6 2.174E-5
+7 2.7669E-5
+8 2.8657E-5
+9 2.8161E-5
+10 2.8729E-5
+11 3.5367E-5
+12 3.701E-5
+13 3.84E-5
+14 4.1329E-5
+15 4.8116E-5
+16 5.3597E-5
+17 4.6792E-5
+18 5.8618E-5
+19 6.2078E-5
+20 6.4702E-5
+21 6.1464E-5
+22 6.4693E-5
+23 6.1667E-5
+24 7.1466E-5
+25 7.8089E-5
+26 7.4661E-5
+27 7.5628E-5
+28 8.9169E-5
+29 9.4161E-5
+30 9.8494E-5
+\end{filecontents}
+
+\begin{filecontents}{bad-scala.data}
+1  0.048028
+2  0.126308
+3  0.169670
+4  0.264180
+5  0.384297
+6  0.612785
+7  0.817211
+8  1.089386
+9  1.363222
+10  1.644981
+11  1.987046
+12  2.416012
+13  2.920956
+14  3.398956
+15  3.739649
+16  4.329977
+17  4.932896
+18  5.633909
+19  6.436107
+20  6.927233
+21  7.441163
+22  8.242139
+23  8.901034
+24  9.680487
+25  10.496226
+26  11.385733
+27  12.247613
+28  13.990491
+29  14.245804
+30  15.206283
+\end{filecontents}
\ No newline at end of file
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/good-java.data
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/good-java.data	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,34 @@
+%% LaTeX2e file `good-java.data'
+%% generated by the `filecontents' environment
+%% from source `mentoringMeeting' on 2022/02/22.
+%%
+1 1.5633E-5
+2 1.299E-5
+3 1.1451E-5
+4 1.5846E-5
+5 1.9934E-5
+6 2.174E-5
+7 2.7669E-5
+8 2.8657E-5
+9 2.8161E-5
+10 2.8729E-5
+11 3.5367E-5
+12 3.701E-5
+13 3.84E-5
+14 4.1329E-5
+15 4.8116E-5
+16 5.3597E-5
+17 4.6792E-5
+18 5.8618E-5
+19 6.2078E-5
+20 6.4702E-5
+21 6.1464E-5
+22 6.4693E-5
+23 6.1667E-5
+24 7.1466E-5
+25 7.8089E-5
+26 7.4661E-5
+27 7.5628E-5
+28 8.9169E-5
+29 9.4161E-5
+30 9.8494E-5
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/graphic.sty
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/graphic.sty	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,12 @@
+\usepackage{tikz}
+%\usepackage{pgf}
+%\usetikzlibrary{positioning}
+%\usetikzlibrary{calc}
+%\usetikzlibrary{automata}
+%\usetikzlibrary{arrows}
+%\usetikzlibrary{backgrounds}
+%\usetikzlibrary{fit}
+%\usepackage{tikz-qtree}
+\usepackage{pgfplots}
+
+%\pgfplotsset{compat=1.15}
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/lipics-logo-bw.pdf
Binary file ChengsongPhdThesis/lipics-logo-bw.pdf has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/lipics.cls
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/lipics.cls	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,646 @@
+%%
+%% This is file `lipics.cls',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% lipics.dtx  (with options: `class')
+%% 
+%% -----------------------------------------------------------------
+%% Author:     le-tex publishing services
+%% 
+%% This file is part of the lipics package for preparing
+%% LIPICS articles.
+%% 
+%%       Copyright (C) 2010 Schloss Dagstuhl
+%% -----------------------------------------------------------------
+\NeedsTeXFormat{LaTeX2e}[2005/12/01]
+\ProvidesClass{lipics}
+    [2010/09/27 v1.1 LIPIcs articles]
+\emergencystretch1em
+\advance\hoffset-1in
+\advance\voffset-1in
+\advance\hoffset2.95mm
+\newif\if@nobotseplist  \@nobotseplistfalse
+\def\@endparenv{%
+  \addpenalty\@endparpenalty\if@nobotseplist\else\addvspace\@topsepadd\fi\@endpetrue}
+\def\@doendpe{%
+  \@endpetrue
+  \def\par{\@restorepar
+           \everypar{}%
+           \par
+           \if@nobotseplist
+             \addvspace\topsep
+             \addvspace\partopsep
+             \global\@nobotseplistfalse
+           \fi
+           \@endpefalse}%
+  \everypar{{\setbox\z@\lastbox}%
+            \everypar{}%
+            \if@nobotseplist\global\@nobotseplistfalse\fi
+            \@endpefalse}}
+\def\enumerate{%
+  \ifnum \@enumdepth >\thr@@\@toodeep\else
+    \advance\@enumdepth\@ne
+    \edef\@enumctr{enum\romannumeral\the\@enumdepth}%
+    \expandafter
+    \list
+      \csname label\@enumctr\endcsname
+      {\advance\partopsep\topsep
+       \topsep\z@\@plus\p@
+       \ifnum\@listdepth=\@ne
+         \labelsep0.72em
+       \else
+         \ifnum\@listdepth=\tw@
+           \labelsep0.3em
+         \else
+           \labelsep0.5em
+         \fi
+       \fi
+       \usecounter\@enumctr\def\makelabel##1{\hss\llap{##1}}}%
+  \fi}
+\def\endenumerate{\ifnum\@listdepth=\@ne\global\@nobotseplisttrue\fi\endlist}
+\def\itemize{%
+  \ifnum \@itemdepth >\thr@@\@toodeep\else
+    \advance\@itemdepth\@ne
+    \edef\@itemitem{labelitem\romannumeral\the\@itemdepth}%
+    \expandafter
+    \list
+      \csname\@itemitem\endcsname
+      {\advance\partopsep\topsep
+       \topsep\z@\@plus\p@
+       \ifnum\@listdepth=\@ne
+         \labelsep0.83em
+       \else
+         \ifnum\@listdepth=\tw@
+           \labelsep0.75em
+         \else
+           \labelsep0.5em
+         \fi
+      \fi
+      \def\makelabel##1{\hss\llap{##1}}}%
+  \fi}
+\def\enditemize{\ifnum\@listdepth=\@ne\global\@nobotseplisttrue\fi\endlist}
+\def\@sect#1#2#3#4#5#6[#7]#8{%
+  \ifnum #2>\c@secnumdepth
+    \let\@svsec\@empty
+  \else
+    \refstepcounter{#1}%
+    \protected@edef\@svsec{\@seccntformat{#1}\relax}%
+  \fi
+  \@tempskipa #5\relax
+  \ifdim \@tempskipa>\z@
+    \begingroup
+      #6{%
+        \@hangfrom{\hskip #3\relax
+          \ifnum #2=1
+            \colorbox[rgb]{0.99,0.78,0.07}{\kern0.15em\@svsec\kern0.15em}\quad
+          \else
+            \@svsec\quad
+          \fi}%
+          \interlinepenalty \@M #8\@@par}%
+    \endgroup
+    \csname #1mark\endcsname{#7}%
+    \addcontentsline{toc}{#1}{%
+      \ifnum #2>\c@secnumdepth \else
+        \protect\numberline{\csname the#1\endcsname}%
+      \fi
+      #7}%
+  \else
+    \def\@svsechd{%
+      #6{\hskip #3\relax
+      \@svsec #8}%
+      \csname #1mark\endcsname{#7}%
+      \addcontentsline{toc}{#1}{%
+        \ifnum #2>\c@secnumdepth \else
+          \protect\numberline{\csname the#1\endcsname}%
+        \fi
+        #7}}%
+  \fi
+  \@xsect{#5}}
+\def\@seccntformat#1{\csname the#1\endcsname}
+\def\@biblabel#1{\textcolor{darkgray}{\sffamily\bfseries#1}}
+\def\copyrightline{%
+  \ifx\@serieslogo\@empty
+  \else
+    \setbox\@tempboxa\hbox{\includegraphics[height=42\p@]{\@serieslogo}}%
+    \rlap{\hspace\textwidth\hspace{-\wd\@tempboxa}\hspace{\z@}%
+          \vtop to\z@{\vskip-0mm\unhbox\@tempboxa\vss}}%
+  \fi
+  \scriptsize
+  \vtop{\hsize\textwidth
+    \nobreakspace\\
+    \@Copyright
+    \ifx\@Event\@empty\else\@Event.\\\fi
+    \ifx\@Editors\@empty\else
+      \@Eds: \@Editors
+      ; pp. \thepage--\pageref{LastPage}%
+      \\
+    \fi
+    \setbox\@tempboxa\hbox{\includegraphics[height=14\p@,trim=0 15 0 0]{lipics-logo-bw}}%
+    \hspace*{\wd\@tempboxa}\enskip
+    \href{http://www.dagstuhl.de/lipics/}%
+         {Leibniz International Proceedings in Informatics}\\
+    \smash{\unhbox\@tempboxa}\enskip
+    \href{http://www.dagstuhl.de}%
+         {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik, Dagstuhl Publishing, Germany}}}
+\def\ps@plain{\let\@mkboth\@gobbletwo
+  \let\@oddhead\@empty
+  \let\@evenhead\@empty
+  \let\@evenfoot\copyrightline
+  \let\@oddfoot\copyrightline}
+\def\lipics@opterrshort{Option  "\CurrentOption" not supported}
+\def\lipics@opterrlong{The option "\CurrentOption" from article.cls is not supported by lipics.cls.}
+\DeclareOption{a5paper}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{b5paper}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{legalpaper}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{executivepaper}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{landscape}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{10pt}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{11pt}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{12pt}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{oneside}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{twoside}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{titlepage}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{notitlepage}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{onecolumn}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{twocolumn}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{fleqn}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{openbib}{\@latexerr{\lipics@opterrshort}{\lipics@opterrlong}}
+\DeclareOption{a4paper}{\PassOptionsToClass{\CurrentOption}{article}
+                        \advance\hoffset-2.95mm
+                        \advance\voffset8.8mm}
+\DeclareOption{numberwithinsect}{\let\numberwithinsect\relax}
+\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
+\ProcessOptions
+\LoadClass[twoside,notitlepage,fleqn]{article}
+\renewcommand\normalsize{%
+   \@setfontsize\normalsize\@xpt{13}%
+   \abovedisplayskip 10\p@ \@plus2\p@ \@minus5\p@
+   \abovedisplayshortskip \z@ \@plus3\p@
+   \belowdisplayshortskip 6\p@ \@plus3\p@ \@minus3\p@
+   \belowdisplayskip \abovedisplayskip
+   \let\@listi\@listI}
+\normalsize
+\renewcommand\small{%
+   \@setfontsize\small\@ixpt{11.5}%
+   \abovedisplayskip 8.5\p@ \@plus3\p@ \@minus4\p@
+   \abovedisplayshortskip \z@ \@plus2\p@
+   \belowdisplayshortskip 4\p@ \@plus2\p@ \@minus2\p@
+   \def\@listi{\leftmargin\leftmargini
+               \topsep 4\p@ \@plus2\p@ \@minus2\p@
+               \parsep 2\p@ \@plus\p@ \@minus\p@
+               \itemsep \parsep}%
+   \belowdisplayskip \abovedisplayskip
+}
+\renewcommand\footnotesize{%
+   \@setfontsize\footnotesize{8.5}{9.5}%
+   \abovedisplayskip 6\p@ \@plus2\p@ \@minus4\p@
+   \abovedisplayshortskip \z@ \@plus\p@
+   \belowdisplayshortskip 3\p@ \@plus\p@ \@minus2\p@
+   \def\@listi{\leftmargin\leftmargini
+               \topsep 3\p@ \@plus\p@ \@minus\p@
+               \parsep 2\p@ \@plus\p@ \@minus\p@
+               \itemsep \parsep}%
+   \belowdisplayskip \abovedisplayskip
+}
+\renewcommand\large{\@setfontsize\large{10.5}{13}}
+\renewcommand\Large{\@setfontsize\Large{12}{14}}
+\setlength\parindent{1.5em}
+\setlength\headheight{3mm}
+\setlength\headsep   {10mm}
+\setlength\footskip{3mm}
+\setlength\textwidth{140mm}
+\setlength\textheight{222mm}
+\setlength\oddsidemargin{32mm}
+\setlength\evensidemargin{38mm}
+\setlength\marginparwidth{25mm}
+\setlength\topmargin{13mm}
+\setlength{\skip\footins}{2\baselineskip \@plus 4\p@ \@minus 2\p@}
+\def\@listi{\leftmargin\leftmargini
+            \parsep\z@ \@plus\p@
+            \topsep 8\p@ \@plus2\p@ \@minus4\p@
+            \itemsep \parsep}
+\let\@listI\@listi
+\@listi
+\def\@listii {\leftmargin\leftmarginii
+              \labelwidth\leftmarginii
+              \advance\labelwidth-\labelsep
+              \topsep    4\p@ \@plus2\p@ \@minus\p@
+              \parsep\z@ \@plus\p@
+              \itemsep   \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+              \labelwidth\leftmarginiii
+              \advance\labelwidth-\labelsep
+              \topsep    2\p@ \@plus\p@\@minus\p@
+              \parsep    \z@
+              \partopsep \p@ \@plus\z@ \@minus\p@
+              \itemsep   \z@ \@plus\p@}
+\def\ps@headings{%
+    \def\@evenhead{\large\sffamily\bfseries
+                   \llap{\hbox to0.5\oddsidemargin{\thepage\hss}}\leftmark\hfil}%
+    \def\@oddhead{\large\sffamily\bfseries\rightmark\hfil
+                  \rlap{\hbox to0.5\oddsidemargin{\hss\thepage}}}%
+    \def\@oddfoot{\hfil
+                  \rlap{%
+                    \vtop{%
+                      \vskip10mm
+                      \colorbox[rgb]{0.99,0.78,0.07}
+                                    {\@tempdima\evensidemargin
+                                     \advance\@tempdima1in
+                                     \advance\@tempdima\hoffset
+                                     \hb@xt@\@tempdima{%
+                                       \textcolor{darkgray}{\normalsize\sffamily
+                                       \bfseries\quad
+                                       \expandafter\textsolittle
+                                       \expandafter{\@EventShortName}}%
+                                     \strut\hss}}}}}
+    \let\@evenfoot\@empty
+    \let\@mkboth\markboth
+  \let\sectionmark\@gobble
+  \let\subsectionmark\@gobble}
+\pagestyle{headings}
+\renewcommand\maketitle{\par
+  \begingroup
+    \renewcommand\thefootnote{\@fnsymbol\c@footnote}%
+    \if@twocolumn
+      \ifnum \col@number=\@ne
+        \@maketitle
+      \else
+        \twocolumn[\@maketitle]%
+      \fi
+    \else
+      \newpage
+      \global\@topnum\z@   % Prevents figures from going at top of page.
+      \@maketitle
+    \fi
+    \thispagestyle{plain}\@thanks
+  \endgroup
+  \setcounter{footnote}{0}%
+  \global\let\thanks\relax
+  \global\let\maketitle\relax
+  \global\let\@maketitle\relax
+  \global\let\@thanks\@empty
+  \global\let\@author\@empty
+  \global\let\@date\@empty
+  \global\let\@title\@empty
+  \global\let\title\relax
+  \global\let\author\relax
+  \global\let\date\relax
+  \global\let\and\relax
+}
+\newwrite\tocfile
+\def\@maketitle{%
+  \newpage
+  \null\vskip-\baselineskip
+  \vskip-\headsep
+  \@titlerunning
+  \@authorrunning
+  \let \footnote \thanks
+  \parindent\z@ \raggedright
+    {\LARGE\sffamily\bfseries\mathversion{bold}\@title \par}%
+    \vskip 1.5em%
+    \ifnum\c@authors=0 %
+      \@latexerr{No \noexpand\author given}%
+        {Provide at least one author. See the LIPIcs class documentation.}%
+    \else
+      \@author
+    \fi
+    \bgroup
+      \let\footnote\@gobble
+      \immediate\openout\tocfile=\jobname.vtc
+      \protected@write\tocfile{}{%
+        \string\contitem
+        \string\title{\@title}%
+        \string\author{\AB@authfortoc}%
+        \string\page{\thepage}}%
+      \closeout\tocfile
+    \egroup
+  \par}
+\setcounter{secnumdepth}{4}
+\renewcommand\section{\@startsection {section}{1}{\z@}%
+                                   {-3.5ex \@plus -1ex \@minus -.2ex}%
+                                   {2.3ex \@plus.2ex}%
+                                   {\sffamily\Large\bfseries\raggedright}}
+\renewcommand\subsection{\@startsection{subsection}{2}{\z@}%
+                                     {-3.25ex\@plus -1ex \@minus -.2ex}%
+                                     {1.5ex \@plus .2ex}%
+                                     {\sffamily\Large\bfseries\raggedright}}
+\renewcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}%
+                                     {-3.25ex\@plus -1ex \@minus -.2ex}%
+                                     {1.5ex \@plus .2ex}%
+                                     {\sffamily\Large\bfseries\raggedright}}
+\renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}%
+                                    {-3.25ex \@plus-1ex \@minus-.2ex}%
+                                    {1.5ex \@plus .2ex}%
+                                    {\sffamily\large\bfseries\raggedright}}
+\renewcommand\subparagraph{\@startsection{subparagraph}{5}{\z@}%
+                                       {3.25ex \@plus1ex \@minus .2ex}%
+                                       {-1em}%
+                                      {\sffamily\normalsize\bfseries}}
+\setlength\leftmargini  \parindent
+\setlength\leftmarginii {1.2em}
+\setlength\leftmarginiii{1.2em}
+\setlength\leftmarginiv {1.2em}
+\setlength\leftmarginv  {1.2em}
+\setlength\leftmarginvi {1.2em}
+\renewcommand\labelenumi{%
+  \textcolor{darkgray}{\sffamily\bfseries\mathversion{bold}\theenumi.}}
+\renewcommand\labelenumii{%
+  \textcolor{darkgray}{\sffamily\bfseries\mathversion{bold}\theenumii.}}
+\renewcommand\labelenumiii{%
+  \textcolor{darkgray}{\sffamily\bfseries\mathversion{bold}\theenumiii.}}
+\renewcommand\labelenumiv{%
+  \textcolor{darkgray}{\sffamily\bfseries\mathversion{bold}\theenumiv.}}
+\renewcommand\labelitemi{%
+  \textcolor[rgb]{0.6,0.6,0.61}{\ifnum\@listdepth=\@ne
+                                  \rule{0.67em}{0.33em}%
+                                \else
+                                  \rule{0.45em}{0.225em}%
+                                \fi}}
+\renewcommand\labelitemii{%
+  \textcolor[rgb]{0.6,0.6,0.61}{\rule{0.45em}{0.225em}}}
+\renewcommand\labelitemiii{%
+  \textcolor[rgb]{0.6,0.6,0.61}{\sffamily\bfseries\textasteriskcentered}}
+\renewcommand\labelitemiv{%
+  \textcolor[rgb]{0.6,0.6,0.61}{\sffamily\bfseries\textperiodcentered}}
+\renewenvironment{description}
+               {\list{}{\advance\partopsep\topsep\topsep\z@\@plus\p@
+                        \labelwidth\z@ \itemindent-\leftmargin
+                        \let\makelabel\descriptionlabel}}
+               {\ifnum\@listdepth=\@ne\global\@nobotseplisttrue\fi\endlist}
+\renewcommand*\descriptionlabel[1]{%
+  \hspace\labelsep\textcolor{darkgray}{\sffamily\bfseries\mathversion{bold}#1}}
+\renewenvironment{abstract}{%
+  \vskip\bigskipamount
+  \noindent
+  \rlap{\color[rgb]{0.51,0.50,0.52}\vrule\@width\textwidth\@height1\p@}%
+  \hspace*{7mm}\fboxsep1.5mm\colorbox[rgb]{1,1,1}{\raisebox{-0.4ex}{%
+    \large\selectfont\sffamily\bfseries\abstractname}}%
+  \vskip3\p@
+  \fontsize{9.5}{12.5}\selectfont
+  \noindent\ignorespaces}
+  {\ifx\@subjclass\@empty\else
+     \vskip\baselineskip\noindent
+     \subjclassHeading\@subjclass
+   \fi
+   \ifx\@keywords\@empty\else
+     \vskip\baselineskip\noindent
+     \keywordsHeading\@keywords
+   \fi
+   \ifx\@DOI\@empty\else
+     \vskip\baselineskip\noindent
+     \doiHeading\doi{\@DOI}%
+   \fi}
+\renewenvironment{thebibliography}[1]
+  {\if@noskipsec \leavevmode \fi
+   \par
+   \@tempskipa-3.5ex \@plus -1ex \@minus -.2ex\relax
+   \@afterindenttrue
+   \@tempskipa -\@tempskipa \@afterindentfalse
+   \if@nobreak
+     \everypar{}%
+   \else
+     \addpenalty\@secpenalty\addvspace\@tempskipa
+   \fi
+   \noindent
+   \rlap{\color[rgb]{0.51,0.50,0.52}\vrule\@width\textwidth\@height1\p@}%
+   \hspace*{7mm}\fboxsep1.5mm\colorbox[rgb]{1,1,1}{\raisebox{-0.4ex}{%
+     \normalsize\sffamily\bfseries\refname}}%
+   \@xsect{1ex \@plus.2ex}%
+   \list{\@biblabel{\@arabic\c@enumiv}}%
+        {\leftmargin8.5mm
+         \labelsep\leftmargin
+         \settowidth\labelwidth{\@biblabel{#1}}%
+         \advance\labelsep-\labelwidth
+         \usecounter{enumiv}%
+         \let\p@enumiv\@empty
+         \renewcommand\theenumiv{\@arabic\c@enumiv}}%
+   \fontsize{9.5}{12.5}\selectfont
+   \sloppy
+   \clubpenalty4000
+   \@clubpenalty \clubpenalty
+   \widowpenalty4000%
+   \sfcode`\.\@m}
+  {\def\@noitemerr
+     {\@latex@warning{Empty `thebibliography' environment}}%
+   \endlist}
+\renewcommand\footnoterule{%
+  \kern-8\p@
+  {\color[rgb]{0.60,0.60,0.61}\hrule\@width40mm\@height1\p@}%
+  \kern6.6\p@}
+\renewcommand\@makefntext[1]{%
+    \parindent\z@\hangindent1em
+    \leavevmode
+    \hb@xt@1em{\@makefnmark\hss}#1}
+\usepackage[utf8]{inputenc}
+\IfFileExists{lmodern.sty}{\RequirePackage{lmodern}}{}
+\RequirePackage[T1]{fontenc}
+\RequirePackage{textcomp}
+\RequirePackage[mathscr]{eucal}
+\RequirePackage{amssymb}
+\RequirePackage{soul}
+\sodef\textsolittle{}{.12em}{.5em\@plus.08em\@minus.06em}%
+        {.4em\@plus.275em\@minus.183em}
+\RequirePackage{color}
+\definecolor{darkgray}{rgb}{0.31,0.31,0.33}
+\RequirePackage{babel}
+\RequirePackage[tbtags,fleqn]{amsmath}
+\RequirePackage{amsthm}
+\thm@headfont{%
+  \textcolor{darkgray}{$\blacktriangleright$}\nobreakspace\sffamily\bfseries}
+\def\th@remark{%
+  \thm@headfont{%
+    \textcolor{darkgray}{$\blacktriangleright$}\nobreakspace\sffamily}%
+  \normalfont % body font
+  \thm@preskip\topsep \divide\thm@preskip\tw@
+  \thm@postskip\thm@preskip
+}
+\def\@endtheorem{\endtrivlist}%\@endpefalse
+\renewcommand\qedsymbol{\textcolor{darkgray}{\ensuremath{\blacktriangleleft}}}
+\renewenvironment{proof}[1][\proofname]{\par
+  \pushQED{\qed}%
+  \normalfont \topsep6\p@\@plus6\p@\relax
+  \trivlist
+  \item[\hskip\labelsep
+        \color{darkgray}\sffamily\bfseries
+    #1\@addpunct{.}]\ignorespaces
+}{%
+  \popQED\endtrivlist%\@endpefalse
+}
+\theoremstyle{plain}
+\newtheorem{theorem}{Theorem}
+\newtheorem{lemma}[theorem]{Lemma}
+\newtheorem{corollary}[theorem]{Corollary}
+\theoremstyle{definition}
+\newtheorem{definition}[theorem]{Definition}
+\newtheorem{example}[theorem]{Example}
+\theoremstyle{remark}
+\newtheorem*{remark}{Remark}
+\ifx\numberwithinsect\relax
+  \@addtoreset{theorem}{section}
+  \edef\thetheorem{\expandafter\noexpand\thesection\@thmcountersep\@thmcounter{theorem}}
+\fi
+\RequirePackage{graphicx}
+\RequirePackage{array}
+\let\@classzold\@classz
+\def\@classz{%
+   \expandafter\ifx\d@llarbegin\begingroup
+     \toks \count@ =
+     \expandafter{\expandafter\small\the\toks\count@}%
+   \fi
+   \@classzold}
+\RequirePackage{multirow}
+\RequirePackage{tabularx}
+\RequirePackage[online]{threeparttable}
+\def\TPTtagStyle#1{#1)}
+\def\tablenotes{\small\TPT@defaults
+  \@ifnextchar[\TPT@setuptnotes\TPTdoTablenotes} % ]
+\RequirePackage{listings}
+\lstset{basicstyle=\small\ttfamily,%
+        backgroundcolor=\color[rgb]{0.85,0.85,0.86},%
+        frame=single,framerule=0pt,xleftmargin=\fboxsep,xrightmargin=\fboxsep}
+\RequirePackage{lastpage}
+\IfFileExists{doi.sty}
+  {\RequirePackage{doi}%
+   \renewcommand*{\doitext}{}}
+  {\RequirePackage{hyperref}%
+   \def\doi##1{##1}}
+\hypersetup{pdfborder={0 0 0}}
+\RequirePackage[labelsep=space,singlelinecheck=false,%
+  font={up,small},labelfont={sf,bf},%
+  listof=false]{caption}%"listof" instead of "list" for backward compatibility
+\@ifpackagelater{hyperref}{2009/12/09}
+  {\captionsetup{compatibility=false}}%cf. http://groups.google.de/group/comp.text.tex/browse_thread/thread/db9310eb540fbbd8/42e30f3b7b3aa17a?lnk=raot
+  {}
+\DeclareCaptionLabelFormat{boxed}{%
+  \kern0.05em{\color[rgb]{0.99,0.78,0.07}\rule{0.73em}{0.73em}}%
+  \hspace*{0.67em}\bothIfFirst{#1}{~}#2}
+\captionsetup{labelformat=boxed}
+\captionsetup[table]{position=top}
+\RequirePackage[figuresright]{rotating}
+\RequirePackage{subfig}
+\def\titlerunning#1{\gdef\@titlerunning{{\let\footnote\@gobble\markboth{#1}{#1}}}}
+\def\authorrunning#1{%
+  \gdef\@authorrunning{\expandafter\def\expandafter\@tempa\expandafter{#1}%
+                       \ifx\@tempa\@empty\else\markright{#1}\fi}}
+\titlerunning{\@title}
+\authorrunning{\AB@authrunning}
+\newcommand*\volumeinfo[6]{%
+  {\gdef\@Editors{#1}%
+   \gdef\@Eds{Editor}\ifnum #2>1 \gdef\@Eds{Editors}\fi
+   \gdef\@Event{#3}%
+   \setcounter{page}{#6}}}
+\volumeinfo{}{1}{}{}{}{1}
+\RequirePackage{authblk}
+\renewcommand*\Authand{{ and }}
+\renewcommand*\Authfont{\Large\bfseries\mathversion{bold}}
+\renewcommand*\AB@authnote[1]{\textsuperscript{#1}}
+\renewcommand*\AB@affilnote[1]{\protect\item[#1]}
+\renewcommand*\Affilfont{\fontsize{9.5}{12}\selectfont}
+\setlength\affilsep{\baselineskip}
+\newcommand\AB@authrunning{}
+\newcommand\AB@authfortoc{}
+\renewcommand\author[2][]%
+      {\ifnewaffil\addtocounter{affil}{1}%
+       \edef\AB@thenote{\arabic{affil}}\fi
+      \if\relax#1\relax\def\AB@note{\AB@thenote}\else\def\AB@note{#1}%
+        \setcounter{Maxaffil}{0}\fi
+      \ifnum\value{authors}>1\relax
+      \@namedef{@sep\number\c@authors}{\Authsep}\fi
+      \addtocounter{authors}{1}%
+      \begingroup
+          \let\protect\@unexpandable@protect \let\and\AB@pand
+          \def\thanks{\protect\thanks}\def\footnote{\protect\footnote}%
+         \@temptokena=\expandafter{\AB@authors}%
+         {\def\\{\protect\\[\@affilsep]\protect\Affilfont
+              \protect\AB@resetsep}%
+              \xdef\AB@author{\AB@blk@and#2}%
+       \ifnewaffil\gdef\AB@las{}\gdef\AB@lasx{\protect\Authand}\gdef\AB@as{}%
+           \xdef\AB@authors{\the\@temptokena\AB@blk@and}%
+       \else
+          \xdef\AB@authors{\the\@temptokena\AB@as\AB@au@str}%
+          \global\let\AB@las\AB@lasx\gdef\AB@lasx{\protect\Authands}%
+          \gdef\AB@as{\Authsep}%
+       \fi
+       \gdef\AB@au@str{#2}}%
+         \@temptokena=\expandafter{\AB@authlist}%
+         \let\\=\authorcr
+         \xdef\AB@authlist{\the\@temptokena
+           \protect\@nameuse{@sep\number\c@authors}%
+           \protect\Authfont#2\AB@authnote{\AB@note}}%
+         %new
+         \@temptokena=\expandafter{\AB@authrunning}%
+         \let\\=\authorcr
+         \xdef\AB@authrunning{\the\@temptokena
+           \protect\@nameuse{@sep\number\c@authors}#2}%
+         %
+         %new
+         \@temptokena=\expandafter{\AB@authfortoc}%
+         \let\\=\authorcr
+         \xdef\AB@authfortoc{\the\@temptokena
+           \expandafter\noexpand\csname @sep\number\c@authors\endcsname#2}%
+         %
+      \endgroup
+      \ifnum\value{authors}>2\relax
+      \@namedef{@sep\number\c@authors}{\Authands}\fi
+      \newaffilfalse
+}
+\renewcommand\affil[2][]%
+   {\newaffiltrue\let\AB@blk@and\AB@pand
+      \if\relax#1\relax\def\AB@note{\AB@thenote}\else\def\AB@note{#1}%
+        \setcounter{Maxaffil}{0}\fi
+      \begingroup
+        \let\protect\@unexpandable@protect
+        \def\thanks{\protect\thanks}\def\footnote{\protect\footnote}%
+        \@temptokena=\expandafter{\AB@authors}%
+        {\def\\{\protect\\\protect\Affilfont}\xdef\AB@temp{#2}}%
+         \xdef\AB@authors{\the\@temptokena\AB@las\AB@au@str
+         \protect\\[\affilsep]\protect\Affilfont\AB@temp}%
+         \gdef\AB@las{}\gdef\AB@au@str{}%
+        {\xdef\AB@temp{#2}}%
+        \@temptokena=\expandafter{\AB@affillist}%
+        \xdef\AB@affillist{\the\@temptokena \AB@affilsep
+          \AB@affilnote{\AB@note}\protect\Affilfont\AB@temp}%
+      \endgroup
+       \let\AB@affilsep\AB@affilsepx}
+\renewcommand\@author{\ifx\AB@affillist\AB@empty\AB@authrunning\else
+      \ifnum\value{affil}>\value{Maxaffil}\def\rlap##1{##1}%
+    \AB@authlist\\[\affilsep]
+    \labelwidth1.5em\labelsep\z@\leftmargini\labelwidth
+    \edef\@enumctr{enumi}%
+    \list\theenumi{\usecounter\@enumctr\def\makelabel##1{\rlap{##1}\hss}}%
+      \AB@affillist
+    \endlist
+    \else  \AB@authors\fi\fi}
+\newcommand*\Copyright[1]{%
+  \def\@Copyright{%
+      \setbox\@tempboxa\hbox{\includegraphics[height=14\p@,clip]{cc-by}}%
+      \hspace*{\wd\@tempboxa}\enskip\ifx#1\@empty \else \textcopyright\ #1;\\\fi
+      \href{http://creativecommons.org/licenses/by/3.0/}%
+           {\smash{\unhbox\@tempboxa}}\enskip
+            licensed under Creative Commons License CC-BY\\
+    }}
+\Copyright{\@empty}
+\def\keywords#1{\def\@keywords{#1}}
+\let\@keywords\@empty
+\def\keywordsHeading{%
+  \textcolor{darkgray}{\fontsize{9.5}{12.5}\sffamily\bfseries
+                       Keywords and phrases\enskip}}
+\def\subjclass#1{\gdef\@subjclass{#1}}
+\let\@subjclass\@empty
+\def\subjclassHeading{%
+  \textcolor{darkgray}{\fontsize{9.5}{12.5}\sffamily\bfseries
+                       1998 ACM Subject Classification\enskip}}
+\def\doiHeading{%
+  \textcolor{darkgray}{\fontsize{9.5}{12.5}\sffamily\bfseries
+                       Digital Object Identifier\enskip}}
+\def\serieslogo#1{\gdef\@serieslogo{#1}}
+\serieslogo{}
+\def\EventShortName#1{\gdef\@EventShortName{#1}}
+\EventShortName{}
+\def\DOI#1{\gdef\@DOI{#1}}
+\DOI{}
+\endinput
+%%
+%% End of file `lipics.cls'.
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/pics/nub_filter_simp.png
Binary file ChengsongPhdThesis/pics/nub_filter_simp.png has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/pics/regex_nfa_alt.png
Binary file ChengsongPhdThesis/pics/regex_nfa_alt.png has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/pics/regex_nfa_base.png
Binary file ChengsongPhdThesis/pics/regex_nfa_base.png has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/pics/regex_nfa_seq1.png
Binary file ChengsongPhdThesis/pics/regex_nfa_seq1.png has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/pics/regex_nfa_seq2.png
Binary file ChengsongPhdThesis/pics/regex_nfa_seq2.png has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/pics/regex_nfa_star.png
Binary file ChengsongPhdThesis/pics/regex_nfa_star.png has changed
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/re-java.data
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/re-java.data	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,20 @@
+%% LaTeX2e file `re-java.data'
+%% generated by the `filecontents' environment
+%% from source `mentoringMeeting' on 2022/02/22.
+%%
+5  0.00298
+10  0.00418
+15  0.00996
+16  0.01710
+17  0.03492
+18  0.03303
+19  0.05084
+20  0.10177
+21  0.19960
+22  0.41159
+23  0.82234
+24  1.70251
+25  3.36112
+26  6.63998
+27  13.35120
+28  29.81185
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/re-js.data
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/re-js.data	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,15 @@
+%% LaTeX2e file `re-js.data'
+%% generated by the `filecontents' environment
+%% from source `mentoringMeeting' on 2022/02/22.
+%%
+5   0.061
+10  0.061
+15  0.061
+20  0.070
+23  0.131
+25  0.308
+26  0.564
+28  1.994
+30  7.648
+31  15.881
+32  32.190
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/re-python2.data
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/re-python2.data	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,19 @@
+%% LaTeX2e file `re-python2.data'
+%% generated by the `filecontents' environment
+%% from source `mentoringMeeting' on 2022/02/22.
+%%
+1 0.033
+5 0.036
+10 0.034
+15 0.036
+18 0.059
+19 0.084
+20 0.141
+21 0.248
+22 0.485
+23 0.878
+24 1.71
+25 3.40
+26 7.08
+27 14.12
+28 26.69
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/regex_time_complexity.bib
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/regex_time_complexity.bib	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,66 @@
+%% This BibTeX bibliography file was created using BibDesk.
+%% https://bibdesk.sourceforge.io/
+
+%% Created for CS TAN at 2022-02-22 23:40:24 +0000 
+
+
+%% Saved with string encoding Unicode (UTF-8) 
+
+
+
+@article{SulzmannLu14,
+	author = {Martin Sulzmann, Kennie Zhuo Ming Lu},
+	date-added = {2022-02-22 23:36:08 +0000},
+	date-modified = {2022-02-22 23:40:23 +0000},
+	journal = {FLOPS},
+	title = {POSIX Regular Expression Parsing with Derivatives},
+	year = {2014},
+	bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxAdcmVnZXgtcGFyc2luZy1kZXJpdmF0aXZlcy5wZGZPEQGoAAAAAAGoAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8dcmVnZXgtcGFyc2luZy1kZXJpdmF0aXZlcy5wZGYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAABAAIAAAogY3UAAAAAAAAAAAAAAAAACXJlZ2V4X2JpYgAAAgBHLzpVc2Vyczpjc3RhbjpEcm9wYm94OldvcmtzcGFjZTpyZWdleF9iaWI6cmVnZXgtcGFyc2luZy1kZXJpdmF0aXZlcy5wZGYAAA4APAAdAHIAZQBnAGUAeAAtAHAAYQByAHMAaQBuAGcALQBkAGUAcgBpAHYAYQB0AGkAdgBlAHMALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAEVVc2Vycy9jc3Rhbi9Ecm9wYm94L1dvcmtzcGFjZS9yZWdleF9iaWIvcmVnZXgtcGFyc2luZy1kZXJpdmF0aXZlcy5wZGYAABMAAS8AABUAAgAM//8AAAAIAA0AGgAkAEQAAAAAAAACAQAAAAAAAAAFAAAAAAAAAAAAAAAAAAAB8A==}}
+
+@article{Thompson_1968,
+	author = {Ken Thompson},
+	date-added = {2022-02-22 23:24:32 +0000},
+	date-modified = {2022-02-22 23:24:32 +0000},
+	doi = {10.1145/363347.363387},
+	journal = {Communications of the {ACM}},
+	month = {jun},
+	number = {6},
+	pages = {419--422},
+	publisher = {Association for Computing Machinery ({ACM})},
+	title = {Programming Techniques: Regular expression search algorithm},
+	url = {https://doi.org/10.1145%2F363347.363387},
+	volume = {11},
+	year = 1968,
+	bdsk-url-1 = {https://doi.org/10.1145%2F363347.363387},
+	bdsk-url-2 = {https://doi.org/10.1145/363347.363387}}
+
+@article{Baeza_Yates_1996,
+	author = {Ricardo A. Baeza-Yates and Gaston H. Gonnet},
+	date-added = {2022-02-22 22:54:46 +0000},
+	date-modified = {2022-02-22 22:54:46 +0000},
+	doi = {10.1145/235809.235810},
+	journal = {Journal of the {ACM}},
+	month = {nov},
+	number = {6},
+	pages = {915--936},
+	publisher = {Association for Computing Machinery ({ACM})},
+	title = {Fast text searching for regular expressions or automaton searching on tries},
+	url = {https://doi.org/10.1145%2F235809.235810},
+	volume = {43},
+	year = 1996,
+	bdsk-url-1 = {https://doi.org/10.1145%2F235809.235810},
+	bdsk-url-2 = {https://doi.org/10.1145/235809.235810}}
+
+@article{BaezaYates96,
+	author = {Baeza-Yates, Ricardo A.; Gonnet, Gaston H.},
+	date-added = {2022-02-22 22:43:23 +0000},
+	date-modified = {2022-02-22 22:51:43 +0000},
+	journal = {J. ACM},
+	number = {no. 6},
+	pages = {915--936.},
+	rss-description = {MR1434907 68Q25 (68P05) 
+Baeza-Yates, Ricardo A.; Gonnet, Gaston H. Fast text searching for regular expressions or automaton searching on tries. J. ACM 43 (1996), no. 6, 915--936.},
+	title = {Fast text searching for regular expressions or automaton searching on tries.},
+	volume = {43 (1996)},
+	year = {1996},
+	bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxAmcHRyZWVzX3N1YmxpbmVhcl9yZWdleDk2QmFlemVZYXRlcy5wZGZPEQHKAAAAAAHKAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fcHRyZWVzX3N1YmxpbmVhcl9yI0ZGRkZGRkZGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAABAAIAAAogY3UAAAAAAAAAAAAAAAAACXJlZ2V4X2JpYgAAAgBQLzpVc2Vyczpjc3RhbjpEcm9wYm94OldvcmtzcGFjZTpyZWdleF9iaWI6cHRyZWVzX3N1YmxpbmVhcl9yZWdleDk2QmFlemVZYXRlcy5wZGYADgBOACYAcAB0AHIAZQBlAHMAXwBzAHUAYgBsAGkAbgBlAGEAcgBfAHIAZQBnAGUAeAA5ADYAQgBhAGUAegBlAFkAYQB0AGUAcwAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIATlVzZXJzL2NzdGFuL0Ryb3Bib3gvV29ya3NwYWNlL3JlZ2V4X2JpYi9wdHJlZXNfc3VibGluZWFyX3JlZ2V4OTZCYWV6ZVlhdGVzLnBkZgATAAEvAAAVAAIADP//AAAACAANABoAJABNAAAAAAAAAgEAAAAAAAAABQAAAAAAAAAAAAAAAAAAAhs=}}
diff -r 65e786a58365 -r a73b2e553804 ChengsongPhdThesis/root.bib
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ChengsongPhdThesis/root.bib	Wed Mar 02 23:13:59 2022 +0000
@@ -0,0 +1,351 @@
+%% This BibTeX bibliography file was created using BibDesk.
+%% https://bibdesk.sourceforge.io/
+
+%% Created for CS TAN at 2022-02-23 13:45:03 +0000 
+
+
+%% Saved with string encoding Unicode (UTF-8) 
+
+
+
+@article{Thompson_1968,
+	author = {Ken Thompson},
+	date-added = {2022-02-23 13:44:42 +0000},
+	date-modified = {2022-02-23 13:44:42 +0000},
+	doi = {10.1145/363347.363387},
+	journal = {Communications of the {ACM}},
+	month = {jun},
+	number = {6},
+	pages = {419--422},
+	publisher = {Association for Computing Machinery ({ACM})},
+	title = {Programming Techniques: Regular expression search algorithm},
+	url = {https://doi.org/10.1145%2F363347.363387},
+	volume = {11},
+	year = 1968,
+	bdsk-url-1 = {https://doi.org/10.1145%2F363347.363387},
+	bdsk-url-2 = {https://doi.org/10.1145/363347.363387}}
+
+@article{17Bir,
+	author = {Asiri Rathnayake and Hayo Thielecke},
+	date-added = {2019-08-18 17:57:30 +0000},
+	date-modified = {2019-08-18 18:00:13 +0000},
+	journal = {arXiv:1405.7058},
+	title = {Static Analysis for Regular Expression Exponential Runtime via Substructural Logics},
+	year = {2017}}
+
+@article{nielson11bcre,
+	author = {Lasse Nielsen, Fritz Henglein},
+	date-added = {2019-07-03 21:09:39 +0000},
+	date-modified = {2019-07-03 21:17:33 +0000},
+	journal = {LATA},
+	title = {Bit-coded Regular Expression Parsing},
+	year = {2011},
+	bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxAyLi4vLi4vTXkgTWFjIChNYWNCb29rLVBybykvRGVza3RvcC9mcml0ei1wYXBlci5wZGZPEQF+AAAAAAF+AAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8PZnJpdHotcGFwZXIucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAACAAMAAAogY3UAAAAAAAAAAAAAAAAAB0Rlc2t0b3AAAAIAQi86VXNlcnM6Y3N0YW46RHJvcGJveDpNeSBNYWMgKE1hY0Jvb2stUHJvKTpEZXNrdG9wOmZyaXR6LXBhcGVyLnBkZgAOACAADwBmAHIAaQB0AHoALQBwAGEAcABlAHIALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAEBVc2Vycy9jc3Rhbi9Ecm9wYm94L015IE1hYyAoTWFjQm9vay1Qcm8pL0Rlc2t0b3AvZnJpdHotcGFwZXIucGRmABMAAS8AABUAAgAM//8AAAAIAA0AGgAkAFkAAAAAAAACAQAAAAAAAAAFAAAAAAAAAAAAAAAAAAAB2w==}}
+
+@misc{SE16,
+	author = {StackStatus},
+	date-added = {2019-06-26 11:28:41 +0000},
+	date-modified = {2019-06-26 16:07:31 +0000},
+	keywords = {ReDos Attack},
+	month = {July},
+	rating = {5},
+	read = {1},
+	title = {Stack Overflow Outage Postmortem},
+	url = {https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016},
+	year = {2016},
+	bdsk-url-1 = {https://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}}
+
+@article{HosoyaVouillonPierce2005,
+	author = {H.~Hosoya and J.~Vouillon and B.~C.~Pierce},
+	journal = {ACM Transactions on Programming Languages and Systems (TOPLAS)},
+	number = 1,
+	pages = {46--90},
+	title = {{R}egular {E}xpression {T}ypes for {XML}},
+	volume = 27,
+	year = {2005}}
+
+@misc{POSIX,
+	note = {\url{http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html}},
+	title = {{T}he {O}pen {G}roup {B}ase {S}pecification {I}ssue 6 {IEEE} {S}td 1003.1 2004 {E}dition},
+	year = {2004}}
+
+@inproceedings{AusafDyckhoffUrban2016,
+	author = {F.~Ausaf and R.~Dyckhoff and C.~Urban},
+	booktitle = {Proc.~of the 7th International Conference on Interactive Theorem Proving (ITP)},
+	pages = {69--86},
+	series = {LNCS},
+	title = {{POSIX} {L}exing with {D}erivatives of {R}egular {E}xpressions ({P}roof {P}earl)},
+	volume = {9807},
+	year = {2016}}
+
+@article{aduAFP16,
+	author = {F.~Ausaf and R.~Dyckhoff and C.~Urban},
+	issn = {2150-914x},
+	journal = {Archive of Formal Proofs},
+	note = {\url{http://www.isa-afp.org/entries/Posix-Lexing.shtml}, Formal proof development},
+	title = {{POSIX} {L}exing with {D}erivatives of {R}egular {E}xpressions},
+	year = 2016}
+
+@techreport{CrashCourse2014,
+	annote = {draft report},
+	author = {N.~B.~B.~Grathwohl and F.~Henglein and U.~T.~Rasmussen},
+	institution = {University of Copenhagen},
+	title = {{A} {C}rash-{C}ourse in {R}egular {E}xpression {P}arsing and {R}egular {E}xpressions as {T}ypes},
+	year = {2014}}
+
+@inproceedings{Sulzmann2014,
+	author = {M.~Sulzmann and K.~Lu},
+	booktitle = {Proc.~of the 12th International Conference on Functional and Logic Programming (FLOPS)},
+	pages = {203--220},
+	series = {LNCS},
+	title = {{POSIX} {R}egular {E}xpression {P}arsing with {D}erivatives},
+	volume = {8475},
+	year = {2014}}
+
+@inproceedings{Sulzmann2014b,
+	author = {M.~Sulzmann and P.~van Steenhoven},
+	booktitle = {Proc.~of the 23rd International Conference on Compiler Construction (CC)},
+	pages = {174--191},
+	series = {LNCS},
+	title = {{A} {F}lexible and {E}fficient {ML} {L}exer {T}ool {B}ased on {E}xtended {R}egular {E}xpression {S}ubmatching},
+	volume = {8409},
+	year = {2014}}
+
+@book{Pierce2015,
+	author = {B.~C.~Pierce and C.~Casinghino and M.~Gaboardi and M.~Greenberg and C.~Hri\c{t}cu and V.~Sj\"{o}berg and B.~Yorgey},
+	note = {\url{http://www.cis.upenn.edu/~bcpierce/sf}},
+	publisher = {Electronic textbook},
+	title = {{S}oftware {F}oundations},
+	year = {2015}}
+
+@misc{Kuklewicz,
+	author = {C.~Kuklewicz},
+	howpublished = {\url{https://wiki.haskell.org/Regex_Posix}},
+	title = {{R}egex {P}osix}}
+
+@article{Vansummeren2006,
+	author = {S.~Vansummeren},
+	journal = {ACM Transactions on Programming Languages and Systems},
+	number = {3},
+	pages = {389--428},
+	title = {{T}ype {I}nference for {U}nique {P}attern {M}atching},
+	volume = {28},
+	year = {2006}}
+
+@inproceedings{Asperti12,
+	author = {A.~Asperti},
+	booktitle = {Proc.~of the 3rd International Conference on Interactive Theorem Proving (ITP)},
+	pages = {283--298},
+	series = {LNCS},
+	title = {{A} {C}ompact {P}roof of {D}ecidability for {R}egular {E}xpression {E}quivalence},
+	volume = {7406},
+	year = {2012}}
+
+@inproceedings{Frisch2004,
+	author = {A.~Frisch and L.~Cardelli},
+	booktitle = {Proc.~of the 31st International Conference on Automata, Languages and Programming (ICALP)},
+	pages = {618--629},
+	series = {LNCS},
+	title = {{G}reedy {R}egular {E}xpression {M}atching},
+	volume = {3142},
+	year = {2004}}
+
+@article{Antimirov95,
+	author = {V.~Antimirov},
+	journal = {Theoretical Computer Science},
+	pages = {291--319},
+	title = {{P}artial {D}erivatives of {R}egular {E}xpressions and {F}inite {A}utomata {C}onstructions},
+	volume = {155},
+	year = {1995}}
+
+@inproceedings{Nipkow98,
+	author = {T.~Nipkow},
+	booktitle = {Proc.~of the 11th International Conference on Theorem Proving in Higher Order Logics (TPHOLs)},
+	pages = {1--15},
+	series = {LNCS},
+	title = {{V}erified {L}exical {A}nalysis},
+	volume = 1479,
+	year = 1998}
+
+@article{Brzozowski1964,
+	author = {J.~A.~Brzozowski},
+	journal = {Journal of the {ACM}},
+	number = {4},
+	pages = {481--494},
+	title = {{D}erivatives of {R}egular {E}xpressions},
+	volume = {11},
+	year = {1964}}
+
+@article{Leroy2009,
+	author = {X.~Leroy},
+	journal = {Communications of the ACM},
+	number = 7,
+	pages = {107--115},
+	title = {{F}ormal {V}erification of a {R}ealistic {C}ompiler},
+	volume = 52,
+	year = 2009}
+
+@inproceedings{Paulson2015,
+	author = {L.~C.~Paulson},
+	booktitle = {Proc.~of the 25th International Conference on Automated Deduction (CADE)},
+	pages = {231--245},
+	series = {LNAI},
+	title = {{A} {F}ormalisation of {F}inite {A}utomata {U}sing {H}ereditarily {F}inite {S}ets},
+	volume = {9195},
+	year = {2015}}
+
+@article{Wu2014,
+	author = {C.~Wu and X.~Zhang and C.~Urban},
+	journal = {Journal of Automatic Reasoning},
+	number = {4},
+	pages = {451--480},
+	title = {{A} {F}ormalisation of the {M}yhill-{N}erode {T}heorem based on {R}egular {E}xpressions},
+	volume = {52},
+	year = {2014}}
+
+@inproceedings{Regehr2011,
+	author = {X.~Yang and Y.~Chen and E.~Eide and J.~Regehr},
+	booktitle = {Proc.~of the 32nd ACM SIGPLAN Conference on Programming Language Design and Implementation (PLDI)},
+	pages = {283--294},
+	title = {{F}inding and {U}nderstanding {B}ugs in {C} {C}ompilers},
+	year = {2011}}
+
+@article{Norrish2014,
+	author = {A.~Barthwal and M.~Norrish},
+	journal = {Journal of Computer and System Sciences},
+	number = {2},
+	pages = {346--362},
+	title = {{A} {M}echanisation of {S}ome {C}ontext-{F}ree {L}anguage {T}heory in {HOL4}},
+	volume = {80},
+	year = {2014}}
+
+@article{Thompson1968,
+	author = {K.~Thompson},
+	issue_date = {June 1968},
+	journal = {Communications of the ACM},
+	number = {6},
+	pages = {419--422},
+	title = {{P}rogramming {T}echniques: {R}egular {E}xpression {S}earch {A}lgorithm},
+	volume = {11},
+	year = {1968}}
+
+@article{Owens2009,
+	author = {S.~Owens and J.~H.~Reppy and A.~Turon},
+	journal = {Journal of Functinal Programming},
+	number = {2},
+	pages = {173--190},
+	title = {{R}egular-{E}xpression {D}erivatives {R}e-{E}xamined},
+	volume = {19},
+	year = {2009}}
+
+@inproceedings{Sulzmann2015,
+	author = {M.~Sulzmann and P.~Thiemann},
+	booktitle = {Proc.~of the 9th International Conference on Language and Automata Theory and Applications (LATA)},
+	pages = {275--286},
+	series = {LNCS},
+	title = {{D}erivatives for {R}egular {S}huffle {E}xpressions},
+	volume = {8977},
+	year = {2015}}
+
+@inproceedings{Chen2012,
+	author = {H.~Chen and S.~Yu},
+	booktitle = {Proc.~in the International Workshop on Theoretical Computer Science (WTCS)},
+	pages = {343--356},
+	series = {LNCS},
+	title = {{D}erivatives of {R}egular {E}xpressions and an {A}pplication},
+	volume = {7160},
+	year = {2012}}
+
+@article{Krauss2011,
+	author = {A.~Krauss and T.~Nipkow},
+	journal = {Journal of Automated Reasoning},
+	pages = {95--106},
+	title = {{P}roof {P}earl: {R}egular {E}xpression {E}quivalence and {R}elation {A}lgebra},
+	volume = 49,
+	year = 2012}
+
+@inproceedings{Traytel2015,
+	author = {D.~Traytel},
+	booktitle = {Proc.~of the 24th Annual Conference on Computer Science Logic (CSL)},
+	pages = {487--503},
+	series = {LIPIcs},
+	title = {{A} {C}oalgebraic {D}ecision {P}rocedure for {WS1S}},
+	volume = {41},
+	year = {2015}}
+
+@inproceedings{Traytel2013,
+	author = {D.~Traytel and T.~Nipkow},
+	booktitle = {Proc.~of the 18th ACM SIGPLAN International Conference on Functional Programming (ICFP)},
+	pages = {3-12},
+	title = {{A} {V}erified {D}ecision {P}rocedure for {MSO} on {W}ords {B}ased on {D}erivatives of {R}egular {E}xpressions},
+	year = 2013}
+
+@inproceedings{Coquand2012,
+	author = {T.~Coquand and V.~Siles},
+	booktitle = {Proc.~of the 1st International Conference on Certified Programs and Proofs (CPP)},
+	pages = {119--134},
+	series = {LNCS},
+	title = {{A} {D}ecision {P}rocedure for {R}egular {E}xpression {E}quivalence in {T}ype {T}heory},
+	volume = {7086},
+	year = {2011}}
+
+@inproceedings{Almeidaetal10,
+	author = {J.~B.~Almeida and N.~Moriera and D.~Pereira and S.~M.~de Sousa},
+	booktitle = {Proc.~of the 15th International Conference on Implementation and Application of Automata (CIAA)},
+	pages = {59-68},
+	series = {LNCS},
+	title = {{P}artial {D}erivative {A}utomata {F}ormalized in {C}oq},
+	volume = {6482},
+	year = {2010}}
+
+@article{Owens2008,
+	author = {S.~Owens and K.~Slind},
+	journal = {Higher-Order and Symbolic Computation},
+	number = {4},
+	pages = {377--409},
+	title = {{A}dapting {F}unctional {P}rograms to {H}igher {O}rder {L}ogic},
+	volume = {21},
+	year = {2008}}
+
+@article{Owens2,
+	author = {S.~Owens and K.~Slind},
+	bibsource = {dblp computer science bibliography, http://dblp.org},
+	biburl = {http://dblp.uni-trier.de/rec/bib/journals/lisp/OwensS08},
+	doi = {10.1007/s10990-008-9038-0},
+	journal = {Higher-Order and Symbolic Computation},
+	number = {4},
+	pages = {377--409},
+	timestamp = {Wed, 16 Dec 2009 13:51:02 +0100},
+	title = {Adapting functional programs to higher order logic},
+	url = {http://dx.doi.org/10.1007/s10990-008-9038-0},
+	volume = {21},
+	year = {2008},
+	bdsk-url-1 = {http://dx.doi.org/10.1007/s10990-008-9038-0}}
+
+@misc{PCRE,
+	title = {{PCRE - Perl Compatible Regular Expressions}},
+	url = {http://www.pcre.org},
+	bdsk-url-1 = {http://www.pcre.org}}
+
+@inproceedings{OkuiSuzuki2010,
+	author = {S.~Okui and T.~Suzuki},
+	booktitle = {Proc.~of the 15th International Conference on Implementation and Application of Automata (CIAA)},
+	pages = {231--240},
+	series = {LNCS},
+	title = {{D}isambiguation in {R}egular {E}xpression {M}atching via {P}osition {A}utomata with {A}ugmented {T}ransitions},
+	volume = {6482},
+	year = {2010}}
+
+@techreport{OkuiSuzukiTech,
+	author = {S.~Okui and T.~Suzuki},
+	institution = {University of Aizu},
+	title = {{D}isambiguation in {R}egular {E}xpression {M}atching via {P}osition {A}utomata with {A}ugmented {T}ransitions},
+	year = {2013}}
+
+@inproceedings{Davis18,
+	author = {J.~C.~Davis and C.~.A.~Coghlan and F.~Servant and D.~Lee},
+	booktitle = {Proc.~of the 26th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE)},
+	numpages = {11},
+	pages = {246--256},
+	title = {{T}he {I}mpact of {R}egular {E}xpression {D}enial of {S}ervice ({ReDoS}) in {P}ractice: {A}n {E}mpirical {S}tudy at the {E}cosystem {S}cale},
+	year = {2018}}
diff -r 65e786a58365 -r a73b2e553804 thys2/SizeBound6CT.thy
--- a/thys2/SizeBound6CT.thy	Tue Mar 01 11:14:17 2022 +0000
+++ b/thys2/SizeBound6CT.thy	Wed Mar 02 23:13:59 2022 +0000
@@ -21,6 +21,12 @@
 |"orderedPrefAux 0 ss = Nil"
 
 
+fun ordsuf :: "char list \<Rightarrow> char list list"
+  where
+  "ordsuf [] = []"
+| "ordsuf (x # xs) = (ordsuf xs) @ [(x # xs)]" 
+
+
 fun orderedPref :: "char list \<Rightarrow> char list list"
   where
 "orderedPref s = orderedPrefAux (length s) s"
@@ -46,6 +52,26 @@
 lemma shape_of_suf_3list:
   shows "orderedSuf [c1, c2, c3] = [[c3], [c2, c3], [c1, c2, c3]]"
   by auto
+
+
+lemma ordsuf_last:
+  shows "ordsuf (xs @ [x]) = [x] # (map (\<lambda>s. s @ [x]) (ordsuf xs))" 
+  apply(induct xs)
+  apply(auto)
+  done
+
+lemma ordsuf_append:
+  shows "ordsuf (s1 @ s) = (ordsuf s) @ (map (\<lambda>s11. s11 @ s) (ordsuf s1))"
+apply(induct s1 arbitrary: s rule: rev_induct)
+  apply(simp)
+  apply(drule_tac x="[x] @ s" in meta_spec)
+  apply(simp)
+  apply(subst ordsuf_last)
+  apply(simp)
+  done
+
+
+
 (*
 lemma throwing_elem_around:
   shows "orderedSuf (s1 @ [a] @ s) = (orderedSuf s) @ (map (\<lambda>s11. s11 @ s) (orderedSuf ( s1 @ [a]) ))"
@@ -90,10 +116,16 @@
 
 *)
 
+datatype cchar = 
+Achar
+|Bchar
+|Cchar
+|Dchar
+
 datatype rrexp = 
   RZERO
 | RONE 
-| RCHAR char
+| RCHAR cchar
 | RSEQ rrexp rrexp
 | RALTS "rrexp list"
 | RSTAR rrexp
@@ -113,13 +145,19 @@
 | "rnullable (RSEQ  r1 r2) = (rnullable r1 \<and> rnullable r2)"
 | "rnullable (RSTAR   r) = True"
 
+fun convert_cchar_char :: "cchar \<Rightarrow> char"
+  where
+"convert_cchar_char Achar = (CHR 0x41) "
+| "convert_cchar_char Bchar = (CHR 0x42) "
+| "convert_cchar_char Cchar = (CHR 0x43) "
+| "convert_cchar_char Dchar = (CHR 0x44) "
 
 fun
  rder :: "char \<Rightarrow> rrexp \<Rightarrow> rrexp"
 where
   "rder c (RZERO) = RZERO"
 | "rder c (RONE) = RZERO"
-| "rder c (RCHAR d) = (if c = d then RONE else RZERO)"
+| "rder c (RCHAR d) = (if c = (convert_cchar_char d) then RONE else RZERO)"
 | "rder c (RALTS rs) = RALTS (map (rder c) rs)"
 | "rder c (RSEQ r1 r2) = 
      (if rnullable r1
@@ -252,6 +290,12 @@
 "cond_list r1 r2 s = rders_cond_list r2 (nullable_bools r1 (orderedPref s) ) (orderedSuf s)"
 
 
+fun seq_update :: " char \<Rightarrow> rrexp \<Rightarrow> char list list \<Rightarrow> char list list"
+  where
+"seq_update c r Ss = (if (rnullable r) then ([c] # (map (\<lambda>s1. s1 @ [c]) Ss)) else (map (\<lambda>s1. s1 @ [c]) Ss))"
+
+
+
 lemma rSEQ_mono:
   shows "rsize (rsimp_SEQ r1 r2) \<le>rsize ( RSEQ r1 r2)"
   apply auto
@@ -378,7 +422,7 @@
   apply(case_tac r2)
        apply simp_all
   done
-
+(*
 lemma rsimp_aalts_another:
   shows "\<forall>r \<in> (set  (map rsimp  ((RSEQ (rders r1 s) r2) #
            (rders_cond_list r2 (nullable_bools r1 (orderedPref s))  (orderedSuf s))  )) ). (rsize r) < N "
@@ -424,8 +468,55 @@
    prefer 2
    apply auto
   sorry
+*)
 
+(*
+lemma shape_derssimpseq_onechar2:
+  shows "rders_simp (RSEQ r1 r2) [c] = 
+         rsimp (RALTS  ((RSEQ (rders_simp r1 [c]) r2) #
+           (rders_cond_list r2 (nullable_bools r1 (orderedPref [c]))  (orderedSuf [c]))) )"
+  sorry
 
+*)
+
+lemma shape_derssimpseq_onechar:
+  shows "   (rders_simp r [c]) =  (rsimp (rders r [c]))"
+and "rders_simp (RSEQ r1 r2) [c] = 
+         rsimp (RALTS  ((RSEQ (rders r1 [c]) r2) #
+           (rders_cond_list r2 (nullable_bools r1 (orderedPref [c]))  (orderedSuf [c]))) )"
+   apply simp
+  apply(simp add: rders.simps)
+  apply(case_tac "rsimp (rder c r1) = RZERO")
+   apply auto
+  apply(case_tac "rsimp (rder c r1) = RONE")
+   apply auto
+   apply(subgoal_tac "rsimp_SEQ RONE (rsimp r2) = rsimp r2")
+  prefer 2
+  using idiot
+    apply simp
+   apply(subgoal_tac "rsimp_SEQ RONE (rsimp r2) = rsimp_ALTs (rdistinct (rflts [rsimp r2]) {})")
+    prefer 2
+    apply auto
+   apply(case_tac "rsimp r2")
+        apply auto
+   apply(subgoal_tac "rdistinct x5 {} = x5")
+  prefer 2
+  using no_further_dB_after_simp
+    apply metis
+   apply(subgoal_tac "rsimp_ALTs (rdistinct x5 {}) = rsimp_ALTs x5")
+    prefer 2
+    apply fastforce  
+   apply auto
+   apply (metis no_alt_short_list_after_simp)
+  apply (case_tac "rsimp r2 = RZERO")
+   apply(subgoal_tac "rsimp_SEQ (rsimp (rder c r1)) (rsimp r2) = RZERO")
+    prefer 2
+    apply(case_tac "rsimp ( rder c r1)")
+         apply auto
+  apply(subgoal_tac "rsimp_SEQ (rsimp (rder c r1)) (rsimp r2) = RSEQ (rsimp (rder c r1)) (rsimp r2)")
+   prefer 2
+   apply auto
+  sorry
 
 lemma shape_derssimpseq_onechar2:
   shows "rders_simp (RSEQ r1 r2) [c] = 
@@ -433,6 +524,31 @@
            (rders_cond_list r2 (nullable_bools r1 (orderedPref [c]))  (orderedSuf [c]))) )"
   sorry
 
+(*
+lemma simp_helps_der_pierce_dB:
+  shows " rsimp
+            (rsimp_ALTs
+              (map (rder x)
+                (rdistinct rs {}))) = 
+          rsimp (rsimp_ALTs (rdistinct (map (rder x) rs) {}))"
+
+  sorry
+*)
+(*
+lemma simp_helps_der_pierce_flts:
+  shows " rsimp
+            (rsimp_ALTs
+             (rdistinct 
+                (map (rder x)
+                  (rflts rs )
+                ) {}
+             )
+            ) = 
+          rsimp (rsimp_ALTs (rdistinct (rflts (map (rder x) rs)) {})  )"
+
+  sorry
+
+*)
 
 lemma rders__onechar:
   shows " (rders_simp r [c]) =  (rsimp (rders r [c]))"
@@ -484,29 +600,6 @@
             )"
   sorry
 
-lemma simp_helps_der_pierce_dB:
-  shows " rsimp
-            (rsimp_ALTs
-              (map (rder x)
-                (rdistinct rs {}))) = 
-          rsimp (rsimp_ALTs (rdistinct (map (rder x) rs) {}))"
-
-  sorry
-
-lemma simp_helps_der_pierce_flts:
-  shows " rsimp
-            (rsimp_ALTs
-             (rdistinct 
-                (map (rder x)
-                  (rflts rs )
-                ) {}
-             )
-            ) = 
-          rsimp (rsimp_ALTs (rdistinct (rflts (map (rder x) rs)) {})  )"
-
-  sorry
-
-
 lemma unfold_ders_simp_inside_only: 
   shows "    (rders_simp (RSEQ r1 r2) xs =
            rsimp (RALTS (RSEQ (rders_simp r1 xs) r2 # rders_cond_list r2 (nullable_bools r1 (orderedPref xs)) (orderedSuf xs))))
@@ -594,6 +687,55 @@
   sorry
 
 
+
+
+lemma seq_update_seq_ders:
+  shows "rsimp (rder c ( rsimp (RALTS ((RSEQ (rders_simp r1 s) r2) # 
+(map (rders_simp r2) Ss))))) = 
+         rsimp (RALTS ((RSEQ (rders_simp r1 (s @ [c])) r2) # 
+(map (rders_simp r2) (seq_update c (rders_simp r1 s) Ss))))  "
+  sorry
+
+lemma seq_ders_closed_form1:
+  shows "\<exists>Ss. rders_simp (RSEQ r1 r2) [c] = rsimp (RALTS ((RSEQ (rders_simp r1 [c]) r2) # 
+(map ( rders_simp r2 ) Ss)))"
+  apply(case_tac "rnullable r1")
+   apply(subgoal_tac " rders_simp (RSEQ r1 r2) [c] = 
+rsimp (RALTS ((RSEQ (rders_simp r1 [c]) r2) # (map (rders_simp r2) [[c]])))")
+    prefer 2
+    apply (simp add: rsimp_idem)
+   apply(rule_tac x = "[[c]]" in exI)
+   apply simp
+  apply(subgoal_tac  " rders_simp (RSEQ r1 r2) [c] = 
+rsimp (RALTS ((RSEQ (rders_simp r1 [c]) r2) # (map (rders_simp r2) [])))")
+   apply blast
+  apply(simp add: rsimp_idem)
+  sorry
+
+
+
+lemma seq_ders_closed_form:
+  shows "s \<noteq> [] \<Longrightarrow> \<exists>Ss. rders_simp (RSEQ r1 r2) s = rsimp (RALTS ((RSEQ (rders_simp r1 s) r2) # 
+(map ( rders_simp r2 ) Ss)))"
+  apply(induct s rule: rev_induct)
+   apply simp
+  apply(case_tac xs)
+  using seq_ders_closed_form1 apply auto[1]
+  apply(subgoal_tac "\<exists>Ss. rders_simp (RSEQ r1 r2) xs = rsimp (RALTS (RSEQ (rders_simp r1 xs) r2 # map (rders_simp r2) Ss))")
+  prefer 2
+  
+   apply blast
+  apply(erule exE)
+  apply(rule_tac x = "seq_update x (rders_simp r1 xs) Ss" in exI)
+  apply(subst rders_simp_append)
+  apply(subgoal_tac "rders_simp (rders_simp (RSEQ r1 r2) xs) [x] = rsimp (rder x (rders_simp (RSEQ r1 r2) xs))")
+   apply(simp only:)
+   apply(subst seq_update_seq_ders)
+   apply blast
+  using rders_simp_one_char by presburger
+
+
+
 (*nullable_seq_with_list1 related *)
 lemma LHS0_def_der_alt:
   shows "rsimp (rder x (RALTS ( (RSEQ (rders_simp r1 xs) r2) # (cond_list r1 r2 xs)))) = 
@@ -710,32 +852,6 @@
    apply simp
   using r1r2ders_whole rders_simp_append rders_simp_one_char by presburger
 
-(*
-
-  apply(subgoal_tac " rsimp
-            (rder x
-              (rsimp_ALTs
-                (rdistinct
-                  (rflts
-                    (rsimp_SEQ (rsimp (rders_simp r1 xs)) (rsimp r2) #
-                     map rsimp
-                      (rders_cond_list r2 (nullable_bools r1 (orderedPrefAux (length xs) xs)) (orderedSufAux (length xs) xs))))
-                  {}))) =  
-                      rsimp
-            (
-              (rsimp_ALTs
-               (map (rder x)
-                (rdistinct
-                  (rflts
-                    (rsimp_SEQ (rsimp (rders_simp r1 xs)) (rsimp r2) #
-                     map rsimp
-                      (rders_cond_list r2 (nullable_bools r1 (orderedPrefAux (length xs) xs)) (orderedSufAux (length xs) xs))))
-                  {})
-               )
-              )
-            ) ")
-   prefer 2
-*)
 
 lemma shape_derssimp_alts:
   shows "s \<noteq> [] \<Longrightarrow> rders_simp (RALTS rs) s = rsimp (RALTS (map (\<lambda>r. rders r s) rs))"
@@ -751,13 +867,10 @@
 |"rexp_encode (RCHAR c) = 2"
 |"rexp_encode (RSEQ r1 r2) = ( 2 ^ (rexp_encode r1)) "
 *)
-lemma finite_chars:
-  shows " \<exists>N. ( (\<forall>r \<in> (set rs). \<exists>c. r = RCHAR c) \<and> (distinct rs) \<longrightarrow> length rs < N)"
-  apply(rule_tac x = "Suc 256" in exI)
-  sorry
+
 
-definition all_chars :: "int \<Rightarrow> char list"
-  where "all_chars n = map char_of [0..n]"
+
+
 (*
 fun rexp_enum :: "nat \<Rightarrow> rrexp list"
   where 
@@ -766,26 +879,150 @@
 |"rexp_enum (Suc n) = [(RSEQ r1 r2). r1 \<in> set (rexp_enum i) \<and> r2 \<in> set (rexp_enum j) \<and> i + j = n]"
 
 *)
-
-fun rexp_enum :: "nat \<Rightarrow> rrexp set"
+context notes rev_conj_cong[fundef_cong] begin
+function (sequential) rexp_enum :: "nat \<Rightarrow> rrexp set"
   where 
 "rexp_enum 0 = {}"
-|"rexp_enum (Suc 0) =  {RALTS []} \<union> {RZERO} \<union> {RONE} \<union> { (RCHAR c) |c. c \<in> set (all_chars 255)}"
-|"rexp_enum (Suc n) = {(RSEQ r1 r2)|r1 r2 i j. r1 \<in>  (rexp_enum i) \<and> r2 \<in>  (rexp_enum j) \<and> i + j = n}"
+|"rexp_enum (Suc 0) =  {RALTS []} \<union> {RZERO} \<union> {RONE} \<union> { (RCHAR c) |c. c \<in>{Achar, Bchar, Cchar, Dchar} }"
+|"rexp_enum (Suc n) = {(RSEQ r1 r2)|r1 r2 i j. r1 \<in>  (rexp_enum i) \<and> r2 \<in>  (rexp_enum j) \<and> i + j = n} \<union>
+{(RALTS (a # rs)) | a rs i j. a \<in> (rexp_enum i) \<and> (RALTS rs) \<in> (rexp_enum j) \<and> i + j = Suc n \<and> i \<le> n \<and> j \<le> n} \<union>
+{RSTAR r0 | r0. r0 \<in> (rexp_enum n)} \<union>
+(rexp_enum n)"
+  by pat_completeness auto
+termination
+  by (relation "measure size") auto
+end
+
+lemma rexp_enum_inclusion:
+  shows "(rexp_enum n) \<subseteq> (rexp_enum (Suc n))"
+  apply(induct n)
+   apply auto[1]
+  apply(simp)
+  done
+
+lemma rexp_enum_mono:
+  shows "n \<le> m \<Longrightarrow> (rexp_enum n) \<subseteq> (rexp_enum m)"
+  by (simp add: lift_Suc_mono_le rexp_enum_inclusion)
+
+lemma enum_inductive_cases:
+  shows "rsize (RSEQ r1 r2) = Suc n \<Longrightarrow> \<exists>i j. rsize r1 = i \<and> rsize r2 = j\<and> i + j = n"
+  by (metis Suc_inject rsize.simps(5))
+thm rsize.simps(5)
+
+lemma enumeration_finite:
+  shows "\<exists>Nn. card (rexp_enum n) < Nn"
+  apply(simp add:no_top_class.gt_ex)
+  done
+
+lemma size1_rexps:
+  shows "RCHAR x \<in> rexp_enum 1"
+  apply(cases x)
+     apply auto
+  done
+
+lemma non_zero_size:
+  shows "rsize r \<ge> Suc 0"
+  apply(induct r)
+  apply auto done
+
+corollary size_geq1:
+  shows "rsize r \<ge> 1"
+  by (simp add: non_zero_size)
+
+
+lemma rexp_size_induct:
+  shows "\<And>N r x5 a list.
+       \<lbrakk> rsize r = Suc N; r = RALTS x5;
+        x5 = a # list\<rbrakk>  \<Longrightarrow>\<exists>i j. rsize a = i \<and> rsize (RALTS list) = j \<and> i + j =  Suc N \<and> i \<le> N \<and> j \<le> N"
+  apply(rule_tac x = "rsize a" in exI)
+  apply(rule_tac x = "rsize (RALTS list)" in exI)
+  apply(subgoal_tac "rsize a \<ge> 1")
+   prefer 2
+  using One_nat_def non_zero_size apply presburger
+  apply(subgoal_tac "rsize (RALTS list) \<ge> 1 ")
+  prefer 2
+  using size_geq1 apply blast
+  apply simp
+  done
 
- 
-lemma finite_sized_rexp_forms_finite_set:
-  shows " \<exists>SN. ( \<forall>r.( rsize r < N \<longrightarrow> r \<in> SN)) \<and> (finite SN)"
-  apply(induct N)
+lemma rexp_enum_case3:
+  shows "N \<ge> Suc 0 \<Longrightarrow> rexp_enum (Suc N) =  {(RSEQ r1 r2)|r1 r2 i j. r1 \<in>  (rexp_enum i) \<and> r2 \<in>  (rexp_enum j) \<and> i + j = N} \<union>
+{(RALTS (a # rs)) | a rs i j. a \<in> (rexp_enum i) \<and> (RALTS rs) \<in> (rexp_enum j) \<and> i + j = Suc N \<and> i \<le> N \<and> j \<le> N} \<union>
+{RSTAR r0 | r0. r0 \<in> (rexp_enum N)} \<union>
+(rexp_enum N)"
+  apply(case_tac N)
    apply simp
-   apply auto
- (*\<lbrakk>\<forall>r. rsize r < N \<longrightarrow> r \<in> SN; finite SN\<rbrakk> \<Longrightarrow> \<exists>SN. (\<forall>r. rsize r < Suc N \<longrightarrow> r \<in> SN) \<and> finite SN*)
- (* \<And>N. \<exists>SN. (\<forall>r. rsize r < N \<longrightarrow> r \<in> SN) \<and> finite SN \<Longrightarrow> \<exists>SN. (\<forall>r. rsize r < Suc N \<longrightarrow> r \<in> SN) \<and> finite SN*)
+  apply auto
+  done
+
+
+
+lemma def_enum_alts:
+  shows "\<lbrakk> r = RALTS x5; x5 = a # list;
+        rsize a = i \<and> rsize (RALTS list) = j \<and> i + j = Suc N \<and> a \<in> (rexp_enum i) \<and> (RALTS list) \<in> (rexp_enum j) \<rbrakk>
+       \<Longrightarrow> r \<in> rexp_enum (Suc N)"
+  apply(subgoal_tac "N \<ge> 1")
+  prefer 2
+  apply (metis One_nat_def add_is_1 less_Suc0 linorder_le_less_linear non_zero_size)
+  apply(subgoal_tac " rexp_enum (Suc N) =  {(RSEQ r1 r2)|r1 r2 i j. r1 \<in>  (rexp_enum i) \<and> r2 \<in>  (rexp_enum j) \<and> i + j = N} \<union>
+{(RALTS (a # rs)) | a rs i j. a \<in> (rexp_enum i) \<and> (RALTS rs) \<in> (rexp_enum j) \<and> i + j = Suc N\<and> i \<le> N \<and> j \<le> N} \<union>
+{RSTAR r0 | r0. r0 \<in> (rexp_enum N)} \<union>
+(rexp_enum N)")
+  prefer 2
+  using One_nat_def rexp_enum_case3 apply presburger
+  apply(subgoal_tac "i \<le> N \<and> j \<le> N")
+  prefer 2
+  using non_zero_size apply auto[1]
+  apply(subgoal_tac "r \<in> {uu.
+      \<exists>a rs i j. uu = RALTS (a # rs) \<and> a \<in> rexp_enum i \<and> RALTS rs \<in> rexp_enum j \<and> i + j = Suc N \<and> i \<le> N \<and> j \<le> N}")
+   apply auto[1]
+  apply(subgoal_tac "RALTS (a # list) \<in>  {uu.
+      \<exists>a rs i j. uu = RALTS (a # rs) \<and> a \<in> rexp_enum i \<and> RALTS rs \<in> rexp_enum j \<and> i + j = Suc N \<and> i \<le> N \<and> j \<le> N}")
+
+  
+   apply fastforce
+  apply(subgoal_tac "a \<in> rexp_enum i")
+  prefer 2
+  
+   apply linarith
+  by blast
+
+lemma rexp_enum_covers:
+  shows " rsize r \<le> N \<Longrightarrow> r \<in> rexp_enum N \<and> r \<in> rexp_enum (rsize r)"
+  apply(induct N arbitrary : r)
+   apply simp
+  
+  using rsize.elims apply auto[1]
+  apply(case_tac "rsize r \<le> N")
+  using enumeration_finite
+  
+   apply (meson in_mono rexp_enum_inclusion)
+  apply(subgoal_tac "rsize r = Suc N")
+  prefer 2
+  using le_Suc_eq apply blast
+
+  apply(case_tac r)
+       prefer 5
+       apply(case_tac x5)
+        apply(subgoal_tac "rsize r =1")
+  prefer 2
+  using hand_made_def_rlist_size rlist_size.simps(2) rsize.simps(4) apply presburger
+        apply simp
+  apply(subgoal_tac "a \<in> rexp_enum (rsize a)")
+  apply(subgoal_tac "RALTS list \<in> rexp_enum (rsize (RALTS list))")
+  
+         apply (meson def_enum_alts rexp_size_induct)
+        apply(subgoal_tac "rsize (RALTS list) \<le> N")
+         apply(subgoal_tac "RALTS list \<in> rexp_enum N")
+  prefer 2
+          apply presburger
+  
   sorry
 
 
 lemma finite_size_finite_regx:
   shows " \<exists>l. \<forall>rs. ((\<forall>r \<in> (set rs). rsize r < N) \<and> (distinct rs) \<longrightarrow> (length rs) < l) "
+
   sorry
 
 (*below  probably needs proved concurrently*)