# HG changeset patch
# User Chengsong
# Date 1648425582 -3600
# Node ID 23818853a710f588c5dd8d7beeaed08fb82a30f6
# Parent  7a8cef3f5234a4183a0d7440ead9dc2c902a5260
thesis

diff -r 7a8cef3f5234 -r 23818853a710 ChengsongTanPhdThesis/Chapters/Chapter1.tex
--- a/ChengsongTanPhdThesis/Chapters/Chapter1.tex	Sat Mar 26 11:24:36 2022 +0000
+++ b/ChengsongTanPhdThesis/Chapters/Chapter1.tex	Mon Mar 28 00:59:42 2022 +0100
@@ -45,10 +45,10 @@
 %----------------------------------------------------------------------------------------
 %This part is about regular expressions, Brzozowski derivatives,
 %and a bit-coded lexing algorithm with proven correctness and time bounds.
-Regular expressions are widely used in modern day programming tasks.
-Be it IDE with syntax hightlighting and auto completion, 
+Regular expressions are widely used in computer science: 
+be it in IDEs with syntax hightlighting and auto completion, 
 command line tools like $\mathit{grep}$ that facilitates easy 
-processing of text by search and replace, network intrusion
+text processing , network intrusion
 detection systems that rejects suspicious traffic, or compiler
 front ends--there is always an important phase of the
 task that involves matching a regular 
@@ -197,38 +197,213 @@
 
  \section{Why are current algorithm for regexes slow?}
 
-Theoretical results say that regular expression matching
-should be linear with respect to the input. 
-You could construct
-an NFA via Thompson construction, and simulate  running it.
-This would be linear.
-Or you could determinize the NFA into a DFA, and minimize that
-DFA. Once you have the DFA, the running time is also linear, requiring just 
-one scanning pass of the input.
+%find literature/find out for yourself that REGEX->DFA on basic regexes
+%does not blow up the size
+Shouldn't regular expression matching be linear?
+How can one explain the super-linear behaviour of the 
+regex matching engines we have?
+The time cost of regex matching algorithms in general
+involve two phases: 
+the construction phase, in which the algorithm builds some  
+suitable data structure from the input regex $r$, we denote
+the time cost by $P_1(r)$.
+The lexing
+phase, when the input string $s$ is read and the data structure
+representing that regex $r$ is being operated on. We represent the time
+it takes by $P_2(r, s)$.\\
+In the case of a $\mathit{DFA}$,
+we have $P_2(r, s) = O( |s| )$,
+because we take at most $|s|$ steps, 
+and each step takes
+at most one transition--
+a deterministic-finite-automata
+by definition has at most one state active and at most one
+transition upon receiving an input symbol.
+But unfortunately in the  worst case
+$P_1(r) = O(exp^{|r|})$. An example will be given later. \\
+For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold 
+expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$.
+The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack.
+On the other hand, if backtracking is used, the worst-case time bound bloats
+to $|r| * 2^|s|$ .
+%on the input
+%And when calculating the time complexity of the matching algorithm,
+%we are assuming that each input reading step requires constant time.
+%which translates to that the number of 
+%states active and transitions taken each time is bounded by a
+%constant $C$.
+%But modern  regex libraries in popular language engines
+% often want to support much richer constructs than just
+% sequences and Kleene stars,
+%such as negation, intersection, 
+%bounded repetitions and back-references.
+%And de-sugaring these "extended" regular expressions 
+%into basic ones might bloat the size exponentially.
+%TODO: more reference for exponential size blowup on desugaring. 
+\subsection{Tools that uses $\mathit{DFA}$s}
+%TODO:more tools that use DFAs?
+$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
+in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
+lexers. The user provides a set of regular expressions
+and configurations to such lexer generators, and then 
+gets an output program encoding a minimized $\mathit{DFA}$
+that can be compiled and run. 
+The good things about $\mathit{DFA}$s is that once
+generated, they are fast and stable, unlike
+backtracking algorithms. 
+However, they do not scale well with bounded repetitions.\\
 
-But modern  regex libraries in popular language engines
- often want to support richer constructs
-such as bounded repetitions and back-references
-rather than  the most basic regular expressions.
-%put in illustrations
-%a{1}{3}
-These make a DFA construction impossible because
-of an exponential states explosion.
- They also want to support lexing rather than just matching
- for tasks that involves text processing.
- 
- Existing regex libraries either pose restrictions on the user input, or does 
- not give linear running time guarantee.
+Bounded repetitions, usually written in the form
+$r^{\{c\}}$ (where $c$ is a constant natural number),
+denotes a regular expression accepting strings
+that can be divided into $c$ substrings, where each 
+substring is in $r$. 
+For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
+an $\mathit{NFA}$ describing it would look like:
+\begin{center}
+\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
+   \node[state,initial] (q_0)   {$q_0$}; 
+   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
+   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
+   \node[state, accepting, red](q_3) [right=of q_2] {$q_3$};
+    \path[->] 
+    (q_0) edge  node {a} (q_1)
+    	  edge [loop below] node {a,b} ()
+    (q_1) edge  node  {a,b} (q_2)
+    (q_2) edge  node  {a,b} (q_3);
+\end{tikzpicture}
+\end{center}
+The red states are "countdown states" which counts down 
+the number of characters needed in addition to the current
+string to make a successful match.
+For example, state $q_1$ indicates a match that has
+gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$,
+and just consumed the "delimiter" $a$ in the middle, and 
+need to match 2 more iterations of $(a|b)$ to complete.
+State $q_2$ on the other hand, can be viewed as a state
+after $q_1$ has consumed 1 character, and just waits
+for 1 more character to complete.
+$q_3$ is the last state, requiring 0 more character and is accepting.
+Depending on the suffix of the
+input string up to the current read location,
+the states $q_1$ and $q_2$, $q_3$
+may or may
+not be active, independent from each other.
+A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
+contain at least $2^3$ non-equivalent states that cannot be merged, 
+because the subset construction during determinisation will generate
+all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$.
+Generalizing this to regular expressions with larger
+bounded repetitions number, we have that
+regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s
+would require at least $2^{n+1}$ states, if $r$ contains
+more than 1 string.
+This is to represent all different 
+scenarios which "countdown" states are active.
+For those regexes, tools such as $\mathit{JFLEX}$ 
+would generate gigantic $\mathit{DFA}$'s or
+out of memory errors.
+For this reason, regex libraries that support 
+bounded repetitions often choose to use the $\mathit{NFA}$ 
+approach.
+\subsection{The $\mathit{NFA}$ approach to regex matching}
+One can simulate the $\mathit{NFA}$ running in two ways:
+one by keeping track of all active states after consuming 
+a character, and update that set of states iteratively.
+This can be viewed as a breadth-first-search of the $\mathit{NFA}$
+for a path terminating
+at an accepting state.
+Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this
+type of $\mathit{NFA}$ simulation, and guarantees a linear runtime
+in terms of input string length.
+%TODO:try out these lexers
+The other way to use $\mathit{NFA}$ for matching is choosing  
+a single transition each time, keeping all the other options in 
+a queue or stack, and backtracking if that choice eventually 
+fails. This method, often called a  "depth-first-search", 
+is efficient in a lot of cases, but could end up
+with exponential run time.\\
+%TODO:COMPARE java python lexer speed with Rust and Go
+The reason behind backtracking algorithms in languages like
+Java and Python is that they support back-references.
+\subsection{Back References in Regex--Non-Regular part}
+If we have a regular expression like this (the sequence
+operator is omitted for brevity):
+\begin{center}
+	$r_1(r_2(r_3r_4))$
+\end{center}
+We could label sub-expressions of interest 
+by parenthesizing them and giving 
+them a number by the order in which their opening parentheses appear.
+One possible way of parenthesizing and labelling is given below:
+\begin{center}
+	$\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$
+\end{center}
+$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled
+by 1 to 4. $1$ would refer to the entire expression 
+$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc.
+These sub-expressions are called "capturing groups".
+We can use the following syntax to denote that we want a string just matched by a 
+sub-expression (capturing group) to appear at a certain location again, 
+exactly as it was:
+\begin{center}
+$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots 
+\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$
+\end{center}
+The backslash and number $i$ are used to denote such 
+so-called "back-references".
+Let $e$ be an expression made of regular expressions 
+and back-references. $e$ contains the expression $e_i$
+as its $i$-th capturing group.
+The semantics of back-reference can be recursively
+written as:
+\begin{center}
+	\begin{tabular}{c}
+		$L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\
+		$s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$
+	\end{tabular}
+\end{center}
+The concrete example
+$((a|b|c|\ldots|z)^*)\backslash 1$
+would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\
+Back-reference is a construct in the "regex" standard
+that programmers found quite useful, but not exactly 
+regular any more.
+In fact, that allows the regex construct to express 
+languages that cannot be contained in context-free
+languages either.
+For example, the back-reference $((a^*)b\backslash1 b \backslash 1$
+expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$,
+which cannot be expressed by context-free grammars\parencite{campeanu2003formal}.
+Such a language is contained in the context-sensitive hierarchy
+of formal languages. 
+Solving the back-reference expressions matching problem
+is NP-complete\parencite{alfred2014algorithms} and a non-bactracking,
+efficient solution is not known to exist.
+%TODO:read a bit more about back reference algorithms
+It seems that languages like Java and Python made the trade-off
+to support back-references at the expense of having to backtrack,
+even in the case of regexes not involving back-references.\\
+Summing these up, we can categorise existing 
+practical regex libraries into the ones  with  linear
+time guarantees like Go and Rust, which impose restrictions
+on the user input (not allowing back-references, 
+bounded repetitions canno exceed 1000 etc.), and ones  
+ that allows the programmer much freedom, but grinds to a halt
+ in some non-negligible portion of cases.
  %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions 
- For example, the Rust regex engine claims to be linear, 
- but does not support lookarounds and back-references.
- The GoLang regex library does not support over 1000 repetitions.  
- Java and Python both support back-references, but shows
-catastrophic backtracking behaviours on inputs without back-references(
-when the language is still regular).
+% For example, the Rust regex engine claims to be linear, 
+% but does not support lookarounds and back-references.
+% The GoLang regex library does not support over 1000 repetitions.  
+% Java and Python both support back-references, but shows
+%catastrophic backtracking behaviours on inputs without back-references(
+%when the language is still regular).
  %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c  baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac
  %TODO: verify the fact Rust does not allow 1000+ reps
  %TODO: Java 17 updated graphs? Is it ok to still use Java 8 graphs?
+\section{Buggy Regex Engines} 
+
+
  Another thing about the these libraries is that there
  is no correctness guarantee.
  In some cases they either fails to generate a lexing result when there is a match,
@@ -239,9 +414,11 @@
 exponential backtracking problems, 
 but also undesired (or even buggy) outputs.
 %TODO: comment from who
-xxx commented that most regex libraries are not
+Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not
 correctly implementing the POSIX (maximum-munch)
 rule of regular expression matching.
+This experience is echoed by the writer's
+tryout of a few online regex testers:
 A concrete example would be 
 the regex
 \begin{verbatim}
@@ -264,125 +441,86 @@
 
 This regex would trigger catastrophic backtracking in 
 languages like Python and Java,
-whereas it gives a correct but uninformative (non-POSIX)
+whereas it gives a non-POSIX  and uninformative 
 match in languages like Go or .NET--The match with only 
 character $c$.
 
-Backtracking or depth-first search might give us
-exponential running time, and quite a few tools 
-avoid that by determinising the $\mathit{NFA}$ 
-into a $\mathit{DFA}$ and minimizes it.
-For example, $\mathit{LEX}$ and $\mathit{JFLEX}$ are tools
-in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based
-lexers.
-However, they do not scale well with bounded repetitions.
-Bounded repetitions, usually written in the form
-$r^{\{c\}}$ (where $c$ is a constant natural number),
-denotes a regular expression accepting strings
-that can be divided into $c$ substrings, and each 
-substring is in $r$. 
-%TODO: draw example NFA.
-For the regular expression $(a|b)^*a(a|b)^{\{2\}}$,
-an $\mathit{NFA}$ describing it would look like:
+As Grathwohl\parencite{grathwohl2014crash} commented,
 \begin{center}
-\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] 
-   \node[state,initial] (q_0)   {$q_0$}; 
-   \node[state, red] (q_1) [right=of q_0] {$q_1$}; 
-   \node[state, red] (q_2) [right=of q_1] {$q_2$}; 
-   \node[state,accepting](q_3) [right=of q_2] {$q_3$};
-    \path[->] 
-    (q_0) edge  node {a} (q_1)
-    	  edge [loop below] node {a,b} ()
-    (q_1) edge  node  {a,b} (q_2)
-          edge [loop above] node {0} ()
-    (q_2) edge  node  {a,b} (q_3);
-\end{tikzpicture}
+	``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.''
 \end{center}
-The red states are "counter states" which counts down 
-the number of characters needed in addition to the current
-string to make a successful match.
-For example, state $q_1$ indicates a match that has
-gone past the $(a|b)$ part of $(a|b)^*a(a|b)^{\{2\}}$,
-and just consumed the "delimiter" $a$ in the middle, and 
-need to match 2 more iterations of $a|b$ to complete.
-State $q_2$ on the other hand, can be viewed as a state
-after $q_1$ has consumed 1 character, and just waits
-for 1 more character to complete.
-Depending on the actual characters appearing in the 
-input string, the states $q_1$ and $q_2$ may or may
-not be active, independent from each other.
-A $\mathit{DFA}$ for such an $\mathit{NFA}$ would
-contain at least 4 non-equivalent states that cannot be merged, 
-because subset states indicating which of $q_1$ and $q_2$
-are active are at least four: $\phi$, $\{q_1\}$, 
-$\{q_2\}$, $\{q_1, q_2\}$.
-Generalizing this to regular expressions with larger
-bounded repetitions number, we have $r^*ar^{\{n\}}$
-in general would require at least $2^n$ states
-in a $\mathit{DFA}$. This is to represent all different 
-configurations of "countdown" states.
-For those regexes, tools such as $\mathit{JFLEX}$ 
-would generate gigantic $\mathit{DFA}$'s or even
-give out memory errors.
 
-For this reason, regex libraries that support 
-bounded repetitions often choose to use the $\mathit{NFA}$ 
-approach.
-One can simulate the $\mathit{NFA}$ running in two ways:
-one by keeping track of all active states after consuming 
-a character, and update that set of states iteratively.
-This is a breadth-first-search of the $\mathit{NFA}$.
-for a path terminating
-at an accepting state.
-Languages like $\mathit{GO}$ and $\mathit{RUST}$ use this
-type of $\mathit{NFA}$ simulation, and guarantees a linear runtime
-in terms of input string length.
-The other way to use $\mathit{NFA}$ for matching is to take 
-a single state in a path each time, and backtrack if that path
-fails. This is a depth-first-search that could end up
-with exponential run time.
-The reason behind backtracking algorithms in languages like
-Java and Python is that they support back-references.
-\subsection{Back References in Regex--Non-Regular part}
-If we label sub-expressions by parenthesizing them and give 
-them a number by the order their opening parenthesis appear,
-$\underset{1}{(}\ldots\underset{2}{(}\ldots\underset{3}{(}\ldots\underset{4}{(}\ldots)\ldots)\ldots)\ldots)$
-We can use the following syntax to denote that we want a string just matched by a 
-sub-expression to appear at a certain location again exactly:
-$(.*)\backslash 1$
-would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.
-
-Back-reference is a construct in the "regex" standard
-that programmers found quite useful, but not exactly 
-regular any more.
-In fact, that allows the regex construct to express 
-languages that cannot be contained in context-free
-languages
-For example, the back reference $(a^*)\backslash 1 \backslash 1$
-expresses the language $\{a^na^na^n\mid n \in N\}$,
-which cannot be expressed by context-free grammars.
-To express such a language one would need context-sensitive
-grammars, and matching/parsing for such grammars is NP-hard in 
-general.
-%TODO:cite reference for NP-hardness of CSG matching
-For such constructs, the most intuitive way of matching is
-using backtracking algorithms, and there are no known algorithms 
-that does not backtrack.
-%TODO:read a bit more about back reference algorithms
+\section{Engineering and Academic Approaches to Deal with Catastrophic Backtracking}
 
 
-
-
-\section{Our Solution--Brzozowski Derivatives}
-
-
- 
+There is also static analysis work on regular expression that
+have potential expoential behavious. Rathnayake and Thielecke 
+\parencite{Rathnayake2014StaticAF} proposed an algorithm
+that detects regular expressions triggering exponential
+behavious on backtracking matchers.
+People also developed static analysis methods for
+generating non-linear polynomial worst-time estimates
+for regexes, attack string that exploit the worst-time 
+scenario, and "attack automata" that generates
+attack strings \parencite{Weideman2017Static}.
+There are also tools to "debug" regular expressions
+that allows people to see why a match failed or was especially slow
+by showing the steps a back-tracking regex engine took\parencite{regexploit2021}.
+%TODO:also the regex101 debugger
+\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives}
  Is it possible to have a regex lexing algorithm with proven correctness and 
  time complexity, which allows easy extensions to
   constructs like 
  bounded repetitions, negation,  lookarounds, and even back-references? 
+Building on top of Sulzmann and Lu's attempt to formalize the 
+notion of POSIX lexing rules \parencite{Sulzmann2014}, 
+Ausaf and Urban\parencite{AusafDyckhoffUrban2016} modelled
+POSIX matching as a ternary relation recursively defined in a
+natural deduction style.
+With the formally-specified rules for what a POSIX matching is,
+they designed a regex matching algorithm based on Brzozowski derivatives, and 
+proved in Isabelle/HOL that the algorithm gives correct results.
+
  
- We propose Brzozowski's derivatives as a solution to this problem.
+
+
+
+%----------------------------------------------------------------------------------------
+
+\section{Our Approach}
+In the last fifteen or so years, Brzozowski's derivatives of regular
+expressions have sparked quite a bit of interest in the functional
+programming and theorem prover communities.  The beauty of
+Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
+expressible in any functional language, and easily definable and
+reasoned about in theorem provers---the definitions just consist of
+inductive datatypes and simple recursive functions.  Derivatives of a
+regular expression, written $r \backslash c$, give a simple solution
+to the problem of matching a string $s$ with a regular
+expression $r$: if the derivative of $r$ w.r.t.\ (in
+succession) all the characters of the string matches the empty string,
+then $r$ matches $s$ (and {\em vice versa}).  
+
+
+This work aims to address the above vulnerability by the combination
+of Brzozowski's derivatives and interactive theorem proving. We give an 
+improved version of  Sulzmann and Lu's bit-coded algorithm using 
+derivatives, which come with a formal guarantee in terms of correctness and 
+running time as an Isabelle/HOL proof.
+Then we improve the algorithm with an even stronger version of 
+simplification, and prove a time bound linear to input and
+cubic to regular expression size using a technique by
+Antimirov.
+
+\subsection{Existing Work}
+We are aware
+of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
+Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
+of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
+in Coq is given by Coquand and Siles \parencite{Coquand2012}.
+Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
+ 
+ %We propose Brzozowski's derivatives as a solution to this problem.
  
 The main contribution of this thesis is a proven correct lexing algorithm
 with formalized time bounds.
@@ -1314,69 +1452,6 @@
 
 %----------------------------------------------------------------------------------------
 
-\section{Engineering and Academic Approaches to Deal with Catastrophic Backtracking}
-
-The reason behind is that regex libraries in popular language engines
- often want to support richer constructs
-than  the most basic regular expressions, and lexing rather than matching
-is needed for sub-match extraction.
-
-There is also static analysis work on regular expression that
-have potential expoential behavious. Rathnayake and Thielecke 
-\parencite{Rathnayake2014StaticAF} proposed an algorithm
-that detects regular expressions triggering exponential
-behavious on backtracking matchers.
-People also developed static analysis methods for
-generating non-linear polynomial worst-time estimates
-for regexes, attack string that exploit the worst-time 
-scenario, and "attack automata" that generates
-attack strings.
-For a comprehensive analysis, please refer to Weideman's thesis 
-\parencite{Weideman2017Static}.
-
-\subsection{DFA Approach}
-
-Exponential states.
-
-\subsection{NFA Approach}
-Backtracking.
-
-
-
-%----------------------------------------------------------------------------------------
-
-\section{Our Approach}
-In the last fifteen or so years, Brzozowski's derivatives of regular
-expressions have sparked quite a bit of interest in the functional
-programming and theorem prover communities.  The beauty of
-Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly
-expressible in any functional language, and easily definable and
-reasoned about in theorem provers---the definitions just consist of
-inductive datatypes and simple recursive functions.  Derivatives of a
-regular expression, written $r \backslash c$, give a simple solution
-to the problem of matching a string $s$ with a regular
-expression $r$: if the derivative of $r$ w.r.t.\ (in
-succession) all the characters of the string matches the empty string,
-then $r$ matches $s$ (and {\em vice versa}).  
-
-
-This work aims to address the above vulnerability by the combination
-of Brzozowski's derivatives and interactive theorem proving. We give an 
-improved version of  Sulzmann and Lu's bit-coded algorithm using 
-derivatives, which come with a formal guarantee in terms of correctness and 
-running time as an Isabelle/HOL proof.
-Then we improve the algorithm with an even stronger version of 
-simplification, and prove a time bound linear to input and
-cubic to regular expression size using a technique by
-Antimirov.
-
-\subsection{Existing Work}
-We are aware
-of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by
-Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part
-of the work by Krauss and Nipkow \parencite{Krauss2011}.  And another one
-in Coq is given by Coquand and Siles \parencite{Coquand2012}.
-Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}.
 
 %----------------------------------------------------------------------------------------
 
diff -r 7a8cef3f5234 -r 23818853a710 ChengsongTanPhdThesis/example.bib
--- a/ChengsongTanPhdThesis/example.bib	Sat Mar 26 11:24:36 2022 +0000
+++ b/ChengsongTanPhdThesis/example.bib	Mon Mar 28 00:59:42 2022 +0100
@@ -57,6 +57,44 @@
 	journal = {arXiv:1405.7058},
 	title = {Static Analysis for Regular Expression Exponential Runtime via Substructural Logics},
 	year = {2017}}
+@article{campeanu2003formal,
+  title={A formal study of practical regular expressions},
+  author={C{\^a}mpeanu, Cezar and Salomaa, Kai and Yu, Sheng},
+  journal={International Journal of Foundations of Computer Science},
+  volume={14},
+  number={06},
+  pages={1007--1018},
+  year={2003},
+  publisher={World Scientific}
+}
+
+@article{alfred2014algorithms,
+  title={Algorithms for finding patterns in strings},
+  author={Alfred, V},
+  journal={Algorithms and Complexity},
+  volume={1},
+  pages={255},
+  year={2014},
+  publisher={Elsevier}
+}
+
+
+@article{CAMPEANU2009Intersect,
+title = {On the intersection of regex languages with regular languages},
+journal = {Theoretical Computer Science},
+volume = {410},
+number = {24},
+pages = {2336-2344},
+year = {2009},
+note = {Formal Languages and Applications: A Collection of Papers in Honor of Sheng Yu},
+issn = {0304-3975},
+doi = {https://doi.org/10.1016/j.tcs.2009.02.022},
+url = {https://www.sciencedirect.com/science/article/pii/S0304397509001789},
+author = {Cezar Câmpeanu and Nicolae Santean},
+keywords = {Extended regular expression, Regex automata system, Regex},
+abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255–300] and rigorously in 2003 by Câmpeanu, Salomaa and Yu [C. Câmpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007–1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines — regex automata systems (RAS) — which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.}
+}
+
 
 @article{nielson11bcre,
 	author = {Lasse Nielsen, Fritz Henglein},
@@ -77,6 +115,22 @@
 	year = {2021}
 }
 	
+@misc{KuklewiczHaskell,
+	title = {Regex Posix},
+	author = {Kuklewicz},
+	keywords = {Buggy C POSIX Lexing Libraries},
+	url = {https://wiki.haskell.org/Regex_Posix},
+	year = {2017}
+}
+
+
+@techreport{grathwohl2014crash,
+  title={A Crash-Course in Regular Expression Parsing and Regular Expressions as Types},
+  author={Grathwohl, Niels Bj{\o}rn Bugge and Henglein, Fritz and Rasmussen, Ulrik Terp},
+  year={2014},
+  institution={Technical report, University of Copenhagen}
+}
+
 @misc{SE16,
 	author = {StackStatus},
 	date-added = {2019-06-26 11:28:41 +0000},
diff -r 7a8cef3f5234 -r 23818853a710 ChengsongTanPhdThesis/main.tex
--- a/ChengsongTanPhdThesis/main.tex	Sat Mar 26 11:24:36 2022 +0000
+++ b/ChengsongTanPhdThesis/main.tex	Mon Mar 28 00:59:42 2022 +0100
@@ -35,7 +35,7 @@
 %nohyperref, % Uncomment to not load the hyperref package
 headsepline, % Uncomment to get a line under the header
 %chapterinoneline, % Uncomment to place the chapter title next to the number on one line
-%consistentlayout, % Uncomment to change the layout of the declaration, abstract and acknowledgements pages to match the default layout
+consistentlayout, % Uncomment to change the layout of the declaration, abstract and acknowledgements pages to match the default layout
 ]{MastersDoctoralThesis} % The class file specifying the document structure
 
 \usepackage[utf8]{inputenc} % Required for inputting international characters
@@ -91,7 +91,7 @@
 \keywords{} % Keywords for your thesis, this is not currently used anywhere in the template, print it elsewhere with \keywordnames
 \university{\href{https://www.kcl.ac.uk}{King's College London}} % Your university's name and URL, this is used in the title page and abstract, print it elsewhere with \univname
 \department{\href{https://www.kcl.ac.uk/informatics}{Department or Informatics}} % Your department's name and URL, this is used in the title page and abstract, print it elsewhere with \deptname
-\group{\href{https://www.kcl.ac.uk/research/ssy}{Software Systems}} % Your research group's name and URL, this is used in the title page, print it elsewhere with \groupname
+\group{\href{https://www.kcl.ac.uk/research/ssy}{Software Systems Group}} % Your research group's name and URL, this is used in the title page, print it elsewhere with \groupname
 \faculty{\href{http://faculty.university.com}{Chengsong Tan}} % Your faculty's name and URL, this is used in the title page and abstract, print it elsewhere with \facname
 
 \AtBeginDocument{
@@ -193,11 +193,7 @@
 
 \begin{abstract}
 \addchaptertocentry{\abstractname} % Add the abstract to the table of contents
-This work is a combination of functional algorithms
-and formal methods.
-Regular expression matching and lexing has been 
- widely-used and well-implemented
-in software industry. 
+This work is about regular expressions and derivatives. It combines functional algorithms and their formal verification in the Isabelle/HOL theorem prover.
 
 Theoretical results say that regular expression matching
 should be linear with respect to the input.
diff -r 7a8cef3f5234 -r 23818853a710 thys2/ClosedForms.thy
--- a/thys2/ClosedForms.thy	Sat Mar 26 11:24:36 2022 +0000
+++ b/thys2/ClosedForms.thy	Mon Mar 28 00:59:42 2022 +0100
@@ -494,6 +494,117 @@
 
 
 
+
+inductive frewrite:: "rrexp list \<Rightarrow> rrexp list \<Rightarrow> bool" ("_ \<leadsto>f _" [10, 10] 10)
+  where
+  "(RZERO # rs) \<leadsto>f rs"
+| "((RALTS rs) # rsa) \<leadsto>f (rs @ rsa)"
+| "rs1 \<leadsto>f rs2 \<Longrightarrow> (r # rs1) \<leadsto>f (r # rs2)"
+
+
+inductive 
+  frewrites:: "rrexp list \<Rightarrow> rrexp list \<Rightarrow> bool" ("_ \<leadsto>f* _" [10, 10] 10)
+where 
+  rs1[intro, simp]:"rs \<leadsto>f* rs"
+| rs2[intro]: "\<lbrakk>rs1 \<leadsto>f* rs2; rs2 \<leadsto>f rs3\<rbrakk> \<Longrightarrow> rs1 \<leadsto>f* rs3"
+
+lemma fr_in_rstar : "r1 \<leadsto>f r2 \<Longrightarrow> r1 \<leadsto>f* r2"
+  using frewrites.intros(1) frewrites.intros(2) by blast
+ 
+lemma freal_trans[trans]: 
+  assumes a1: "r1 \<leadsto>f* r2"  and a2: "r2 \<leadsto>f* r3"
+  shows "r1 \<leadsto>f* r3"
+  using a2 a1
+  apply(induct r2 r3 arbitrary: r1 rule: frewrites.induct) 
+  apply(auto)
+  done
+
+
+lemma  many_steps_later: "\<lbrakk>r1 \<leadsto>f r2; r2 \<leadsto>f* r3 \<rbrakk> \<Longrightarrow> r1 \<leadsto>f* r3"
+  by (meson fr_in_rstar freal_trans)
+
+
+lemma frewrite_append:
+  shows "\<lbrakk> rsa \<leadsto>f rsb \<rbrakk> \<Longrightarrow> rs @ rsa \<leadsto>f rs @ rsb"
+  apply(induct rs)
+  apply simp+
+  oops
+
+
+lemma frewrites_cons:
+  shows "\<lbrakk> rsa \<leadsto>f* rsb \<rbrakk> \<Longrightarrow> r # rsa \<leadsto>f* r # rsb"
+
+  oops
+
+lemma frewrites_append:
+  shows " \<lbrakk>rsa \<leadsto>f* rsb\<rbrakk> \<Longrightarrow> (rs @ rsa) \<leadsto>f* (rs @ rsb)"
+  apply(induct rsa rsb rule: frewrites.induct)
+   apply simp
+  oops
+
+
+
+lemma frewrites_concat:
+  shows "\<lbrakk>rs1 \<leadsto>f rs2; rsa \<leadsto>f* rsb \<rbrakk> \<Longrightarrow> (rs1 @ rsa) \<leadsto>f* (rs2 @ rsb)"
+  apply(induct rs1 rs2 rule: frewrite.induct)
+    apply(simp)
+  apply(subgoal_tac "(RZERO # rs @ rsa) \<leadsto>f (rs @ rsa)")
+  prefer 2
+  using frewrite.intros(1) apply blast
+    apply(subgoal_tac "(rs @ rsa) \<leadsto>f* (rs @ rsb)")
+  using many_steps_later apply blast
+
+
+
+
+
+
+
+lemma many_steps_later1:
+  shows " \<lbrakk>rs1 \<leadsto>f* rs2\<rbrakk>
+       \<Longrightarrow> (RONE # rs1) \<leadsto>f* (RONE # rs2)"
+  oops
+
+lemma early_late_der_frewrites:
+  shows "map (rder x) (rflts rs) \<leadsto>f* rflts (map (rder x) rs)"
+  apply(induct rs)
+   apply simp
+  apply(case_tac a)
+       apply simp+
+  using frewrite.intros(1) many_steps_later apply blast
+     apply(case_tac "x = x3")
+     apply simp
+  
+
+
+
+
+fun alt_set:: "rrexp \<Rightarrow> rrexp set"
+  where
+  "alt_set (RALTS rs) = set rs"
+| "alt_set r = {r}"
+
+
+
+lemma rd_flts_set:
+  shows "rs1 \<leadsto>f* rs2 \<Longrightarrow> rdistinct rs1 (insert RZERO (rset \<union> (\<Union>(alt_set ` rset)))) \<leadsto>f*  
+                          rdistinct rs2 (rset \<union> (\<Union>(alt_set ` rset)))"
+  by (meson frewrite.intros(2) frewrites.simps)
+  
+lemma rd_flts_set2:
+  shows "rs1 \<leadsto>f* rs2 \<Longrightarrow> rdistinct rs1 ((rset \<union> (\<Union>(alt_set ` rset)))) \<leadsto>f*  
+                          rdistinct rs2 (rset \<union> (\<Union>(alt_set ` rset)))"
+  using frewrite.intros(2) by blast
+
+lemma rd_flts_incorrect:
+  shows "rs1 \<leadsto>f* rs2 \<Longrightarrow> rdistinct rs1 rset \<leadsto>f* rdistinct rs2 rset"
+  sledgehammer
+  by (smt (verit, ccfv_threshold) frewrites.simps)
+
+lemma flts_does_rewrite:
+  shows "rs1 \<leadsto>f* rs2 \<Longrightarrow> rflts rs1 = rflts rs2"
+  oops
+
 lemma rflts_fltsder_derflts:
   shows "rflts (map rsimp (rdistinct (map (rder x3) (rflts rs)) rset)) = 
          rflts (map rsimp (rdistinct (rflts (map (rder x3) rs)) rset))"
@@ -501,13 +612,15 @@
 
 
 lemma simp_der_flts:
-  shows "rsimp (RALTS (rdistinct (map (rder x) (rflts rs)) rset))= 
-         rsimp (RALTS (rdistinct (rflts (map (rder x) rs)) rset))"
+  shows "rsimp (RALTS (rdistinct (map (rder x) (rflts rs)) {})) = 
+         rsimp (RALTS (rdistinct (rflts (map (rder x) rs)) {}))"
 
-  apply(induct rs arbitrary: rset)
+  apply(induct rs)
    apply simp
   apply(case_tac a)
-       apply simp
+       apply simp+
+      prefer 2
+  apply simp
       apply(case_tac "RZERO \<in> rset")
        apply simp+
   using distinct_flts_no0 apply presburger
@@ -522,8 +635,13 @@
 
 
 lemma simp_der_pierce_flts:
-  shows " rsimp (rsimp_ALTs (rdistinct (map (rder x) (rflts (map (rsimp \<circ> (\<lambda>r. rders_simp r xs)) rs))) {})) =
-           rsimp (rsimp_ALTs (rdistinct (rflts (map (rder x) (map (rsimp \<circ> (\<lambda>r. rders_simp r xs)) rs))) {}))"
+  shows " rsimp (
+rsimp_ALTs (rdistinct (map (rder x) (rflts (map (rsimp \<circ> (\<lambda>r. rders_simp r xs)) rs))) {})
+) =
+          rsimp (
+rsimp_ALTs (rdistinct (rflts (map (rder x) (map (rsimp \<circ> (\<lambda>r. rders_simp r xs)) rs))) {})
+)"
+  
   sorry