# HG changeset patch # User Chengsong # Date 1648425582 -3600 # Node ID 23818853a710f588c5dd8d7beeaed08fb82a30f6 # Parent 7a8cef3f5234a4183a0d7440ead9dc2c902a5260 thesis diff -r 7a8cef3f5234 -r 23818853a710 ChengsongTanPhdThesis/Chapters/Chapter1.tex --- a/ChengsongTanPhdThesis/Chapters/Chapter1.tex Sat Mar 26 11:24:36 2022 +0000 +++ b/ChengsongTanPhdThesis/Chapters/Chapter1.tex Mon Mar 28 00:59:42 2022 +0100 @@ -45,10 +45,10 @@ %---------------------------------------------------------------------------------------- %This part is about regular expressions, Brzozowski derivatives, %and a bit-coded lexing algorithm with proven correctness and time bounds. -Regular expressions are widely used in modern day programming tasks. -Be it IDE with syntax hightlighting and auto completion, +Regular expressions are widely used in computer science: +be it in IDEs with syntax hightlighting and auto completion, command line tools like $\mathit{grep}$ that facilitates easy -processing of text by search and replace, network intrusion +text processing , network intrusion detection systems that rejects suspicious traffic, or compiler front ends--there is always an important phase of the task that involves matching a regular @@ -197,38 +197,213 @@ \section{Why are current algorithm for regexes slow?} -Theoretical results say that regular expression matching -should be linear with respect to the input. -You could construct -an NFA via Thompson construction, and simulate running it. -This would be linear. -Or you could determinize the NFA into a DFA, and minimize that -DFA. Once you have the DFA, the running time is also linear, requiring just -one scanning pass of the input. +%find literature/find out for yourself that REGEX->DFA on basic regexes +%does not blow up the size +Shouldn't regular expression matching be linear? +How can one explain the super-linear behaviour of the +regex matching engines we have? +The time cost of regex matching algorithms in general +involve two phases: +the construction phase, in which the algorithm builds some +suitable data structure from the input regex $r$, we denote +the time cost by $P_1(r)$. +The lexing +phase, when the input string $s$ is read and the data structure +representing that regex $r$ is being operated on. We represent the time +it takes by $P_2(r, s)$.\\ +In the case of a $\mathit{DFA}$, +we have $P_2(r, s) = O( |s| )$, +because we take at most $|s|$ steps, +and each step takes +at most one transition-- +a deterministic-finite-automata +by definition has at most one state active and at most one +transition upon receiving an input symbol. +But unfortunately in the worst case +$P_1(r) = O(exp^{|r|})$. An example will be given later. \\ +For $\mathit{NFA}$s, we have $P_1(r) = O(|r|)$ if we do not unfold +expressions like $r^n$ into $\underbrace{r \cdots r}_{\text{n copies of r}}$. +The $P_2(r, s)$ is bounded by $|r|\cdot|s|$, if we do not backtrack. +On the other hand, if backtracking is used, the worst-case time bound bloats +to $|r| * 2^|s|$ . +%on the input +%And when calculating the time complexity of the matching algorithm, +%we are assuming that each input reading step requires constant time. +%which translates to that the number of +%states active and transitions taken each time is bounded by a +%constant $C$. +%But modern regex libraries in popular language engines +% often want to support much richer constructs than just +% sequences and Kleene stars, +%such as negation, intersection, +%bounded repetitions and back-references. +%And de-sugaring these "extended" regular expressions +%into basic ones might bloat the size exponentially. +%TODO: more reference for exponential size blowup on desugaring. +\subsection{Tools that uses $\mathit{DFA}$s} +%TODO:more tools that use DFAs? +$\mathit{LEX}$ and $\mathit{JFLEX}$ are tools +in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based +lexers. The user provides a set of regular expressions +and configurations to such lexer generators, and then +gets an output program encoding a minimized $\mathit{DFA}$ +that can be compiled and run. +The good things about $\mathit{DFA}$s is that once +generated, they are fast and stable, unlike +backtracking algorithms. +However, they do not scale well with bounded repetitions.\\ -But modern regex libraries in popular language engines - often want to support richer constructs -such as bounded repetitions and back-references -rather than the most basic regular expressions. -%put in illustrations -%a{1}{3} -These make a DFA construction impossible because -of an exponential states explosion. - They also want to support lexing rather than just matching - for tasks that involves text processing. - - Existing regex libraries either pose restrictions on the user input, or does - not give linear running time guarantee. +Bounded repetitions, usually written in the form +$r^{\{c\}}$ (where $c$ is a constant natural number), +denotes a regular expression accepting strings +that can be divided into $c$ substrings, where each +substring is in $r$. +For the regular expression $(a|b)^*a(a|b)^{\{2\}}$, +an $\mathit{NFA}$ describing it would look like: +\begin{center} +\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] + \node[state,initial] (q_0) {$q_0$}; + \node[state, red] (q_1) [right=of q_0] {$q_1$}; + \node[state, red] (q_2) [right=of q_1] {$q_2$}; + \node[state, accepting, red](q_3) [right=of q_2] {$q_3$}; + \path[->] + (q_0) edge node {a} (q_1) + edge [loop below] node {a,b} () + (q_1) edge node {a,b} (q_2) + (q_2) edge node {a,b} (q_3); +\end{tikzpicture} +\end{center} +The red states are "countdown states" which counts down +the number of characters needed in addition to the current +string to make a successful match. +For example, state $q_1$ indicates a match that has +gone past the $(a|b)^*$ part of $(a|b)^*a(a|b)^{\{2\}}$, +and just consumed the "delimiter" $a$ in the middle, and +need to match 2 more iterations of $(a|b)$ to complete. +State $q_2$ on the other hand, can be viewed as a state +after $q_1$ has consumed 1 character, and just waits +for 1 more character to complete. +$q_3$ is the last state, requiring 0 more character and is accepting. +Depending on the suffix of the +input string up to the current read location, +the states $q_1$ and $q_2$, $q_3$ +may or may +not be active, independent from each other. +A $\mathit{DFA}$ for such an $\mathit{NFA}$ would +contain at least $2^3$ non-equivalent states that cannot be merged, +because the subset construction during determinisation will generate +all the elements in the power set $\mathit{Pow}\{q_1, q_2, q_3\}$. +Generalizing this to regular expressions with larger +bounded repetitions number, we have that +regexes shaped like $r^*ar^{\{n\}}$ when converted to $\mathit{DFA}$s +would require at least $2^{n+1}$ states, if $r$ contains +more than 1 string. +This is to represent all different +scenarios which "countdown" states are active. +For those regexes, tools such as $\mathit{JFLEX}$ +would generate gigantic $\mathit{DFA}$'s or +out of memory errors. +For this reason, regex libraries that support +bounded repetitions often choose to use the $\mathit{NFA}$ +approach. +\subsection{The $\mathit{NFA}$ approach to regex matching} +One can simulate the $\mathit{NFA}$ running in two ways: +one by keeping track of all active states after consuming +a character, and update that set of states iteratively. +This can be viewed as a breadth-first-search of the $\mathit{NFA}$ +for a path terminating +at an accepting state. +Languages like $\mathit{Go}$ and $\mathit{Rust}$ use this +type of $\mathit{NFA}$ simulation, and guarantees a linear runtime +in terms of input string length. +%TODO:try out these lexers +The other way to use $\mathit{NFA}$ for matching is choosing +a single transition each time, keeping all the other options in +a queue or stack, and backtracking if that choice eventually +fails. This method, often called a "depth-first-search", +is efficient in a lot of cases, but could end up +with exponential run time.\\ +%TODO:COMPARE java python lexer speed with Rust and Go +The reason behind backtracking algorithms in languages like +Java and Python is that they support back-references. +\subsection{Back References in Regex--Non-Regular part} +If we have a regular expression like this (the sequence +operator is omitted for brevity): +\begin{center} + $r_1(r_2(r_3r_4))$ +\end{center} +We could label sub-expressions of interest +by parenthesizing them and giving +them a number by the order in which their opening parentheses appear. +One possible way of parenthesizing and labelling is given below: +\begin{center} + $\underset{1}{(}r_1\underset{2}{(}r_2\underset{3}{(}r_3)\underset{4}{(}r_4)))$ +\end{center} +$r_1r_2r_3r_4$, $r_1r_2r_3$, $r_3$, $r_4$ are labelled +by 1 to 4. $1$ would refer to the entire expression +$(r_1(r_2(r_3)(r_4)))$, $2$ referring to $r_2(r_3)(r_4)$, etc. +These sub-expressions are called "capturing groups". +We can use the following syntax to denote that we want a string just matched by a +sub-expression (capturing group) to appear at a certain location again, +exactly as it was: +\begin{center} +$\ldots\underset{\text{i-th lparen}}{(}{r_i})\ldots +\underset{s_i \text{ which just matched} \;r_i}{\backslash i}$ +\end{center} +The backslash and number $i$ are used to denote such +so-called "back-references". +Let $e$ be an expression made of regular expressions +and back-references. $e$ contains the expression $e_i$ +as its $i$-th capturing group. +The semantics of back-reference can be recursively +written as: +\begin{center} + \begin{tabular}{c} + $L ( e \cdot \backslash i) = \{s @ s_i \mid s \in L (e)\quad s_i \in L(r_i)$\\ + $s_i\; \text{match of ($e$, $s$)'s $i$-th capturing group string}\}$ + \end{tabular} +\end{center} +The concrete example +$((a|b|c|\ldots|z)^*)\backslash 1$ +would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\ +Back-reference is a construct in the "regex" standard +that programmers found quite useful, but not exactly +regular any more. +In fact, that allows the regex construct to express +languages that cannot be contained in context-free +languages either. +For example, the back-reference $((a^*)b\backslash1 b \backslash 1$ +expresses the language $\{a^n b a^n b a^n\mid n \in \mathbb{N}\}$, +which cannot be expressed by context-free grammars\parencite{campeanu2003formal}. +Such a language is contained in the context-sensitive hierarchy +of formal languages. +Solving the back-reference expressions matching problem +is NP-complete\parencite{alfred2014algorithms} and a non-bactracking, +efficient solution is not known to exist. +%TODO:read a bit more about back reference algorithms +It seems that languages like Java and Python made the trade-off +to support back-references at the expense of having to backtrack, +even in the case of regexes not involving back-references.\\ +Summing these up, we can categorise existing +practical regex libraries into the ones with linear +time guarantees like Go and Rust, which impose restrictions +on the user input (not allowing back-references, +bounded repetitions canno exceed 1000 etc.), and ones + that allows the programmer much freedom, but grinds to a halt + in some non-negligible portion of cases. %TODO: give examples such as RE2 GOLANG 1000 restriction, rust no repetitions - For example, the Rust regex engine claims to be linear, - but does not support lookarounds and back-references. - The GoLang regex library does not support over 1000 repetitions. - Java and Python both support back-references, but shows -catastrophic backtracking behaviours on inputs without back-references( -when the language is still regular). +% For example, the Rust regex engine claims to be linear, +% but does not support lookarounds and back-references. +% The GoLang regex library does not support over 1000 repetitions. +% Java and Python both support back-references, but shows +%catastrophic backtracking behaviours on inputs without back-references( +%when the language is still regular). %TODO: test performance of Rust on (((((a*a*)b*)b){20})*)c baabaabababaabaaaaaaaaababaaaababababaaaabaaabaaaaaabaabaabababaababaaaaaaaaababaaaababababaaaaaaaaaaaaac %TODO: verify the fact Rust does not allow 1000+ reps %TODO: Java 17 updated graphs? Is it ok to still use Java 8 graphs? +\section{Buggy Regex Engines} + + Another thing about the these libraries is that there is no correctness guarantee. In some cases they either fails to generate a lexing result when there is a match, @@ -239,9 +414,11 @@ exponential backtracking problems, but also undesired (or even buggy) outputs. %TODO: comment from who -xxx commented that most regex libraries are not +Kuklewicz\parencite{KuklewiczHaskell} commented that most regex libraries are not correctly implementing the POSIX (maximum-munch) rule of regular expression matching. +This experience is echoed by the writer's +tryout of a few online regex testers: A concrete example would be the regex \begin{verbatim} @@ -264,125 +441,86 @@ This regex would trigger catastrophic backtracking in languages like Python and Java, -whereas it gives a correct but uninformative (non-POSIX) +whereas it gives a non-POSIX and uninformative match in languages like Go or .NET--The match with only character $c$. -Backtracking or depth-first search might give us -exponential running time, and quite a few tools -avoid that by determinising the $\mathit{NFA}$ -into a $\mathit{DFA}$ and minimizes it. -For example, $\mathit{LEX}$ and $\mathit{JFLEX}$ are tools -in $C$ and $\mathit{JAVA}$ that generates $\mathit{DFA}$-based -lexers. -However, they do not scale well with bounded repetitions. -Bounded repetitions, usually written in the form -$r^{\{c\}}$ (where $c$ is a constant natural number), -denotes a regular expression accepting strings -that can be divided into $c$ substrings, and each -substring is in $r$. -%TODO: draw example NFA. -For the regular expression $(a|b)^*a(a|b)^{\{2\}}$, -an $\mathit{NFA}$ describing it would look like: +As Grathwohl\parencite{grathwohl2014crash} commented, \begin{center} -\begin{tikzpicture}[shorten >=1pt,node distance=2cm,on grid,auto] - \node[state,initial] (q_0) {$q_0$}; - \node[state, red] (q_1) [right=of q_0] {$q_1$}; - \node[state, red] (q_2) [right=of q_1] {$q_2$}; - \node[state,accepting](q_3) [right=of q_2] {$q_3$}; - \path[->] - (q_0) edge node {a} (q_1) - edge [loop below] node {a,b} () - (q_1) edge node {a,b} (q_2) - edge [loop above] node {0} () - (q_2) edge node {a,b} (q_3); -\end{tikzpicture} + ``The POSIX strategy is more complicated than the greedy because of the dependence on information about the length of matched strings in the various subexpressions.'' \end{center} -The red states are "counter states" which counts down -the number of characters needed in addition to the current -string to make a successful match. -For example, state $q_1$ indicates a match that has -gone past the $(a|b)$ part of $(a|b)^*a(a|b)^{\{2\}}$, -and just consumed the "delimiter" $a$ in the middle, and -need to match 2 more iterations of $a|b$ to complete. -State $q_2$ on the other hand, can be viewed as a state -after $q_1$ has consumed 1 character, and just waits -for 1 more character to complete. -Depending on the actual characters appearing in the -input string, the states $q_1$ and $q_2$ may or may -not be active, independent from each other. -A $\mathit{DFA}$ for such an $\mathit{NFA}$ would -contain at least 4 non-equivalent states that cannot be merged, -because subset states indicating which of $q_1$ and $q_2$ -are active are at least four: $\phi$, $\{q_1\}$, -$\{q_2\}$, $\{q_1, q_2\}$. -Generalizing this to regular expressions with larger -bounded repetitions number, we have $r^*ar^{\{n\}}$ -in general would require at least $2^n$ states -in a $\mathit{DFA}$. This is to represent all different -configurations of "countdown" states. -For those regexes, tools such as $\mathit{JFLEX}$ -would generate gigantic $\mathit{DFA}$'s or even -give out memory errors. -For this reason, regex libraries that support -bounded repetitions often choose to use the $\mathit{NFA}$ -approach. -One can simulate the $\mathit{NFA}$ running in two ways: -one by keeping track of all active states after consuming -a character, and update that set of states iteratively. -This is a breadth-first-search of the $\mathit{NFA}$. -for a path terminating -at an accepting state. -Languages like $\mathit{GO}$ and $\mathit{RUST}$ use this -type of $\mathit{NFA}$ simulation, and guarantees a linear runtime -in terms of input string length. -The other way to use $\mathit{NFA}$ for matching is to take -a single state in a path each time, and backtrack if that path -fails. This is a depth-first-search that could end up -with exponential run time. -The reason behind backtracking algorithms in languages like -Java and Python is that they support back-references. -\subsection{Back References in Regex--Non-Regular part} -If we label sub-expressions by parenthesizing them and give -them a number by the order their opening parenthesis appear, -$\underset{1}{(}\ldots\underset{2}{(}\ldots\underset{3}{(}\ldots\underset{4}{(}\ldots)\ldots)\ldots)\ldots)$ -We can use the following syntax to denote that we want a string just matched by a -sub-expression to appear at a certain location again exactly: -$(.*)\backslash 1$ -would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc. - -Back-reference is a construct in the "regex" standard -that programmers found quite useful, but not exactly -regular any more. -In fact, that allows the regex construct to express -languages that cannot be contained in context-free -languages -For example, the back reference $(a^*)\backslash 1 \backslash 1$ -expresses the language $\{a^na^na^n\mid n \in N\}$, -which cannot be expressed by context-free grammars. -To express such a language one would need context-sensitive -grammars, and matching/parsing for such grammars is NP-hard in -general. -%TODO:cite reference for NP-hardness of CSG matching -For such constructs, the most intuitive way of matching is -using backtracking algorithms, and there are no known algorithms -that does not backtrack. -%TODO:read a bit more about back reference algorithms +\section{Engineering and Academic Approaches to Deal with Catastrophic Backtracking} - - -\section{Our Solution--Brzozowski Derivatives} - - - +There is also static analysis work on regular expression that +have potential expoential behavious. Rathnayake and Thielecke +\parencite{Rathnayake2014StaticAF} proposed an algorithm +that detects regular expressions triggering exponential +behavious on backtracking matchers. +People also developed static analysis methods for +generating non-linear polynomial worst-time estimates +for regexes, attack string that exploit the worst-time +scenario, and "attack automata" that generates +attack strings \parencite{Weideman2017Static}. +There are also tools to "debug" regular expressions +that allows people to see why a match failed or was especially slow +by showing the steps a back-tracking regex engine took\parencite{regexploit2021}. +%TODO:also the regex101 debugger +\section{Our Solution--Formal Specification of POSIX and Brzozowski Derivatives} Is it possible to have a regex lexing algorithm with proven correctness and time complexity, which allows easy extensions to constructs like bounded repetitions, negation, lookarounds, and even back-references? +Building on top of Sulzmann and Lu's attempt to formalize the +notion of POSIX lexing rules \parencite{Sulzmann2014}, +Ausaf and Urban\parencite{AusafDyckhoffUrban2016} modelled +POSIX matching as a ternary relation recursively defined in a +natural deduction style. +With the formally-specified rules for what a POSIX matching is, +they designed a regex matching algorithm based on Brzozowski derivatives, and +proved in Isabelle/HOL that the algorithm gives correct results. + - We propose Brzozowski's derivatives as a solution to this problem. + + + +%---------------------------------------------------------------------------------------- + +\section{Our Approach} +In the last fifteen or so years, Brzozowski's derivatives of regular +expressions have sparked quite a bit of interest in the functional +programming and theorem prover communities. The beauty of +Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly +expressible in any functional language, and easily definable and +reasoned about in theorem provers---the definitions just consist of +inductive datatypes and simple recursive functions. Derivatives of a +regular expression, written $r \backslash c$, give a simple solution +to the problem of matching a string $s$ with a regular +expression $r$: if the derivative of $r$ w.r.t.\ (in +succession) all the characters of the string matches the empty string, +then $r$ matches $s$ (and {\em vice versa}). + + +This work aims to address the above vulnerability by the combination +of Brzozowski's derivatives and interactive theorem proving. We give an +improved version of Sulzmann and Lu's bit-coded algorithm using +derivatives, which come with a formal guarantee in terms of correctness and +running time as an Isabelle/HOL proof. +Then we improve the algorithm with an even stronger version of +simplification, and prove a time bound linear to input and +cubic to regular expression size using a technique by +Antimirov. + +\subsection{Existing Work} +We are aware +of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by +Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part +of the work by Krauss and Nipkow \parencite{Krauss2011}. And another one +in Coq is given by Coquand and Siles \parencite{Coquand2012}. +Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}. + + %We propose Brzozowski's derivatives as a solution to this problem. The main contribution of this thesis is a proven correct lexing algorithm with formalized time bounds. @@ -1314,69 +1452,6 @@ %---------------------------------------------------------------------------------------- -\section{Engineering and Academic Approaches to Deal with Catastrophic Backtracking} - -The reason behind is that regex libraries in popular language engines - often want to support richer constructs -than the most basic regular expressions, and lexing rather than matching -is needed for sub-match extraction. - -There is also static analysis work on regular expression that -have potential expoential behavious. Rathnayake and Thielecke -\parencite{Rathnayake2014StaticAF} proposed an algorithm -that detects regular expressions triggering exponential -behavious on backtracking matchers. -People also developed static analysis methods for -generating non-linear polynomial worst-time estimates -for regexes, attack string that exploit the worst-time -scenario, and "attack automata" that generates -attack strings. -For a comprehensive analysis, please refer to Weideman's thesis -\parencite{Weideman2017Static}. - -\subsection{DFA Approach} - -Exponential states. - -\subsection{NFA Approach} -Backtracking. - - - -%---------------------------------------------------------------------------------------- - -\section{Our Approach} -In the last fifteen or so years, Brzozowski's derivatives of regular -expressions have sparked quite a bit of interest in the functional -programming and theorem prover communities. The beauty of -Brzozowski's derivatives \parencite{Brzozowski1964} is that they are neatly -expressible in any functional language, and easily definable and -reasoned about in theorem provers---the definitions just consist of -inductive datatypes and simple recursive functions. Derivatives of a -regular expression, written $r \backslash c$, give a simple solution -to the problem of matching a string $s$ with a regular -expression $r$: if the derivative of $r$ w.r.t.\ (in -succession) all the characters of the string matches the empty string, -then $r$ matches $s$ (and {\em vice versa}). - - -This work aims to address the above vulnerability by the combination -of Brzozowski's derivatives and interactive theorem proving. We give an -improved version of Sulzmann and Lu's bit-coded algorithm using -derivatives, which come with a formal guarantee in terms of correctness and -running time as an Isabelle/HOL proof. -Then we improve the algorithm with an even stronger version of -simplification, and prove a time bound linear to input and -cubic to regular expression size using a technique by -Antimirov. - -\subsection{Existing Work} -We are aware -of a mechanised correctness proof of Brzozowski's derivative-based matcher in HOL4 by -Owens and Slind~\parencite{Owens2008}. Another one in Isabelle/HOL is part -of the work by Krauss and Nipkow \parencite{Krauss2011}. And another one -in Coq is given by Coquand and Siles \parencite{Coquand2012}. -Also Ribeiro and Du Bois give one in Agda \parencite{RibeiroAgda2017}. %---------------------------------------------------------------------------------------- diff -r 7a8cef3f5234 -r 23818853a710 ChengsongTanPhdThesis/example.bib --- a/ChengsongTanPhdThesis/example.bib Sat Mar 26 11:24:36 2022 +0000 +++ b/ChengsongTanPhdThesis/example.bib Mon Mar 28 00:59:42 2022 +0100 @@ -57,6 +57,44 @@ journal = {arXiv:1405.7058}, title = {Static Analysis for Regular Expression Exponential Runtime via Substructural Logics}, year = {2017}} +@article{campeanu2003formal, + title={A formal study of practical regular expressions}, + author={C{\^a}mpeanu, Cezar and Salomaa, Kai and Yu, Sheng}, + journal={International Journal of Foundations of Computer Science}, + volume={14}, + number={06}, + pages={1007--1018}, + year={2003}, + publisher={World Scientific} +} + +@article{alfred2014algorithms, + title={Algorithms for finding patterns in strings}, + author={Alfred, V}, + journal={Algorithms and Complexity}, + volume={1}, + pages={255}, + year={2014}, + publisher={Elsevier} +} + + +@article{CAMPEANU2009Intersect, +title = {On the intersection of regex languages with regular languages}, +journal = {Theoretical Computer Science}, +volume = {410}, +number = {24}, +pages = {2336-2344}, +year = {2009}, +note = {Formal Languages and Applications: A Collection of Papers in Honor of Sheng Yu}, +issn = {0304-3975}, +doi = {https://doi.org/10.1016/j.tcs.2009.02.022}, +url = {https://www.sciencedirect.com/science/article/pii/S0304397509001789}, +author = {Cezar Câmpeanu and Nicolae Santean}, +keywords = {Extended regular expression, Regex automata system, Regex}, +abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255–300] and rigorously in 2003 by Câmpeanu, Salomaa and Yu [C. Câmpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007–1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines — regex automata systems (RAS) — which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.} +} + @article{nielson11bcre, author = {Lasse Nielsen, Fritz Henglein}, @@ -77,6 +115,22 @@ year = {2021} } +@misc{KuklewiczHaskell, + title = {Regex Posix}, + author = {Kuklewicz}, + keywords = {Buggy C POSIX Lexing Libraries}, + url = {https://wiki.haskell.org/Regex_Posix}, + year = {2017} +} + + +@techreport{grathwohl2014crash, + title={A Crash-Course in Regular Expression Parsing and Regular Expressions as Types}, + author={Grathwohl, Niels Bj{\o}rn Bugge and Henglein, Fritz and Rasmussen, Ulrik Terp}, + year={2014}, + institution={Technical report, University of Copenhagen} +} + @misc{SE16, author = {StackStatus}, date-added = {2019-06-26 11:28:41 +0000}, diff -r 7a8cef3f5234 -r 23818853a710 ChengsongTanPhdThesis/main.tex --- a/ChengsongTanPhdThesis/main.tex Sat Mar 26 11:24:36 2022 +0000 +++ b/ChengsongTanPhdThesis/main.tex Mon Mar 28 00:59:42 2022 +0100 @@ -35,7 +35,7 @@ %nohyperref, % Uncomment to not load the hyperref package headsepline, % Uncomment to get a line under the header %chapterinoneline, % Uncomment to place the chapter title next to the number on one line -%consistentlayout, % Uncomment to change the layout of the declaration, abstract and acknowledgements pages to match the default layout +consistentlayout, % Uncomment to change the layout of the declaration, abstract and acknowledgements pages to match the default layout ]{MastersDoctoralThesis} % The class file specifying the document structure \usepackage[utf8]{inputenc} % Required for inputting international characters @@ -91,7 +91,7 @@ \keywords{} % Keywords for your thesis, this is not currently used anywhere in the template, print it elsewhere with \keywordnames \university{\href{https://www.kcl.ac.uk}{King's College London}} % Your university's name and URL, this is used in the title page and abstract, print it elsewhere with \univname \department{\href{https://www.kcl.ac.uk/informatics}{Department or Informatics}} % Your department's name and URL, this is used in the title page and abstract, print it elsewhere with \deptname -\group{\href{https://www.kcl.ac.uk/research/ssy}{Software Systems}} % Your research group's name and URL, this is used in the title page, print it elsewhere with \groupname +\group{\href{https://www.kcl.ac.uk/research/ssy}{Software Systems Group}} % Your research group's name and URL, this is used in the title page, print it elsewhere with \groupname \faculty{\href{http://faculty.university.com}{Chengsong Tan}} % Your faculty's name and URL, this is used in the title page and abstract, print it elsewhere with \facname \AtBeginDocument{ @@ -193,11 +193,7 @@ \begin{abstract} \addchaptertocentry{\abstractname} % Add the abstract to the table of contents -This work is a combination of functional algorithms -and formal methods. -Regular expression matching and lexing has been - widely-used and well-implemented -in software industry. +This work is about regular expressions and derivatives. It combines functional algorithms and their formal verification in the Isabelle/HOL theorem prover. Theoretical results say that regular expression matching should be linear with respect to the input. diff -r 7a8cef3f5234 -r 23818853a710 thys2/ClosedForms.thy --- a/thys2/ClosedForms.thy Sat Mar 26 11:24:36 2022 +0000 +++ b/thys2/ClosedForms.thy Mon Mar 28 00:59:42 2022 +0100 @@ -494,6 +494,117 @@ + +inductive frewrite:: "rrexp list \ rrexp list \ bool" ("_ \f _" [10, 10] 10) + where + "(RZERO # rs) \f rs" +| "((RALTS rs) # rsa) \f (rs @ rsa)" +| "rs1 \f rs2 \ (r # rs1) \f (r # rs2)" + + +inductive + frewrites:: "rrexp list \ rrexp list \ bool" ("_ \f* _" [10, 10] 10) +where + rs1[intro, simp]:"rs \f* rs" +| rs2[intro]: "\rs1 \f* rs2; rs2 \f rs3\ \ rs1 \f* rs3" + +lemma fr_in_rstar : "r1 \f r2 \ r1 \f* r2" + using frewrites.intros(1) frewrites.intros(2) by blast + +lemma freal_trans[trans]: + assumes a1: "r1 \f* r2" and a2: "r2 \f* r3" + shows "r1 \f* r3" + using a2 a1 + apply(induct r2 r3 arbitrary: r1 rule: frewrites.induct) + apply(auto) + done + + +lemma many_steps_later: "\r1 \f r2; r2 \f* r3 \ \ r1 \f* r3" + by (meson fr_in_rstar freal_trans) + + +lemma frewrite_append: + shows "\ rsa \f rsb \ \ rs @ rsa \f rs @ rsb" + apply(induct rs) + apply simp+ + oops + + +lemma frewrites_cons: + shows "\ rsa \f* rsb \ \ r # rsa \f* r # rsb" + + oops + +lemma frewrites_append: + shows " \rsa \f* rsb\ \ (rs @ rsa) \f* (rs @ rsb)" + apply(induct rsa rsb rule: frewrites.induct) + apply simp + oops + + + +lemma frewrites_concat: + shows "\rs1 \f rs2; rsa \f* rsb \ \ (rs1 @ rsa) \f* (rs2 @ rsb)" + apply(induct rs1 rs2 rule: frewrite.induct) + apply(simp) + apply(subgoal_tac "(RZERO # rs @ rsa) \f (rs @ rsa)") + prefer 2 + using frewrite.intros(1) apply blast + apply(subgoal_tac "(rs @ rsa) \f* (rs @ rsb)") + using many_steps_later apply blast + + + + + + + +lemma many_steps_later1: + shows " \rs1 \f* rs2\ + \ (RONE # rs1) \f* (RONE # rs2)" + oops + +lemma early_late_der_frewrites: + shows "map (rder x) (rflts rs) \f* rflts (map (rder x) rs)" + apply(induct rs) + apply simp + apply(case_tac a) + apply simp+ + using frewrite.intros(1) many_steps_later apply blast + apply(case_tac "x = x3") + apply simp + + + + + +fun alt_set:: "rrexp \ rrexp set" + where + "alt_set (RALTS rs) = set rs" +| "alt_set r = {r}" + + + +lemma rd_flts_set: + shows "rs1 \f* rs2 \ rdistinct rs1 (insert RZERO (rset \ (\(alt_set ` rset)))) \f* + rdistinct rs2 (rset \ (\(alt_set ` rset)))" + by (meson frewrite.intros(2) frewrites.simps) + +lemma rd_flts_set2: + shows "rs1 \f* rs2 \ rdistinct rs1 ((rset \ (\(alt_set ` rset)))) \f* + rdistinct rs2 (rset \ (\(alt_set ` rset)))" + using frewrite.intros(2) by blast + +lemma rd_flts_incorrect: + shows "rs1 \f* rs2 \ rdistinct rs1 rset \f* rdistinct rs2 rset" + sledgehammer + by (smt (verit, ccfv_threshold) frewrites.simps) + +lemma flts_does_rewrite: + shows "rs1 \f* rs2 \ rflts rs1 = rflts rs2" + oops + lemma rflts_fltsder_derflts: shows "rflts (map rsimp (rdistinct (map (rder x3) (rflts rs)) rset)) = rflts (map rsimp (rdistinct (rflts (map (rder x3) rs)) rset))" @@ -501,13 +612,15 @@ lemma simp_der_flts: - shows "rsimp (RALTS (rdistinct (map (rder x) (rflts rs)) rset))= - rsimp (RALTS (rdistinct (rflts (map (rder x) rs)) rset))" + shows "rsimp (RALTS (rdistinct (map (rder x) (rflts rs)) {})) = + rsimp (RALTS (rdistinct (rflts (map (rder x) rs)) {}))" - apply(induct rs arbitrary: rset) + apply(induct rs) apply simp apply(case_tac a) - apply simp + apply simp+ + prefer 2 + apply simp apply(case_tac "RZERO \ rset") apply simp+ using distinct_flts_no0 apply presburger @@ -522,8 +635,13 @@ lemma simp_der_pierce_flts: - shows " rsimp (rsimp_ALTs (rdistinct (map (rder x) (rflts (map (rsimp \ (\r. rders_simp r xs)) rs))) {})) = - rsimp (rsimp_ALTs (rdistinct (rflts (map (rder x) (map (rsimp \ (\r. rders_simp r xs)) rs))) {}))" + shows " rsimp ( +rsimp_ALTs (rdistinct (map (rder x) (rflts (map (rsimp \ (\r. rders_simp r xs)) rs))) {}) +) = + rsimp ( +rsimp_ALTs (rdistinct (rflts (map (rder x) (map (rsimp \ (\r. rders_simp r xs)) rs))) {}) +)" + sorry