# HG changeset patch # User Chengsong # Date 1653594700 -3600 # Node ID ff7945a988a3107f796e7fccfada80dc489816ba # Parent 3c5e58d08939c4e2b74f1a5d3863222be6a12938 more to thesis diff -r 3c5e58d08939 -r ff7945a988a3 ChengsongTanPhdThesis/Chapters/Chapter1.tex --- a/ChengsongTanPhdThesis/Chapters/Chapter1.tex Fri May 20 18:52:03 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Chapter1.tex Thu May 26 20:51:40 2022 +0100 @@ -41,7 +41,7 @@ \def\AONE{\textit{AONE}} \def\ACHAR{\textit{ACHAR}} - +\def\POSIX{\textit{POSIX}} \def\ALTS{\textit{ALTS}} \def\ASTAR{\textit{ASTAR}} \def\DFA{\textit{DFA}} @@ -51,7 +51,7 @@ \def\flex{\textit{flex}} \def\inj{\mathit{inj}} \def\Empty{\mathit{Empty}} -\def\Left{\mathit{Left}}xc +\def\Left{\mathit{Left}} \def\Right{\mathit{Right}} \def\Stars{\mathit{Stars}} \def\Char{\mathit{Char}} @@ -64,6 +64,7 @@ %\def\bderssimp{\mathit{bders}\_\mathit{simp}} \def\distinctWith{\textit{distinctWith}} +\def\size{\mathit{size}} \def\rexp{\mathbf{rexp}} \def\simp{\mathit{simp}} \def\simpALTs{\mathit{simp}\_\mathit{ALTs}} @@ -79,6 +80,7 @@ \newcommand\asize[1]{\llbracket #1 \rrbracket} \newcommand\rerase[1]{ (#1)\downarrow_r} + \def\erase{\textit{erase}} \def\STAR{\textit{STAR}} \def\flts{\textit{flts}} @@ -100,31 +102,24 @@ Regular expressions are widely used in computer science: -be it in text-editors\parencite{atomEditor} with syntax hightlighting and auto completion, -command line tools like $\mathit{grep}$ that facilitates easy -text processing , network intrusion -detection systems that rejects suspicious traffic, or compiler +be it in text-editors\parencite{atomEditor} with syntax highlighting and auto-completion, +command-line tools like $\mathit{grep}$ that facilitate easy +text processing, network intrusion +detection systems that reject suspicious traffic, or compiler front ends--the majority of the solutions to these tasks involve lexing with regular expressions. - Given its usefulness and ubiquity, one would imagine that modern regular expression matching implementations -are mature and fully-studied. - -If you go to a popular programming language's -regex engine, -you can supply it with regex and strings of your own, -and get matching/lexing information such as how a -sub-part of the regex matches a sub-part of the string. -These lexing libraries are on average quite fast. +are mature and fully studied. +Indeed, in a popular programming language' regex engine, +supplying it with regular expressions and strings, one can +get rich matching information in a very short time. +Some network intrusion detection systems +use regex engines that are able to process +megabytes or even gigabytes of data per second\parencite{Turo_ov__2020}. +Unfortunately, this is not the case for $\mathbf{all}$ inputs. %TODO: get source for SNORT/BRO's regex matching engine/speed -For example, the regex engines some network intrusion detection -systems use are able to process - megabytes or even gigabytes of network traffic per second. - -Why do we need to have our version, if the algorithms are -blindingly fast already? Well it turns out it is not always the case. Take $(a^*)^*\,b$ and ask whether @@ -132,12 +127,15 @@ expression. Obviously this is not the case---the expected $b$ in the last position is missing. One would expect that modern regular expression matching engines can find this out very quickly. Alas, if one tries -this example in JavaScript, Python or Java 8 with strings like 28 -$a$'s, one discovers that this decision takes around 30 seconds and -takes considerably longer when adding a few more $a$'s, as the graphs +this example in JavaScript, Python or Java 8, even with strings of a small +length, say around 30 $a$'s, one discovers that +this decision takes crazy time to finish given the simplicity of the problem. +This is clearly exponential behaviour, and +is triggered by some relatively simple regex patterns, as the graphs below show: -\begin{center} +\begin{figure} +\centering \begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}} \begin{tikzpicture} \begin{axis}[ @@ -204,11 +202,10 @@ \multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings of the form $\underbrace{aa..a}_{n}$.} \end{tabular} -\end{center} +\caption{aStarStarb} \label{fig:aStarStarb} +\end{figure} -This is clearly exponential behaviour, and -is triggered by some relatively simple regex patterns. This superlinear blowup in matching algorithms sometimes cause @@ -228,17 +225,16 @@ respond so slowly that the load balancer assumed a $\mathit{DoS}$ attack and therefore stopped the servers from responding to any requests. This made the whole site become unavailable. -A more -recent example is a global outage of all Cloudflare servers on 2 July +A more recent example is a global outage of all Cloudflare servers on 2 July 2019. A poorly written regular expression exhibited exponential -behaviour and exhausted CPUs that serve HTTP traffic. Although the -outage had several causes, at the heart was a regular expression that +behaviour and exhausted CPUs that serve HTTP traffic. Although the outage +had several causes, at the heart was a regular expression that was used to monitor network traffic.\footnote{\url{https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/}} %TODO: data points for some new versions of languages These problems with regular expressions are not isolated events that happen -very occasionally, but actually quite widespread. +very occasionally, but actually widespread. They occur so often that they get a name--Regular-Expression-Denial-Of-Service (ReDoS) attack. @@ -250,7 +246,7 @@ requires more research attention. - \section{Why are current algorithm for regexes slow?} + \section{The Problem Behind Slow Cases} %find literature/find out for yourself that REGEX->DFA on basic regexes %does not blow up the size @@ -309,50 +305,6 @@ However, they do not scale well with bounded repetitions.\\ -\begin{center} -\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}} -\begin{tikzpicture} -\begin{axis}[ - ymode=log, - xlabel={$n$}, - x label style={at={(1.05,-0.05)}}, - ylabel={time in secs}, - enlargelimits=false, - xtick={1,2,...,8}, - xmax=9, - ymax=16000000, - ytick={0,500,...,3500}, - scaled ticks=false, - axis lines=left, - width=5cm, - height=4cm, - legend entries={JavaScript}, - legend pos=north west, - legend cell align=left] -\addplot[red,mark=*, mark options={fill=white}] table {re-chengsong.data}; -\end{axis} -\end{tikzpicture} -\end{tabular} -\end{center} - -Another graph: -\begin{center} -\begin{tikzpicture} - \begin{axis}[ - height=0.5\textwidth, - width=\textwidth, - xlabel=number of a's, - xtick={0,...,9}, - ylabel=maximum size, - ymode=log, - log basis y={2} -] - \addplot[mark=*,blue] table {re-chengsong.data}; - \end{axis} -\end{tikzpicture} -\end{center} - - Bounded repetitions, usually written in the form @@ -469,7 +421,7 @@ $((a|b|c|\ldots|z)^*)\backslash 1$ would match the string like $\mathit{bobo}$, $\mathit{weewee}$ and etc.\\ Back-reference is a construct in the "regex" standard -that programmers found quite useful, but not exactly +that programmers found useful, but not exactly regular any more. In fact, that allows the regex construct to express languages that cannot be contained in context-free @@ -506,10 +458,10 @@ \section{Buggy Regex Engines} - Another thing about the these libraries is that there + Another thing about these libraries is that there is no correctness guarantee. - In some cases they either fails to generate a lexing result when there is a match, - or gives the wrong way of matching. + In some cases, they either fail to generate a lexing result when there exists a match, + or give the wrong way of matching. It turns out that regex libraries not only suffer from @@ -557,12 +509,12 @@ When a regular expression does not behave as intended, people usually try to rewrite the regex to some equivalent form -or they try to avoid the possibly problematic patterns completely\parencite{Davis18}, -of which there are many false positives. +or they try to avoid the possibly problematic patterns completely, +for which many false positives exist\parencite{Davis18}. Animated tools to "debug" regular expressions -are also quite popular, regexploit\parencite{regexploit2021}, regex101\parencite{regex101} +are also popular, regexploit\parencite{regexploit2021}, regex101\parencite{regex101} to name a few. -There is also static analysis work on regular expressions that +We are also aware of static analysis work on regular expressions that aims to detect potentially expoential regex patterns. Rathnayake and Thielecke \parencite{Rathnayake2014StaticAF} proposed an algorithm that detects regular expressions triggering exponential @@ -608,7 +560,7 @@ %---------------------------------------------------------------------------------------- -\section{Our Contribution} +\section{Contribution} @@ -627,8 +579,8 @@ The main contribution of this thesis is a proven correct lexing algorithm with formalized time bounds. -To our best knowledge, there is no lexing libraries using Brzozowski derivatives -that have a provable time guarantee, +To our best knowledge, no lexing libraries using Brzozowski derivatives +have a provable time guarantee, and claims about running time are usually speculative and backed by thin empirical evidence. %TODO: give references @@ -681,8 +633,8 @@ %TODO: FILL in the other defs \begin{center} \begin{tabular}{lcl} -$L \; r_1 + r_2$ & $\dn$ & $ L \; r_1 \cup L \; r_2$\\ -$L \; r_1 \cdot r_2$ & $\dn$ & $ L \; r_1 \cap L \; r_2$\\ +$L \; (r_1 + r_2)$ & $\dn$ & $ L \; (r_1) \cup L \; ( r_2)$\\ +$L \; (r_1 \cdot r_2)$ & $\dn$ & $ L \; (r_1) \cap L \; (r_2)$\\ \end{tabular} \end{center} Which are also called the "language interpretation". @@ -696,7 +648,7 @@ Formally, we define first such a transformation on any string set, which we call semantic derivative: \begin{center} -$\Der \; c\; \textit{StringSet} = \{s \mid c :: s \in StringSet\}$ +$\Der \; c\; \textit{A} = \{s \mid c :: s \in A\}$ \end{center} Mathematically, it can be expressed as the @@ -788,13 +740,13 @@ The most involved cases are the sequence and star case. The sequence case says that if the first regular expression -contains an empty string then second component of the sequence +contains an empty string then the second component of the sequence might be chosen as the target regular expression to be chopped off its head character. -The star regular expression unwraps the iteration of -regular expression and attack the star regular expression -to its back again to make sure there are 0 or more iterations -following this unfolded iteration. +The star regular expression's derivative unwraps the iteration of +regular expression and attaches the star regular expression +to the sequence's second element to make sure a copy is retained +for possible more iterations in later phases of lexing. The main property of the derivative operation @@ -844,10 +796,57 @@ Beautiful and simple definition. If we implement the above algorithm naively, however, -the algorithm can be excruciatingly slow. For example, when starting with the regular -expression $(a + aa)^*$ and building 12 successive derivatives +the algorithm can be excruciatingly slow. + + +\begin{figure} +\centering +\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}} +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n$}, + x label style={at={(1.05,-0.05)}}, + ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=10000, + ytick={0,1000,...,10000}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={JavaScript}, + legend pos=north west, + legend cell align=left] +\addplot[red,mark=*, mark options={fill=white}] table {EightThousandNodes.data}; +\end{axis} +\end{tikzpicture}\\ +\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings + of the form $\underbrace{aa..a}_{n}$.} +\end{tabular} +\caption{EightThousandNodes} \label{fig:EightThousandNodes} +\end{figure} + + +(8000 node data to be added here) +For example, when starting with the regular +expression $(a + aa)^*$ and building a few successive derivatives (around 10) w.r.t.~the character $a$, one obtains a derivative regular expression -with more than 8000 nodes (when viewed as a tree). Operations like +with more than 8000 nodes (when viewed as a tree)\ref{EightThousandNodes}. +The reason why $(a + aa) ^*$ explodes so drastically is that without +pruning, the algorithm will keep records of all possible ways of matching: +\begin{center} +$(a + aa) ^* \backslash (aa) = (\ZERO + \ONE \ONE)\cdot(a + aa)^* + (\ONE + \ONE a) \cdot (a + aa)^*$ +\end{center} + +\noindent +Each of the above alternative branches correspond to the match +$aa $, $a \quad a$ and $a \quad a \cdot (a)$(incomplete). +These different ways of matching will grow exponentially with the string length, +and without simplifications that throw away some of these very similar matchings, +it is no surprise that these expressions grow so quickly. +Operations like $\backslash$ and $\nullable$ need to traverse such trees and consequently the bigger the size of the derivative the slower the algorithm. @@ -949,9 +948,9 @@ The number of different ways of matching without allowing any value under a star to be flattened to an empty string can be given by the following formula: -\begin{center} - $C_n = (n+1)+n C_1+\ldots + 2 C_{n-1}$ -\end{center} +\begin{equation} + C_n = (n+1)+n C_1+\ldots + 2 C_{n-1} +\end{equation} and a closed form formula can be calculated to be \begin{equation} C_n =\frac{(2+\sqrt{2})^n - (2-\sqrt{2})^n}{4\sqrt{2}} @@ -962,12 +961,26 @@ worst case runtime. Therefore it is impractical to try to generate all possible matches in a run. In practice, we are usually interested about POSIX values, which by intuition always -match the leftmost regular expression when there is a choice -and always match a sub part as much as possible before proceeding -to the next token. For example, the above example has the POSIX value +\begin{itemize} +\item +match the leftmost regular expression when multiple options of matching +are available +\item +always match a subpart as much as possible before proceeding +to the next token. +\end{itemize} + + + For example, the above example has the POSIX value $ \Stars\,[\Seq(Stars\,[\underbrace{\Char(a),\ldots,\Char(a)}_\text{n iterations}], Stars\,[])]$. The output of an algorithm we want would be a POSIX matching encoded as a value. +The reason why we are interested in $\POSIX$ values is that they can +be practically used in the lexing phase of a compiler front end. +For instance, when lexing a code snippet +$\textit{iffoo} = 3$ with the regular expression $\textit{keyword} + \textit{identifier}$, we want $\textit{iffoo}$ to be recognized +as an identifier rather than a keyword. + The contribution of Sulzmann and Lu is an extension of Brzozowski's algorithm by a second phase (the first phase being building successive derivatives---see \eqref{graph:*}). In this second phase, a POSIX value @@ -1256,7 +1269,7 @@ $\textit{if}\;c=d\; \;\textit{then}\; \textit{ONE}\;bs\;\textit{else}\;\textit{ZERO}$\\ $(\textit{ALTS}\;bs\,as)\,\backslash c$ & $\dn$ & - $\textit{ALTS}\;bs\,(as.map(\backslash c))$\\ + $\textit{ALTS}\;bs\,(map (\backslash c) as)$\\ $(\textit{SEQ}\;bs\,a_1\,a_2)\,\backslash c$ & $\dn$ & $\textit{if}\;\textit{bnullable}\,a_1$\\ & &$\textit{then}\;\textit{ALTS}\,bs\,List((\textit{SEQ}\,[]\,(a_1\,\backslash c)\,a_2),$\\ @@ -1316,7 +1329,7 @@ For instance, when we do derivative of $_{bs}a^*$ with respect to c, we need to unfold it into a sequence, and attach an additional bit $0$ to the front of $r \backslash c$ -to indicate that there is one more star iteration. Also the sequence clause +to indicate one more star iteration. Also the sequence clause is more subtle---when $a_1$ is $\textit{bnullable}$ (here \textit{bnullable} is exactly the same as $\textit{nullable}$, except that it is for annotated regular expressions, therefore we omit the @@ -1375,19 +1388,18 @@ operation from characters to strings (just like the derivatives for un-annotated regular expressions). -Remember tha one of the important reasons we introduced bitcodes -is that they can make simplification more structured and therefore guaranteeing -the correctness. +Now we introduce the simplifications, which is why we introduce the +bitcodes in the first place. -\subsection*{Our Simplification Rules} +\subsection*{Simplification Rules} -In this section we introduce aggressive (in terms of size) simplification rules +This section introduces aggressive (in terms of size) simplification rules on annotated regular expressions -in order to keep derivatives small. Such simplifications are promising +to keep derivatives small. Such simplifications are promising as we have generated test data that show -that a good tight bound can be achieved. Obviously we could only -partially cover the search space as there are infinitely many regular +that a good tight bound can be achieved. We could only +partially cover the search space as there are infinitely many regular expressions and strings. One modification we introduced is to allow a list of annotated regular @@ -1426,13 +1438,13 @@ \noindent The simplification does a pattern matching on the regular expression. When it detected that the regular expression is an alternative or -sequence, it will try to simplify its children regular expressions -recursively and then see if one of the children turn into $\ZERO$ or +sequence, it will try to simplify its child regular expressions +recursively and then see if one of the children turns into $\ZERO$ or $\ONE$, which might trigger further simplification at the current level. The most involved part is the $\sum$ clause, where we use two auxiliary functions $\textit{flatten}$ and $\textit{distinct}$ to open up nested alternatives and reduce as many duplicates as possible. Function -$\textit{distinct}$ keeps the first occurring copy only and remove all later ones +$\textit{distinct}$ keeps the first occurring copy only and removes all later ones when detected duplicates. Function $\textit{flatten}$ opens up nested $\sum$s. Its recursive definition is given below: @@ -1491,7 +1503,7 @@ -However, there are two difficulties with derivative-based matchers: +However, two difficulties with derivative-based matchers exist: First, Brzozowski's original matcher only generates a yes/no answer for whether a regular expression matches a string or not. This is too little information in the context of lexing where separate tokens must @@ -1554,7 +1566,7 @@ \emph{incremental parsing method} (that is the algorithm to be formalised in this paper): - +%motivation part \begin{quote}\it ``Correctness Claim: We further claim that the incremental parsing method [..] in combination with the simplification steps [..] diff -r 3c5e58d08939 -r ff7945a988a3 ChengsongTanPhdThesis/Chapters/Chapter2.tex --- a/ChengsongTanPhdThesis/Chapters/Chapter2.tex Fri May 20 18:52:03 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Chapter2.tex Thu May 26 20:51:40 2022 +0100 @@ -7,200 +7,16 @@ -%---------------------------------------------------------------------------------------- -% SECTION 1 -%---------------------------------------------------------------------------------------- -\section{Properties of $\backslash c$} - -To have a clear idea of what we can and -need to prove about the algorithms involving -Brzozowski's derivatives, there are a few -properties we need to be clear about . -\subsection{function $\backslash c$ is not 1-to-1} -\begin{center} -The derivative $w.r.t$ character $c$ is not one-to-one. -Formally, - $\exists r_1 \;r_2. r_1 \neq r_2 \mathit{and} r_1 \backslash c = r_2 \backslash c$ -\end{center} -This property is trivially true for the -character regex example: -\begin{center} - $r_1 = e; \; r_2 = d;\; r_1 \backslash c = \ZERO = r_2 \backslash c$ -\end{center} -But apart from the cases where the derivative -output is $\ZERO$, are there non-trivial results -of derivatives which contain strings? -The answer is yes. -For example, -\begin{center} - Let $r_1 = a^*b\;\quad r_2 = (a\cdot a^*)\cdot b + b$.\\ - where $a$ is not nullable.\\ - $r_1 \backslash c = ((a \backslash c)\cdot a^*)\cdot c + b \backslash c$\\ - $r_2 \backslash c = ((a \backslash c)\cdot a^*)\cdot c + b \backslash c$ -\end{center} -We start with two syntactically different regexes, -and end up with the same derivative result, which is -a "meaningful" regex because it contains strings. -We have rediscovered Arden's lemma:\\ -\begin{center} - $A^*B = A\cdot A^* \cdot B + B$ -\end{center} - - %----------------------------------- % SUBSECTION 1 %----------------------------------- -\subsection{Subsection 1} +\section{Specifications of Certain Functions to be Used} To be completed. - - - -%----------------------------------- -% SECTION 2 -%----------------------------------- - -\section{Finiteness Property} -In this section let us sketch our argument for why the size of the simplified -derivatives with the aggressive simplification function can be finitely bounded. - -For convenience, we use a new datatype which we call $\rrexp$ to denote -the difference between it and annotated regular expressions. -\[ \rrexp ::= \RZERO \mid \RONE - \mid \RCHAR{c} - \mid \RSEQ{r_1}{r_2} - \mid \RALTS{rs} - \mid \RSTAR{r} -\] -For $\rrexp$ we throw away the bitcodes on the annotated regular expressions, -but keep everything else intact. -It is similar to annotated regular expressions being $\erase$-ed, but with all its structure preserved -(the $\erase$ function unfortunately does not preserve structure in the case of empty and singleton -$\ALTS$). -We denote the operation of erasing the bits and turning an annotated regular expression -into an $\rrexp{}$ as $\rerase{}$. -\begin{center} -\begin{tabular}{lcr} -$\rerase{\AZERO}$ & $=$ & $\RZERO$\\ -$\rerase{\ASEQ{bs}{r_1}{r_2}}$ & $=$ & $\RSEQ{\rerase{r_1}}{\rerase{r_2}}$\\ -$\rerase{\AALTS{bs}{rs}}$ & $ =$ & $ \RALTS{rs}$ -\end{tabular} -\end{center} -%TODO: FINISH definition of rerase -Similarly we could define the derivative and simplification on -$\rrexp$, which would be identical to those we defined for plain $\rexp$s in chapter1, -except that now they can operate on alternatives taking multiple arguments. - -\begin{center} -\begin{tabular}{lcr} -$\RALTS{rs} \backslash c$ & $=$ & $\RALTS{map\; (\_ \backslash c) \;rs}$\\ -(other clauses omitted) -\end{tabular} -\end{center} - -Now that $\rrexp$s do not have bitcodes on them, we can do the -duplicate removal without an equivalence relation: -\begin{center} -\begin{tabular}{lcl} -$\rdistinct{r :: rs}{rset}$ & $=$ & $\textit{if}(r \in \textit{rset}) \; \textit{then} \; \rdistinct{rs}{rset}$\\ - & & $\textit{else}\; r::\rdistinct{rs}{(rset \cup \{r\})}$ -\end{tabular} -\end{center} -%TODO: definition of rsimp (maybe only the alternative clause) - - -The reason why these definitions mirror precisely the corresponding operations -on annotated regualar expressions is that we can calculate sizes more easily: - -\begin{lemma} -$\rsize{\rerase a} = \asize a$ -\end{lemma} -A similar equation holds for annotated regular expressions' simplification -and the plain regular expressions' simplification: -\begin{lemma} -$\asize{\bsimp{a}} = \rsize{\rsimp{\rerase{a}}}$ -\end{lemma} -Putting these two together we get a property that allows us to estimate the -size of an annotated regular expression derivative using its un-annotated counterpart: -\begin{lemma} -$\asize{\bderssimp{r}{s}} = \rsize{\rderssimp{\rerase{r}}{s}}$ -\end{lemma} -Unless stated otherwise in this chapter all $\textit{rexp}$s without - bitcodes are seen as $\rrexp$s. - We also use $r_1 + r_2$ and $\RALTS{[r_1, r_2]}$ interchageably - as the former suits people's intuitive way of stating a binary alternative - regular expression. - - Suppose -we have a size function for bitcoded regular expressions, written -$\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree -(we omit the precise definition; ditto for lists $\llbracket r\!s\rrbracket$). For this we show that for every $r$ -there exists a bound $N$ -such that - -\begin{center} -$\forall s. \; \llbracket{\bderssimp{r}{s}}\rrbracket \leq N$ -\end{center} - -\noindent -We prove this by induction on $r$. The base cases for $\AZERO$, -$\AONE \textit{bs}$ and $\ACHAR \textit{bs} c$ are straightforward. The interesting case is -for sequences of the form $\ASEQ{bs}{r_1}{r_2}$. In this case our induction -hypotheses state $\exists N_1. \forall s. \; \llbracket \bderssimp{r}{s} \rrbracket \leq N_1$ and -$\exists N_2. \forall s. \; \llbracket \bderssimp{r_2}{s} \rrbracket \leq N_2$. We can reason as follows -% -\begin{center} -\begin{tabular}{lcll} -& & $ \llbracket \bderssimp{\ASEQ{bs}{r_1}{r_2} }{s} \rrbracket$\\ -& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;(\ASEQ{nil}{\bderssimp{ r_1}{s}}{ r_2} :: - [\bderssimp{ r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)]))\rrbracket $ & (1) \\ -& $\leq$ & - $\llbracket\textit{\distinctWith}\,((\ASEQ{nil}{\bderssimp{r_1}{s}}{r_2}$) :: - [$\bderssimp{ r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)])\,\approx\;{}\rrbracket + 1 $ & (2) \\ -& $\leq$ & $\llbracket\ASEQ{bs}{\bderssimp{ r_1}{s}}{r_2}\rrbracket + - \llbracket\textit{distinctWith}\,[\bderssimp{r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)]\,\approx\;{}\rrbracket + 1 $ & (3) \\ -& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + - \llbracket \distinctWith\,[ \bderssimp{r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)] \,\approx\;\rrbracket$ & (4)\\ -& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5) -\end{tabular} -\end{center} - -% tell Chengsong about Indian paper of closed forms of derivatives - -\noindent -where in (1) the $\textit{Suffix}( r_1, s)$ are the all the suffixes of $s$ where $\bderssimp{ r_1}{s'}$ is nullable ($s'$ being a suffix of $s$). -The reason why we could write the derivatives of a sequence this way is described in section 2. -The term (2) is used to control (1). -That is because one can obtain an overall -smaller regex list -by flattening it and removing $\ZERO$s first before applying $\distinctWith$ on it. -Section 3 is dedicated to its proof. -In (3) we know that $\llbracket \ASEQ{bs}{(\bderssimp{ r_1}{s}}{r_2}\rrbracket$ is -bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller -than $N_2$. The list length after $\distinctWith$ is bounded by a number, which we call $l_{N_2}$. It stands -for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them). -We reason similarly for $\STAR$.\medskip - -\noindent -Clearly we give in this finiteness argument (Step (5)) a very loose bound that is -far from the actual bound we can expect. We can do better than this, but this does not improve -the finiteness property we are proving. If we are interested in a polynomial bound, -one would hope to obtain a similar tight bound as for partial -derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with - $\distinctWith$ is to maintain a ``set'' of alternatives (like the sets in -partial derivatives). Unfortunately to obtain the exact same bound would mean -we need to introduce simplifications, such as - $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$, -which exist for partial derivatives. However, if we introduce them in our -setting we would lose the POSIX property of our calculated values. We leave better -bounds for future work.\\[-6.5mm] - - - %----------------------------------- % SECTION ? %----------------------------------- @@ -247,9 +63,324 @@ \begin{lemma} $r \rightarrow_f r' \implies \rsimp{r} \rightarrow \rsimp{r'}$ \end{lemma} -. + Fortunately this is provable by induction on the list $rs$ + + +%----------------------------------- +% SECTION 2 +%----------------------------------- + +\section{Finiteness Property} +In this section let us describe our argument for why the size of the simplified +derivatives with the aggressive simplification function is finitely bounded. + Suppose +we have a size function for bitcoded regular expressions, written +$\llbracket r\rrbracket$, which counts the number of nodes if we regard $r$ as a tree + +\begin{center} +\begin{tabular}{ccc} +$\llbracket \ACHAR{bs}{c} \rrbracket $ & $\dn$ & $1$\\ +\end{tabular} +\end{center} +(TODO: COMPLETE this defn and for $rs$) + +The size is based on a recursive function on the structure of the regex, +not the bitcodes. +Therefore we may as well talk about size of an annotated regular expression +in their un-annotated form: +\begin{center} +$\asize(a) = \size(\erase(a))$. +\end{center} +(TODO: turn equals to roughly equals) + +But there is a minor nuisance here: +the erase function actually messes with the structure of the regular expression: +\begin{center} +\begin{tabular}{ccc} +$\erase(\AALTS{bs}{[]} )$ & $\dn$ & $\ZERO$\\ +\end{tabular} +\end{center} +(TODO: complete this definition with singleton r in alts) + +An alternative regular expression with an empty list of children + is turned into an $\ZERO$ during the +$\erase$ function, thereby changing the size and structure of the regex. +These will likely be fixable if we really want to use plain $\rexp$s for dealing +with size, but we choose a more straightforward (or stupid) method by +defining a new datatype that is similar to plain $\rexp$s but can take +non-binary arguments for its alternative constructor, + which we call $\rrexp$ to denote +the difference between it and plain regular expressions. +\[ \rrexp ::= \RZERO \mid \RONE + \mid \RCHAR{c} + \mid \RSEQ{r_1}{r_2} + \mid \RALTS{rs} + \mid \RSTAR{r} +\] +For $\rrexp$ we throw away the bitcodes on the annotated regular expressions, +but keep everything else intact. +It is similar to annotated regular expressions being $\erase$-ed, but with all its structure preserved +(the $\erase$ function unfortunately does not preserve structure in the case of empty and singleton +$\ALTS$). +We denote the operation of erasing the bits and turning an annotated regular expression +into an $\rrexp{}$ as $\rerase{}$. +\begin{center} +\begin{tabular}{lcl} +$\rerase{\AZERO}$ & $=$ & $\RZERO$\\ +$\rerase{\ASEQ{bs}{r_1}{r_2}}$ & $=$ & $\RSEQ{\rerase{r_1}}{\rerase{r_2}}$\\ +$\rerase{\AALTS{bs}{rs}}$ & $ =$ & $ \RALTS{rs}$ +\end{tabular} +\end{center} +%TODO: FINISH definition of rerase +Similarly we could define the derivative and simplification on +$\rrexp$, which would be identical to those we defined for plain $\rexp$s in chapter1, +except that now they can operate on alternatives taking multiple arguments. + +\begin{center} +\begin{tabular}{lcr} +$\RALTS{rs} \backslash c$ & $=$ & $\RALTS{map\; (\_ \backslash c) \;rs}$\\ +(other clauses omitted) +\end{tabular} +\end{center} + +Now that $\rrexp$s do not have bitcodes on them, we can do the +duplicate removal without an equivalence relation: +\begin{center} +\begin{tabular}{lcl} +$\rdistinct{r :: rs}{rset}$ & $=$ & $\textit{if}(r \in \textit{rset}) \; \textit{then} \; \rdistinct{rs}{rset}$\\ + & & $\textit{else}\; r::\rdistinct{rs}{(rset \cup \{r\})}$ +\end{tabular} +\end{center} +%TODO: definition of rsimp (maybe only the alternative clause) + + +The reason why these definitions mirror precisely the corresponding operations +on annotated regualar expressions is that we can calculate sizes more easily: + +\begin{lemma} +$\rsize{\rerase a} = \asize a$ +\end{lemma} +This lemma says that the size of an r-erased regex is the same as the annotated regex. +this does not hold for plain $\rexp$s due to their ways of how alternatives are represented. +\begin{lemma} +$\asize{\bsimp{a}} = \rsize{\rsimp{\rerase{a}}}$ +\end{lemma} +Putting these two together we get a property that allows us to estimate the +size of an annotated regular expression derivative using its un-annotated counterpart: +\begin{lemma} +$\asize{\bderssimp{r}{s}} = \rsize{\rderssimp{\rerase{r}}{s}}$ +\end{lemma} +Unless stated otherwise in this chapter all $\textit{rexp}$s without + bitcodes are seen as $\rrexp$s. + We also use $r_1 + r_2$ and $\RALTS{[r_1, r_2]}$ interchageably + as the former suits people's intuitive way of stating a binary alternative + regular expression. + + +\begin{theorem} +For any regex $r$, $\exists N_r. \forall s. \; \llbracket{\bderssimp{r}{s}}\rrbracket \leq N_r$ +\end{theorem} + +\noindent +\begin{proof} +We prove this by induction on $r$. The base cases for $\AZERO$, +$\AONE \textit{bs}$ and $\ACHAR \textit{bs} c$ are straightforward. The interesting case is +for sequences of the form $\ASEQ{bs}{r_1}{r_2}$. In this case our induction +hypotheses state $\exists N_1. \forall s. \; \llbracket \bderssimp{r}{s} \rrbracket \leq N_1$ and +$\exists N_2. \forall s. \; \llbracket \bderssimp{r_2}{s} \rrbracket \leq N_2$. We can reason as follows +% +\begin{center} +\begin{tabular}{lcll} +& & $ \llbracket \bderssimp{\ASEQ{bs}{r_1}{r_2} }{s} \rrbracket$\\ +& $ = $ & $\llbracket bsimp\,(\textit{ALTs}\;bs\;(\ASEQ{nil}{\bderssimp{ r_1}{s}}{ r_2} :: + [\bderssimp{ r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)]))\rrbracket $ & (1) \\ +& $\leq$ & + $\llbracket\textit{\distinctWith}\,((\ASEQ{nil}{\bderssimp{r_1}{s}}{r_2}$) :: + [$\bderssimp{ r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)])\,\approx\;{}\rrbracket + 1 $ & (2) \\ +& $\leq$ & $\llbracket\ASEQ{bs}{\bderssimp{ r_1}{s}}{r_2}\rrbracket + + \llbracket\textit{distinctWith}\,[\bderssimp{r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)]\,\approx\;{}\rrbracket + 1 $ & (3) \\ +& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + + \llbracket \distinctWith\,[ \bderssimp{r_2}{s'} \;|\; s' \in \textit{Suffix}( r_1, s)] \,\approx\;\rrbracket$ & (4)\\ +& $\leq$ & $N_1 + \llbracket r_2\rrbracket + 2 + l_{N_{2}} * N_{2}$ & (5) +\end{tabular} +\end{center} + + +\noindent +where in (1) the $\textit{Suffix}( r_1, s)$ are the all the suffixes of $s$ where $\bderssimp{ r_1}{s'}$ is nullable ($s'$ being a suffix of $s$). +The reason why we could write the derivatives of a sequence this way is described in section 2. +The term (2) is used to control (1). +That is because one can obtain an overall +smaller regex list +by flattening it and removing $\ZERO$s first before applying $\distinctWith$ on it. +Section 3 is dedicated to its proof. +In (3) we know that $\llbracket \ASEQ{bs}{(\bderssimp{ r_1}{s}}{r_2}\rrbracket$ is +bounded by $N_1 + \llbracket{}r_2\rrbracket + 1$. In (5) we know the list comprehension contains only regular expressions of size smaller +than $N_2$. The list length after $\distinctWith$ is bounded by a number, which we call $l_{N_2}$. It stands +for the number of distinct regular expressions smaller than $N_2$ (there can only be finitely many of them). +We reason similarly for $\STAR$.\medskip +\end{proof} + +What guarantee does this bound give us? + +Whatever the regex is, it will not grow indefinitely. +Take our previous example $(a + aa)^*$ as an example: +\begin{center} +\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}} +\begin{tikzpicture} +\begin{axis}[ + xlabel={number of $a$'s}, + x label style={at={(1.05,-0.05)}}, + ylabel={regex size}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax= 40, + ytick={0,10,...,40}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={$(a + aa)^*$}, + legend pos=north west, + legend cell align=left] +\addplot[red,mark=*, mark options={fill=white}] table {a_aa_star.data}; +\end{axis} +\end{tikzpicture} +\end{tabular} +\end{center} +We are able to limit the size of the regex $(a + aa)^*$'s derivatives + with our simplification +rules very effectively. + + +As the graphs of some more randomly generated regexes show, the size of +the derivative might grow quickly at the start of the input, + but after a sufficiently long input string, the trend will stop. + + +%a few sample regular experessions' derivatives +%size change +%TODO: giving graphs showing a few regexes' size changes +%w;r;t the input characters number +%a*, aa*, aaa*, ..... +%randomly generated regexes +\begin{center} +\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}} +\begin{tikzpicture} +\begin{axis}[ + xlabel={number of $a$'s}, + x label style={at={(1.05,-0.05)}}, + ylabel={regex size}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=1000, + ytick={0,100,...,1000}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={regex1}, + legend pos=north west, + legend cell align=left] +\addplot[red,mark=*, mark options={fill=white}] table {regex1_size_change.data}; +\end{axis} +\end{tikzpicture} + & +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n$}, + x label style={at={(1.05,-0.05)}}, + %ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=1000, + ytick={0,100,...,1000}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={regex2}, + legend pos=north west, + legend cell align=left] +\addplot[blue,mark=*, mark options={fill=white}] table {regex2_size_change.data}; +\end{axis} +\end{tikzpicture} + & +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n$}, + x label style={at={(1.05,-0.05)}}, + %ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=1000, + ytick={0,100,...,1000}, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={regex3}, + legend pos=north west, + legend cell align=left] +\addplot[cyan,mark=*, mark options={fill=white}] table {regex3_size_change.data}; +\end{axis} +\end{tikzpicture}\\ +\multicolumn{3}{c}{Graphs: size change of 3 randomly generated regexes $w.r.t.$ input string length.} +\end{tabular} +\end{center} + + + + + +\noindent +Clearly we give in this finiteness argument (Step (5)) a very loose bound that is +far from the actual bound we can expect. +In our proof for the inductive case $r_1 \cdot r_2$, the dominant term in the bound +is $l_{N_2} * N_2$, where $N_2$ is the bound we have for $\llbracket \bderssimp{r_2}{s} \rrbracket$. +Given that $l_{N_2}$ is roughly the size $4^{N_2}$, the size bound $\llbracket \bderssimp{r_1 \cdot r_2}{s} \rrbracket$ +inflates the size bound of $\llbracket \bderssimp{r_2}{s} \rrbracket$ with the function +$f(x) = x * 2^x$. +This means the bound we have will surge up at least +tower-exponentially with a linear increase of the depth. +For a regex of depth $n$, the bound +would be approximately $4^n$. +%TODO: change this to tower exponential! + +We can do better than this, but this does not improve +the finiteness property we are proving. If we are interested in a polynomial bound, +one would hope to obtain a similar tight bound as for partial +derivatives introduced by Antimirov \cite{Antimirov95}. After all the idea with + $\distinctWith$ is to maintain a ``set'' of alternatives (like the sets in +partial derivatives). +To obtain the exact same bound would mean +we need to introduce simplifications, such as + $(r_1 + r_2) \cdot r_3 \longrightarrow (r_1 \cdot r_3) + (r_2 \cdot r_3)$, +which exist for partial derivatives. + +However, if we introduce them in our +setting we would lose the POSIX property of our calculated values. +A simple example for this would be the regex $(a + a\cdot b)\cdot(b\cdot c + c)$. +If we split this regex up into $a\cdot(b\cdot c + c) + a\cdot b \cdot (b\cdot c + c)$, the lexer +would give back $\Left(\Seq(\Char(a), \Left(\Char(b \cdot c))))$ instead of +what we want: $\Seq(\Right(ab), \Right(c))$. Unless we can store the structural information +in all the places where a transformation of the form $(r_1 + r_2)\cdot r \rightarrow r_1 \cdot r + r_2 \cdot r$ +occurs, and apply them in the right order once we get +a result of the "aggressively simplified" regex, it would be impossible to still get a $\POSIX$ value. +This is unlike the simplification we had before, where the rewriting rules +such as $\ONE \cdot r \rightsquigarrow r$, under which our lexer will give the same value. +We will discuss better +bounds in the last section of this chapter.\\[-6.5mm] + + + + %---------------------------------------------------------------------------------------- % SECTION ?? %---------------------------------------------------------------------------------------- @@ -369,10 +500,62 @@ The intuition is that if we remove duplicates from the $\textit{LHS}$, at least the same amount of duplicates will be removed from the list $\textit{rs}$ in the $\textit{RHS}$. + +%---------------------------------------------------------------------------------------- +% SECTION ALTS CLOSED FORM +%---------------------------------------------------------------------------------------- +\section{A Closed Form for \textit{ALTS}} +Now we prove that $rsimp (rders\_simp (RALTS rs) s) = rsimp (RALTS (map (\lambda r. rders\_simp r s) rs))$. + + +There are a few key steps, one of these steps is +$rsimp (rsimp\_ALTs (map (rder x) (rdistinct (rflts (map (rsimp \circ (\lambda r. rders\_simp r xs)) rs)) {}))) += rsimp (rsimp\_ALTs (rdistinct (map (rder x) (rflts (map (rsimp \circ (\lambda r. rders\_simp r xs)) rs))) {}))$ + +One might want to prove this by something a simple statement like: +$map (rder x) (rdistinct rs rset) = rdistinct (map (rder x) rs) (rder x) ` rset)$. + +For this to hold we want the $\textit{distinct}$ function to pick up +the elements before and after derivatives correctly: +$r \in rset \equiv (rder x r) \in (rder x rset)$. +which essentially requires that the function $\backslash$ is an injective mapping. + +Unfortunately the function $\backslash c$ is not an injective mapping. + +\subsection{function $\backslash c$ is not injective (1-to-1)} +\begin{center} +The derivative $w.r.t$ character $c$ is not one-to-one. +Formally, + $\exists r_1 \;r_2. r_1 \neq r_2 \mathit{and} r_1 \backslash c = r_2 \backslash c$ +\end{center} +This property is trivially true for the +character regex example: +\begin{center} + $r_1 = e; \; r_2 = d;\; r_1 \backslash c = \ZERO = r_2 \backslash c$ +\end{center} +But apart from the cases where the derivative +output is $\ZERO$, are there non-trivial results +of derivatives which contain strings? +The answer is yes. +For example, +\begin{center} + Let $r_1 = a^*b\;\quad r_2 = (a\cdot a^*)\cdot b + b$.\\ + where $a$ is not nullable.\\ + $r_1 \backslash c = ((a \backslash c)\cdot a^*)\cdot c + b \backslash c$\\ + $r_2 \backslash c = ((a \backslash c)\cdot a^*)\cdot c + b \backslash c$ +\end{center} +We start with two syntactically different regexes, +and end up with the same derivative result. +This is not surprising as we have such +equality as below in the style of Arden's lemma:\\ +\begin{center} + $L(A^*B) = L(A\cdot A^* \cdot B + B)$ +\end{center} + %---------------------------------------------------------------------------------------- % SECTION 4 %---------------------------------------------------------------------------------------- -\section{a bound for the star regular expression} +\section{A Bound for the Star Regular Expression} We have shown how to control the size of the sequence regular expression $r_1\cdot r_2$ using the "closed form" of $(r_1 \cdot r_2) \backslash s$ and then the property of the $\distinct$ function. @@ -410,7 +593,7 @@ $\hflat{r_1 + r_2}$ & $=$ & $\RALTS{\hflataux{r_1} @ \hflataux{r_2}}$\\ $\hflat r$ & $=$ & $ r$ \end{tabular} -\end{center} +\end{center}s Again these definitions are tailor-made for dealing with alternatives that have originated from a star's derivatives, so we don't attempt to open up all possible regexes of the form $\RALTS{rs}$, where $\textit{rs}$ might not contain precisely 2 @@ -452,7 +635,7 @@ proof for a size bound, we are happy to restrict ourselves to unannotated regular expressions, and obtain such equalities as \begin{lemma} - $\rsimp{(r_1 + r_2)} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$ + $\rsimp{r_1 + r_2} = \rsimp{\RALTS{\hflataux{r_1} @ \hflataux{r_2}}}$ \end{lemma} \begin{proof} @@ -480,30 +663,104 @@ One might wonder the actual bound rather than the loose bound we gave -for the convenience of a easier proof. -How much can the regex $r^* \backslash s$ grow? As hinted earlier, +for the convenience of an easier proof. +How much can the regex $r^* \backslash s$ grow? +As earlier graphs have shown, +%TODO: reference that graph where size grows quickly they can grow at a maximum speed exponential $w.r.t$ the number of characters, -but will eventually level off when the string $s$ is long enough, -as suggested by the finiteness bound proof. -And unfortunately we have concrete examples +but will eventually level off when the string $s$ is long enough. +If they grow to a size exponential $w.r.t$ the original regex, our algorithm +would still be slow. +And unfortunately, we have concrete examples where such regexes grew exponentially large before levelling off: $(a ^ * + (aa) ^ * + (aaa) ^ * + \ldots + (\underbrace{a \ldots a}_{\text{n a's}})^*$ will already have a maximum -size that is exponential on the number $n$. -%TODO: give out a graph showing how the size of the regex grows and levels off at a size exponential to the original regex - + size that is exponential on the number $n$ +under our current simplification rules: +%TODO: graph of a regex whose size increases exponentially. +\begin{center} +\begin{tikzpicture} + \begin{axis}[ + height=0.5\textwidth, + width=\textwidth, + xlabel=number of a's, + xtick={0,...,9}, + ylabel=maximum size, + ymode=log, + log basis y={2} +] + \addplot[mark=*,blue] table {re-chengsong.data}; + \end{axis} +\end{tikzpicture} +\end{center} -While the tight upper bound will definitely be a lot lower than -the one we gave for the finiteness proof, -it will still be at least expoential, under our current simplification rules. +For convenience we use $(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$ +to express $(a ^ * + (aa) ^ * + (aaa) ^ * + \ldots + +(\underbrace{a \ldots a}_{\text{n a's}})^*$ in the below discussion. +The exponential size is triggered by that the regex +$\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*$ +inside the $(\ldots) ^*$ having exponentially many +different derivatives, despite those difference being minor. +$(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*\backslash \underbrace{a \ldots a}_{\text{m a's}}$ +will therefore contain the following terms (after flattening out all nested +alternatives): +\begin{center} +$(\oplus_{i = 1]{n} (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* })\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)$\\ +$(1 \leq m' \leq m )$ +\end{center} +These terms are distinct for $m' \leq L.C.M.(1, \ldots, n)$ (will be explained in appendix). + With each new input character taking the derivative against the intermediate result, more and more such distinct + terms will accumulate, +until the length reaches $L.C.M.(1, \ldots, n)$. +$\textit{distinctBy}$ will not be able to de-duplicate any two of these terms +$(\oplus_{i = 1}^{n} (\underbrace{a \ldots a}_{\text{((i - (m' \% i))\%i) a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* )\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$\\ -This suggests stronger simplifications that keep the tight bound polynomial. +$(\oplus_{i = 1}^{n} (\underbrace{a \ldots a}_{\text{((i - (m'' \% i))\%i) a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* )\cdot (\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$\\ + where $m' \neq m''$ \\ + as they are slightly different. + This means that with our current simplification methods, + we will not be able to control the derivative so that + $\llbracket \bderssimp{r}{s} \rrbracket$ stays polynomial %\leq O((\llbracket r\rrbacket)^c)$ + as there are already exponentially many terms. + These terms are similar in the sense that the head of those terms + are all consisted of sub-terms of the form: + $(\underbrace{a \ldots a}_{\text{j a's}})\cdot (\underbrace{a \ldots a}_{\text{i a's}})^* $. + For $\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*$, there will be at most + $n * (n + 1) / 2$ such terms. + For example, $(a^* + (aa)^* + (aaa)^*) ^*$'s derivatives + can be described by 6 terms: + $a^*$, $a\cdot (aa)^*$, $ (aa)^*$, + $aa \cdot (aaa)^*$, $a \cdot (aaa)^*$, and $(aaa)^*$. +The total number of different "head terms", $n * (n + 1) / 2$, + is proportional to the number of characters in the regex +$(\oplus_{i=1}^{n} (\underbrace{a \ldots a}_{\text{i a's}})^*)^*$. +This suggests a slightly different notion of size, which we call the +alphabetic width: +%TODO: +(TODO: Alphabetic width def.) + + +Antimirov\parencite{Antimirov95} has proven that +$\textit{PDER}_{UNIV}(r) \leq \textit{awidth}(r)$. +where $\textit{PDER}_{UNIV}(r)$ is a set of all possible subterms +created by doing derivatives of $r$ against all possible strings. +If we can make sure that at any moment in our lexing algorithm our +intermediate result hold at most one copy of each of the +subterms then we can get the same bound as Antimirov's. +This leads to the algorithm in the next section. %---------------------------------------------------------------------------------------- % SECTION 5 %---------------------------------------------------------------------------------------- -\section{A Stronger Version of Simplification} +\section{A Stronger Version of Simplification Inspired by Antimirov} %TODO: search for isabelle proofs of algorithms that check equivalence + + + + + + + diff -r 3c5e58d08939 -r ff7945a988a3 ChengsongTanPhdThesis/example.bib --- a/ChengsongTanPhdThesis/example.bib Fri May 20 18:52:03 2022 +0100 +++ b/ChengsongTanPhdThesis/example.bib Thu May 26 20:51:40 2022 +0100 @@ -1,24 +1,41 @@ %% This BibTeX bibliography file was created using BibDesk. %% https://bibdesk.sourceforge.io/ -%% Created for CS TAN at 2022-03-16 16:38:47 +0000 +%% Created for CS TAN at 2022-05-23 18:43:50 +0100 %% Saved with string encoding Unicode (UTF-8) + + +@article{Turo_ov__2020, + author = {Lenka Turo{\v{n}}ov{\'{a}} and Luk{\'{a}}{\v{s}} Hol{\'{\i}}k and Ond{\v{r}}ej Leng{\'{a}}l and Olli Saarikivi and Margus Veanes and Tom{\'{a}}{\v{s}} Vojnar}, + date-added = {2022-05-23 18:43:04 +0100}, + date-modified = {2022-05-23 18:43:04 +0100}, + doi = {10.1145/3428286}, + journal = {Proceedings of the {ACM} on Programming Languages}, + month = {nov}, + number = {{OOPSLA}}, + pages = {1--30}, + publisher = {Association for Computing Machinery ({ACM})}, + title = {Regex matching with counting-set automata}, + url = {https://doi.org/10.1145%2F3428286}, + volume = {4}, + year = 2020, + bdsk-url-1 = {https://doi.org/10.1145%2F3428286}, + bdsk-url-2 = {https://doi.org/10.1145/3428286}} + @article{Rathnayake2014StaticAF, - title={Static Analysis for Regular Expression Exponential Runtime via Substructural Logics}, - author={Asiri Rathnayake and Hayo Thielecke}, - journal={ArXiv}, - year={2014}, - volume={abs/1405.7058} -} + author = {Asiri Rathnayake and Hayo Thielecke}, + journal = {ArXiv}, + title = {Static Analysis for Regular Expression Exponential Runtime via Substructural Logics}, + volume = {abs/1405.7058}, + year = {2014}} @inproceedings{Weideman2017Static, - title={Static analysis of regular expressions}, - author={Nicolaas Weideman}, - year={2017} -} + author = {Nicolaas Weideman}, + title = {Static analysis of regular expressions}, + year = {2017}} @inproceedings{RibeiroAgda2017, abstract = {We describe the formalization of a regular expression (RE) parsing algorithm that produces a bit representation of its parse tree in the dependently typed language Agda. The algorithm computes bit-codes using Brzozowski derivatives and we prove that produced codes are equivalent to parse trees ensuring soundness and completeness w.r.t an inductive RE semantics. We include the certified algorithm in a tool developed by us, named verigrep, for regular expression based search in the style of the well known GNU grep. Practical experiments conducted with this tool are reported.}, @@ -63,44 +80,42 @@ journal = {arXiv:1405.7058}, title = {Static Analysis for Regular Expression Exponential Runtime via Substructural Logics}, year = {2017}} + @article{campeanu2003formal, - title={A formal study of practical regular expressions}, - author={C{\^a}mpeanu, Cezar and Salomaa, Kai and Yu, Sheng}, - journal={International Journal of Foundations of Computer Science}, - volume={14}, - number={06}, - pages={1007--1018}, - year={2003}, - publisher={World Scientific} -} + author = {C{\^a}mpeanu, Cezar and Salomaa, Kai and Yu, Sheng}, + journal = {International Journal of Foundations of Computer Science}, + number = {06}, + pages = {1007--1018}, + publisher = {World Scientific}, + title = {A formal study of practical regular expressions}, + volume = {14}, + year = {2003}} @article{alfred2014algorithms, - title={Algorithms for finding patterns in strings}, - author={Alfred, V}, - journal={Algorithms and Complexity}, - volume={1}, - pages={255}, - year={2014}, - publisher={Elsevier} -} - + author = {Alfred, V}, + journal = {Algorithms and Complexity}, + pages = {255}, + publisher = {Elsevier}, + title = {Algorithms for finding patterns in strings}, + volume = {1}, + year = {2014}} @article{CAMPEANU2009Intersect, -title = {On the intersection of regex languages with regular languages}, -journal = {Theoretical Computer Science}, -volume = {410}, -number = {24}, -pages = {2336-2344}, -year = {2009}, -note = {Formal Languages and Applications: A Collection of Papers in Honor of Sheng Yu}, -issn = {0304-3975}, -doi = {https://doi.org/10.1016/j.tcs.2009.02.022}, -url = {https://www.sciencedirect.com/science/article/pii/S0304397509001789}, -author = {Cezar Câmpeanu and Nicolae Santean}, -keywords = {Extended regular expression, Regex automata system, Regex}, -abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255–300] and rigorously in 2003 by Câmpeanu, Salomaa and Yu [C. Câmpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007–1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines — regex automata systems (RAS) — which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.} -} - + abstract = {In this paper we revisit the semantics of extended regular expressions (regex), defined succinctly in the 90s [A.V. Aho, Algorithms for finding patterns in strings, in: Jan van Leeuwen (Ed.), Handbook of Theoretical Computer Science, in: Algorithms and Complexity, vol. A, Elsevier and MIT Press, 1990, pp. 255--300] and rigorously in 2003 by C{\^a}mpeanu, Salomaa and Yu [C. C{\^a}mpeanu, K. Salomaa, S. Yu, A formal study of practical regular expressions, IJFCS 14 (6) (2003) 1007--1018], when the authors reported an open problem, namely whether regex languages are closed under the intersection with regular languages. We give a positive answer; and for doing so, we propose a new class of machines --- regex automata systems (RAS) --- which are equivalent to regex. Among others, these machines provide a consistent and convenient method of implementing regex in practice. We also prove, as a consequence of this closure property, that several languages, such as the mirror language, the language of palindromes, and the language of balanced words are not regex languages.}, + author = {Cezar C{\^a}mpeanu and Nicolae Santean}, + doi = {https://doi.org/10.1016/j.tcs.2009.02.022}, + issn = {0304-3975}, + journal = {Theoretical Computer Science}, + keywords = {Extended regular expression, Regex automata system, Regex}, + note = {Formal Languages and Applications: A Collection of Papers in Honor of Sheng Yu}, + number = {24}, + pages = {2336-2344}, + title = {On the intersection of regex languages with regular languages}, + url = {https://www.sciencedirect.com/science/article/pii/S0304397509001789}, + volume = {410}, + year = {2009}, + bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S0304397509001789}, + bdsk-url-2 = {https://doi.org/10.1016/j.tcs.2009.02.022}} @article{nielson11bcre, author = {Lasse Nielsen, Fritz Henglein}, @@ -110,6 +125,7 @@ title = {Bit-coded Regular Expression Parsing}, year = {2011}, bdsk-file-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxA1Li4vLi4vLi4vTXkgTWFjIChNYWNCb29rLVBybykvRGVza3RvcC9mcml0ei1wYXBlci5wZGZPEQF+AAAAAAF+AAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8PZnJpdHotcGFwZXIucGRmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAADAAMAAAogY3UAAAAAAAAAAAAAAAAAB0Rlc2t0b3AAAAIAQi86VXNlcnM6Y3N0YW46RHJvcGJveDpNeSBNYWMgKE1hY0Jvb2stUHJvKTpEZXNrdG9wOmZyaXR6LXBhcGVyLnBkZgAOACAADwBmAHIAaQB0AHoALQBwAGEAcABlAHIALgBwAGQAZgAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAASAEBVc2Vycy9jc3Rhbi9Ecm9wYm94L015IE1hYyAoTWFjQm9vay1Qcm8pL0Rlc2t0b3AvZnJpdHotcGFwZXIucGRmABMAAS8AABUAAgAM//8AAAAIAA0AGgAkAFwAAAAAAAACAQAAAAAAAAAFAAAAAAAAAAAAAAAAAAAB3g==}} + @software{regexploit2021, author = {Ben Caller, Luca Carettoni}, date-added = {2020-11-24 00:00:00 +0000}, @@ -118,36 +134,38 @@ month = {May}, title = {regexploit}, url = {https://github.com/doyensec/regexploit}, - year = {2021} -} - + year = {2021}, + bdsk-url-1 = {https://github.com/doyensec/regexploit}} + @misc{KuklewiczHaskell, - title = {Regex Posix}, author = {Kuklewicz}, keywords = {Buggy C POSIX Lexing Libraries}, + title = {Regex Posix}, url = {https://wiki.haskell.org/Regex_Posix}, - year = {2017} -} + year = {2017}, + bdsk-url-1 = {https://wiki.haskell.org/Regex_Posix}} + @misc{regex101, - title = {regex101}, author = {Firas Dib}, - year = {2011}, + keywords = {regex tester debugger}, + title = {regex101}, url = {https://regex101.com/}, - keywords = {regex tester debugger} -} + year = {2011}, + bdsk-url-1 = {https://regex101.com/}} + @misc{atomEditor, + author = {Dunno}, + keywords = {text editor}, title = {Atom Editor}, - author = {Dunno}, + url = {https://atom.io}, year = {2022}, - url = {https://atom.io}, - keywords = {text editor} -} + bdsk-url-1 = {https://atom.io}} + @techreport{grathwohl2014crash, - title={A Crash-Course in Regular Expression Parsing and Regular Expressions as Types}, - author={Grathwohl, Niels Bj{\o}rn Bugge and Henglein, Fritz and Rasmussen, Ulrik Terp}, - year={2014}, - institution={Technical report, University of Copenhagen} -} + author = {Grathwohl, Niels Bj{\o}rn Bugge and Henglein, Fritz and Rasmussen, Ulrik Terp}, + institution = {Technical report, University of Copenhagen}, + title = {A Crash-Course in Regular Expression Parsing and Regular Expressions as Types}, + year = {2014}} @misc{SE16, author = {StackStatus}, diff -r 3c5e58d08939 -r ff7945a988a3 ChengsongTanPhdThesis/main.tex --- a/ChengsongTanPhdThesis/main.tex Fri May 20 18:52:03 2022 +0100 +++ b/ChengsongTanPhdThesis/main.tex Thu May 26 20:51:40 2022 +0100 @@ -245,9 +245,9 @@ \tableofcontents % Prints the main table of contents -\listoffigures % Prints the list of figures +%\listoffigures % Prints the list of figures -\listoftables % Prints the list of tables +%\listoftables % Prints the list of tables %---------------------------------------------------------------------------------------- % ABBREVIATIONS @@ -255,9 +255,6 @@ \begin{abbreviations}{ll} % Include a list of abbreviations (a table of two columns) -\textbf{LAH} & \textbf{L}ist \textbf{A}bbreviations \textbf{H}ere\\ -\textbf{WSF} & \textbf{W}hat (it) \textbf{S}tands \textbf{F}or\\ - \newtheorem{theorem}{Theorem} \newtheorem{lemma}{Lemma} diff -r 3c5e58d08939 -r ff7945a988a3 RegexExplosionPlot/evilForBsimp.sc --- a/RegexExplosionPlot/evilForBsimp.sc Fri May 20 18:52:03 2022 +0100 +++ b/RegexExplosionPlot/evilForBsimp.sc Thu May 26 20:51:40 2022 +0100 @@ -1024,10 +1024,10 @@ //val pderSTAR = pderUNIV(STARREG) //val refSize = pderSTAR.map(size(_)).sum - for(n <- 6 to 6){ - val STARREG = n_astar(n) + for(n <- 1 to 12){ + val STARREG = n_astar_alts(n) val iMax = (lcm((1 to n).toList)) - for(i <- 1 to iMax + 50){// 100, 400, 800, 840, 841, 900 + for(i <- iMax to iMax ){// 100, 400, 800, 840, 841, 900 val prog0 = "a" * i //println(s"test: $prog0") print(n) diff -r 3c5e58d08939 -r ff7945a988a3 thys2/blexer2.sc --- a/thys2/blexer2.sc Fri May 20 18:52:03 2022 +0100 +++ b/thys2/blexer2.sc Thu May 26 20:51:40 2022 +0100 @@ -1,3 +1,5 @@ +//Strong Bsimp to obtain Antimirov's cubic bound + // A simple lexer inspired by work of Sulzmann & Lu //================================================== // @@ -496,12 +498,37 @@ val rs_simp = rs.map(strongBsimp5(_)) val flat_res = flats(rs_simp) var dist_res = distinctBy5(flat_res)//distinctBy(flat_res, erase) - var dist2_res = distinctBy5(dist_res) - while(dist_res != dist2_res){ - dist_res = dist2_res - dist2_res = distinctBy5(dist_res) + // var dist2_res = distinctBy5(dist_res) + // while(dist_res != dist2_res){ + // dist_res = dist2_res + // dist2_res = distinctBy5(dist_res) + // } + (dist_res) match { + case Nil => AZERO + case s :: Nil => fuse(bs1, s) + case rs => AALTS(bs1, rs) } - (dist2_res) match { + } + case r => r + } +} + +def strongBsimp6(r: ARexp): ARexp = +{ + // println("was this called?") + r match { + case ASEQ(bs1, r1, r2) => (strongBsimp6(r1), strongBsimp6(r2)) match { + case (AZERO, _) => AZERO + case (_, AZERO) => AZERO + case (AONE(bs2), r2s) => fuse(bs1 ++ bs2, r2s) + case (r1s, r2s) => ASEQ(bs1, r1s, r2s) + } + case AALTS(bs1, rs) => { + // println("alts case") + val rs_simp = rs.map(strongBsimp6(_)) + val flat_res = flats(rs_simp) + var dist_res = distinctBy6(flat_res)//distinctBy(flat_res, erase) + (dist_res) match { case Nil => AZERO case s :: Nil => fuse(bs1, s) case rs => AALTS(bs1, rs) @@ -688,6 +715,90 @@ } } +def attachCtx(r: ARexp, ctx: List[Rexp]) : List[Rexp] = { + val res = breakIntoTerms(oneSimp(L(erase(r), ctx))).map(oneSimp) + res +} + +def ABIncludedByC[A, B, C](a: A, b: B, c: C, f: (A, B) => C, inclusionPred: (C, C) => Boolean) : Boolean = { + inclusionPred(f(a, b), c) +} +def rexpListInclusion(rs1: List[Rexp], rs2: List[Rexp]) : Boolean = { + rs1.forall(rs2.contains) +} +def pAKC6(acc: List[Rexp], r: ARexp, ctx: List[Rexp]) : ARexp = { + // println("pakc") + // println(shortRexpOutput(erase(r))) + // println("acc") + // rsprint(acc) + // println("ctx---------") + // rsprint(ctx) + // println("ctx---------end") + // rsprint(breakIntoTerms(L(erase(r), ctx)).map(oneSimp)) + + // rprint(L(erase(r), ctx)) + //breakIntoTerms(L(erase(r), ctx)).map(oneSimp).forall(acc.contains) + if (ABIncludedByC(r, ctx, acc, attachCtx, rexpListInclusion)) {//acc.flatMap(breakIntoTerms + //println("included in acc!!!") + AZERO + } + else{ + r match { + case ASEQ(bs, r1, r2) => + (pAKC6(acc, r1, erase(r2) :: ctx)) match{ + case AZERO => + AZERO + case AONE(bs1) => + fuse(bs1, r2) + case r1p => + ASEQ(bs, r1p, r2) + } + case AALTS(bs, rs0) => + // println("before pruning") + // println(s"ctx is ") + // ctx.foreach(r => println(shortRexpOutput(r))) + // println(s"rs0 is ") + // rs0.foreach(r => println(shortRexpOutput(erase(r)))) + // println(s"acc is ") + // acc.foreach(r => println(shortRexpOutput(r))) + rs0.map(r => pAKC6(acc, r, ctx)).filter(_ != AZERO) match { + case Nil => + // println("after pruning Nil") + AZERO + case r :: Nil => + // println("after pruning singleton") + // println(shortRexpOutput(erase(r))) + r + case rs0p => + // println("after pruning non-singleton") + AALTS(bs, rs0p) + } + case r => r + } + } +} + + +def distinctBy6(xs: List[ARexp], acc: List[Rexp] = Nil) : List[ARexp] = xs match { + case Nil => + Nil + case x :: xs => { + val erased = erase(x) + if(acc.contains(erased)){ + distinctBy6(xs, acc) + } + else{ + val pruned = pAKC6(acc, x, Nil) + val newTerms = breakIntoTerms(erase(pruned)) + pruned match { + case AZERO => + distinctBy6(xs, acc) + case xPrime => + xPrime :: distinctBy6(xs, newTerms.map(oneSimp) ::: acc)//distinctBy5(xs, addToAcc.map(oneSimp(_)) ::: acc) + } + } + } +} def breakIntoTerms(r: Rexp) : List[Rexp] = r match { case SEQ(r1, r2) => breakIntoTerms(r1).map(r11 => SEQ(r11, r2)) @@ -806,7 +917,10 @@ case Nil => r case c::s => bdersStrong5(s, strongBsimp5(bder(c, r))) } - + def bdersStrong6(s: List[Char], r: ARexp) : ARexp = s match { + case Nil => r + case c::s => bdersStrong6(s, strongBsimp6(bder(c, r))) + } def bdersSimp(s: String, r: Rexp) : ARexp = bders_simp(s.toList, internalise(r)) def bdersStrong(s: List[Char], r: ARexp) : ARexp = s match { @@ -1024,13 +1138,13 @@ //val pderSTAR = pderUNIV(STARREG) //val refSize = pderSTAR.map(size(_)).sum - for(n <- 6 to 6){ + for(n <- 5 to 5){ val STARREG = n_astar(n) val iMax = (lcm((1 to n).toList)) - for(i <- 1 to iMax + 50){// 100, 400, 800, 840, 841, 900 + for(i <- 1 to iMax + 2){// 100, 400, 800, 840, 841, 900 val prog0 = "a" * i //println(s"test: $prog0") - print(n) + print(i) print(" ") // print(i) // print(" ") @@ -1089,34 +1203,17 @@ //STAR(SEQ(ALTS(STAR(STAR(STAR(STAR(CHAR(c))))),ALTS(CHAR(c),CHAR(b))),ALTS(ZERO,SEQ(ALTS(ALTS(STAR(CHAR(c)),SEQ(CHAR(b),CHAR(a))),CHAR(c)),STAR(ALTS(ALTS(ONE,CHAR(a)),STAR(CHAR(c)))))))) //(depth5) //STAR(SEQ(ALTS(ALTS(STAR(CHAR(b)),SEQ(ONE,CHAR(b))),SEQ(STAR(CHAR(a)),CHAR(b))),ALTS(ZERO,ALTS(STAR(CHAR(b)),STAR(CHAR(a)))))) - test(rexp(4), 10000000) { (r: Rexp) => + test(rexp(4), 100000) { (r: Rexp) => // ALTS(SEQ(SEQ(ONE,CHAR('a')),STAR(CHAR('a'))),SEQ(ALTS(CHAR('c'),ONE),STAR(ZERO))))))), 1) { (r: Rexp) => val ss = stringsFromRexp(r) - val boolList = ss.map(s => { - val bdStrong = bdersStrong(s.toList, internalise(r)) - val bdStrong5 = bdersStrong5(s.toList, internalise(r)) - val bdFurther5 = strongBsimp5(bdStrong5) - val sizeFurther5 = asize(bdFurther5) - val pdersSet = pderUNIV(r) - val size5 = asize(bdStrong5) - val size0 = asize(bdStrong) + val boolList = ss.filter(s => s != "").map(s => { + //val bdStrong = bdersStrong(s.toList, internalise(r)) + val bdStrong6 = bdersStrong6(s.toList, internalise(r)) + val bdStrong6Set = breakIntoTerms(erase(bdStrong6)) + val pdersSet = pderUNIV(r).flatMap(r => breakIntoTerms(r)) // println(s) - // println("pdersSet size") - // println(pdersSet.size) - // pdersSet.foreach(r => println(shortRexpOutput(r))) - // println("after bdStrong5") - - // println(shortRexpOutput(erase(bdStrong5))) - // println(breakIntoTerms(erase(bdStrong5)).size) - - // println("after bdStrong") - // println(shortRexpOutput(erase(bdStrong))) - // println(breakIntoTerms(erase(bdStrong)).size) - // println(size5, size0, sizeFurther5) - // aprint(strongBsimp5(bdStrong)) - // println(asize(strongBsimp5(bdStrong5))) - // println(s) - size5 <= size0 + // println(bdStrong6Set.size, pdersSet.size) + bdStrong6Set.size <= pdersSet.size }) // println(boolList) //!boolList.exists(b => b == false) @@ -1125,11 +1222,40 @@ } -small() -// generator_test() +// small() +generator_test() +def counterexample_check() { + val r = STAR(SEQ(ALTS(ALTS(CHAR('b'),CHAR('c')), + SEQ(CHAR('b'),CHAR('b'))),ALTS(SEQ(ONE,CHAR('b')),CHAR('a')))) + val s = "bbbb" + val bdStrong5 = bdersStrong6(s.toList, internalise(r)) + val bdStrong5Set = breakIntoTerms(erase(bdStrong5)) + val pdersSet = pderUNIV(r)//.map(oneSimp).flatMap(r => breakIntoTerms(r)) + println("original regex ") + rprint(r) + println("after strong bsimp") + aprint(bdStrong5) + println("turned into a set %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ") + rsprint(bdStrong5Set) + println("after pderUNIV") + rsprint(pdersSet.toList) +} +// counterexample_check() // 1 - +def newStrong_test() { + val r2 = (CHAR('b') | ONE) + val r0 = CHAR('d') + val r1 = (ONE | CHAR('c')) + val expRexp = (SEQ(r2, r0) | SEQ(SEQ(r1, r2), r0)) + println(s"original regex is: ") + rprint(expRexp) + val expSimp5 = strongBsimp5(internalise(expRexp)) + val expSimp6 = strongBsimp6(internalise(expRexp)) + aprint(expSimp5) + aprint(expSimp6) +} +// newStrong_test() // SEQ(SEQ(SEQ(ONE,CHAR('b')),STAR(CHAR('b'))), // SEQ(ALTS(ALTS(ZERO,STAR(CHAR('b'))), // STAR(ALTS(CHAR('a'),SEQ(SEQ(STAR(ALTS(STAR(CHAR('c')),CHAR('a'))),