# HG changeset patch # User Chengsong # Date 1658431355 -3600 # Node ID 692911c0b9819e71f8cac6f82ab1b47d17de14de # Parent 454ced5576055bad4ceb06edd9aae57839f0f5f0# Parent 344a834a093a060cd7dee700a03f5187e911afc7 merge chritian changes diff -r 344a834a093a -r 692911c0b981 ChengsongTanPhdThesis/Chapters/Inj.tex --- a/ChengsongTanPhdThesis/Chapters/Inj.tex Sun Jul 17 20:28:46 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Inj.tex Thu Jul 21 20:22:35 2022 +0100 @@ -939,7 +939,7 @@ \noindent The central property of the $\lexer$ is that it gives the correct result by $\POSIX$ standards: -\begin{theorem} +\begin{theorem}\label{lexerCorrectness} The $\lexer$ based on derivatives and injections is correct: \begin{center} \begin{tabular}{lcl} @@ -1008,32 +1008,148 @@ This is a highly ambiguous regular expression, with many ways to split up the string into multiple segments for different star iteratioins, -and each segment will have multiple ways of splitting between +and for each segment +multiple ways of splitting between the two $a^*$ sub-expressions. -It is not surprising there are exponentially many -distinct lexical values -for such a pair of regular expression and string. +When $n$ is equal to $1$, there are two lexical values for +the match: +\[ + \Stars \; [\Seq \; (\Stars \; [\Char \; a])\; (\Stars \; [])] \quad (value 1) +\] +and +\[ + \Stars \; [\Seq \; (\Stars \; [])\; (\Stars \; [\Char \; a])] \quad (value 2) +\] +The derivative of $\derssimp \;s \; r$ is +\[ + (a^*a^* + a^*)\cdot(a^*a^*)^*. +\] +The $a^*a^*$ and $a^*$ in the first child of the above sequence +correspond to value 1 and value 2, respectively. +When $n=2$, the number goes up to 7: +\[ + \Stars \; [\Seq \; (\Stars \; [\Char \; a, \Char \; a])\; (\Stars \; [])] +\] +, +\[ + \Stars \; [\Seq \; (\Stars \; [\Char \; a])\; (\Stars \; [\Char \; a])] +\] +, +\[ + \Stars \; [\Seq \; (\Stars \; [])\; (\Stars \; [\Char \; a, \Char \; a])] +\] +, +\[ + \Stars \; [\Seq \; (\Stars \; [\Char \; a])\; (\Stars \; []), \Seq \; (\Stars \; [\Char\;a])\; (\Stars\; []) ] +\] +, +\[ + \Stars \; [\Seq \; (\Stars \; [\Char \; a])\; (\Stars \; []), + \Seq \; (\Stars \; [])\; (\Stars \; [\Char \; a]) + ] +\] +, +\[ + \Stars \; [ + \Seq \; (\Stars \; [])\; (\Stars \; [\Char \; a]), + \Seq \; (\Stars \; [])\; (\Stars \; [\Char \; a]) + ] +\] +and +\[ + \Stars \; [ + \Seq \; (\Stars \; [])\; (\Stars \; [\Char \; a]), + \Seq \; (\Stars \; [\Char \; a])\; (\Stars \; []) + ] +\] +And $\derssimp \; aa \; (a^*a^*)^*$ would be +\[ + ((a^*a^* + a^*)+a^*)\cdot(a^*a^*)^* + + (a^*a^* + a^*)\cdot(a^*a^*)^*. +\] +which removes two out of the seven terms corresponding to the +seven distinct lexical values. + +It is not surprising that there are exponentially many +distinct lexical values that cannot be eliminated by +the simple-minded simplification of $\derssimp$. + A lexer without a good enough strategy to deduplicate will naturally have an exponential runtime on ambiguous regular expressions. -Somehow one has to make sure which - lexical values are $\POSIX$ and must be kept in a lexing algorithm. - For example, the above $r= (a^*\cdot a^*)^*$ and -$s=\underbrace{aa\ldots a}_\text{n \textit{a}s}$ example has the POSIX value -$ \Stars\,[\Seq(Stars\,[\underbrace{\Char(a),\ldots,\Char(a)}_\text{n iterations}], Stars\,[])]$. -We want to keep this value only, and remove all the regular expression subparts -not corresponding to this value during lexing. -To do this, a two-phase algorithm with rectification is a bit too fragile. -Can we not create those intermediate values $v_1,\ldots v_n$, -and get the lexing information that should be already there while -doing derivatives in one pass, without a second injection phase? -In the meantime, can we ensure that simplifications -are easily handled without breaking the correctness of the algorithm? +On the other hand, the + $\POSIX$ value for $r= (a^*\cdot a^*)^*$ and +$s=\underbrace{aa\ldots a}_\text{n \textit{a}s}$ is +\[ + \Stars\, + [\Seq \; (\Stars\,[\underbrace{\Char(a),\ldots,\Char(a)}_\text{n iterations}]), \Stars\,[]] +\] +and at any moment the subterms in a regular expression +that will result in a POSIX value is only +a minority among the many other terms, +and one can remove ones that are absolutely not possible to +be POSIX. +In the above example, +\[ + ((a^*a^* + \underbrace{a^*}_\text{A})+\underbrace{a^*}_\text{duplicate of A})\cdot(a^*a^*)^* + + \underbrace{(a^*a^* + a^*)\cdot(a^*a^*)^*}_\text{further simp removes this}. +\] +can be further simplified by +removing the underlined term first, +which would open up possibilities +of further simplification that removes the +underbraced part. +The result would be +\[ + (\underbrace{a^*a^*}_\text{term 1} + \underbrace{a^*}_\text{term 2})\cdot(a^*a^*)^*. +\] +with corresponding values +\begin{center} + \begin{tabular}{lr} + $\Stars \; [\Seq \; (\Stars \; [\Char \; a, \Char \; a])\; (\Stars \; [])]$ & $(\text{term 1})$\\ + $\Stars \; [\Seq \; (\Stars \; [\Char \; a])\; (\Stars \; [\Char \; a])] $ & $(\text{term 2})$ + \end{tabular} +\end{center} +Other terms with an underlying value such as +\[ + \Stars \; [\Seq \; (\Stars \; [])\; (\Stars \; [\Char \; a, \Char \; a])] +\] +is simply too hopeless to contribute a POSIX lexical value, +and is therefore thrown away. -Sulzmann and Lu solved this problem by -introducing additional information to the -regular expressions called \emph{bitcodes}. +Ausaf and Dyckhoff and Urban \cite{AusafDyckhoffUrban2016} +have come up with some simplification steps, however those steps +are not yet sufficiently strong so that they achieve the above effects. +And even with these relatively mild simplifications the proof +is already quite a bit complicated than the theorem \ref{lexerCorrectness}. +One would prove something like: +\[ + \textit{If}\; (\textit{snd} \; (\textit{simp} \; r\backslash c), s) \rightarrow v \;\; + \textit{then}\;\; (r, c::s) \rightarrow + \inj\;\, r\, \;c \;\, ((\textit{fst} \; (\textit{simp} \; r \backslash c))\; v) +\] +instead of the simple lemma \ref{injPosix}, where now $\textit{simp}$ +not only has to return a simplified regular expression, +but also what specific simplifications +has been done as a function on values +showing how one can transform the value +underlying the simplified regular expression +to the unsimplified one. + +We therefore choose a slightly different approach to +get better simplifications, which uses +some augmented data structures compared to +plain regular expressions. +We call them \emph{annotated} +regular expressions. +With annotated regular expressions, +we can avoid creating the intermediate values $v_1,\ldots v_n$ and a +second phase altogether. +In the meantime, we can also ensure that simplifications +are easily handled without breaking the correctness of the algorithm. +We introduce this new datatype and the +corresponding algorithm in the next chapter. diff -r 344a834a093a -r 692911c0b981 ChengsongTanPhdThesis/Chapters/Introduction.tex --- a/ChengsongTanPhdThesis/Chapters/Introduction.tex Sun Jul 17 20:28:46 2022 +0100 +++ b/ChengsongTanPhdThesis/Chapters/Introduction.tex Thu Jul 21 20:22:35 2022 +0100 @@ -83,13 +83,13 @@ \def\retrieve{\textit{retrieve}} \def\blexer{\textit{blexer}} \def\flex{\textit{flex}} -\def\inj{\mathit{inj}} +\def\inj{\textit{inj}} \def\Empty{\textit{Empty}} \def\Left{\textit{Left}} \def\Right{\textit{Right}} -\def\Stars{\mathit{Stars}} -\def\Char{\mathit{Char}} -\def\Seq{\mathit{Seq}} +\def\Stars{\textit{Stars}} +\def\Char{\textit{Char}} +\def\Seq{\textit{Seq}} \def\Der{\textit{Der}} \def\Ders{\textit{Ders}} \def\nullable{\mathit{nullable}} diff -r 344a834a093a -r 692911c0b981 thys2/blexer2.sc --- a/thys2/blexer2.sc Sun Jul 17 20:28:46 2022 +0100 +++ b/thys2/blexer2.sc Thu Jul 21 20:22:35 2022 +0100 @@ -1466,35 +1466,42 @@ } } // aaa_star() + +def matching_ways_counting(n: Int): Int = + { + if(n == 0) 1 + else if(n == 1) 2 + else (for(i <- 1 to n - 1) + yield matching_ways_counting(i) * matching_ways_counting(n - i) ).sum + (n + 1) + } def naive_matcher() { val r = STAR(STAR("a") ~ STAR("a")) - for(i <- 0 to 20) { - val s = "a" * i - val t1 = System.nanoTime - matcher(s, r) - val duration = (System.nanoTime - t1) / 1e9d - print(i) - print(" ") - // print(duration) - // print(" ") - print(asize(bders_simp(s.toList, internalise(r)))) - //print(size(ders_simp(s.toList, r))) - println() - } - // for(i <- 1 to 40000000 by 500000) { - // val s = "a" * i + // for(i <- 0 to 10) { + // val s = "a" * i // val t1 = System.nanoTime - // val derssimp_result = ders_simp(s.toList, r) + // matcher(s, r) // val duration = (System.nanoTime - t1) / 1e9d // print(i) // print(" ") - // print(duration) + // // print(duration) + // // print(" ") + // (aprint(bders_simp(s.toList, internalise(r)))) + // //print(size(ders_simp(s.toList, r))) // println() // } + for(i <- 1 to 3) { + val s = "a" * i + val t1 = System.nanoTime + val derssimp_result = ders_simp(s.toList, r) + println(i) + println(matching_ways_counting(i)) + println(size(derssimp_result)) + rprint(derssimp_result) + } } -// naive_matcher() +naive_matcher() //single(SEQ(SEQ(STAR(CHAR('b')),STAR(STAR(SEQ(CHAR('a'),CHAR('b'))))), // SEQ(SEQ(CHAR('b'),STAR(ALTS(CHAR('a'),ONE))),ONE)))