--- a/ChengsongTanPhdThesis/Chapters/Cubic.tex Fri Jun 03 16:45:30 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/Cubic.tex Mon Jun 06 03:05:31 2022 +0100
@@ -11,52 +11,278 @@
%----------------------------------------------------------------------------------------
\section{A Stronger Version of Simplification}
%TODO: search for isabelle proofs of algorithms that check equivalence
-Two alternative (distinct) sub-matches for the (sub-)string and (sub-)regex pair
+In our bit-coded lexing algorithm, with or without simplification,
+two alternative (distinct) sub-matches for the (sub-)string and (sub-)regex pair
are always expressed in the "derivative regular expression" as two
disjoint alternative terms at the current (sub-)regex level. The fact that they
are parallel tells us when we retrieve the information from this (sub-)regex
there will always be a choice of which alternative term to take.
+As an example, the regular expression $aa \cdot a^*+ a \cdot a^*$ (omitting bit-codes)
+expresses two possibilities it will match future input.
+It will either match 2 $a$'s, then 0 or more $a$'s, in other words, at least 2 more $a$'s
+\begin{figure}\label{string_3parts1}
+\begin{center}
+\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+ \node [rectangle split, rectangle split horizontal, rectangle split parts=3]
+ {Consumed Input
+ \nodepart{two} Expects $aa$
+ \nodepart{three} Then expects $a^*$};
+
+\end{tikzpicture}
+\end{center}
+\end{figure}
+\noindent
+Or it will match at least 1 more $a$:
+\begin{figure}\label{string_3parts2}
+\begin{center}
+\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+ \node [rectangle split, rectangle split horizontal, rectangle split parts=3]
+ {Consumed
+ \nodepart{two} Expects $a$
+ \nodepart{three} Then expects $a^*$};
+
+\end{tikzpicture}
+\end{center}
+\end{figure}
+If these two possibilities are identical, we can eliminate
+the second one as we know the second one corresponds to
+a match that is not $\POSIX$.
+
+
+If two identical regexes
+happen to be grouped into different sequences, one cannot use
+the $a + a \rightsquigarrow a$ rule to eliminate them, even if they
+are "parallel" to each other:
+\[
+(a+b) \cdot r_1 + (a+c) \cdot r_2 \rightsquigarrow (a+b) \cdot r_1 + (c) \cdot r_2
+\]
+and that's because they are followed by possibly
+different "suffixes" $r_1$ and $r_2$, and if
+they do turn out to be different then doing
+\[
+(a+b) \cdot r_1 + (a+c) \cdot r_2 \rightsquigarrow (a+b) \cdot r_1 + (c) \cdot r_2
+\]
+might cause a possible match where the string is in $L(a \cdot r_2)$ to be lost.
Here is an example for this.
-
Assume we have the derivative regex
-\[(a^* + (aa)^* + (aaa)^*)\cdot(a^* + (aa)^* + (aaa)^*)^* +
-(a^* + a\cdot(aa)^* + (aa)\cdot (aaa)^*)\cdot(a^* + (aa)^* + (aaa)^*)^*
+\[( r_1 + r_2 + r_3)\cdot r+
+( r_1 + r_5 + r_6)\cdot( r_1 + r_2 + r_3)^*
\]
-occurring in an intermediate step in successive
-derivatives of an input string $\underbrace{aa\ldots a}_{\text{n \; a's}}$.
+occurring in an intermediate step in our bit-coded lexing algorithm.
+
In this expression, there will be 6 "parallel" terms if we break up regexes
of shape $(a+b)\cdot c$ using the distributivity law (into $ac + bc$).
\begin{align}
(\nonumber \\
-a^* + & \label{term:1} \\
-(aa)^* + & \label{term:2} \\
-(aaa)^* & \label{term:3} \\
-& )\cdot(a^* + (aa)^* + (aaa)^*)^* \; + \; (\nonumber \\
-a^* + & \label{term:4} \\
-a \cdot (aa)^* + & \label{term:5} \\
-aa \cdot (aaa)^* \label{term:6}\\
-& )\cdot(a^* + (aa)^* + (aaa)^*)^*\nonumber
+ r_1 + & \label{term:1} \\
+ r_2 + & \label{term:2} \\
+ r_3 & \label{term:3} \\
+& )\cdot r\; + \; (\nonumber \\
+ r_1 + & \label{term:4} \\
+ r_5 + & \label{term:5} \\
+ r_6 \label{term:6}\\
+& )\cdot r\nonumber
\end{align}
-
They have been labelled, and each label denotes
one term, for example, \ref{term:1} denotes
\[
-a^*\cdot(a^* + (aa)^* + (aaa)^*)^*
+ r_1 \cdot r
\]
\noindent
and \ref{term:3} denotes
\[
-(aaa)^*\cdot(a^* + (aa)^* + (aaa)^*)^*.
+ r_3\cdot r.
\]
-These terms can be interpreted in terms of
-their current input position's most recent
-partial match.
-Term \ref{term:1}, \ref{term:2}, and \ref{term:3}
-denote the partial-match ending at the current position:
+From a lexer's point of view, \ref{term:4} will never
+be picked up in later phases of matching because there
+is a term \ref{term:1} giving identical matching information.
+The first term \ref{term:1} will match a string in $L(r_1 \cdot r)$,
+and so on for term \ref{term:2}, \ref{term:3}, etc.
+
\mybox{previous input $\ldots$}\mybox{$aaa$ }\mybox{rest of input $\ldots$}
+\begin{center}\label{string_2parts}
+
+\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+ \node [rectangle split, rectangle split horizontal, rectangle split parts=2]
+ {$r_{x1}$
+ \nodepart{two} $r_1\cdot r$ };
+%\caption{term 1 \ref{term:1}'s matching configuration}
+\end{tikzpicture}
+
+\end{center}
+For term 1 \ref{term:1}, whatever was before the current
+input position was fully matched and the regex corresponding
+to it reduced to $\ONE$,
+and in the next input token, it will start on $r_1\cdot r$.
+The resulting value will be something of a similar configuration:
+\begin{center}\label{value_2parts}
+%\caption{term 1 \ref{term:1}'s matching configuration}
+\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+ \node [rectangle split, rectangle split horizontal, rectangle split parts=2]
+ {$v_{x1}$
+ \nodepart{two} $v_{r_1\cdot r}$ };
+\end{tikzpicture}
+\end{center}
+For term 2 \ref{term:2} we have a similar value partition:
+\begin{center}\label{value_2parts2}
+%\caption{term 1 \ref{term:1}'s matching configuration}
+\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+ \node [rectangle split, rectangle split horizontal, rectangle split parts=2]
+ {$v_{x2}$
+ \nodepart{two} $v_{r_2\cdot r}$ };
+\end{tikzpicture}
+\end{center}
+\noindent
+and so on.
+We note that for term 4 \ref{term:4} its result value
+after any position beyond the current one will always
+be the same:
+\begin{center}\label{value_2parts4}
+%\caption{term 1 \ref{term:1}'s matching configuration}
+\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+ \node [rectangle split, rectangle split horizontal, rectangle split parts=2]
+ {$v_{x4}$
+ \nodepart{two} $v_{r_1\cdot r}$ };
+\end{tikzpicture}
+\end{center}
+And $\POSIX$ rules says that we can eliminate at least one of them.
+Our algorithm always puts the regex with the longest initial sub-match at the left of the
+alternatives, so we safely throw away \ref{term:4}.
+The fact that term 1 and 4 are not immediately in the same alternative
+expression does not prevent them from being two duplicate matches at
+the current point of view.
+To implement this idea into an algorithm, we define a "pruning function"
+$\textit{prune}$, that takes away parts of all terms in $r$ that belongs to
+$\textit{acc}$, which acts as an element
+is a stronger version of $\textit{distinctBy}$.
+Here is a concise version of it (in the style of Scala):
+\begin{verbatim}
+def distinctByPlus(rs: List[ARexp], acc: Set[Rexp] = Set()) :
+List[ARexp] = rs match {
+ case Nil => Nil
+ case r :: rs =>
+ if(acc.contains(erase(r)))
+ distinctByPlus(rs, acc)
+ else
+ prune(r, acc) :: distinctByPlus(rs, prune(r, acc) +: acc)
+}
+
+\end{verbatim}
+But for the function $\textit{prune}$ there is a difficulty:
+how could one define the $L$ function in a "computable" way,
+so that they generate a (lazy infinite) set of strings in $L(r)$.
+We also need a function that tests whether $L(r_1) \subseteq L(r_2)$
+is true.
+For the moment we cut corners and do not define these two functions
+as they are not used by Antimirov (and will probably not contribute
+to a bound better than Antimirov's cubic bound).
+Rather, we do not try to eliminate in every instance of regular expressions
+that have "language duplicates", but only eliminate the "exact duplicates".
+For this we need to break up terms $(a+b)\cdot c$ into two
+terms $a\cdot c$ and $b\cdot c$ before we add them to the accumulator:
+\begin{verbatim}
+def distinctWith(rs: List[ARexp],
+ pruneFunction: (ARexp, Set[Rexp]) => ARexp,
+ acc: Set[Rexp] = Set()) : List[ARexp] =
+ rs match{
+ case Nil => Nil
+ case r :: rs =>
+ if(acc(erase(r)))
+ distinctWith(rs, pruneFunction, acc)
+ else {
+ val pruned_r = pruneFunction(r, acc)
+ pruned_r ::
+ distinctWith(rs,
+ pruneFunction,
+ turnIntoTerms(erase(pruned_r)) ++: acc
+ )
+ }
+ }
+\end{verbatim}
+\noindent
+This process is done by the function $\textit{turnIntoTerms}$:
+\begin{verbatim}
+def turnIntoTerms(r: Rexp): List[Rexp] = r match {
+ case SEQ(r1, r2) => if(isOne(r1))
+ turnIntoTerms(r2)
+ else
+ turnIntoTerms(r1).map(r11 => SEQ(r11, r2))
+ case ALTS(r1, r2) => turnIntoTerms(r1) ::: turnIntoTerms(r2)
+ case ZERO => Nil
+ case _ => r :: Nil
+}
+\end{verbatim}
+\noindent
+The pruning function can be defined recursively:
+\begin{verbatim}
+def prune7(r: ARexp, acc: Set[Rexp]) : ARexp = r match{
+ case AALTS(bs, rs) => rs.map(r => prune7(r, acc)).filter(_ != ZERO) match
+ {
+ case Nil => AZERO
+ case r::Nil => fuse(bs, r)
+ case rs1 => AALTS(bs, rs1)
+ }
+ case ASEQ(bs, r1, r2) => prune7(r1, acc.map(r => removeSeqTail(r, erase(r2)))) match {
+ case AZERO => AZERO
+ case r1p if(isOne(erase(r1p))) => fuse(bs ++ mkepsBC(r1p), r2)
+ case r1p => ASEQ(bs, r1p, r2)
+ }
+ case r => if(acc(erase(r))) AZERO else r
+}
+\end{verbatim}
+
+\begin{figure}
+\centering
+\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={size},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=30,
+ ymax=800,
+ ytick={0,200,...,800},
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={$bsimpStrong$ size growth},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[red,mark=*, mark options={fill=white}] table {strongSimpCurve.data};
+\end{axis}
+\end{tikzpicture}
+ &
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n$},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={size},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=30,
+ ymax=42000,
+ ytick={0,10000,...,40000},
+ scaled ticks=true,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={$bsimp$ size growth},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[blue,mark=*, mark options={fill=white}] table {bsimpExponential.data};
+\end{axis}
+\end{tikzpicture}\\
+\multicolumn{2}{c}{Graphs: Runtime for matching $((a^* + (aa)^* + \ldots + (aaaaa)^* )^*)^*$ with strings
+ of the form $\underbrace{aa..a}_{n}$.}
+\end{tabular}
+\caption{aaaaaStarStar} \label{fig:aaaaaStarStar}
+\end{figure}
@@ -113,11 +339,11 @@
Another option would be to either store the string $s$ that resulted in
a mis-match for $r$ or a dummy value as a placeholder:
\[
- \vdash \textit{Not}(abcd) : ~(a^*)
+ \vdash \textit{Not}(abcd) : ~( r_1 )
\]
or
\[
- \vdash \textit{Not}(\textit{Dummy}) : ~(a^*)
+ \vdash \textit{Not}(\textit{Dummy}) : ~( r_1 )
\]
We choose to implement this as it is most straightforward:
\[
--- a/ChengsongTanPhdThesis/Chapters/tape.tex Fri Jun 03 16:45:30 2022 +0100
+++ b/ChengsongTanPhdThesis/Chapters/tape.tex Mon Jun 06 03:05:31 2022 +0100
@@ -1,55 +1,16 @@
-\documentclass{standalone}
-\usepackage[utf8]{inputenc}
-\usepackage{ifthen}
-\usepackage{tikz}
-\usetikzlibrary{chains}
-\usetikzlibrary{calc, decorations.pathmorphing}
-\tikzstyle{tape}=[thick, draw, minimum height=5mm, minimum width=7mm, fill=green!35,outer sep=0pt]
-\tikzstyle{head}=[very thick, draw, minimum height=6mm, minimum width=7mm, fill=none,outer sep=0pt]
+\documentclass[tikz, border=10pt]{standalone}
+\usetikzlibrary{shapes}
+
+
+
-\def\DrawTape#1{
-\begin{scope}[start chain=1 going right,node distance=-0.15mm]
- % start tape
- \node [on chain=1,tape,draw=none, tape, fill=none, draw=none] at (0,-\i*1) (node\i0){\ldots\ };
- \draw [thick,fill=green!35] (node\i0.north east) -- ++(-.07,0) decorate [decoration={zigzag, segment length=.10cm, amplitude=.015cm}]
- {-- ($(node\i0.south east)+(-.15,0)$)} -- (node\i0.south east) -- cycle;
- % draw tape
- \foreach [count=\j] \element in \adlist {
- \ifthenelse{\j = \head}{ % read-write heads current position
- \node [on chain=1,tape, fill=red!35] (node\i\j){\element};
- }
- {
- \node [on chain=1,tape] (node\i\j) {\element};
- }
- }
- % close tape
- \draw [thick,fill=green!35] (node\i\j.north east) -- ++(.15,0) decorate [decoration={zigzag, segment length=.10cm, amplitude=.015cm}]
- {-- ($(node\i\j.south east)+(.07,0)$)} -- (node\i\j.south east) -- cycle;
- \node [on chain=1,tape, fill=none, draw=none] (end\i) {\ \ldots};
- \node [right = of end\i]{\tiny tape~\i};
-
- % draw read-write head
- \prevnode=\head
- \advance\prevnode by -1
- \node [head,right = of node\i\the\prevnode] (head\i){};
- \node [above = of head\i]{\tiny head \i};
-\end{scope}
-}
-\newcount\prevnode
\begin{document}
-% Define the turing machine as a list of pairs, where each pair consists of the read-writes head position the delimiter "/" and the list of symbols on the tape.
-\newcommand{\tapes}{ 3/{, , ,1,1, }, 1/{, 1, 0, 1 , 0,}, 2/{ , 1 , 1, 0, 1,}}
-\begin{tikzpicture}
-% Draw program node
-\draw (-1.5,-2) node [draw, thick, rectangle, minimum height= 2.5cm, minimum width=2cm, fill=white, rounded corners] (program){program};
-% Draw tapes and heads
-\foreach [count=\i] \head/\adlist in \tapes {
- % Draw tape and head
- \DrawTape{\i,\head,\adlist}
- % Draw link to program
- \draw[thick, rounded corners=3pt] (program) {-- ($(node\i0.south west) + (0,-0.15)$)}
- {-- ($(head\i.south) + (0,-0.1)$)}-- (head\i.south);
-}
+\begin{tikzpicture}[every node/.append style={draw, rounded corners, inner sep=10pt}]
+ \node [rectangle split, rectangle split horizontal, rectangle split parts=3]
+ {Services
+ \nodepart{two} Event Driven Framework
+ \nodepart{three} OS Adapter Layer};
+
\end{tikzpicture}
\end{document}
\ No newline at end of file
--- a/ChengsongTanPhdThesis/good-java.data Fri Jun 03 16:45:30 2022 +0100
+++ b/ChengsongTanPhdThesis/good-java.data Mon Jun 06 03:05:31 2022 +0100
@@ -1,6 +1,6 @@
%% LaTeX2e file `good-java.data'
%% generated by the `filecontents' environment
-%% from source `main' on 2022/03/16.
+%% from source `main' on 2022/06/05.
%%
1 1.5633E-5
2 1.299E-5
--- a/thys2/blexer2.sc Fri Jun 03 16:45:30 2022 +0100
+++ b/thys2/blexer2.sc Mon Jun 06 03:05:31 2022 +0100
@@ -542,19 +542,33 @@
}
def distinctWith(rs: List[ARexp],
- f: (ARexp, Set[Rexp]) => ARexp,
+ pruneFunction: (ARexp, Set[Rexp]) => ARexp,
acc: Set[Rexp] = Set()) : List[ARexp] =
rs match{
case Nil => Nil
case r :: rs =>
if(acc(erase(r)))
- distinctWith(rs, f, acc)
+ distinctWith(rs, pruneFunction, acc)
else {
- val pruned_r = f(r, acc)
- pruned_r :: distinctWith(rs, f, strongBreakIntoTerms(erase(pruned_r)) ++: acc)
+ val pruned_r = pruneFunction(r, acc)
+ pruned_r ::
+ distinctWith(rs,
+ pruneFunction,
+ turnIntoTerms(erase(pruned_r)) ++: acc
+ )
}
}
+// def distinctByPlus(rs: List[ARexp], acc: Set[Rexp] = Set()) :
+// List[ARexp] = rs match {
+// case Nil => Nil
+// case r :: rs =>
+// if(acc.contains(erase(r)))
+// distinctByPlus(rs, acc)
+// else
+// prune(r, acc) :: distinctByPlus(rs, prune(r, acc) +: acc)
+// }
+
//r = r' ~ tail => returns r'
def removeSeqTail(r: Rexp, tail: Rexp) : Rexp = r match {
case SEQ(r1, r2) =>
@@ -805,19 +819,18 @@
}
-def strongBreakIntoTerms(r: Rexp): List[Rexp] = r match {
+def turnIntoTerms(r: Rexp): List[Rexp] = r match {
case SEQ(r1, r2) => if(isOne(r1))
- //strongBreakIntoTerms(r1).map(r11 => SEQ(r11, r2)) :::
- strongBreakIntoTerms(r2)
+ turnIntoTerms(r2)
else
- strongBreakIntoTerms(r1).map(r11 => SEQ(r11, r2))
- case ALTS(r1, r2) => strongBreakIntoTerms(r1) ::: strongBreakIntoTerms(r2)
+ turnIntoTerms(r1).map(r11 => SEQ(r11, r2))
+ case ALTS(r1, r2) => turnIntoTerms(r1) ::: turnIntoTerms(r2)
case ZERO => Nil
case _ => r :: Nil
}
def attachCtx(r: ARexp, ctx: List[Rexp]) : Set[Rexp] = {
- val res = strongBreakIntoTerms((L(erase(r), ctx))).map(oneSimp)
+ val res = turnIntoTerms((L(erase(r), ctx))).map(oneSimp)
res.toSet
}
@@ -868,7 +881,7 @@
}
else{
val pruned = prune6(acc, x, Nil)
- val newTerms = strongBreakIntoTerms(erase(pruned))
+ val newTerms = turnIntoTerms(erase(pruned))
pruned match {
case AZERO =>
distinctBy6(xs, acc)
@@ -1201,11 +1214,11 @@
def n_astar_alts(d: Int) : Rexp = d match {
case 0 => n_astar_list(0)
case d => STAR(n_astar_list(d))
- // case r1 :: r2 :: Nil => ALTS(r1, r2)
- // case r1 :: Nil => r1
- // case r :: rs => ALTS(r, n_astar_alts(rs))
- // case Nil => throw new Error("should give at least 1 elem")
-}
+ }
+def n_astar_alts2(d: Int) : Rexp = d match {
+ case 0 => n_astar_list(0)
+ case d => STAR(STAR(n_astar_list(d)))
+ }
def n_astar_aux(d: Int) = {
if(d == 0) n_astar_alts(0)
else ALTS(n_astar_alts(d), n_astar_alts(d - 1))
@@ -1233,9 +1246,9 @@
//val refSize = pderSTAR.map(size(_)).sum
for(n <- 5 to 5){
- val STARREG = n_astar(n)
+ val STARREG = n_astar_alts2(n)
val iMax = (lcm((1 to n).toList))
- for(i <- 1 to iMax + 2){// 100, 400, 800, 840, 841, 900
+ for(i <- 0 to iMax ){// 100, 400, 800, 840, 841, 900
val prog0 = "a" * i
//println(s"test: $prog0")
print(i)
@@ -1243,10 +1256,11 @@
// print(i)
// print(" ")
println(asize(bders_simp(prog0.toList, internalise(STARREG))))
+ // println(asize(bdersStrong7(prog0.toList, internalise(STARREG))))
}
}
}
-
+small()
def generator_test() {
test(rexp(4), 1000000) { (r: Rexp) =>
@@ -1255,9 +1269,9 @@
val boolList = ss.filter(s => s != "").map(s => {
//val bdStrong = bdersStrong(s.toList, internalise(r))
val bdStrong6 = bdersStrong7(s.toList, internalise(r))
- val bdStrong6Set = strongBreakIntoTerms(erase(bdStrong6))
- val pdersSet = pderUNIV(r)//.flatMap(r => strongBreakIntoTerms(r))
- val pdersSetBroken = pdersSet.flatMap(r => strongBreakIntoTerms(r))
+ val bdStrong6Set = turnIntoTerms(erase(bdStrong6))
+ val pdersSet = pderUNIV(r)//.flatMap(r => turnIntoTerms(r))
+ val pdersSetBroken = pdersSet.flatMap(r => turnIntoTerms(r))
bdStrong6Set.size <= pdersSet.size || bdStrong6Set.size <= pdersSetBroken.size
})
//println(boolList)
@@ -1280,7 +1294,7 @@
val s = "b"
val bdStrong5 = bdersStrong7(s.toList, internalise(r))
val bdStrong5Set = breakIntoTerms(erase(bdStrong5))
- val pdersSet = pderUNIV(r)//.map(r => erase(bsimp(internalise(r))))//.flatMap(r => strongBreakIntoTerms(r))//.map(oneSimp).flatMap(r => breakIntoTerms(r))
+ val pdersSet = pderUNIV(r)//.map(r => erase(bsimp(internalise(r))))//.flatMap(r => turnIntoTerms(r))//.map(oneSimp).flatMap(r => breakIntoTerms(r))
println("original regex ")
rprint(r)
println("after strong bsimp")
@@ -1291,7 +1305,7 @@
rsprint(pdersSet.toList)
}
-counterexample_check()
+// counterexample_check()
def linform_test() {
val r = STAR(SEQ(STAR(CHAR('c')),ONE))//SEQ(STAR(CHAR('c')),STAR(SEQ(STAR(CHAR('c')),ONE))) //