--- a/Spiral.scala Thu Jul 25 21:02:06 2019 +0100
+++ b/Spiral.scala Sun Aug 18 22:19:46 2019 +0100
@@ -580,6 +580,22 @@
}
}
+ def speed_test(){
+ val s0 = "a"*1000
+ val r = SEQ(STAR("a"), "b")
+ for(i <- 1 to 30){
+ val s = s0*i
+ val start = System.nanoTime()
+ try{
+ blex_simp(internalise(r), s.toList)
+ }
+ catch{
+ case x: Exception =>
+ }
+ val end = System.nanoTime()
+ printf("%d %f\n",i, (end - start)/1.0e9)
+ }
+ }
def main(args: Array[String]) {
//check_all()
//radical_correctness()
@@ -587,9 +603,10 @@
//retrieve_experience()
//neat_retrieve()
//test_bsimp2()
- christian_def2()
+ //christian_def2()
//christian_def()
//essence_posix()
+ speed_test()
}
}
Binary file ninems/.DS_Store has changed
--- a/ninems/data.sty Thu Jul 25 21:02:06 2019 +0100
+++ b/ninems/data.sty Sun Aug 18 22:19:46 2019 +0100
@@ -55,4 +55,69 @@
28 29.81185
\end{filecontents}
+% Java, example (a*)b
+\begin{filecontents}{good-java.data}
+1 1.5633E-5
+2 1.299E-5
+3 1.1451E-5
+4 1.5846E-5
+5 1.9934E-5
+6 2.174E-5
+7 2.7669E-5
+8 2.8657E-5
+9 2.8161E-5
+10 2.8729E-5
+11 3.5367E-5
+12 3.701E-5
+13 3.84E-5
+14 4.1329E-5
+15 4.8116E-5
+16 5.3597E-5
+17 4.6792E-5
+18 5.8618E-5
+19 6.2078E-5
+20 6.4702E-5
+21 6.1464E-5
+22 6.4693E-5
+23 6.1667E-5
+24 7.1466E-5
+25 7.8089E-5
+26 7.4661E-5
+27 7.5628E-5
+28 8.9169E-5
+29 9.4161E-5
+30 9.8494E-5
+\end{filecontents}
+\begin{filecontents}{bad-scala.data}
+1 0.048028
+2 0.126308
+3 0.169670
+4 0.264180
+5 0.384297
+6 0.612785
+7 0.817211
+8 1.089386
+9 1.363222
+10 1.644981
+11 1.987046
+12 2.416012
+13 2.920956
+14 3.398956
+15 3.739649
+16 4.329977
+17 4.932896
+18 5.633909
+19 6.436107
+20 6.927233
+21 7.441163
+22 8.242139
+23 8.901034
+24 9.680487
+25 10.496226
+26 11.385733
+27 12.247613
+28 13.990491
+29 14.245804
+30 15.206283
+\end{filecontents}
\ No newline at end of file
--- a/ninems/ninems.tex Thu Jul 25 21:02:06 2019 +0100
+++ b/ninems/ninems.tex Sun Aug 18 22:19:46 2019 +0100
@@ -187,7 +187,10 @@
frequent enough to have a separate name created for
them---\emph{evil regular expressions}. In empiric work, Davis et al
report that they have found thousands of such evil regular expressions
-in the JavaScript and Python ecosystems \cite{Davis18}.
+in the JavaScript and Python ecosystems \cite{Davis18}. Static analysis
+approach that is both sound and complete exists\cite{17Bir}, but the running
+time on certain examples in the RegExLib and Snort regular expressions
+libraries is unacceptable. Therefore the problem of efficiency still remains.
This superlinear blowup in matching algorithms sometimes causes
considerable grief in real life: for example on 20 July 2016 one evil
@@ -1274,13 +1277,76 @@
contains all the bitcodes needed. The task here is to find a way to
compute the correct $v'$.
-The second task is to speed up the more aggressive simplification.
-Currently it is slower than the original naive simplification by Ausaf
-and Urban (the naive version as implemented by Ausaf and Urban of
-course can ``explode'' in some cases). So it needs to be explored how to
-make our algorithm faster on all inputs. Our possibility would be to
-explore again the connection to DFAs. This is very much work in
-progress.
+The second task is to speed up the more aggressive simplification. Currently
+it is slower than the original naive simplification by Ausaf and Urban (the
+naive version as implemented by Ausaf and Urban of course can ``explode'' in
+some cases). It is therefore not surprising that the speed is also much slower
+than regular expression engines in popular programming languages such as Java
+and Python on most inputs that are linear. For example, just by rewriting the
+example regular expression in the beginning of this report $(a^*)^*\,b$ into
+$a^*\,b$ would eliminate the ambiguity in the matching and make the time
+for matching linear with respect to the input string size. This allows the
+DFA approach to become blindingly fast, and dwarf the speed of our current
+implementation. For example, here is a comparison of Java regex engine
+and our implementation on this example.
+
+\begin{center}
+\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}}
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n*1000$},
+ x label style={at={(1.05,-0.05)}},
+ ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=9,
+ scaled ticks=true,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={Bitcoded Algorithm},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[red,mark=*, mark options={fill=white}] table {bad-scala.data};
+\end{axis}
+\end{tikzpicture}
+ &
+\begin{tikzpicture}
+\begin{axis}[
+ xlabel={$n*1000$},
+ x label style={at={(1.05,-0.05)}},
+ %ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=9,
+ scaled ticks=false,
+ axis lines=left,
+ width=5cm,
+ height=4cm,
+ legend entries={Java},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[cyan,mark=*, mark options={fill=white}] table {good-java.data};
+\end{axis}
+\end{tikzpicture}\\
+\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings
+ of the form $\underbrace{aa..a}_{n}$.}
+\end{tabular}
+\end{center}
+
+
+Java regex engine can match string of thousands of characters in a few milliseconds,
+whereas our current algorithm gets excruciatingly slow on input of this size.
+The running time in theory is linear, however it does not appear to be the
+case in an actual implementation. So it needs to be explored how to
+make our algorithm faster on all inputs. It could be the recursive calls that are
+needed to manipulate bits that are causing the slow down. A possible solution
+is to write recursive functions into tail-recusive form.
+Another possibility would be to explore
+again the connection to DFAs to speed up the algorithm on
+subcalls that are small enough. This is very much work in progress.
\section{Conclusion}
--- a/ninems/root.bib Thu Jul 25 21:02:06 2019 +0100
+++ b/ninems/root.bib Sun Aug 18 22:19:46 2019 +0100
@@ -1,15 +1,23 @@
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/
-%% Created for CS TAN at 2019-07-03 22:17:58 +0100
+%% Created for CS TAN at 2019-08-18 19:00:13 +0100
%% Saved with string encoding Unicode (UTF-8)
+@article{17Bir,
+ Author = {Asiri Rathnayake and Hayo Thielecke},
+ Date-Added = {2019-08-18 17:57:30 +0000},
+ Date-Modified = {2019-08-18 18:00:13 +0000},
+ Journal = {arXiv:1405.7058},
+ Title = {Static Analysis for Regular Expression Exponential Runtime via Substructural Logics},
+ Year = {2017}}
+
@article{nielson11bcre,
- Author = { Lasse Nielsen, Fritz Henglein},
+ Author = {Lasse Nielsen, Fritz Henglein},
Date-Added = {2019-07-03 21:09:39 +0000},
Date-Modified = {2019-07-03 21:17:33 +0000},
Journal = {LATA},