# HG changeset patch # User Chengsong # Date 1566163186 -3600 # Node ID 9c52c21b5db30d2b8d5b13a1ef4ad1037ce08ac4 # Parent 51ac1ab6e1fd33a55c6cb63029fa7eec7259379b changes to report diff -r 51ac1ab6e1fd -r 9c52c21b5db3 Spiral.scala --- a/Spiral.scala Thu Jul 25 21:02:06 2019 +0100 +++ b/Spiral.scala Sun Aug 18 22:19:46 2019 +0100 @@ -580,6 +580,22 @@ } } + def speed_test(){ + val s0 = "a"*1000 + val r = SEQ(STAR("a"), "b") + for(i <- 1 to 30){ + val s = s0*i + val start = System.nanoTime() + try{ + blex_simp(internalise(r), s.toList) + } + catch{ + case x: Exception => + } + val end = System.nanoTime() + printf("%d %f\n",i, (end - start)/1.0e9) + } + } def main(args: Array[String]) { //check_all() //radical_correctness() @@ -587,9 +603,10 @@ //retrieve_experience() //neat_retrieve() //test_bsimp2() - christian_def2() + //christian_def2() //christian_def() //essence_posix() + speed_test() } } diff -r 51ac1ab6e1fd -r 9c52c21b5db3 ninems/.DS_Store Binary file ninems/.DS_Store has changed diff -r 51ac1ab6e1fd -r 9c52c21b5db3 ninems/data.sty --- a/ninems/data.sty Thu Jul 25 21:02:06 2019 +0100 +++ b/ninems/data.sty Sun Aug 18 22:19:46 2019 +0100 @@ -55,4 +55,69 @@ 28 29.81185 \end{filecontents} +% Java, example (a*)b +\begin{filecontents}{good-java.data} +1 1.5633E-5 +2 1.299E-5 +3 1.1451E-5 +4 1.5846E-5 +5 1.9934E-5 +6 2.174E-5 +7 2.7669E-5 +8 2.8657E-5 +9 2.8161E-5 +10 2.8729E-5 +11 3.5367E-5 +12 3.701E-5 +13 3.84E-5 +14 4.1329E-5 +15 4.8116E-5 +16 5.3597E-5 +17 4.6792E-5 +18 5.8618E-5 +19 6.2078E-5 +20 6.4702E-5 +21 6.1464E-5 +22 6.4693E-5 +23 6.1667E-5 +24 7.1466E-5 +25 7.8089E-5 +26 7.4661E-5 +27 7.5628E-5 +28 8.9169E-5 +29 9.4161E-5 +30 9.8494E-5 +\end{filecontents} +\begin{filecontents}{bad-scala.data} +1 0.048028 +2 0.126308 +3 0.169670 +4 0.264180 +5 0.384297 +6 0.612785 +7 0.817211 +8 1.089386 +9 1.363222 +10 1.644981 +11 1.987046 +12 2.416012 +13 2.920956 +14 3.398956 +15 3.739649 +16 4.329977 +17 4.932896 +18 5.633909 +19 6.436107 +20 6.927233 +21 7.441163 +22 8.242139 +23 8.901034 +24 9.680487 +25 10.496226 +26 11.385733 +27 12.247613 +28 13.990491 +29 14.245804 +30 15.206283 +\end{filecontents} \ No newline at end of file diff -r 51ac1ab6e1fd -r 9c52c21b5db3 ninems/ninems.tex --- a/ninems/ninems.tex Thu Jul 25 21:02:06 2019 +0100 +++ b/ninems/ninems.tex Sun Aug 18 22:19:46 2019 +0100 @@ -187,7 +187,10 @@ frequent enough to have a separate name created for them---\emph{evil regular expressions}. In empiric work, Davis et al report that they have found thousands of such evil regular expressions -in the JavaScript and Python ecosystems \cite{Davis18}. +in the JavaScript and Python ecosystems \cite{Davis18}. Static analysis +approach that is both sound and complete exists\cite{17Bir}, but the running +time on certain examples in the RegExLib and Snort regular expressions +libraries is unacceptable. Therefore the problem of efficiency still remains. This superlinear blowup in matching algorithms sometimes causes considerable grief in real life: for example on 20 July 2016 one evil @@ -1274,13 +1277,76 @@ contains all the bitcodes needed. The task here is to find a way to compute the correct $v'$. -The second task is to speed up the more aggressive simplification. -Currently it is slower than the original naive simplification by Ausaf -and Urban (the naive version as implemented by Ausaf and Urban of -course can ``explode'' in some cases). So it needs to be explored how to -make our algorithm faster on all inputs. Our possibility would be to -explore again the connection to DFAs. This is very much work in -progress. +The second task is to speed up the more aggressive simplification. Currently +it is slower than the original naive simplification by Ausaf and Urban (the +naive version as implemented by Ausaf and Urban of course can ``explode'' in +some cases). It is therefore not surprising that the speed is also much slower +than regular expression engines in popular programming languages such as Java +and Python on most inputs that are linear. For example, just by rewriting the +example regular expression in the beginning of this report $(a^*)^*\,b$ into +$a^*\,b$ would eliminate the ambiguity in the matching and make the time +for matching linear with respect to the input string size. This allows the +DFA approach to become blindingly fast, and dwarf the speed of our current +implementation. For example, here is a comparison of Java regex engine +and our implementation on this example. + +\begin{center} +\begin{tabular}{@{}c@{\hspace{0mm}}c@{\hspace{0mm}}c@{}} +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n*1000$}, + x label style={at={(1.05,-0.05)}}, + ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=9, + scaled ticks=true, + axis lines=left, + width=5cm, + height=4cm, + legend entries={Bitcoded Algorithm}, + legend pos=north west, + legend cell align=left] +\addplot[red,mark=*, mark options={fill=white}] table {bad-scala.data}; +\end{axis} +\end{tikzpicture} + & +\begin{tikzpicture} +\begin{axis}[ + xlabel={$n*1000$}, + x label style={at={(1.05,-0.05)}}, + %ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=9, + scaled ticks=false, + axis lines=left, + width=5cm, + height=4cm, + legend entries={Java}, + legend pos=north west, + legend cell align=left] +\addplot[cyan,mark=*, mark options={fill=white}] table {good-java.data}; +\end{axis} +\end{tikzpicture}\\ +\multicolumn{3}{c}{Graphs: Runtime for matching $(a^*)^*\,b$ with strings + of the form $\underbrace{aa..a}_{n}$.} +\end{tabular} +\end{center} + + +Java regex engine can match string of thousands of characters in a few milliseconds, +whereas our current algorithm gets excruciatingly slow on input of this size. +The running time in theory is linear, however it does not appear to be the +case in an actual implementation. So it needs to be explored how to +make our algorithm faster on all inputs. It could be the recursive calls that are +needed to manipulate bits that are causing the slow down. A possible solution +is to write recursive functions into tail-recusive form. +Another possibility would be to explore +again the connection to DFAs to speed up the algorithm on +subcalls that are small enough. This is very much work in progress. \section{Conclusion} diff -r 51ac1ab6e1fd -r 9c52c21b5db3 ninems/root.bib --- a/ninems/root.bib Thu Jul 25 21:02:06 2019 +0100 +++ b/ninems/root.bib Sun Aug 18 22:19:46 2019 +0100 @@ -1,15 +1,23 @@ %% This BibTeX bibliography file was created using BibDesk. %% https://bibdesk.sourceforge.io/ -%% Created for CS TAN at 2019-07-03 22:17:58 +0100 +%% Created for CS TAN at 2019-08-18 19:00:13 +0100 %% Saved with string encoding Unicode (UTF-8) +@article{17Bir, + Author = {Asiri Rathnayake and Hayo Thielecke}, + Date-Added = {2019-08-18 17:57:30 +0000}, + Date-Modified = {2019-08-18 18:00:13 +0000}, + Journal = {arXiv:1405.7058}, + Title = {Static Analysis for Regular Expression Exponential Runtime via Substructural Logics}, + Year = {2017}} + @article{nielson11bcre, - Author = { Lasse Nielsen, Fritz Henglein}, + Author = {Lasse Nielsen, Fritz Henglein}, Date-Added = {2019-07-03 21:09:39 +0000}, Date-Modified = {2019-07-03 21:17:33 +0000}, Journal = {LATA},