2 \usepackage{../style} |
2 \usepackage{../style} |
3 \usepackage{../langs} |
3 \usepackage{../langs} |
4 \usepackage{../graphics} |
4 \usepackage{../graphics} |
5 \usepackage{../data} |
5 \usepackage{../data} |
6 |
6 |
|
7 |
7 \begin{document} |
8 \begin{document} |
8 |
9 |
9 \section*{Handout 2} |
10 \section*{Handout 2} |
10 |
11 |
11 This lecture is about implementing a more efficient regular |
12 This lecture is about implementing a more efficient regular |
12 expression matcher (the plots on the right)---more efficient |
13 expression matcher (the plots on the right)---more efficient |
13 than the matchers from regular expression libraries in Ruby and |
14 than the matchers from regular expression libraries in Ruby and |
14 Python (the plots on the left). These plots show the running |
15 Python (the plots on the left). These plots show the running |
15 time for the evil regular expression $a?^{\{n\}}a^{\{n\}}$. |
16 time for the evil regular expression $a?^{\{n\}}a^{\{n\}}$. |
16 |
17 Note the different scales in each plot. |
|
18 |
|
19 \pgfplotsset{compat=1.11} |
17 \begin{center} |
20 \begin{center} |
18 \begin{tabular}{@{}cc@{}} |
21 \begin{tabular}{@{}cc@{}} |
19 \begin{tikzpicture}[y=.072cm, x=.12cm] |
22 \begin{tikzpicture} |
20 %axis |
23 \begin{axis}[ |
21 \draw (0,0) -- coordinate (x axis mid) (30,0); |
24 xlabel={\pcode{a}s}, |
22 \draw (0,0) -- coordinate (y axis mid) (0,30); |
25 ylabel={time in secs}, |
23 %ticks |
26 enlargelimits=false, |
24 \foreach \x in {0,5,...,30} |
27 xtick={0,5,...,30}, |
25 \draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x}; |
28 xmax=30, |
26 \foreach \y in {0,5,...,30} |
29 ymax=35, |
27 \draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y}; |
30 ytick={0,5,...,30}, |
28 %labels |
31 scaled ticks=false, |
29 \node[below=0.6cm] at (x axis mid) {number of \texttt{a}s}; |
32 axis lines=left, |
30 \node[rotate=90,left=0.9cm] at (y axis mid) {time in secs}; |
33 width=5cm, |
31 %plots |
34 height=5cm, |
32 \draw[color=blue] plot[mark=*] |
35 legend entries={Python,Ruby}, |
33 file {re-python.data}; |
36 legend pos=north west, |
34 \draw[color=brown] plot[mark=triangle*] |
37 legend cell align=left |
35 file {re-ruby.data}; |
38 ] |
36 %legend |
39 \addplot[blue,mark=*, mark options={fill=white}] |
37 \begin{scope}[shift={(4,20)}] |
40 table {re-python.data}; |
38 \draw[color=blue] (0,0) -- |
41 \addplot[brown,mark=pentagon*, mark options={fill=white}] |
39 plot[mark=*] (0.25,0) -- (0.5,0) |
42 table {re-ruby.data}; |
40 node[right]{\small Python}; |
43 \end{axis} |
41 \draw[yshift=-4mm, color=brown] (0,0) -- |
44 \end{tikzpicture} |
42 plot[mark=triangle*] (0.25,0) -- (0.5,0) |
|
43 node[right]{\small Ruby}; |
|
44 \end{scope} |
|
45 \end{tikzpicture} |
|
46 & |
45 & |
47 \begin{tikzpicture}[y=.072cm, x=.0004cm] |
46 \begin{tikzpicture} |
48 %axis |
47 \begin{axis}[ |
49 \draw (0,0) -- coordinate (x axis mid) (12000,0); |
48 xlabel={\pcode{a}s}, |
50 \draw (0,0) -- coordinate (y axis mid) (0,30); |
49 ylabel={time in secs}, |
51 %ticks |
50 enlargelimits=false, |
52 \foreach \x in {0,3000,...,12000} |
51 xtick={0,3000,...,12000}, |
53 \draw (\x,1pt) -- (\x,-3pt) node[anchor=north] {\x}; |
52 xmax=12000, |
54 \foreach \y in {0,5,...,30} |
53 ymax=35, |
55 \draw (1pt,\y) -- (-3pt,\y) node[anchor=east] {\y}; |
54 ytick={0,5,...,30}, |
56 %labels |
55 scaled ticks=false, |
57 \node[below=0.6cm] at (x axis mid) {number of \texttt{a}s}; |
56 axis lines=left, |
58 \node[rotate=90,left=0.9cm] at (y axis mid) {time in secs}; |
57 width=6.5cm, |
59 |
58 height=5cm |
60 %plots |
59 ] |
61 \draw[color=green] plot[mark=square*, mark options={fill=white} ] |
60 \addplot[green,mark=square*,mark options={fill=white}] table {re2b.data}; |
62 file {re2b.data}; |
61 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data}; |
63 \draw[color=black] plot[mark=square*, mark options={fill=white} ] |
62 \end{axis} |
64 file {re3.data}; |
|
65 \end{tikzpicture} |
63 \end{tikzpicture} |
66 \end{tabular} |
64 \end{tabular} |
67 \end{center}\medskip |
65 \end{center}\medskip |
68 |
66 |
69 |
67 |
421 |
419 |
422 \lstinputlisting[numbers=none]{../progs/app51.scala} |
420 \lstinputlisting[numbers=none]{../progs/app51.scala} |
423 |
421 |
424 \noindent Running the matcher with the example, we find it is |
422 \noindent Running the matcher with the example, we find it is |
425 slightly worse then the matcher in Ruby and Python. |
423 slightly worse then the matcher in Ruby and Python. |
426 Ooops\ldots\medskip |
424 Ooops\ldots |
|
425 |
|
426 \pgfplotsset{compat=1.11} |
|
427 \begin{center} |
|
428 \begin{tikzpicture} |
|
429 \begin{axis}[ |
|
430 xlabel={\pcode{a}s}, |
|
431 ylabel={time in secs}, |
|
432 enlargelimits=false, |
|
433 xtick={0,5,...,30}, |
|
434 xmax=30, |
|
435 ytick={0,5,...,30}, |
|
436 scaled ticks=false, |
|
437 axis lines=left, |
|
438 width=6cm, |
|
439 height=5cm, |
|
440 legend entries={Python,Ruby,Scala V1}, |
|
441 legend pos=outer north east, |
|
442 legend cell align=left |
|
443 ] |
|
444 \addplot[blue,mark=*, mark options={fill=white}] |
|
445 table {re-python.data}; |
|
446 \addplot[brown,mark=pentagon*, mark options={fill=white}] |
|
447 table {re-ruby.data}; |
|
448 \addplot[red,mark=triangle*,mark options={fill=white}] |
|
449 table {re1.data}; |
|
450 \end{axis} |
|
451 \end{tikzpicture} |
|
452 \end{center} |
427 |
453 |
428 \noindent Analysing this failure a bit we notice that |
454 \noindent Analysing this failure a bit we notice that |
429 for $a^{\{n\}}$ we generate quite big regular expressions: |
455 for $a^{\{n\}}$ we generate quite big regular expressions: |
430 |
456 |
431 \begin{center} |
457 \begin{center} |
433 1: & $a$\\ |
459 1: & $a$\\ |
434 2: & $a\cdot a$\\ |
460 2: & $a\cdot a$\\ |
435 3: & $a\cdot a\cdot a$\\ |
461 3: & $a\cdot a\cdot a$\\ |
436 & \ldots\\ |
462 & \ldots\\ |
437 13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\ |
463 13: & $a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a\cdot a$\\ |
438 & \ldots\\ |
464 & \ldots |
439 20: |
|
440 \end{tabular} |
465 \end{tabular} |
441 \end{center} |
466 \end{center} |
442 |
467 |
443 \noindent Our algorithm traverses such regular expressions at |
468 \noindent Our algorithm traverses such regular expressions at |
444 least once every time a derivative is calculated. So having |
469 least once every time a derivative is calculated. So having |
445 large regular expressions, will cause problems. This problem |
470 large regular expressions will cause problems. This problem |
446 is aggravated with $a?$ being represented as $a + \epsilon$. |
471 is aggravated by $a?$ being represented as $a + \epsilon$. |
447 |
472 |
448 |
473 We can fix this by having an explicit constructor for |
|
474 $r^{\{n\}}$. In Scala we would introduce a constructor like |
|
475 |
|
476 \begin{center} |
|
477 \code{case class NTIMES(r: Rexp, n: Int) extends Rexp} |
|
478 \end{center} |
|
479 |
|
480 \noindent With this we have a constant ``size'' regular |
|
481 expression for our running example no matter how large $n$ |
|
482 is. This means we have to also add cases for $nullable$ and |
|
483 $der$. Does the change have any effect? |
|
484 |
|
485 \pgfplotsset{compat=1.11} |
|
486 \begin{center} |
|
487 \begin{tikzpicture} |
|
488 \begin{axis}[ |
|
489 xlabel={\pcode{a}s}, |
|
490 ylabel={time in secs}, |
|
491 enlargelimits=false, |
|
492 xtick={0,100,...,1000}, |
|
493 xmax=1000, |
|
494 ytick={0,5,...,30}, |
|
495 scaled ticks=false, |
|
496 axis lines=left, |
|
497 width=10cm, |
|
498 height=5cm, |
|
499 legend entries={Python,Ruby,Scala V1,Scala V2}, |
|
500 legend pos=outer north east, |
|
501 legend cell align=left |
|
502 ] |
|
503 \addplot[blue,mark=*, mark options={fill=white}] |
|
504 table {re-python.data}; |
|
505 \addplot[brown,mark=pentagon*, mark options={fill=white}] |
|
506 table {re-ruby.data}; |
|
507 \addplot[red,mark=triangle*,mark options={fill=white}] |
|
508 table {re1.data}; |
|
509 \addplot[green,mark=square*,mark options={fill=white}] |
|
510 table {re2b.data}; |
|
511 \end{axis} |
|
512 \end{tikzpicture} |
|
513 \end{center} |
|
514 |
|
515 \noindent Now we are talking business! The modified matcher |
|
516 can within 30 seconds handle regular expressions up to |
|
517 $n = 950$ before a StackOverflow is raised. |
|
518 |
|
519 The moral is that our algorithm is rather sensitive to the |
|
520 size of regular expressions it needs to handle. This is of |
|
521 course obvious because both $nullable$ and $der$ need to |
|
522 traverse the whole regular expression. There seems to be one |
|
523 more source of making the algorithm run faster. The derivative |
|
524 function often produces ``useless'' $\varnothing$s and |
|
525 $\epsilon$s. To see this, consider $r = ((a \cdot b) + b)^*$ |
|
526 and the following two derivatives |
|
527 |
|
528 \begin{center} |
|
529 \begin{tabular}{l} |
|
530 $der\,a\,r = ((\epsilon \cdot b) + \varnothing) \cdot r$\\ |
|
531 $der\,b\,r = ((\varnothing \cdot b) + \epsilon)\cdot r$\\ |
|
532 $der\,c\,r = ((\varnothing \cdot b) + \varnothing)\cdot r$ |
|
533 \end{tabular} |
|
534 \end{center} |
|
535 |
|
536 \noindent |
|
537 If we simplify them according to the simple rules from the |
|
538 beginning, we can replace the right-hand sides by the |
|
539 smaller equivalent regular expressions |
|
540 |
|
541 \begin{center} |
|
542 \begin{tabular}{l} |
|
543 $der\,a\,r \equiv b \cdot r$\\ |
|
544 $der\,b\,r \equiv r$\\ |
|
545 $der\,c\,r \equiv \varnothing$ |
|
546 \end{tabular} |
|
547 \end{center} |
|
548 |
|
549 \noindent I leave it to you to contemplate whether such a |
|
550 simplification can have any impact on the correctness of our |
|
551 algorithm (will it change any answers?). Figure~\ref{scala2} |
|
552 give a simplification function that recursively traverses a |
|
553 regular expression and simplifies it according to the rules |
|
554 given at the beginning. There are only rules for $+$, $\cdot$ |
|
555 and $n$-times (the latter because we added it in the second |
|
556 version of our matcher). There is no rule for a star, because |
|
557 empirical data and also a little thought showed that |
|
558 simplifying under a star is waste of computation time. The |
|
559 simplification function will be called after every derivation. |
|
560 This additional step removes all the ``junk'' the derivative |
|
561 function introduced. Does this improve the speed? You bet!! |
|
562 |
|
563 \begin{figure}[p] |
|
564 \lstinputlisting{../progs/app6.scala} |
|
565 \caption{The simplification function and modified |
|
566 \pcode{ders}-function.\label{scala2}} |
|
567 \end{figure} |
|
568 |
|
569 \pgfplotsset{compat=1.11} |
|
570 \begin{center} |
|
571 \begin{tikzpicture} |
|
572 \begin{axis}[ |
|
573 xlabel={\pcode{a}s}, |
|
574 ylabel={time in secs}, |
|
575 enlargelimits=false, |
|
576 xtick={0,2000,...,12000}, |
|
577 xmax=12000, |
|
578 ytick={0,5,...,30}, |
|
579 scaled ticks=false, |
|
580 axis lines=left, |
|
581 width=9cm, |
|
582 height=4cm, |
|
583 legend entries={Scala V2,Scala V3}, |
|
584 ] |
|
585 \addplot[green,mark=square*,mark options={fill=white}] table {re2b.data}; |
|
586 \addplot[black,mark=square*,mark options={fill=white}] table {re3.data}; |
|
587 \end{axis} |
|
588 \end{tikzpicture} |
|
589 \end{center} |
449 |
590 |
450 \end{document} |
591 \end{document} |
|
592 |
|
593 |
|
594 |
451 |
595 |
452 %%% Local Variables: |
596 %%% Local Variables: |
453 %%% mode: latex |
597 %%% mode: latex |
454 %%% TeX-master: t |
598 %%% TeX-master: t |
455 %%% End: |
599 %%% End: |