# HG changeset patch # User Christian Urban # Date 1489541079 0 # Node ID b78664a24f5d1303e3643525305b7f5fe3e99ee9 # Parent d922cc83b70ca7df546debdf9e032da62fb4de08 updated diff -r d922cc83b70c -r b78664a24f5d LINKS --- a/LINKS Mon Feb 13 23:22:45 2017 +0000 +++ b/LINKS Wed Mar 15 01:24:39 2017 +0000 @@ -3,3 +3,15 @@ https://dere.github.io/2017-02-12/beginners-assembly-part1/ +webassembly +https://sourceware.org/ml/binutils/2017-03/msg00044.html +https://hacks.mozilla.org/2017/02/a-cartoon-intro-to-webassembly/ + +webassembly explorer +https://mbebenita.github.io/WasmExplorer/ + + + +free books +https://github.com/vhf/free-programming-books/blob/master/free-programming-books.md +https://john.cs.olemiss.edu/~hcc/csci658/notes/Free_Prog_Lang_Textbooks.html \ No newline at end of file diff -r d922cc83b70c -r b78664a24f5d data.sty --- a/data.sty Mon Feb 13 23:22:45 2017 +0000 +++ b/data.sty Wed Mar 15 01:24:39 2017 +0000 @@ -142,18 +142,18 @@ %% re2.scala example a?{n} a{n} \begin{filecontents}{re2.data} -1 0.00006 -101 0.00605 -201 0.04343 -301 0.16469 -401 0.41954 -501 0.83703 -601 1.66925 -701 2.71086 -801 4.19745 -901 7.56495 -1001 12.42103 -1101 20.41763 +1 0.00050 +101 0.02030 +201 0.10587 +301 0.31188 +401 0.32794 +501 0.64490 +601 1.16738 +701 2.10815 +801 3.47144 +901 6.80621 +1001 12.35611 +1101 23.80084 \end{filecontents} %% re2.scala: example (a*)* b @@ -182,36 +182,35 @@ %% re3.scala: example a?{n} a{n} \begin{filecontents}{re3.data} -1 0.00005 -1001 0.63505 -2001 2.53029 -3001 5.72804 -4001 9.94246 -5001 15.52770 -6001 22.44126 -7001 30.86867 -8001 39.32242 -9001 48.96998 +1 0.00003 +1001 0.03887 +2001 0.15666 +3001 0.35910 +4001 0.63950 +5001 1.00241 +6001 1.50480 +7001 2.11568 +8001 2.71208 +9001 3.41157 +10001 4.19962 +11001 5.70387 \end{filecontents} %% re3.scala: example (a*)* b \begin{filecontents}{re3a.data} -1 0.00014 -500001 2.61059 -1000001 5.42773 -1500001 8.02603 -2000001 10.49844 -2500001 13.34234 -3000001 16.17491 -3500001 19.11650 -4000001 21.66151 -4500001 24.85496 -5000001 28.52113 -5500001 28.54548 -6000001 32.39523 -6500001 45.13486 -7000001 54.15018 -7500001 71.32218 +1 0.00015 +500001 0.28337 +1000001 0.53271 +1500001 0.84478 +2000001 1.11763 +2500001 1.76656 +3000001 2.13310 +3500001 2.39576 +4000001 2.98624 +4500001 5.96529 +5000001 13.56911 +5500001 18.43089 +6000001 40.33704 \end{filecontents} %% re4.scala example a?{n} a{n} diff -r d922cc83b70c -r b78664a24f5d handouts/ho01.pdf Binary file handouts/ho01.pdf has changed diff -r d922cc83b70c -r b78664a24f5d handouts/ho01.tex --- a/handouts/ho01.tex Mon Feb 13 23:22:45 2017 +0000 +++ b/handouts/ho01.tex Wed Mar 15 01:24:39 2017 +0000 @@ -3,6 +3,8 @@ \usepackage{../langs} \usepackage{../graphics} \usepackage{../data} +\usepackage{lstlinebgrd} +\definecolor{capri}{rgb}{0.0, 0.75, 1.0} %%http://regexcrossword.com/challenges/cities/puzzles/1 %%https://jex.im/regulex/ @@ -37,24 +39,24 @@ \section*{Handout 1} This module is about text processing, be it for web-crawlers, -compilers, dictionaries, DNA-data and so on. When looking for a +compilers, dictionaries, DNA-data and so on. When looking for a particular string, like $abc$ in a large text we can use the Knuth-Morris-Pratt algorithm, which is currently the most efficient general string search algorithm. But often we do \emph{not} just look -for a particular string, but for string patterns. For example in +for a particular string, but for string patterns. For example, in program code we need to identify what are the keywords (if, then, while, etc), what are the identifiers (variable names). A pattern for identifiers could be stated as: they start with a letter, followed by -zero or more letters, numbers and underscores. Also often we face the +zero or more letters, numbers and underscores. Often we also face the problem that we are given a string (for example some user input) and want to know whether it matches a particular pattern---be it an email address, for example. In this way we can exclude user input that would otherwise have nasty effects on our program (crashing it or making it -go into an infinite loop, if not worse). Scanning for computer viruses -or filtering out spam usually involves scanning for some signature -(essentially a pattern). The point is that the fast -Knuth-Morris-Pratt algorithm for strings is not good enough for such -string \emph{patterns}.\smallskip +go into an infinite loop, if not worse). In tools like Snort, scanning +for computer viruses or filtering out spam usually involves scanning +for some signature (essentially a string pattern). The point is that +the fast Knuth-Morris-Pratt algorithm for strings is not good enough +for such string \emph{patterns}.\smallskip \defn{Regular expressions} help with conveniently specifying such patterns. The idea behind regular expressions is that @@ -190,7 +192,31 @@ matching in Python, Ruby and Java. \begin{center} -\begin{tabular}{@{\hspace{-1mm}}c@{\hspace{-1mm}}c@{}} +\begin{tabular}{@{\hspace{-1mm}}c@{\hspace{1mm}}c@{}} +\begin{tikzpicture} +\begin{axis}[ + title={Graph: $\texttt{(a*)*\,b}$ and strings + $\underbrace{\texttt{a}\ldots \texttt{a}}_{n}$}, + xlabel={$n$}, + x label style={at={(1.05,0.0)}}, + ylabel={time in secs}, + enlargelimits=false, + xtick={0,5,...,30}, + xmax=33, + ymax=35, + ytick={0,5,...,30}, + scaled ticks=false, + axis lines=left, + width=5.5cm, + height=4.5cm, + legend entries={Python, Java}, + legend pos=north west, + legend cell align=left] +\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data}; +\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data}; +\end{axis} +\end{tikzpicture} +& \begin{tikzpicture} \begin{axis}[ title={Graph: $\texttt{a?\{n\}\,a{\{n\}}}$ and strings @@ -214,47 +240,24 @@ \addplot[brown,mark=triangle*, mark options={fill=white}] table {re-ruby.data}; \end{axis} \end{tikzpicture} -& -\begin{tikzpicture} -\begin{axis}[ - title={Graph: $\texttt{(a*)*\,b}$ and strings - $\underbrace{\texttt{a}\ldots \texttt{a}}_{n}$}, - xlabel={$n$}, - x label style={at={(1.05,0.0)}}, - ylabel={time in secs}, - enlargelimits=false, - xtick={0,5,...,30}, - xmax=33, - ymax=35, - ytick={0,5,...,30}, - scaled ticks=false, - axis lines=left, - width=5.5cm, - height=4.5cm, - legend entries={Python, Java}, - legend pos=north west, - legend cell align=left] -\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data}; -\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data}; -\end{axis} -\end{tikzpicture} \end{tabular} \end{center} -\noindent This first graph shows that Python needs approximately 29 -seconds for finding out whether a string of 28 \texttt{a}s matches the -regular expression \texttt{a?\{28\}\,a\{28\}}. Ruby is even slightly +\noindent This first graph shows that Python and Java need +approximately 30 seconds to find out that the regular expression +$\texttt{(a*)*\,b}$ does not match strings of 28 \texttt{a}s. +Similarly, the second shows that Python needs approximately 29 seconds +for finding out whether a string of 28 \texttt{a}s matches the regular +expression \texttt{a?\{28\}\,a\{28\}}. Ruby is even slightly worse.\footnote{In this example Ruby uses the slightly different regular expression \texttt{a?a?a?...a?a?aaa...aa}, where the \texttt{a?} and \texttt{a} each occur $n$ times. More such test cases can be found at \url{http://www.computerbytesman.com/redos/}.} -Simlarly, Python and Java needs approximately 30 seconds to find out -that the regular expression $\texttt{(a*)*\,b}$ does not match strings -of 28 \texttt{a}s. Admittedly, these regular expressions are -carefully chosen to exhibit this exponential behaviour, but similar -ones occur more often than one wants in ``real life''. For example, on -20 July 2016 a similar regular expression brought the webpage -\href{http://stackexchange.com}{Stack Exchange} to its knees: +Admittedly, these regular expressions are carefully chosen to exhibit +this exponential behaviour, but similar ones occur more often than one +wants in ``real life''. For example, on 20 July 2016 a similar regular +expression brought the webpage \href{http://stackexchange.com}{Stack + Exchange} to its knees: \begin{center} \url{http://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016} @@ -293,9 +296,9 @@ at a relatively simple algorithm that solves this problem much better than Python and Ruby do\ldots actually it will be two versions of the algorithm: the first one will be able to -process strings of approximately 1,000 \texttt{a}s in 30 +process strings of approximately 1,100 \texttt{a}s in 23 seconds, while the second version will even be able to process -up to 7,000(!) in 30 seconds, see the graph below: +up to 11,000(!) in 5 seconds, see the graph below: \begin{center} \begin{tikzpicture} @@ -306,8 +309,8 @@ x label style={at={(1.05,0.0)}}, ylabel={time in secs}, enlargelimits=false, - xtick={0,3000,...,9000}, - xmax=10000, + xtick={0,3000,...,12000}, + xmax=13000, ymax=32, ytick={0,5,...,30}, scaled ticks=false, @@ -696,8 +699,9 @@ \end{quote} -\begin{figure}[p] -\lstinputlisting{../progs/crawler1.scala} +\begin{figure}[p]\small + \lstinputlisting[linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}] + {../progs/crawler1.scala} \caption{The Scala code for a simple web-crawler that checks for broken links in a web-page. It uses the regular expression @@ -708,8 +712,10 @@ \end{figure} -\begin{figure}[p] -\lstinputlisting{../progs/crawler2.scala} + +\begin{figure}[p]\small + \lstinputlisting[linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}] + {../progs/crawler2.scala} \caption{A version of the web-crawler that only follows links in ``my'' domain---since these are the ones I am interested in @@ -722,8 +728,9 @@ \end{figure} -\begin{figure}[p] -\lstinputlisting{../progs/crawler3.scala} +\begin{figure}[p]\small + \lstinputlisting[linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}] + {../progs/crawler3.scala} \caption{A small email harvester---whenever we download a web-page, we also check whether it contains any email diff -r d922cc83b70c -r b78664a24f5d handouts/ho02.pdf Binary file handouts/ho02.pdf has changed diff -r d922cc83b70c -r b78664a24f5d handouts/ho02.tex --- a/handouts/ho02.tex Mon Feb 13 23:22:45 2017 +0000 +++ b/handouts/ho02.tex Wed Mar 15 01:24:39 2017 +0000 @@ -3,7 +3,8 @@ \usepackage{../langs} \usepackage{../graphics} \usepackage{../data} - +\usepackage{lstlinebgrd} +\definecolor{capri}{rgb}{0.0, 0.75, 1.0} \begin{document} \fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016, 2017} @@ -54,8 +55,8 @@ x label style={at={(1.1,0.05)}}, ylabel={\small time in secs}, enlargelimits=false, - xtick={0,3000,...,9000}, - xmax=10000, + xtick={0,2500,...,11000}, + xmax=12000, ymax=35, ytick={0,5,...,30}, scaled ticks=false, @@ -86,9 +87,10 @@ axis lines=left, width=5cm, height=5cm, - legend entries={Java}, + legend entries={Java, Python}, legend pos=north west, legend cell align=left] +\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data}; \addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data}; \end{axis} \end{tikzpicture} @@ -461,7 +463,9 @@ for \pcode{matches} are shown in Figure~\ref{scala1}. \begin{figure}[p] -\lstinputlisting{../progs/app5.scala} + \lstinputlisting[numbers=left,linebackgroundcolor= + {\ifodd\value{lstnumber}\color{capri!3}\fi}] + {../progs/app5.scala} \caption{Scala implementation of the \textit{nullable} and derivative functions. These functions are easy to implement in functional languages, because their built-in pattern @@ -581,8 +585,8 @@ x label style={at={(1.01,0.0)}}, ylabel={time in secs}, enlargelimits=false, - xtick={0,100,...,1000}, - xmax=1100, + xtick={0,200,...,1100}, + xmax=1200, ytick={0,5,...,30}, scaled ticks=false, axis lines=left, @@ -600,8 +604,8 @@ \end{center} \noindent Now we are talking business! The modified matcher -can within 30 seconds handle regular expressions up to -$n = 950$ before a StackOverflow is raised. Recall that Python and Ruby +can within 25 seconds handle regular expressions up to +$n = 1,100$ before a StackOverflow is raised. Recall that Python and Ruby (and our first version, Scala V1) could only handle $n = 27$ or so in 30 seconds. There is no change for our second example $(a^*)^* \cdot b$---so this is still good. @@ -652,7 +656,9 @@ function introduced. Does this improve the speed? You bet!! \begin{figure}[p] -\lstinputlisting{../progs/app6.scala} +\lstinputlisting[numbers=left,linebackgroundcolor= + {\ifodd\value{lstnumber}\color{capri!3}\fi}] + {../progs/app6.scala} \caption{The simplification function and modified \texttt{ders}-function; this function now calls \texttt{der} first, but then simplifies diff -r d922cc83b70c -r b78664a24f5d progs/re1.scala --- a/progs/re1.scala Mon Feb 13 23:22:45 2017 +0000 +++ b/progs/re1.scala Wed Mar 15 01:24:39 2017 +0000 @@ -1,3 +1,5 @@ +// Simple matcher for basic regular expressions + abstract class Rexp case object ZERO extends Rexp // matches nothing @@ -40,22 +42,27 @@ // main matcher function def matches(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r)) + //examples from the homework + val r = STAR(ALT(SEQ(CHAR('a'), CHAR('b')), CHAR('b'))) der('a', r) der('b', r) der('c', r) -//optional (one or zero times) +//optional regular expression (one or zero times) def OPT(r: Rexp) = ALT(r, ONE) -//n-times (explicitly expanded) +//n-times regular expression (explicitly expanded) def NTIMES(r: Rexp, n: Int) : Rexp = n match { case 0 => ONE case 1 => r case n => SEQ(r, NTIMES(r, n - 1)) } + +// Test Cases + // the evil regular expression a?{n} a{n} def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) @@ -88,3 +95,23 @@ println(i + " " + "%.5f".format(time_needed(2, matches(EVIL2, "a" * i)))) } + + + +// size of a regular expressions - for testing purposes +def size(r: Rexp) : Int = r match { + case ZERO => 1 + case ONE => 1 + case CHAR(_) => 1 + case ALT(r1, r2) => 1 + size(r1) + size(r2) + case SEQ(r1, r2) => 1 + size(r1) + size(r2) + case STAR(r) => 1 + size(r) +} + +// the expicit expansion in EVIL1(n) increases +// drastically its size + +size(EVIL1(1)) // 5 +size(EVIL1(3)) // 17 +size(EVIL1(5)) // 29 +size(EVIL1(7)) // 41 diff -r d922cc83b70c -r b78664a24f5d progs/re2.scala --- a/progs/re2.scala Mon Feb 13 23:22:45 2017 +0000 +++ b/progs/re2.scala Wed Mar 15 01:24:39 2017 +0000 @@ -1,5 +1,6 @@ -// version with explicit an n-times regular expression -// this keeps the regular expression small +// Version with explicit an explicit n-times regular expression; +// this keeps the overall regular expression in the EVIL1 regular +// expression small abstract class Rexp case object ZERO extends Rexp @@ -8,7 +9,8 @@ case class ALT(r1: Rexp, r2: Rexp) extends Rexp case class SEQ(r1: Rexp, r2: Rexp) extends Rexp case class STAR(r: Rexp) extends Rexp -case class NTIMES(r: Rexp, n: Int) extends Rexp //explicit constructor +case class NTIMES(r: Rexp, n: Int) extends Rexp //explicit constructor for n-times + def nullable (r: Rexp) : Boolean = r match { case ZERO => false @@ -20,6 +22,7 @@ case NTIMES(r, i) => if (i == 0) true else nullable(r) } + def der (c: Char, r: Rexp) : Rexp = r match { case ZERO => ZERO case ONE => ZERO @@ -30,7 +33,7 @@ else SEQ(der(c, r1), r2) case STAR(r1) => SEQ(der(c, r1), STAR(r1)) case NTIMES(r1, i) => - if (i == 0) ZERO else der(c, SEQ(r1, NTIMES(r1, i - 1))) + if (i == 0) ZERO else SEQ(der(c, r1), NTIMES(r1, i - 1)) } def ders (s: List[Char], r: Rexp) : Rexp = s match { @@ -41,9 +44,12 @@ def matches(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r)) -//optional: one or zero times +//optional regular expression: one or zero times def OPT(r: Rexp) = ALT(r, ONE) + +// Test Cases + //evil regular expressions def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) @@ -57,11 +63,11 @@ //test: (a?{n}) (a{n}) -for (i <- 1 to 1001 by 10) { +for (i <- 1 to 1201 by 100) { println(i + " " + "%.5f".format(time_needed(2, matches(EVIL1(i), "a" * i)))) } -for (i <- 1 to 1001 by 10) { +for (i <- 1 to 1201 by 100) { println(i + " " + "%.5f".format(time_needed(2, matches(EVIL1(i), "a" * i)))) } @@ -76,3 +82,32 @@ } + +// size of a regular expressions - for testing purposes +def size(r: Rexp) : Int = r match { + case ZERO => 1 + case ONE => 1 + case CHAR(_) => 1 + case ALT(r1, r2) => 1 + size(r1) + size(r2) + case SEQ(r1, r2) => 1 + size(r1) + size(r2) + case STAR(r) => 1 + size(r) + case NTIMES(r, _) => 1 + size(r) +} + +// EVIL1(n) has now a constant size, no matter +// what n is + +size(EVIL1(1)) // 7 +size(EVIL1(3)) // 7 +size(EVIL1(5)) // 7 +size(EVIL1(7)) // 7 + + +// but the size of the derivatives can grow +// quite dramatically + +size(ders("".toList, EVIL2)) // 5 +size(ders("a".toList, EVIL2)) // 12 +size(ders("aa".toList, EVIL2)) // 28 +size(ders("aaa".toList, EVIL2)) // 58 +size(ders("aaaa".toList, EVIL2)) // 116 diff -r d922cc83b70c -r b78664a24f5d progs/re3.scala --- a/progs/re3.scala Mon Feb 13 23:22:45 2017 +0000 +++ b/progs/re3.scala Wed Mar 15 01:24:39 2017 +0000 @@ -1,3 +1,6 @@ +// Version with simplification during derivatives; +// this keeps the regular expressions small, which +// is good for run-time abstract class Rexp case object ZERO extends Rexp @@ -32,7 +35,7 @@ else SEQ(der(c, r1), r2) case STAR(r1) => SEQ(der(c, r1), STAR(r1)) case NTIMES(r, i) => - if (i == 0) ZERO else der(c, SEQ(r, NTIMES(r, i - 1))) + if (i == 0) ZERO else SEQ(der(c, r), NTIMES(r, i - 1)) } def simp(r: Rexp) : Rexp = r match { @@ -63,12 +66,12 @@ def matcher(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r)) -var regex = NTIMES(CHAR('a'),5) -println(matcher(regex,"aaaaa")) - //one or zero def OPT(r: Rexp) = ALT(r, ONE) + +// Test Cases + //evil regular expressions def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) @@ -83,20 +86,43 @@ //test: (a?{n}) (a{n}) -for (i <- 1 to 9001 by 10) { +for (i <- 1 to 11001 by 1000) { println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL1(i), "a" * i)))) } -for (i <- 1 to 9001 by 1000) { +for (i <- 1 to 11001 by 1000) { println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL1(i), "a" * i)))) } //test: (a*)* b -for (i <- 1 to 7000001 by 500000) { +for (i <- 1 to 6000001 by 500000) { + println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL2, "a" * i)))) +} + +for (i <- 1 to 6000001 by 500000) { println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL2, "a" * i)))) } -for (i <- 1 to 7000001 by 500000) { - println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL2, "a" * i)))) + + +// size of a regular expressions - for testing purposes +def size(r: Rexp) : Int = r match { + case ZERO => 1 + case ONE => 1 + case CHAR(_) => 1 + case ALT(r1, r2) => 1 + size(r1) + size(r2) + case SEQ(r1, r2) => 1 + size(r1) + size(r2) + case STAR(r) => 1 + size(r) + case NTIMES(r, _) => 1 + size(r) } + +// now the size of the derivatives grows +// much, much slower + +size(ders("".toList, EVIL2)) // 5 +size(ders("a".toList, EVIL2)) // 8 +size(ders("aa".toList, EVIL2)) // 8 +size(ders("aaa".toList, EVIL2)) // 8 +size(ders("aaaa".toList, EVIL2)) // 8 +size(ders("aaaaa".toList, EVIL2)) // 8 diff -r d922cc83b70c -r b78664a24f5d progs/re4.scala --- a/progs/re4.scala Mon Feb 13 23:22:45 2017 +0000 +++ b/progs/re4.scala Wed Mar 15 01:24:39 2017 +0000 @@ -1,4 +1,6 @@ - +// Version which attempts to move whole strings, not +// just characters, under derivatives whenever possible + abstract class Rexp case object ZERO extends Rexp case object ONE extends Rexp @@ -50,6 +52,7 @@ case r => r } +// *new* // derivative w.r.t. a string (iterates der) def ders2(s: List[Char], r: Rexp) : Rexp = (s, r) match { case (Nil, r) => r @@ -68,6 +71,9 @@ //one or zero def OPT(r: Rexp) = ALT(r, ONE) + +// Test Cases + def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n)) val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b')) @@ -79,7 +85,6 @@ (end - start)/(i * 1.0e9) } -// warmup //test: (a?{n}) (a{n}) for (i <- 1 to 7000001 by 500000) { println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL1(i), "a" * i))))