--- a/LINKS Mon Feb 13 23:22:45 2017 +0000
+++ b/LINKS Wed Mar 15 01:24:39 2017 +0000
@@ -3,3 +3,15 @@
https://dere.github.io/2017-02-12/beginners-assembly-part1/
+webassembly
+https://sourceware.org/ml/binutils/2017-03/msg00044.html
+https://hacks.mozilla.org/2017/02/a-cartoon-intro-to-webassembly/
+
+webassembly explorer
+https://mbebenita.github.io/WasmExplorer/
+
+
+
+free books
+https://github.com/vhf/free-programming-books/blob/master/free-programming-books.md
+https://john.cs.olemiss.edu/~hcc/csci658/notes/Free_Prog_Lang_Textbooks.html
\ No newline at end of file
--- a/data.sty Mon Feb 13 23:22:45 2017 +0000
+++ b/data.sty Wed Mar 15 01:24:39 2017 +0000
@@ -142,18 +142,18 @@
%% re2.scala example a?{n} a{n}
\begin{filecontents}{re2.data}
-1 0.00006
-101 0.00605
-201 0.04343
-301 0.16469
-401 0.41954
-501 0.83703
-601 1.66925
-701 2.71086
-801 4.19745
-901 7.56495
-1001 12.42103
-1101 20.41763
+1 0.00050
+101 0.02030
+201 0.10587
+301 0.31188
+401 0.32794
+501 0.64490
+601 1.16738
+701 2.10815
+801 3.47144
+901 6.80621
+1001 12.35611
+1101 23.80084
\end{filecontents}
%% re2.scala: example (a*)* b
@@ -182,36 +182,35 @@
%% re3.scala: example a?{n} a{n}
\begin{filecontents}{re3.data}
-1 0.00005
-1001 0.63505
-2001 2.53029
-3001 5.72804
-4001 9.94246
-5001 15.52770
-6001 22.44126
-7001 30.86867
-8001 39.32242
-9001 48.96998
+1 0.00003
+1001 0.03887
+2001 0.15666
+3001 0.35910
+4001 0.63950
+5001 1.00241
+6001 1.50480
+7001 2.11568
+8001 2.71208
+9001 3.41157
+10001 4.19962
+11001 5.70387
\end{filecontents}
%% re3.scala: example (a*)* b
\begin{filecontents}{re3a.data}
-1 0.00014
-500001 2.61059
-1000001 5.42773
-1500001 8.02603
-2000001 10.49844
-2500001 13.34234
-3000001 16.17491
-3500001 19.11650
-4000001 21.66151
-4500001 24.85496
-5000001 28.52113
-5500001 28.54548
-6000001 32.39523
-6500001 45.13486
-7000001 54.15018
-7500001 71.32218
+1 0.00015
+500001 0.28337
+1000001 0.53271
+1500001 0.84478
+2000001 1.11763
+2500001 1.76656
+3000001 2.13310
+3500001 2.39576
+4000001 2.98624
+4500001 5.96529
+5000001 13.56911
+5500001 18.43089
+6000001 40.33704
\end{filecontents}
%% re4.scala example a?{n} a{n}
Binary file handouts/ho01.pdf has changed
--- a/handouts/ho01.tex Mon Feb 13 23:22:45 2017 +0000
+++ b/handouts/ho01.tex Wed Mar 15 01:24:39 2017 +0000
@@ -3,6 +3,8 @@
\usepackage{../langs}
\usepackage{../graphics}
\usepackage{../data}
+\usepackage{lstlinebgrd}
+\definecolor{capri}{rgb}{0.0, 0.75, 1.0}
%%http://regexcrossword.com/challenges/cities/puzzles/1
%%https://jex.im/regulex/
@@ -37,24 +39,24 @@
\section*{Handout 1}
This module is about text processing, be it for web-crawlers,
-compilers, dictionaries, DNA-data and so on. When looking for a
+compilers, dictionaries, DNA-data and so on. When looking for a
particular string, like $abc$ in a large text we can use the
Knuth-Morris-Pratt algorithm, which is currently the most efficient
general string search algorithm. But often we do \emph{not} just look
-for a particular string, but for string patterns. For example in
+for a particular string, but for string patterns. For example, in
program code we need to identify what are the keywords (if, then,
while, etc), what are the identifiers (variable names). A pattern for
identifiers could be stated as: they start with a letter, followed by
-zero or more letters, numbers and underscores. Also often we face the
+zero or more letters, numbers and underscores. Often we also face the
problem that we are given a string (for example some user input) and
want to know whether it matches a particular pattern---be it an email
address, for example. In this way we can exclude user input that would
otherwise have nasty effects on our program (crashing it or making it
-go into an infinite loop, if not worse). Scanning for computer viruses
-or filtering out spam usually involves scanning for some signature
-(essentially a pattern). The point is that the fast
-Knuth-Morris-Pratt algorithm for strings is not good enough for such
-string \emph{patterns}.\smallskip
+go into an infinite loop, if not worse). In tools like Snort, scanning
+for computer viruses or filtering out spam usually involves scanning
+for some signature (essentially a string pattern). The point is that
+the fast Knuth-Morris-Pratt algorithm for strings is not good enough
+for such string \emph{patterns}.\smallskip
\defn{Regular expressions} help with conveniently specifying
such patterns. The idea behind regular expressions is that
@@ -190,7 +192,31 @@
matching in Python, Ruby and Java.
\begin{center}
-\begin{tabular}{@{\hspace{-1mm}}c@{\hspace{-1mm}}c@{}}
+\begin{tabular}{@{\hspace{-1mm}}c@{\hspace{1mm}}c@{}}
+\begin{tikzpicture}
+\begin{axis}[
+ title={Graph: $\texttt{(a*)*\,b}$ and strings
+ $\underbrace{\texttt{a}\ldots \texttt{a}}_{n}$},
+ xlabel={$n$},
+ x label style={at={(1.05,0.0)}},
+ ylabel={time in secs},
+ enlargelimits=false,
+ xtick={0,5,...,30},
+ xmax=33,
+ ymax=35,
+ ytick={0,5,...,30},
+ scaled ticks=false,
+ axis lines=left,
+ width=5.5cm,
+ height=4.5cm,
+ legend entries={Python, Java},
+ legend pos=north west,
+ legend cell align=left]
+\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
+\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
+\end{axis}
+\end{tikzpicture}
+&
\begin{tikzpicture}
\begin{axis}[
title={Graph: $\texttt{a?\{n\}\,a{\{n\}}}$ and strings
@@ -214,47 +240,24 @@
\addplot[brown,mark=triangle*, mark options={fill=white}] table {re-ruby.data};
\end{axis}
\end{tikzpicture}
-&
-\begin{tikzpicture}
-\begin{axis}[
- title={Graph: $\texttt{(a*)*\,b}$ and strings
- $\underbrace{\texttt{a}\ldots \texttt{a}}_{n}$},
- xlabel={$n$},
- x label style={at={(1.05,0.0)}},
- ylabel={time in secs},
- enlargelimits=false,
- xtick={0,5,...,30},
- xmax=33,
- ymax=35,
- ytick={0,5,...,30},
- scaled ticks=false,
- axis lines=left,
- width=5.5cm,
- height=4.5cm,
- legend entries={Python, Java},
- legend pos=north west,
- legend cell align=left]
-\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
-\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
-\end{axis}
-\end{tikzpicture}
\end{tabular}
\end{center}
-\noindent This first graph shows that Python needs approximately 29
-seconds for finding out whether a string of 28 \texttt{a}s matches the
-regular expression \texttt{a?\{28\}\,a\{28\}}. Ruby is even slightly
+\noindent This first graph shows that Python and Java need
+approximately 30 seconds to find out that the regular expression
+$\texttt{(a*)*\,b}$ does not match strings of 28 \texttt{a}s.
+Similarly, the second shows that Python needs approximately 29 seconds
+for finding out whether a string of 28 \texttt{a}s matches the regular
+expression \texttt{a?\{28\}\,a\{28\}}. Ruby is even slightly
worse.\footnote{In this example Ruby uses the slightly different
regular expression \texttt{a?a?a?...a?a?aaa...aa}, where the
\texttt{a?} and \texttt{a} each occur $n$ times. More such test
cases can be found at \url{http://www.computerbytesman.com/redos/}.}
-Simlarly, Python and Java needs approximately 30 seconds to find out
-that the regular expression $\texttt{(a*)*\,b}$ does not match strings
-of 28 \texttt{a}s. Admittedly, these regular expressions are
-carefully chosen to exhibit this exponential behaviour, but similar
-ones occur more often than one wants in ``real life''. For example, on
-20 July 2016 a similar regular expression brought the webpage
-\href{http://stackexchange.com}{Stack Exchange} to its knees:
+Admittedly, these regular expressions are carefully chosen to exhibit
+this exponential behaviour, but similar ones occur more often than one
+wants in ``real life''. For example, on 20 July 2016 a similar regular
+expression brought the webpage \href{http://stackexchange.com}{Stack
+ Exchange} to its knees:
\begin{center}
\url{http://stackstatus.net/post/147710624694/outage-postmortem-july-20-2016}
@@ -293,9 +296,9 @@
at a relatively simple algorithm that solves this problem much
better than Python and Ruby do\ldots actually it will be two
versions of the algorithm: the first one will be able to
-process strings of approximately 1,000 \texttt{a}s in 30
+process strings of approximately 1,100 \texttt{a}s in 23
seconds, while the second version will even be able to process
-up to 7,000(!) in 30 seconds, see the graph below:
+up to 11,000(!) in 5 seconds, see the graph below:
\begin{center}
\begin{tikzpicture}
@@ -306,8 +309,8 @@
x label style={at={(1.05,0.0)}},
ylabel={time in secs},
enlargelimits=false,
- xtick={0,3000,...,9000},
- xmax=10000,
+ xtick={0,3000,...,12000},
+ xmax=13000,
ymax=32,
ytick={0,5,...,30},
scaled ticks=false,
@@ -696,8 +699,9 @@
\end{quote}
-\begin{figure}[p]
-\lstinputlisting{../progs/crawler1.scala}
+\begin{figure}[p]\small
+ \lstinputlisting[linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}]
+ {../progs/crawler1.scala}
\caption{The Scala code for a simple web-crawler that checks
for broken links in a web-page. It uses the regular expression
@@ -708,8 +712,10 @@
\end{figure}
-\begin{figure}[p]
-\lstinputlisting{../progs/crawler2.scala}
+
+\begin{figure}[p]\small
+ \lstinputlisting[linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}]
+ {../progs/crawler2.scala}
\caption{A version of the web-crawler that only follows links
in ``my'' domain---since these are the ones I am interested in
@@ -722,8 +728,9 @@
\end{figure}
-\begin{figure}[p]
-\lstinputlisting{../progs/crawler3.scala}
+\begin{figure}[p]\small
+ \lstinputlisting[linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}]
+ {../progs/crawler3.scala}
\caption{A small email harvester---whenever we download a
web-page, we also check whether it contains any email
Binary file handouts/ho02.pdf has changed
--- a/handouts/ho02.tex Mon Feb 13 23:22:45 2017 +0000
+++ b/handouts/ho02.tex Wed Mar 15 01:24:39 2017 +0000
@@ -3,7 +3,8 @@
\usepackage{../langs}
\usepackage{../graphics}
\usepackage{../data}
-
+\usepackage{lstlinebgrd}
+\definecolor{capri}{rgb}{0.0, 0.75, 1.0}
\begin{document}
\fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016, 2017}
@@ -54,8 +55,8 @@
x label style={at={(1.1,0.05)}},
ylabel={\small time in secs},
enlargelimits=false,
- xtick={0,3000,...,9000},
- xmax=10000,
+ xtick={0,2500,...,11000},
+ xmax=12000,
ymax=35,
ytick={0,5,...,30},
scaled ticks=false,
@@ -86,9 +87,10 @@
axis lines=left,
width=5cm,
height=5cm,
- legend entries={Java},
+ legend entries={Java, Python},
legend pos=north west,
legend cell align=left]
+\addplot[blue,mark=*, mark options={fill=white}] table {re-python2.data};
\addplot[cyan,mark=*, mark options={fill=white}] table {re-java.data};
\end{axis}
\end{tikzpicture}
@@ -461,7 +463,9 @@
for \pcode{matches} are shown in Figure~\ref{scala1}.
\begin{figure}[p]
-\lstinputlisting{../progs/app5.scala}
+ \lstinputlisting[numbers=left,linebackgroundcolor=
+ {\ifodd\value{lstnumber}\color{capri!3}\fi}]
+ {../progs/app5.scala}
\caption{Scala implementation of the \textit{nullable} and
derivative functions. These functions are easy to
implement in functional languages, because their built-in pattern
@@ -581,8 +585,8 @@
x label style={at={(1.01,0.0)}},
ylabel={time in secs},
enlargelimits=false,
- xtick={0,100,...,1000},
- xmax=1100,
+ xtick={0,200,...,1100},
+ xmax=1200,
ytick={0,5,...,30},
scaled ticks=false,
axis lines=left,
@@ -600,8 +604,8 @@
\end{center}
\noindent Now we are talking business! The modified matcher
-can within 30 seconds handle regular expressions up to
-$n = 950$ before a StackOverflow is raised. Recall that Python and Ruby
+can within 25 seconds handle regular expressions up to
+$n = 1,100$ before a StackOverflow is raised. Recall that Python and Ruby
(and our first version, Scala V1) could only handle $n = 27$ or so in 30
seconds. There is no change for our second example
$(a^*)^* \cdot b$---so this is still good.
@@ -652,7 +656,9 @@
function introduced. Does this improve the speed? You bet!!
\begin{figure}[p]
-\lstinputlisting{../progs/app6.scala}
+\lstinputlisting[numbers=left,linebackgroundcolor=
+ {\ifodd\value{lstnumber}\color{capri!3}\fi}]
+ {../progs/app6.scala}
\caption{The simplification function and modified
\texttt{ders}-function; this function now
calls \texttt{der} first, but then simplifies
--- a/progs/re1.scala Mon Feb 13 23:22:45 2017 +0000
+++ b/progs/re1.scala Wed Mar 15 01:24:39 2017 +0000
@@ -1,3 +1,5 @@
+// Simple matcher for basic regular expressions
+
abstract class Rexp
case object ZERO extends Rexp // matches nothing
@@ -40,22 +42,27 @@
// main matcher function
def matches(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r))
+
//examples from the homework
+
val r = STAR(ALT(SEQ(CHAR('a'), CHAR('b')), CHAR('b')))
der('a', r)
der('b', r)
der('c', r)
-//optional (one or zero times)
+//optional regular expression (one or zero times)
def OPT(r: Rexp) = ALT(r, ONE)
-//n-times (explicitly expanded)
+//n-times regular expression (explicitly expanded)
def NTIMES(r: Rexp, n: Int) : Rexp = n match {
case 0 => ONE
case 1 => r
case n => SEQ(r, NTIMES(r, n - 1))
}
+
+// Test Cases
+
// the evil regular expression a?{n} a{n}
def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
@@ -88,3 +95,23 @@
println(i + " " + "%.5f".format(time_needed(2, matches(EVIL2, "a" * i))))
}
+
+
+
+// size of a regular expressions - for testing purposes
+def size(r: Rexp) : Int = r match {
+ case ZERO => 1
+ case ONE => 1
+ case CHAR(_) => 1
+ case ALT(r1, r2) => 1 + size(r1) + size(r2)
+ case SEQ(r1, r2) => 1 + size(r1) + size(r2)
+ case STAR(r) => 1 + size(r)
+}
+
+// the expicit expansion in EVIL1(n) increases
+// drastically its size
+
+size(EVIL1(1)) // 5
+size(EVIL1(3)) // 17
+size(EVIL1(5)) // 29
+size(EVIL1(7)) // 41
--- a/progs/re2.scala Mon Feb 13 23:22:45 2017 +0000
+++ b/progs/re2.scala Wed Mar 15 01:24:39 2017 +0000
@@ -1,5 +1,6 @@
-// version with explicit an n-times regular expression
-// this keeps the regular expression small
+// Version with explicit an explicit n-times regular expression;
+// this keeps the overall regular expression in the EVIL1 regular
+// expression small
abstract class Rexp
case object ZERO extends Rexp
@@ -8,7 +9,8 @@
case class ALT(r1: Rexp, r2: Rexp) extends Rexp
case class SEQ(r1: Rexp, r2: Rexp) extends Rexp
case class STAR(r: Rexp) extends Rexp
-case class NTIMES(r: Rexp, n: Int) extends Rexp //explicit constructor
+case class NTIMES(r: Rexp, n: Int) extends Rexp //explicit constructor for n-times
+
def nullable (r: Rexp) : Boolean = r match {
case ZERO => false
@@ -20,6 +22,7 @@
case NTIMES(r, i) => if (i == 0) true else nullable(r)
}
+
def der (c: Char, r: Rexp) : Rexp = r match {
case ZERO => ZERO
case ONE => ZERO
@@ -30,7 +33,7 @@
else SEQ(der(c, r1), r2)
case STAR(r1) => SEQ(der(c, r1), STAR(r1))
case NTIMES(r1, i) =>
- if (i == 0) ZERO else der(c, SEQ(r1, NTIMES(r1, i - 1)))
+ if (i == 0) ZERO else SEQ(der(c, r1), NTIMES(r1, i - 1))
}
def ders (s: List[Char], r: Rexp) : Rexp = s match {
@@ -41,9 +44,12 @@
def matches(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r))
-//optional: one or zero times
+//optional regular expression: one or zero times
def OPT(r: Rexp) = ALT(r, ONE)
+
+// Test Cases
+
//evil regular expressions
def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
@@ -57,11 +63,11 @@
//test: (a?{n}) (a{n})
-for (i <- 1 to 1001 by 10) {
+for (i <- 1 to 1201 by 100) {
println(i + " " + "%.5f".format(time_needed(2, matches(EVIL1(i), "a" * i))))
}
-for (i <- 1 to 1001 by 10) {
+for (i <- 1 to 1201 by 100) {
println(i + " " + "%.5f".format(time_needed(2, matches(EVIL1(i), "a" * i))))
}
@@ -76,3 +82,32 @@
}
+
+// size of a regular expressions - for testing purposes
+def size(r: Rexp) : Int = r match {
+ case ZERO => 1
+ case ONE => 1
+ case CHAR(_) => 1
+ case ALT(r1, r2) => 1 + size(r1) + size(r2)
+ case SEQ(r1, r2) => 1 + size(r1) + size(r2)
+ case STAR(r) => 1 + size(r)
+ case NTIMES(r, _) => 1 + size(r)
+}
+
+// EVIL1(n) has now a constant size, no matter
+// what n is
+
+size(EVIL1(1)) // 7
+size(EVIL1(3)) // 7
+size(EVIL1(5)) // 7
+size(EVIL1(7)) // 7
+
+
+// but the size of the derivatives can grow
+// quite dramatically
+
+size(ders("".toList, EVIL2)) // 5
+size(ders("a".toList, EVIL2)) // 12
+size(ders("aa".toList, EVIL2)) // 28
+size(ders("aaa".toList, EVIL2)) // 58
+size(ders("aaaa".toList, EVIL2)) // 116
--- a/progs/re3.scala Mon Feb 13 23:22:45 2017 +0000
+++ b/progs/re3.scala Wed Mar 15 01:24:39 2017 +0000
@@ -1,3 +1,6 @@
+// Version with simplification during derivatives;
+// this keeps the regular expressions small, which
+// is good for run-time
abstract class Rexp
case object ZERO extends Rexp
@@ -32,7 +35,7 @@
else SEQ(der(c, r1), r2)
case STAR(r1) => SEQ(der(c, r1), STAR(r1))
case NTIMES(r, i) =>
- if (i == 0) ZERO else der(c, SEQ(r, NTIMES(r, i - 1)))
+ if (i == 0) ZERO else SEQ(der(c, r), NTIMES(r, i - 1))
}
def simp(r: Rexp) : Rexp = r match {
@@ -63,12 +66,12 @@
def matcher(r: Rexp, s: String) : Boolean = nullable(ders(s.toList, r))
-var regex = NTIMES(CHAR('a'),5)
-println(matcher(regex,"aaaaa"))
-
//one or zero
def OPT(r: Rexp) = ALT(r, ONE)
+
+// Test Cases
+
//evil regular expressions
def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
@@ -83,20 +86,43 @@
//test: (a?{n}) (a{n})
-for (i <- 1 to 9001 by 10) {
+for (i <- 1 to 11001 by 1000) {
println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL1(i), "a" * i))))
}
-for (i <- 1 to 9001 by 1000) {
+for (i <- 1 to 11001 by 1000) {
println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL1(i), "a" * i))))
}
//test: (a*)* b
-for (i <- 1 to 7000001 by 500000) {
+for (i <- 1 to 6000001 by 500000) {
+ println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL2, "a" * i))))
+}
+
+for (i <- 1 to 6000001 by 500000) {
println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL2, "a" * i))))
}
-for (i <- 1 to 7000001 by 500000) {
- println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL2, "a" * i))))
+
+
+// size of a regular expressions - for testing purposes
+def size(r: Rexp) : Int = r match {
+ case ZERO => 1
+ case ONE => 1
+ case CHAR(_) => 1
+ case ALT(r1, r2) => 1 + size(r1) + size(r2)
+ case SEQ(r1, r2) => 1 + size(r1) + size(r2)
+ case STAR(r) => 1 + size(r)
+ case NTIMES(r, _) => 1 + size(r)
}
+
+// now the size of the derivatives grows
+// much, much slower
+
+size(ders("".toList, EVIL2)) // 5
+size(ders("a".toList, EVIL2)) // 8
+size(ders("aa".toList, EVIL2)) // 8
+size(ders("aaa".toList, EVIL2)) // 8
+size(ders("aaaa".toList, EVIL2)) // 8
+size(ders("aaaaa".toList, EVIL2)) // 8
--- a/progs/re4.scala Mon Feb 13 23:22:45 2017 +0000
+++ b/progs/re4.scala Wed Mar 15 01:24:39 2017 +0000
@@ -1,4 +1,6 @@
-
+// Version which attempts to move whole strings, not
+// just characters, under derivatives whenever possible
+
abstract class Rexp
case object ZERO extends Rexp
case object ONE extends Rexp
@@ -50,6 +52,7 @@
case r => r
}
+// *new*
// derivative w.r.t. a string (iterates der)
def ders2(s: List[Char], r: Rexp) : Rexp = (s, r) match {
case (Nil, r) => r
@@ -68,6 +71,9 @@
//one or zero
def OPT(r: Rexp) = ALT(r, ONE)
+
+// Test Cases
+
def EVIL1(n: Int) = SEQ(NTIMES(OPT(CHAR('a')), n), NTIMES(CHAR('a'), n))
val EVIL2 = SEQ(STAR(STAR(CHAR('a'))), CHAR('b'))
@@ -79,7 +85,6 @@
(end - start)/(i * 1.0e9)
}
-// warmup
//test: (a?{n}) (a{n})
for (i <- 1 to 7000001 by 500000) {
println(i + " " + "%.5f".format(time_needed(2, matcher(EVIL1(i), "a" * i))))