--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Attic/crawler2.scala Wed May 06 15:37:31 2020 +0100
@@ -0,0 +1,44 @@
+// This version of the crawler only
+// checks links in the "domain" urbanc
+
+import io.Source
+import scala.util.matching.Regex
+import scala.util._
+
+// gets the first 10K of a web-page
+def get_page(url: String) : String = {
+ Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
+ getOrElse { println(s" Problem with: $url"); ""}
+}
+
+// regexes for URLs and "my" domain
+val http_pattern = """"https?://[^"]*"""".r
+val my_urls = """urban""".r /*@\label{myurlline}@*/
+//val my_urls = """kcl.ac.uk""".r
+
+def unquote(s: String) = s.drop(1).dropRight(1)
+
+def get_all_URLs(page: String) : Set[String] =
+ http_pattern.findAllIn(page).map(unquote).toSet
+
+def crawl(url: String, n: Int) : Unit = {
+ if (n == 0) () /*@\label{changestartline}@*/
+ else if (my_urls.findFirstIn(url) == None) {
+ println(s"Visiting: $n $url")
+ get_page(url); ()
+ } /*@\label{changeendline}@*/
+ else {
+ println(s"Visiting: $n $url")
+ for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1)
+ }
+}
+
+// starting URL for the crawler
+val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
+//val startURL = """https://nms.kcl.ac.uk/christian.urban/bsc-projects-17.html"""
+
+
+// can now deal with depth 3 and beyond
+crawl(startURL, 3)
+
+
--- a/coursework/cw03.tex Thu Apr 16 19:15:46 2020 +0100
+++ b/coursework/cw03.tex Wed May 06 15:37:31 2020 +0100
@@ -7,6 +7,10 @@
\section*{Coursework 3}
+TODO: Testcases for expressions
+\url{https://github.com/ArashPartow/math-parser-benchmark-project}
+
+
\noindent This coursework is worth 5\% and is due on \cwTHREE{}
at 18:00. You are asked to implement a parser for the
WHILE language and also an interpreter. You can do the
Binary file handouts/ho01.pdf has changed
--- a/handouts/ho01.tex Thu Apr 16 19:15:46 2020 +0100
+++ b/handouts/ho01.tex Wed May 06 15:37:31 2020 +0100
@@ -35,7 +35,6 @@
% compiler explorer
% https://gcc.godbolt.org
-%https://www.youtube.com/watch?v=gmhMQfFQu20
% good article how languages have been influenced
% 10 MOST(LY DEAD) INFLUENTIAL PROGRAMMING LANGUAGES
@@ -47,12 +46,13 @@
\section*{Handout 1}
-The purpose of a compiler is to transform a program a human can read
-and write into code the machine can run as fast as
-possible. Developping a compiler is an old craft going back to 1952
-with the first compiler written by Grace Hopper. Why studiying
-compilers nowadays? An interesting answer is given by John Regher in
-his compiler
+The purpose of a compiler is to transform a program a human can read and
+write into code the machine can run as fast as possible. Developing a
+compiler is an old craft going back to 1952 with the first compiler
+written by Grace Hopper.\footnote{Who many years ago was invited on a
+talk show hosted by David Letterman, see
+\url{https://youtu.be/3N_ywhx6_K0?t=31}.} Why studying compilers
+nowadays? An interesting answer is given by John Regher in his compiler
blog:\footnote{\url{http://blog.regehr.org/archives/1419}}
\begin{quote}\it{}``We can start off with a couple of observations
@@ -73,18 +73,23 @@
employment act for solid compiler hackers.''
\end{quote}
+\noindent
+So the goal of this module is to become a solid (beginner) compiler
+hacker and as part of the coursework to implement a small
+compiler for a very small programming language.
-The first part of this module is about text processing, be it for
-compilers, dictionaries, DNA-data, spam-filters and so on.
-When looking for a particular string, say \pcode{"foobar"}, in a large
-text we can use the Knuth-Morris-Pratt algorithm, which is currently the
-most efficient general string search algorithm. But often we do
-\emph{not} just look for a particular string, but for string patterns.
-For example, in program code we need to identify what are the keywords
-(\texttt{if}, \texttt{then}, \texttt{while}, \texttt{for}, etc) and what
-are the identifiers (variable names). A pattern for identifiers could be
-stated as: they start with a letter, followed by zero or more letters,
-numbers and underscores.
+The first part of the module is about the problem of text processing,
+which is needed for compilers, but also for dictionaries, DNA-data,
+spam-filters and so on. When looking for a particular string, say
+\pcode{"foobar"}, in a large text we can use the Knuth-Morris-Pratt
+algorithm, which is currently the most efficient general string search
+algorithm. But often we do \emph{not} just look for a particular string,
+but for string patterns. For example, in program code we need to
+identify what are the keywords (\texttt{if}, \texttt{then},
+\texttt{while}, \texttt{for}, etc) and what are the identifiers
+(variable names). A pattern for identifiers could be stated as: they
+start with a letter, followed by zero or more letters, numbers and
+underscores.
%You might also be surprised what
%constraints programming languages impose about numbers: for example
@@ -200,25 +205,23 @@
\end{tabular}
\end{center}
-\noindent With this table you can figure out the purpose of
-the regular expressions in the web-crawlers shown Figures
-\ref{crawler1}, \ref{crawler2} and
-\ref{crawler3}. Note, however, the regular expression for
-http-addresses in web-pages in Figure~\ref{crawler1}, Line 15,
-is intended to be
+\noindent With this table you can figure out the purpose of the regular
+expressions in the web-crawlers shown Figures \ref{crawler1} and
+\ref{crawler3}. In in Figure~\ref{crawler1}, however, be careful with
+the regular expression for http-addresses in Line~\ref{httpline}. It is
+intended to be
\[
\pcode{"https?://[^"]*"}
\]
-\noindent It specifies that web-addresses need to start with a
-double quote, then comes \texttt{http} followed by an optional
-\texttt{s} and so on until the closing double quote comes.
-Usually we would have to escape the double quotes in order to
-make sure we interpret the double quote as character, not as
-double quote for a string. But Scala's trick with triple
-quotes allows us to omit this kind of escaping. As a result we
-can just write:
+\noindent specifying that http-addresses need to start with a double
+quote, then comes \texttt{http} followed by an optional \texttt{s} and
+so on\ldots{}until the closing double quote comes at the end of the
+address. Normally we would have to escape the double quotes in order to
+make sure we interpret the double quote as character, not as double
+quote for a string. But Scala's trick with triple quotes allows us to
+omit this kind of ugly escaping. As a result we can just write:
\[
\pcode{""""https?://[^"]*"""".r}
@@ -348,17 +351,17 @@
\end{center}
Such troublesome regular expressions are sometimes called \emph{evil
- regular expressions} because they have the potential to make regular
- expression matching engines to topple over, like in Python, Ruby,
- JavaScript and Java 8. This ``toppling over'' is also sometimes called
- \emph{catastrophic backtracking}. I have also seen the term
- \emph{eternal matching} used for this. The problem with evil regular
- expressions is that they can have some serious consequences, for
- example, if you use them in your web-application. The reason is that
- hackers can look for these instances where the matching engine behaves
- badly and then mount a nice DoS-attack against your application. These
- attacks are already have their own name: \emph{Regular Expression
- Denial of Service Attacks (ReDoS)}.
+regular expressions} because they have the potential to make regular
+expression matching engines to topple over, like in Python, Ruby,
+JavaScript and Java 8. This ``toppling over'' is also sometimes called
+\emph{catastrophic backtracking}. I have also seen the term
+\emph{eternal matching} used for this. The problem with evil regular
+expressions and catastrophic backtracking is that they can have some
+serious consequences, for example, if you use them in your
+web-application. The reason is that hackers can look for these instances
+where the matching engine behaves badly and then mount a nice DoS-attack
+against your application. These attacks are already have their own name:
+\emph{Regular Expression Denial of Service Attacks (ReDoS)}.
It will be instructive to look behind the ``scenes'' to find out why
Python and Ruby (and others) behave so badly when matching strings with
@@ -420,6 +423,11 @@
\end{tikzpicture}
\end{center}
+\noindent
+You might have wondered above why I looked at the (now) old Java 8: the
+reason is that Java 9 and later versions are a bit better, but we will
+still beat them hands down with our regex matcher.
+
\subsection*{Basic Regular Expressions}
The regular expressions shown earlier for Scala, we
@@ -776,34 +784,34 @@
{../progs/crawler1.scala}
\caption{The Scala code for a simple web-crawler that checks
-for broken links in a web-page. It uses the regular expression
+for broken links in web-pages. It uses the regular expression
\texttt{http\_pattern} in Line~\ref{httpline} for recognising
URL-addresses. It finds all links using the library function
-\texttt{findAllIn} in Line~\ref{findallline}.\label{crawler1}}
+\texttt{findAllIn} in Line~\ref{findallline} (this function
+is part of Scala's regular expression library).\label{crawler1}}
\end{figure}
+
+%\begin{figure}[p]\small
+% \lstinputlisting[numbers=left,linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}]
+% {../progs/crawler2.scala}
+%
+%\caption{A version of the web-crawler that only follows links
+%in ``my'' domain---since these are the ones I am interested in
+%to fix. It uses the regular expression \texttt{my\_urls} in
+%Line~\ref{myurlline} to check for my name in the links. The
+%main change is in
+%Lines~\ref{changestartline}--\ref{changeendline} where there
+%is a test whether URL is in ``my'' domain or
+%not.\label{crawler2}}
+%\end{figure}
\begin{figure}[p]\small
\lstinputlisting[numbers=left,linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}]
{../progs/crawler2.scala}
-\caption{A version of the web-crawler that only follows links
-in ``my'' domain---since these are the ones I am interested in
-to fix. It uses the regular expression \texttt{my\_urls} in
-Line~\ref{myurlline} to check for my name in the links. The
-main change is in
-Lines~\ref{changestartline}--\ref{changeendline} where there
-is a test whether URL is in ``my'' domain or
-not.\label{crawler2}}
-
-\end{figure}
-
-\begin{figure}[p]\small
- \lstinputlisting[numbers=left,linebackgroundcolor={\ifodd\value{lstnumber}\color{capri!3}\fi}]
- {../progs/crawler3.scala}
-
\caption{A small email harvester---whenever we download a
web-page, we also check whether it contains any email
addresses. For this we use the regular expression
--- a/handouts/ho05.tex Thu Apr 16 19:15:46 2020 +0100
+++ b/handouts/ho05.tex Wed May 06 15:37:31 2020 +0100
@@ -16,6 +16,10 @@
% Language hierachy is about complexity
% How hard is it to recognise an element in a language
+% Pratt parsing
+% https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html
+% https://www.oilshell.org/blog/2017/03/31.html
+
\begin{document}
\section*{Handout 5 (Grammars \& Parser)}
--- a/handouts/ho09.tex Thu Apr 16 19:15:46 2020 +0100
+++ b/handouts/ho09.tex Wed May 06 15:37:31 2020 +0100
@@ -12,6 +12,12 @@
\fnote{\copyright{} Christian Urban, King's College London, 2019}
+% CPS translations
+% https://felleisen.org/matthias/4400-s20/lecture17.html
+
+%% pattern matching resources
+%% https://www.reddit.com/r/ProgrammingLanguages/comments/g1vno3/beginner_resources_for_compiling_pattern_matching/
+
\section*{Handout 9 (LLVM, SSA and CPS)}
Reflecting on our two tiny compilers targetting the JVM, the code
--- a/progs/c.scala Thu Apr 16 19:15:46 2020 +0100
+++ b/progs/c.scala Wed May 06 15:37:31 2020 +0100
@@ -18,6 +18,9 @@
for ((head, tail) <- parse(ts); if tail.isEmpty) yield head
}
+
+
+
class SeqParser[I : IsSeq, T, S](p: => Parser[I, T], q: => Parser[I, S]) extends Parser[I, ~[T, S]] {
def parse(sb: I) =
for ((head1, tail1) <- p.parse(sb);
@@ -67,7 +70,7 @@
implicit def ParserOps[I : IsSeq, T](p: Parser[I, T]) = new {
def ||(q : => Parser[I, T]) = new AltParser[I, T](p, q)
def ~[S](q : => Parser[I, S]) = new SeqParser[I, T, S](p, q)
- def map[S](f: => T => S) = new MapParser[I, T, S](p, f)
+ def map[S](f: => T => S) = new MapParser[I, T, S](p, f)
}
// these implicits allow us to use infic notation for
--- a/progs/crawler1.scala Thu Apr 16 19:15:46 2020 +0100
+++ b/progs/crawler1.scala Wed May 06 15:37:31 2020 +0100
@@ -10,18 +10,20 @@
Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
getOrElse { println(s" Problem with: $url"); ""}
}
-get_page("https://nms.kcl.ac.uk/christiana.urban/")
+
+// e.g. get_page("https://nms.kcl.ac.uk/christiana.urban/")
+
// regex for URLs
val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/
-// drops the first and last character from a string
+// drops the first and last characters from a string
def unquote(s: String) = s.drop(1).dropRight(1)
def get_all_URLs(page: String) : Set[String] =
http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/
-// naive version of crawl - searches until a given depth,
-// visits pages potentially more than once
+// a very naive version of crawl - searches until a given
+// depth, visits pages potentially more than once
def crawl(url: String, n: Int) : Unit = {
if (n == 0) ()
else {
@@ -31,6 +33,7 @@
}
// some starting URLs for the crawler
+
val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
//val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""
--- a/progs/crawler2.scala Thu Apr 16 19:15:46 2020 +0100
+++ b/progs/crawler2.scala Wed May 06 15:37:31 2020 +0100
@@ -1,44 +1,38 @@
-// This version of the crawler only
-// checks links in the "domain" urbanc
+// This version of the crawler that also
+// "harvests" email addresses from webpages
import io.Source
import scala.util.matching.Regex
import scala.util._
-// gets the first 10K of a web-page
def get_page(url: String) : String = {
- Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
+ Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
getOrElse { println(s" Problem with: $url"); ""}
}
-// regexes for URLs and "my" domain
+// regexes for URLs, for "my" domain and for email addresses
val http_pattern = """"https?://[^"]*"""".r
-val my_urls = """urban""".r /*@\label{myurlline}@*/
-//val my_urls = """kcl.ac.uk""".r
+val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/
def unquote(s: String) = s.drop(1).dropRight(1)
def get_all_URLs(page: String) : Set[String] =
http_pattern.findAllIn(page).map(unquote).toSet
+def print_str(s: String) =
+ if (s == "") () else println(s)
+
def crawl(url: String, n: Int) : Unit = {
- if (n == 0) () /*@\label{changestartline}@*/
- else if (my_urls.findFirstIn(url) == None) {
- println(s"Visiting: $n $url")
- get_page(url); ()
- } /*@\label{changeendline}@*/
+ if (n == 0) ()
else {
- println(s"Visiting: $n $url")
- for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1)
+ println(s" Visiting: $n $url")
+ val page = get_page(url)
+ print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/
+ for (u <- get_all_URLs(page).par) crawl(u, n - 1)
}
}
-// starting URL for the crawler
+// staring URL for the crawler
val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
-//val startURL = """https://nms.kcl.ac.uk/christian.urban/bsc-projects-17.html"""
-
-// can now deal with depth 3 and beyond
crawl(startURL, 3)
-
-
--- a/progs/crawler3.scala Thu Apr 16 19:15:46 2020 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-// This version of the crawler that also
-// "harvests" email addresses from webpages
-
-import io.Source
-import scala.util.matching.Regex
-import scala.util._
-
-def get_page(url: String) : String = {
- Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
- getOrElse { println(s" Problem with: $url"); ""}
-}
-
-// regexes for URLs, for "my" domain and for email addresses
-val http_pattern = """"https?://[^"]*"""".r
-val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/
-
-def unquote(s: String) = s.drop(1).dropRight(1)
-
-def get_all_URLs(page: String) : Set[String] =
- http_pattern.findAllIn(page).map(unquote).toSet
-
-def print_str(s: String) =
- if (s == "") () else println(s)
-
-def crawl(url: String, n: Int) : Unit = {
- if (n == 0) ()
- else {
- println(s" Visiting: $n $url")
- val page = get_page(url)
- print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/
- for (u <- get_all_URLs(page).par) crawl(u, n - 1)
- }
-}
-
-// staring URL for the crawler
-val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
-
-
-crawl(startURL, 3)
--- a/slides/slides01.tex Thu Apr 16 19:15:46 2020 +0100
+++ b/slides/slides01.tex Wed May 06 15:37:31 2020 +0100
@@ -428,7 +428,7 @@
\begin{flushright}
\mbox{}\\[-6mm]
{\footnotesize\textcolor{gray}{(she made it to David Letterman's Tonight Show,\\[-2mm]
- \url{http://www.youtube.com/watch?v=aZOxtURhfEU})}}
+ \url{https://youtu.be/3N_ywhx6_K0?t=31})}}
\end{flushright}
\end{frame}
--- a/slides/slides04.tex Thu Apr 16 19:15:46 2020 +0100
+++ b/slides/slides04.tex Wed May 06 15:37:31 2020 +0100
@@ -9,6 +9,12 @@
\pgfplotsset{compat=1.11}
+
+% a hand written lexer for SML
+% https://ponyo.org/
+% https://github.com/eatonphil/ponyo/blob/master/src/Sml/Lexer.sml
+
+
\newcommand{\bl}[1]{\textcolor{blue}{#1}}
\renewcommand{\slidecaption}{CFL 04, King's College London}