# HG changeset patch
# User Christian Urban <urbanc@in.tum.de>
# Date 1474370709 -3600
# Node ID e3acf2bf38954d504123bcb10c01e8eca6851b51
# Parent  5deefcc8cffa06ce02a832264d7379f1d6faec3e# Parent  7a04f2c532c1c70d490aa43e5b767ab520f2303c
merged

diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw01.pdf
Binary file coursework/cw01.pdf has changed
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw01.tex
--- a/coursework/cw01.tex	Tue Sep 20 12:24:29 2016 +0100
+++ b/coursework/cw01.tex	Tue Sep 20 12:25:09 2016 +0100
@@ -6,13 +6,14 @@
 
 \section*{Coursework 1 (Strand 1)}
 
-This coursework is worth 5\% and is due on 20 October at
+This coursework is worth 4\% and is due on 20 October at
 16:00. You are asked to implement a regular expression matcher
 and submit a document containing the answers for the questions
 below. You can do the implementation in any programming
 language you like, but you need to submit the source code with
 which you answered the questions, otherwise a mark of 0\% will
 be awarded. You can submit your answers in a txt-file or pdf.
+Code send as code.
 
 
 
@@ -21,17 +22,17 @@
 It should be understood that the work you submit represents
 your own effort. You have not copied from anyone else. An
 exception is the Scala code I showed during the lectures or
-uploaded to KEATS, which you can use.\bigskip
+uploaded to KEATS, which you can freely use.\bigskip
 
 
-\subsubsection*{Tasks}
+\subsubsection*{Task}
 
 The task is to implement a regular expression matcher based on
-derivatives. The implementation should be able to deal with the usual
-(basic) regular expressions
+derivatives of regular expressions. The implementation should
+be able to deal with the usual (basic) regular expressions
 
 \[
-\varnothing, \epsilon, c, r_1 + r_2, r_1 \cdot r_2, r^*
+\ZERO,\; \ONE,\; c,\; r_1 + r_2,\; r_1 \cdot r_2,\; r^*
 \]
 
 \noindent
@@ -47,9 +48,9 @@
 \end{tabular}
 \end{center}
 
-\noindent In the case of $r^{\{n,m\}}$ we have the convention
-that $0 \le n \le m$. The meanings of the extended regular
-expressions are
+\noindent In the case of $r^{\{n,m\}}$ you can assume the
+convention that $0 \le n \le m$. The meanings of the extended
+regular expressions are
 
 \begin{center}
 \begin{tabular}{r@{\hspace{2mm}}c@{\hspace{2mm}}l}
@@ -64,7 +65,7 @@
 \noindent whereby in the last clause the set $\Sigma^*$ stands
 for the set of \emph{all} strings over the alphabet $\Sigma$
 (in the implementation the alphabet can be just what is
-represented by, say, the type \pcode{char}). So $\sim{}r$
+represented by, say, the type \pcode{Char}). So $\sim{}r$
 means `all the strings that $r$ cannot match'. 
 
 Be careful that your implementation of \textit{nullable} and
@@ -86,12 +87,12 @@
 expressions.
 
 
-\subsection*{Question 1 (unmarked)}
+\subsection*{Question 1}
 
 What is your King's email address (you will need it in
 Question 3)?
 
-\subsection*{Question 2 (marked with 2\%)}
+\subsection*{Question 2}
 
 This question does not require any implementation. From the
 lectures you have seen the definitions for the functions
@@ -122,7 +123,7 @@
 \item $L(der\,c\,r)) = Der\,c\,(L(r))$
 \end{itemize}
 
-\subsection*{Question 3 (marked with 1\%)}
+\subsection*{Question 3}
 
 Implement the following regular expression for email addresses
 
@@ -132,8 +133,8 @@
 
 \noindent and calculate the derivative according to your email
 address. When calculating the derivative, simplify all regular
-expressions as much as possible, but at least apply the
-following six simplification rules:
+expressions as much as possible by applying the
+following 7 simplification rules:
 
 \begin{center}
 \begin{tabular}{l@{\hspace{2mm}}c@{\hspace{2mm}}ll}
@@ -147,13 +148,12 @@
 \end{tabular}
 \end{center}
 
-\noindent
-Write down your simplified derivative in the ``mathematicical''
-notation using parentheses where necessary. That means you should
-use the more readable infix notation $+$, $\cdot$ and $^*$, 
+\noindent Write down your simplified derivative in a readable
+notation using parentheses where necessary. That means you
+should use the infix notation $+$, $\cdot$, $^*$ and so on,
 instead of code.
  
-\subsection*{Question 4 (marked with 1\%)}
+\subsection*{Question 4}
 
 Suppose \textit{[a-z]} stands for the range regular expression
 $[a,b,c,\ldots,z]$.  Consider the regular expression $/ \cdot * \cdot
@@ -168,7 +168,22 @@
 \item \texttt{"/*test/*test*/"}
 \end{enumerate}
 
-\subsection*{Question 5 (marked with 1\%)}
+\noindent
+Also test your regular expression matcher with the regular
+expression $a^{\{3,5\}}$ and the strings
+
+\begin{enumerate}
+\setcounter{enumi}{4}
+\item \texttt{aa}
+\item \texttt{aaa}
+\item \texttt{aaaaa}
+\item \texttt{aaaaaa}
+\end{enumerate}
+
+\noindent
+Does your matcher produce the expected results?
+
+\subsection*{Question 5}
 
 Let $r_1$ be the regular expression $a\cdot a\cdot a$ and $r_2$ be
 $(a^{\{19,19\}}) \cdot (a^?)$.  Decide whether the following three
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw02.pdf
Binary file coursework/cw02.pdf has changed
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw02.tex
--- a/coursework/cw02.tex	Tue Sep 20 12:24:29 2016 +0100
+++ b/coursework/cw02.tex	Tue Sep 20 12:25:09 2016 +0100
@@ -6,25 +6,26 @@
 
 \section*{Coursework 2 (Strand 1)}
 
-\noindent This coursework is worth 5\% and is due on 6
+\noindent This coursework is worth 4\% and is due on 3
 November at 16:00. You are asked to implement the Sulzmann \&
-Lu tokeniser for the WHILE language. You can do the
+Lu lexer for the WHILE language. You can do the
 implementation in any programming language you like, but you
 need to submit the source code with which you answered the
 questions, otherwise a mark of 0\% will be awarded. You can
-submit your answers in a txt-file or as pdf.
+submit your answers in a txt-file or as pdf. Code submit as 
+code.
 
 \subsection*{Disclaimer}
 
 It should be understood that the work you submit represents
 your own effort. You have not copied from anyone else. An
 exception is the Scala code from KEATS and the code I showed
-during the lectures, which you can both use. You can also use
-your own code from the CW~1.
+during the lectures, which you can both freely use. You can
+also use your own code from the CW~1.
 
-\subsection*{Question 1 (marked with 1\%)}
+\subsection*{Question 1}
 
-To implement a tokeniser for the WHILE language, you first
+To implement a lexer for the WHILE language, you first
 need to design the appropriate regular expressions for the
 following eight syntactic entities:
 
@@ -78,7 +79,7 @@
 You can use the basic regular expressions 
 
 \[
-\varnothing, \epsilon, c, r_1 + r_2, r_1 \cdot r_2, r^*
+\ZERO,\; \ONE,\; c,\; r_1 + r_2,\; r_1 \cdot r_2,\; r^*
 \]
 
 \noindent
@@ -97,9 +98,9 @@
 small as possible. For example you should use character ranges
 for identifiers and numbers.
 
-\subsection*{Question 2 (marked with 3\%)}
+\subsection*{Question 2}
 
-Implement the Sulzmann \& Lu tokeniser from the lectures. For
+Implement the Sulzmann \& Lu lexer from the lectures. For
 this you need to implement the functions $nullable$ and $der$
 (you can use your code from CW~1), as well as $mkeps$ and
 $inj$. These functions need to be appropriately extended for
@@ -138,7 +139,7 @@
 
 
 Also add the record regular expression from the
-lectures to your tokeniser and implement a function, say
+lectures to your lexer and implement a function, say
 \pcode{env}, that returns all assignments from a value (such
 that you can extract easily the tokens from a value).\medskip 
 
@@ -154,11 +155,11 @@
 and use your \pcode{env} function to give the token sequence.
 
 
-\subsection*{Question 3 (marked with 1\%)}
+\subsection*{Question 3}
 
-Extend your tokenizer from Q2 to also simplify regular expressions
+Extend your lexer from Q2 to also simplify regular expressions
 after each derivation step and rectify the computed values after each
-injection. Use this tokenizer to tokenize the programs in
+injection. Use this lexer to tokenize the programs in
 Figure~\ref{fib} and \ref{loop}. Give the tokens of these
 programs where whitespaces are filtered out.
 
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw03.pdf
Binary file coursework/cw03.pdf has changed
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw03.tex
--- a/coursework/cw03.tex	Tue Sep 20 12:24:29 2016 +0100
+++ b/coursework/cw03.tex	Tue Sep 20 12:25:09 2016 +0100
@@ -4,9 +4,9 @@
 
 \begin{document}
 
-\section*{Coursework 3 (Strand 1)}
+\section*{Coursework 3}
 
-\noindent This coursework is worth 5\% and is due on 27
+\noindent This coursework is worth 5\% and is due on 24
 November at 16:00. You are asked to implement a parser for the
 WHILE language and also an interpreter. You can do the
 implementation in any programming language you like, but you
@@ -23,9 +23,9 @@
 CW~1 and CW~2.
 
 
-\subsection*{Question 1 (marked with 1\%)}
+\subsection*{Question 1}
 
-Design a grammar for the WHILE language and give the grammar 
+Design a grammar for the WHILE language and give the grammar
 rules. The main categories of non-terminal should be:
 
 \begin{itemize}
@@ -39,12 +39,12 @@
 \item blocks which are enclosed in curly parentheses
 \end{itemize}
 
-\subsection*{Question 2 (marked with 2\%)}
+\subsection*{Question 2}
 
 You should implement a parser for the WHILE language using
 parser combinators. Be careful that the parser takes as input
 a stream, or list, of tokens generated by the tokenizer from
-the previous coursework. For this you might filter out 
+the previous coursework. For this you might want to filter out 
 whitespaces and comments. Your parser should be able to handle
 the WHILE programs in Figures~\ref{fib} and \ref{loop}.
 In addition give the parse tree for the statement:
@@ -81,11 +81,11 @@
 \caption{The datatype for parse trees in Scala.\label{trees}}
 \end{figure}
 
-\subsection*{Question 3 (marked with 2\%)}
+\subsection*{Question 3}
 
 Implement an interpreter for the WHILE language you designed
 and parsed in Question 1 and 2. This interpreter should take
-as input a parse tree. However be careful because programs
+as input a parse tree. However be careful because, programs
 contain variables and variable assignments. This means
 you need to maintain a kind of memory, or environment,
 where you can look up a value of a variable and also
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw04.pdf
Binary file coursework/cw04.pdf has changed
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw04.tex
--- a/coursework/cw04.tex	Tue Sep 20 12:24:29 2016 +0100
+++ b/coursework/cw04.tex	Tue Sep 20 12:25:09 2016 +0100
@@ -9,7 +9,7 @@
 
 \section*{Coursework 4 (Strand 1)}
 
-\noindent This coursework is worth 10\% and is due on 11
+\noindent This coursework is worth 5\% and is due on 13
 December at 16:00. You are asked to implement a compiler for
 the WHILE language that targets the assembler language
 provided by Jasmin or Krakatau (both have very similar
@@ -124,7 +124,7 @@
 %submit your answers in a txt-file or as pdf.\bigskip
 
 
-\subsection*{Question 1 (marked with 5\%)}
+\subsection*{Question 1}
 
 You need to lex and parse WHILE programs, and then generate
 Java Byte Code instructions for the Jasmin assembler (or
@@ -141,7 +141,7 @@
 \caption{The Fibonacci program in the WHILE language.\label{fibs}}
 \end{figure}
 
-\subsection*{Question 2 (marked with 4\%)}
+\subsection*{Question 2}
 
 Extend the syntax of your language so that it contains also
 \texttt{for}-loops, like
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw05.pdf
Binary file coursework/cw05.pdf has changed
diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw05.tex
--- a/coursework/cw05.tex	Tue Sep 20 12:24:29 2016 +0100
+++ b/coursework/cw05.tex	Tue Sep 20 12:25:09 2016 +0100
@@ -6,13 +6,11 @@
 
 \section*{Coursework (Strand 2)}
 
-\noindent This coursework is worth 25\% and is due on 11
-December at 16:00. You are asked to prove the correctness of 
-the
-regular expression matcher from the lectures using the
-Isabelle theorem prover. You need to submit a theory file
-containing this proof. The Isabelle theorem prover is
-available from 
+\noindent This coursework is worth 20\% and is due on 13 December at
+16:00. You are asked to prove the correctness of the regular
+expression matcher from the lectures using the Isabelle theorem
+prover. You need to submit a theory file containing this proof. The
+Isabelle theorem prover is available from
 
 \begin{center}
 \url{http://isabelle.in.tum.de}
@@ -47,8 +45,8 @@
 
 \noindent If you need more help or you are stuck somewhere,
 please feel free to contact me (christian.urban@kcl.ac.uk). I
-am a main developer of Isabelle and have used it for
-approximately the 14 years. One of the success stories of
+am one of the main developers of Isabelle and have used it for
+approximately the 16 years. One of the success stories of
 Isabelle is the recent verification of a microkernel operating
 system by an Australian group, see \url{http://sel4.systems}.
 Their operating system is the only one that has been proved
@@ -73,8 +71,8 @@
 
 \begin{lstlisting}[language={},numbers=none]
 datatype rexp =
-  NULL
-| EMPTY
+  ZERO
+| ONE
 | CHAR char
 | SEQ rexp rexp
 | ALT rexp rexp
@@ -86,8 +84,8 @@
 
 \begin{center}
 \begin{tabular}{rcl@{\hspace{10mm}}l}
-$L(\varnothing)$  & $\dn$ & $\varnothing$   & \pcode{NULL}\\
-$L(\epsilon)$     & $\dn$ & $\{[]\}$        & \pcode{EMPTY}\\ 
+$L(\ZERO)$  & $\dn$ & $\varnothing$   & \pcode{ZERO}\\
+$L(\ONE)$     & $\dn$ & $\{[]\}$        & \pcode{ONE}\\ 
 $L(c)$            & $\dn$ & $\{[c]\}$       & \pcode{CHAR}\\
 $L(r_1 + r_2)$     & $\dn$ & $L(r_1) \cup L(r_2)$ & \pcode{ALT}\\
 $L(r_1 \cdot r_2)$ & $\dn$ & $L(r_1) \,@\, L(r_2)$ & \pcode{SEQ}\\
@@ -136,8 +134,8 @@
 fun 
   nullable :: "rexp $\Rightarrow$ bool"
 where
-  "nullable NULL = False"
-| "nullable EMPTY = True"
+  "nullable ZERO = False"
+| "nullable ONE = True"
 | "nullable (CHAR _) = False"
 | "nullable (ALT r1 r2) = (nullable(r1) $\vee$ nullable(r2))"
 | "nullable (SEQ r1 r2) = (nullable(r1) $\wedge$ nullable(r2))"
@@ -146,9 +144,9 @@
 fun 
   der :: "char $\Rightarrow$ rexp $\Rightarrow$ rexp"
 where
-  "der c NULL = NULL"
-| "der c EMPTY = NULL"
-| "der c (CHAR d) = (if c = d then EMPTY else NULL)"
+  "der c ZERO = ZERO"
+| "der c ONE = ZERO"
+| "der c (CHAR d) = (if c = d then ONE else ZERO)"
 | "der c (ALT r1 r2) = ALT (der c r1) (der c r2)"
 | "der c (SEQ r1 r2) = 
      (if (nullable r1) then ALT (SEQ (der c r1) r2) (der c r2)
@@ -175,8 +173,8 @@
 fun 
   zeroable :: "rexp $\Rightarrow$ bool"
 where
-  "zeroable NULL = True"
-| "zeroable EMPTY = False"
+  "zeroable ZERO = True"
+| "zeroable ONE = False"
 | "zeroable (CHAR _) = False"
 | "zeroable (ALT r1 r2) = (zeroable(r1) $\wedge$ zeroable(r2))"
 | "zeroable (SEQ r1 r2) = (zeroable(r1) $\vee$ zeroable(r2))"
@@ -185,13 +183,13 @@
 lemma
   "zeroable r $\longleftrightarrow$ L r = {}"
 proof (induct)
-  case (NULL)
-  have "zeroable NULL" "L NULL = {}" by simp_all
-  then show "zeroable NULL $\longleftrightarrow$ (L NULL = {})" by simp
+  case (ZERO)
+  have "zeroable ZERO" "L ZERO = {}" by simp_all
+  then show "zeroable ZERO $\longleftrightarrow$ (L ZERO = {})" by simp
 next
-  case (EMPTY)
-  have "$\neg$ zeroable EMPTY" "L EMPTY = {[]}" by simp_all
-  then show "zeroable EMPTY $\longleftrightarrow$ (L EMPTY = {})" by simp
+  case (ONE)
+  have "$\neg$ zeroable ONE" "L ONE = {[]}" by simp_all
+  then show "zeroable ONE $\longleftrightarrow$ (L ONE = {})" by simp
 next
   case (CHAR c)
   have "$\neg$ zeroable (CHAR c)" "L (CHAR c) = {[c]}" by simp_all
diff -r 5deefcc8cffa -r e3acf2bf3895 handouts/ho01.tex
--- a/handouts/ho01.tex	Tue Sep 20 12:24:29 2016 +0100
+++ b/handouts/ho01.tex	Tue Sep 20 12:25:09 2016 +0100
@@ -27,6 +27,27 @@
 %%  reasons for a new prgramming language
 %% http://beautifulracket.com
 
+%regher
+% We can start off with a couple of observations about the role of
+% compilers. First, hardware is getting weirder rather than getting
+% clocked faster: almost all processors are multicores and it looks
+% like there is increasing asymmetry in resources across
+% cores. Processors come with vector units, crypto accelerators, bit
+% twiddling instructions, and lots of features to make virtualization
+% and concurrency work. We have DSPs, GPUs, big.little, and Xeon
+% Phi. This is only scratching the surface. Second, we’re getting
+% tired of low-level languages and their associated security
+% disasters, we want to write new code, to whatever extent possible,
+% in safer, higher-level languages. Compilers are caught right in the
+% middle of these opposing trends: one of their main jobs is to help
+% bridge the large and growing gap between increasingly high-level
+% languages and increasingly wacky platforms. It’s effectively a
+% perpetual employment act for solid compiler hackers.
+
+% compiler explorer
+% https://gcc.godbolt.org
+
+
 \begin{document}
 \fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016}
 
diff -r 5deefcc8cffa -r e3acf2bf3895 progs/catastrophic.java
--- a/progs/catastrophic.java	Tue Sep 20 12:24:29 2016 +0100
+++ b/progs/catastrophic.java	Tue Sep 20 12:25:09 2016 +0100
@@ -1,14 +1,21 @@
+// a case of catastrophic backtracking in Java
+//
+// regexp: (a*)*b
+// strings: aa....
+
 import java.util.regex.*;
 
 public class catastrophic {
     public static void main(String[] args) {
+
+        //we always run all the tests twice -> warmup of the JVM
         for (int runs = 0; runs < 2; runs++) {
-            //Pattern pattern = Pattern.compile("(0*)*A");
-            //Pattern pattern = Pattern.compile("a?{20}a{20}");
-
-            // Run from 5 to 25 characters
-            for (int length = 5; length < 30; length++) {
-                Pattern pattern = Pattern.compile("(a*)*b");
+            
+            Pattern pattern = Pattern.compile("(a*)*b");
+            
+            // Run from 5 to 28 characters
+            for (int length = 5; length < 28; length++) {
+                
                 // Build input of specified length
                 String input = "";
                 for (int i = 0; i < length; i++) { input += "a"; }
@@ -18,10 +25,11 @@
                 for (int i = 0; i < 2; i++) {
                     pattern.matcher(input).find();
                 }
+                
                 System.out.println(length + " " + input + ": " 
                        + ((System.nanoTime() - start) / 2000000000d) 
                        + "s");
             }
         }
     }
-} 
\ No newline at end of file
+} 
diff -r 5deefcc8cffa -r e3acf2bf3895 progs/catastrophic.py
--- a/progs/catastrophic.py	Tue Sep 20 12:24:29 2016 +0100
+++ b/progs/catastrophic.py	Tue Sep 20 12:25:09 2016 +0100
@@ -2,11 +2,23 @@
 import re
 import sys
 
+# case of catastrophic backtracking in Python
+#
+# regex: (a?){n} a{n}
+# strings: aa...
+#
+# call with timing as:
+#
+#   > time ./catastrophic.py 20
+
+# counter n given on the command line
 cn = sys.argv[1]
 
+# constructing the regex
 r1 = '((a?){%s})' % cn
 r2 = 'a{%s}' % cn
 
+# calling the matching function
 m = re.match(r1 + r2 , "a" * int(cn)) 
 
 print m.group(0)
diff -r 5deefcc8cffa -r e3acf2bf3895 progs/catastrophic.rb
--- a/progs/catastrophic.rb	Tue Sep 20 12:24:29 2016 +0100
+++ b/progs/catastrophic.rb	Tue Sep 20 12:25:09 2016 +0100
@@ -1,4 +1,4 @@
-# provided by Daniel Baldwin
+# example provided by Daniel Baldwin
 
 nums = (1..1000)
 
diff -r 5deefcc8cffa -r e3acf2bf3895 progs/crawler1.scala
--- a/progs/crawler1.scala	Tue Sep 20 12:24:29 2016 +0100
+++ b/progs/crawler1.scala	Tue Sep 20 12:25:09 2016 +0100
@@ -32,8 +32,8 @@
 }
 
 // some starting URLs for the crawler
-//val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
-val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney"""
+val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
+//val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney"""
 
 crawl(startURL, 2)
 
diff -r 5deefcc8cffa -r e3acf2bf3895 progs/crawler2.scala
--- a/progs/crawler2.scala	Tue Sep 20 12:24:29 2016 +0100
+++ b/progs/crawler2.scala	Tue Sep 20 12:25:09 2016 +0100
@@ -13,7 +13,7 @@
 
 // regexes for URLs and "my" domain
 val http_pattern = """"https?://[^"]*"""".r
-val my_urls = """urbanc""".r (*@\label{myurlline}@*) 
+val my_urls = """urbanc""".r       (*@\label{myurlline}@*) 
 
 def unquote(s: String) = s.drop(1).dropRight(1)
 
@@ -21,11 +21,11 @@
   http_pattern.findAllIn(page).map(unquote).toSet
 
 def crawl(url: String, n: Int) : Unit = {
-  if (n == 0) () (*@\label{changestartline}@*) 
+  if (n == 0) ()                   (*@\label{changestartline}@*) 
   else if (my_urls.findFirstIn(url) == None) { 
     println(s"Visiting: $n $url")
     get_page(url); () 
-  } (*@\label{changeendline}@*) 
+  }                                (*@\label{changeendline}@*) 
   else {
     println(s"Visiting: $n $url")
     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)