# HG changeset patch # User Christian Urban # Date 1474370709 -3600 # Node ID e3acf2bf38954d504123bcb10c01e8eca6851b51 # Parent 5deefcc8cffa06ce02a832264d7379f1d6faec3e# Parent 7a04f2c532c1c70d490aa43e5b767ab520f2303c merged diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw01.pdf Binary file coursework/cw01.pdf has changed diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw01.tex --- a/coursework/cw01.tex Tue Sep 20 12:24:29 2016 +0100 +++ b/coursework/cw01.tex Tue Sep 20 12:25:09 2016 +0100 @@ -6,13 +6,14 @@ \section*{Coursework 1 (Strand 1)} -This coursework is worth 5\% and is due on 20 October at +This coursework is worth 4\% and is due on 20 October at 16:00. You are asked to implement a regular expression matcher and submit a document containing the answers for the questions below. You can do the implementation in any programming language you like, but you need to submit the source code with which you answered the questions, otherwise a mark of 0\% will be awarded. You can submit your answers in a txt-file or pdf. +Code send as code. @@ -21,17 +22,17 @@ It should be understood that the work you submit represents your own effort. You have not copied from anyone else. An exception is the Scala code I showed during the lectures or -uploaded to KEATS, which you can use.\bigskip +uploaded to KEATS, which you can freely use.\bigskip -\subsubsection*{Tasks} +\subsubsection*{Task} The task is to implement a regular expression matcher based on -derivatives. The implementation should be able to deal with the usual -(basic) regular expressions +derivatives of regular expressions. The implementation should +be able to deal with the usual (basic) regular expressions \[ -\varnothing, \epsilon, c, r_1 + r_2, r_1 \cdot r_2, r^* +\ZERO,\; \ONE,\; c,\; r_1 + r_2,\; r_1 \cdot r_2,\; r^* \] \noindent @@ -47,9 +48,9 @@ \end{tabular} \end{center} -\noindent In the case of $r^{\{n,m\}}$ we have the convention -that $0 \le n \le m$. The meanings of the extended regular -expressions are +\noindent In the case of $r^{\{n,m\}}$ you can assume the +convention that $0 \le n \le m$. The meanings of the extended +regular expressions are \begin{center} \begin{tabular}{r@{\hspace{2mm}}c@{\hspace{2mm}}l} @@ -64,7 +65,7 @@ \noindent whereby in the last clause the set $\Sigma^*$ stands for the set of \emph{all} strings over the alphabet $\Sigma$ (in the implementation the alphabet can be just what is -represented by, say, the type \pcode{char}). So $\sim{}r$ +represented by, say, the type \pcode{Char}). So $\sim{}r$ means `all the strings that $r$ cannot match'. Be careful that your implementation of \textit{nullable} and @@ -86,12 +87,12 @@ expressions. -\subsection*{Question 1 (unmarked)} +\subsection*{Question 1} What is your King's email address (you will need it in Question 3)? -\subsection*{Question 2 (marked with 2\%)} +\subsection*{Question 2} This question does not require any implementation. From the lectures you have seen the definitions for the functions @@ -122,7 +123,7 @@ \item $L(der\,c\,r)) = Der\,c\,(L(r))$ \end{itemize} -\subsection*{Question 3 (marked with 1\%)} +\subsection*{Question 3} Implement the following regular expression for email addresses @@ -132,8 +133,8 @@ \noindent and calculate the derivative according to your email address. When calculating the derivative, simplify all regular -expressions as much as possible, but at least apply the -following six simplification rules: +expressions as much as possible by applying the +following 7 simplification rules: \begin{center} \begin{tabular}{l@{\hspace{2mm}}c@{\hspace{2mm}}ll} @@ -147,13 +148,12 @@ \end{tabular} \end{center} -\noindent -Write down your simplified derivative in the ``mathematicical'' -notation using parentheses where necessary. That means you should -use the more readable infix notation $+$, $\cdot$ and $^*$, +\noindent Write down your simplified derivative in a readable +notation using parentheses where necessary. That means you +should use the infix notation $+$, $\cdot$, $^*$ and so on, instead of code. -\subsection*{Question 4 (marked with 1\%)} +\subsection*{Question 4} Suppose \textit{[a-z]} stands for the range regular expression $[a,b,c,\ldots,z]$. Consider the regular expression $/ \cdot * \cdot @@ -168,7 +168,22 @@ \item \texttt{"/*test/*test*/"} \end{enumerate} -\subsection*{Question 5 (marked with 1\%)} +\noindent +Also test your regular expression matcher with the regular +expression $a^{\{3,5\}}$ and the strings + +\begin{enumerate} +\setcounter{enumi}{4} +\item \texttt{aa} +\item \texttt{aaa} +\item \texttt{aaaaa} +\item \texttt{aaaaaa} +\end{enumerate} + +\noindent +Does your matcher produce the expected results? + +\subsection*{Question 5} Let $r_1$ be the regular expression $a\cdot a\cdot a$ and $r_2$ be $(a^{\{19,19\}}) \cdot (a^?)$. Decide whether the following three diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw02.pdf Binary file coursework/cw02.pdf has changed diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw02.tex --- a/coursework/cw02.tex Tue Sep 20 12:24:29 2016 +0100 +++ b/coursework/cw02.tex Tue Sep 20 12:25:09 2016 +0100 @@ -6,25 +6,26 @@ \section*{Coursework 2 (Strand 1)} -\noindent This coursework is worth 5\% and is due on 6 +\noindent This coursework is worth 4\% and is due on 3 November at 16:00. You are asked to implement the Sulzmann \& -Lu tokeniser for the WHILE language. You can do the +Lu lexer for the WHILE language. You can do the implementation in any programming language you like, but you need to submit the source code with which you answered the questions, otherwise a mark of 0\% will be awarded. You can -submit your answers in a txt-file or as pdf. +submit your answers in a txt-file or as pdf. Code submit as +code. \subsection*{Disclaimer} It should be understood that the work you submit represents your own effort. You have not copied from anyone else. An exception is the Scala code from KEATS and the code I showed -during the lectures, which you can both use. You can also use -your own code from the CW~1. +during the lectures, which you can both freely use. You can +also use your own code from the CW~1. -\subsection*{Question 1 (marked with 1\%)} +\subsection*{Question 1} -To implement a tokeniser for the WHILE language, you first +To implement a lexer for the WHILE language, you first need to design the appropriate regular expressions for the following eight syntactic entities: @@ -78,7 +79,7 @@ You can use the basic regular expressions \[ -\varnothing, \epsilon, c, r_1 + r_2, r_1 \cdot r_2, r^* +\ZERO,\; \ONE,\; c,\; r_1 + r_2,\; r_1 \cdot r_2,\; r^* \] \noindent @@ -97,9 +98,9 @@ small as possible. For example you should use character ranges for identifiers and numbers. -\subsection*{Question 2 (marked with 3\%)} +\subsection*{Question 2} -Implement the Sulzmann \& Lu tokeniser from the lectures. For +Implement the Sulzmann \& Lu lexer from the lectures. For this you need to implement the functions $nullable$ and $der$ (you can use your code from CW~1), as well as $mkeps$ and $inj$. These functions need to be appropriately extended for @@ -138,7 +139,7 @@ Also add the record regular expression from the -lectures to your tokeniser and implement a function, say +lectures to your lexer and implement a function, say \pcode{env}, that returns all assignments from a value (such that you can extract easily the tokens from a value).\medskip @@ -154,11 +155,11 @@ and use your \pcode{env} function to give the token sequence. -\subsection*{Question 3 (marked with 1\%)} +\subsection*{Question 3} -Extend your tokenizer from Q2 to also simplify regular expressions +Extend your lexer from Q2 to also simplify regular expressions after each derivation step and rectify the computed values after each -injection. Use this tokenizer to tokenize the programs in +injection. Use this lexer to tokenize the programs in Figure~\ref{fib} and \ref{loop}. Give the tokens of these programs where whitespaces are filtered out. diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw03.pdf Binary file coursework/cw03.pdf has changed diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw03.tex --- a/coursework/cw03.tex Tue Sep 20 12:24:29 2016 +0100 +++ b/coursework/cw03.tex Tue Sep 20 12:25:09 2016 +0100 @@ -4,9 +4,9 @@ \begin{document} -\section*{Coursework 3 (Strand 1)} +\section*{Coursework 3} -\noindent This coursework is worth 5\% and is due on 27 +\noindent This coursework is worth 5\% and is due on 24 November at 16:00. You are asked to implement a parser for the WHILE language and also an interpreter. You can do the implementation in any programming language you like, but you @@ -23,9 +23,9 @@ CW~1 and CW~2. -\subsection*{Question 1 (marked with 1\%)} +\subsection*{Question 1} -Design a grammar for the WHILE language and give the grammar +Design a grammar for the WHILE language and give the grammar rules. The main categories of non-terminal should be: \begin{itemize} @@ -39,12 +39,12 @@ \item blocks which are enclosed in curly parentheses \end{itemize} -\subsection*{Question 2 (marked with 2\%)} +\subsection*{Question 2} You should implement a parser for the WHILE language using parser combinators. Be careful that the parser takes as input a stream, or list, of tokens generated by the tokenizer from -the previous coursework. For this you might filter out +the previous coursework. For this you might want to filter out whitespaces and comments. Your parser should be able to handle the WHILE programs in Figures~\ref{fib} and \ref{loop}. In addition give the parse tree for the statement: @@ -81,11 +81,11 @@ \caption{The datatype for parse trees in Scala.\label{trees}} \end{figure} -\subsection*{Question 3 (marked with 2\%)} +\subsection*{Question 3} Implement an interpreter for the WHILE language you designed and parsed in Question 1 and 2. This interpreter should take -as input a parse tree. However be careful because programs +as input a parse tree. However be careful because, programs contain variables and variable assignments. This means you need to maintain a kind of memory, or environment, where you can look up a value of a variable and also diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw04.pdf Binary file coursework/cw04.pdf has changed diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw04.tex --- a/coursework/cw04.tex Tue Sep 20 12:24:29 2016 +0100 +++ b/coursework/cw04.tex Tue Sep 20 12:25:09 2016 +0100 @@ -9,7 +9,7 @@ \section*{Coursework 4 (Strand 1)} -\noindent This coursework is worth 10\% and is due on 11 +\noindent This coursework is worth 5\% and is due on 13 December at 16:00. You are asked to implement a compiler for the WHILE language that targets the assembler language provided by Jasmin or Krakatau (both have very similar @@ -124,7 +124,7 @@ %submit your answers in a txt-file or as pdf.\bigskip -\subsection*{Question 1 (marked with 5\%)} +\subsection*{Question 1} You need to lex and parse WHILE programs, and then generate Java Byte Code instructions for the Jasmin assembler (or @@ -141,7 +141,7 @@ \caption{The Fibonacci program in the WHILE language.\label{fibs}} \end{figure} -\subsection*{Question 2 (marked with 4\%)} +\subsection*{Question 2} Extend the syntax of your language so that it contains also \texttt{for}-loops, like diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw05.pdf Binary file coursework/cw05.pdf has changed diff -r 5deefcc8cffa -r e3acf2bf3895 coursework/cw05.tex --- a/coursework/cw05.tex Tue Sep 20 12:24:29 2016 +0100 +++ b/coursework/cw05.tex Tue Sep 20 12:25:09 2016 +0100 @@ -6,13 +6,11 @@ \section*{Coursework (Strand 2)} -\noindent This coursework is worth 25\% and is due on 11 -December at 16:00. You are asked to prove the correctness of -the -regular expression matcher from the lectures using the -Isabelle theorem prover. You need to submit a theory file -containing this proof. The Isabelle theorem prover is -available from +\noindent This coursework is worth 20\% and is due on 13 December at +16:00. You are asked to prove the correctness of the regular +expression matcher from the lectures using the Isabelle theorem +prover. You need to submit a theory file containing this proof. The +Isabelle theorem prover is available from \begin{center} \url{http://isabelle.in.tum.de} @@ -47,8 +45,8 @@ \noindent If you need more help or you are stuck somewhere, please feel free to contact me (christian.urban@kcl.ac.uk). I -am a main developer of Isabelle and have used it for -approximately the 14 years. One of the success stories of +am one of the main developers of Isabelle and have used it for +approximately the 16 years. One of the success stories of Isabelle is the recent verification of a microkernel operating system by an Australian group, see \url{http://sel4.systems}. Their operating system is the only one that has been proved @@ -73,8 +71,8 @@ \begin{lstlisting}[language={},numbers=none] datatype rexp = - NULL -| EMPTY + ZERO +| ONE | CHAR char | SEQ rexp rexp | ALT rexp rexp @@ -86,8 +84,8 @@ \begin{center} \begin{tabular}{rcl@{\hspace{10mm}}l} -$L(\varnothing)$ & $\dn$ & $\varnothing$ & \pcode{NULL}\\ -$L(\epsilon)$ & $\dn$ & $\{[]\}$ & \pcode{EMPTY}\\ +$L(\ZERO)$ & $\dn$ & $\varnothing$ & \pcode{ZERO}\\ +$L(\ONE)$ & $\dn$ & $\{[]\}$ & \pcode{ONE}\\ $L(c)$ & $\dn$ & $\{[c]\}$ & \pcode{CHAR}\\ $L(r_1 + r_2)$ & $\dn$ & $L(r_1) \cup L(r_2)$ & \pcode{ALT}\\ $L(r_1 \cdot r_2)$ & $\dn$ & $L(r_1) \,@\, L(r_2)$ & \pcode{SEQ}\\ @@ -136,8 +134,8 @@ fun nullable :: "rexp $\Rightarrow$ bool" where - "nullable NULL = False" -| "nullable EMPTY = True" + "nullable ZERO = False" +| "nullable ONE = True" | "nullable (CHAR _) = False" | "nullable (ALT r1 r2) = (nullable(r1) $\vee$ nullable(r2))" | "nullable (SEQ r1 r2) = (nullable(r1) $\wedge$ nullable(r2))" @@ -146,9 +144,9 @@ fun der :: "char $\Rightarrow$ rexp $\Rightarrow$ rexp" where - "der c NULL = NULL" -| "der c EMPTY = NULL" -| "der c (CHAR d) = (if c = d then EMPTY else NULL)" + "der c ZERO = ZERO" +| "der c ONE = ZERO" +| "der c (CHAR d) = (if c = d then ONE else ZERO)" | "der c (ALT r1 r2) = ALT (der c r1) (der c r2)" | "der c (SEQ r1 r2) = (if (nullable r1) then ALT (SEQ (der c r1) r2) (der c r2) @@ -175,8 +173,8 @@ fun zeroable :: "rexp $\Rightarrow$ bool" where - "zeroable NULL = True" -| "zeroable EMPTY = False" + "zeroable ZERO = True" +| "zeroable ONE = False" | "zeroable (CHAR _) = False" | "zeroable (ALT r1 r2) = (zeroable(r1) $\wedge$ zeroable(r2))" | "zeroable (SEQ r1 r2) = (zeroable(r1) $\vee$ zeroable(r2))" @@ -185,13 +183,13 @@ lemma "zeroable r $\longleftrightarrow$ L r = {}" proof (induct) - case (NULL) - have "zeroable NULL" "L NULL = {}" by simp_all - then show "zeroable NULL $\longleftrightarrow$ (L NULL = {})" by simp + case (ZERO) + have "zeroable ZERO" "L ZERO = {}" by simp_all + then show "zeroable ZERO $\longleftrightarrow$ (L ZERO = {})" by simp next - case (EMPTY) - have "$\neg$ zeroable EMPTY" "L EMPTY = {[]}" by simp_all - then show "zeroable EMPTY $\longleftrightarrow$ (L EMPTY = {})" by simp + case (ONE) + have "$\neg$ zeroable ONE" "L ONE = {[]}" by simp_all + then show "zeroable ONE $\longleftrightarrow$ (L ONE = {})" by simp next case (CHAR c) have "$\neg$ zeroable (CHAR c)" "L (CHAR c) = {[c]}" by simp_all diff -r 5deefcc8cffa -r e3acf2bf3895 handouts/ho01.tex --- a/handouts/ho01.tex Tue Sep 20 12:24:29 2016 +0100 +++ b/handouts/ho01.tex Tue Sep 20 12:25:09 2016 +0100 @@ -27,6 +27,27 @@ %% reasons for a new prgramming language %% http://beautifulracket.com +%regher +% We can start off with a couple of observations about the role of +% compilers. First, hardware is getting weirder rather than getting +% clocked faster: almost all processors are multicores and it looks +% like there is increasing asymmetry in resources across +% cores. Processors come with vector units, crypto accelerators, bit +% twiddling instructions, and lots of features to make virtualization +% and concurrency work. We have DSPs, GPUs, big.little, and Xeon +% Phi. This is only scratching the surface. Second, we’re getting +% tired of low-level languages and their associated security +% disasters, we want to write new code, to whatever extent possible, +% in safer, higher-level languages. Compilers are caught right in the +% middle of these opposing trends: one of their main jobs is to help +% bridge the large and growing gap between increasingly high-level +% languages and increasingly wacky platforms. It’s effectively a +% perpetual employment act for solid compiler hackers. + +% compiler explorer +% https://gcc.godbolt.org + + \begin{document} \fnote{\copyright{} Christian Urban, King's College London, 2014, 2015, 2016} diff -r 5deefcc8cffa -r e3acf2bf3895 progs/catastrophic.java --- a/progs/catastrophic.java Tue Sep 20 12:24:29 2016 +0100 +++ b/progs/catastrophic.java Tue Sep 20 12:25:09 2016 +0100 @@ -1,14 +1,21 @@ +// a case of catastrophic backtracking in Java +// +// regexp: (a*)*b +// strings: aa.... + import java.util.regex.*; public class catastrophic { public static void main(String[] args) { + + //we always run all the tests twice -> warmup of the JVM for (int runs = 0; runs < 2; runs++) { - //Pattern pattern = Pattern.compile("(0*)*A"); - //Pattern pattern = Pattern.compile("a?{20}a{20}"); - - // Run from 5 to 25 characters - for (int length = 5; length < 30; length++) { - Pattern pattern = Pattern.compile("(a*)*b"); + + Pattern pattern = Pattern.compile("(a*)*b"); + + // Run from 5 to 28 characters + for (int length = 5; length < 28; length++) { + // Build input of specified length String input = ""; for (int i = 0; i < length; i++) { input += "a"; } @@ -18,10 +25,11 @@ for (int i = 0; i < 2; i++) { pattern.matcher(input).find(); } + System.out.println(length + " " + input + ": " + ((System.nanoTime() - start) / 2000000000d) + "s"); } } } -} \ No newline at end of file +} diff -r 5deefcc8cffa -r e3acf2bf3895 progs/catastrophic.py --- a/progs/catastrophic.py Tue Sep 20 12:24:29 2016 +0100 +++ b/progs/catastrophic.py Tue Sep 20 12:25:09 2016 +0100 @@ -2,11 +2,23 @@ import re import sys +# case of catastrophic backtracking in Python +# +# regex: (a?){n} a{n} +# strings: aa... +# +# call with timing as: +# +# > time ./catastrophic.py 20 + +# counter n given on the command line cn = sys.argv[1] +# constructing the regex r1 = '((a?){%s})' % cn r2 = 'a{%s}' % cn +# calling the matching function m = re.match(r1 + r2 , "a" * int(cn)) print m.group(0) diff -r 5deefcc8cffa -r e3acf2bf3895 progs/catastrophic.rb --- a/progs/catastrophic.rb Tue Sep 20 12:24:29 2016 +0100 +++ b/progs/catastrophic.rb Tue Sep 20 12:25:09 2016 +0100 @@ -1,4 +1,4 @@ -# provided by Daniel Baldwin +# example provided by Daniel Baldwin nums = (1..1000) diff -r 5deefcc8cffa -r e3acf2bf3895 progs/crawler1.scala --- a/progs/crawler1.scala Tue Sep 20 12:24:29 2016 +0100 +++ b/progs/crawler1.scala Tue Sep 20 12:25:09 2016 +0100 @@ -32,8 +32,8 @@ } // some starting URLs for the crawler -//val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" -val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney""" +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" +//val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney""" crawl(startURL, 2) diff -r 5deefcc8cffa -r e3acf2bf3895 progs/crawler2.scala --- a/progs/crawler2.scala Tue Sep 20 12:24:29 2016 +0100 +++ b/progs/crawler2.scala Tue Sep 20 12:25:09 2016 +0100 @@ -13,7 +13,7 @@ // regexes for URLs and "my" domain val http_pattern = """"https?://[^"]*"""".r -val my_urls = """urbanc""".r (*@\label{myurlline}@*) +val my_urls = """urbanc""".r (*@\label{myurlline}@*) def unquote(s: String) = s.drop(1).dropRight(1) @@ -21,11 +21,11 @@ http_pattern.findAllIn(page).map(unquote).toSet def crawl(url: String, n: Int) : Unit = { - if (n == 0) () (*@\label{changestartline}@*) + if (n == 0) () (*@\label{changestartline}@*) else if (my_urls.findFirstIn(url) == None) { println(s"Visiting: $n $url") get_page(url); () - } (*@\label{changeendline}@*) + } (*@\label{changeendline}@*) else { println(s"Visiting: $n $url") for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)