# HG changeset patch # User Christian Urban # Date 1474640553 -3600 # Node ID 546f2090ce1201a40d6424e63b7cadb2f4aac6a4 # Parent 0debe6f4139674b221445c0b60ccdd3c48c858ce updated diff -r 0debe6f41396 -r 546f2090ce12 handouts/ho01.pdf Binary file handouts/ho01.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/ho02.pdf Binary file handouts/ho02.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/ho03.pdf Binary file handouts/ho03.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/ho04.pdf Binary file handouts/ho04.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/ho05.pdf Binary file handouts/ho05.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/ho06.pdf Binary file handouts/ho06.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/ho07.pdf Binary file handouts/ho07.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/ho08.pdf Binary file handouts/ho08.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/notation.pdf Binary file handouts/notation.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 handouts/scala-ho.pdf Binary file handouts/scala-ho.pdf has changed diff -r 0debe6f41396 -r 546f2090ce12 langs.sty --- a/langs.sty Tue Sep 20 12:47:46 2016 +0100 +++ b/langs.sty Fri Sep 23 15:22:33 2016 +0100 @@ -66,4 +66,5 @@ \newcommand{\scode}[1]{\mbox{\lstset{language={},basicstyle=\ttfamily\color{codegreen}}\lstinline!#1!}} \makeatother -\lstset{escapeinside={(*@}{@*)}} +%%\lstset{escapeinside={(*@}{@*)}} +\lstset{escapeinside={/*@}{@*/}} diff -r 0debe6f41396 -r 546f2090ce12 progs/crawler1.scala --- a/progs/crawler1.scala Tue Sep 20 12:47:46 2016 +0100 +++ b/progs/crawler1.scala Fri Sep 23 15:22:33 2016 +0100 @@ -7,18 +7,18 @@ // gets the first 10K of a web-page def get_page(url: String) : String = { - Try(Source.fromURL(url).take(10000).mkString) getOrElse + Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } // regex for URLs -val http_pattern = """"https?://[^"]*"""".r (*@\label{httpline}@*) +val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ // drops the first and last character from a string def unquote(s: String) = s.drop(1).dropRight(1) def get_all_URLs(page: String) : Set[String] = - http_pattern.findAllIn(page).map(unquote).toSet (*@\label{findallline}@*) + http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/ // naive version of crawl - searches until a given depth, @@ -32,8 +32,9 @@ } // some starting URLs for the crawler -val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" +//val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" //val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney""" +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-16.html""" crawl(startURL, 2) diff -r 0debe6f41396 -r 546f2090ce12 progs/crawler2.scala --- a/progs/crawler2.scala Tue Sep 20 12:47:46 2016 +0100 +++ b/progs/crawler2.scala Fri Sep 23 15:22:33 2016 +0100 @@ -7,13 +7,13 @@ // gets the first 10K of a web-page def get_page(url: String) : String = { - Try(Source.fromURL(url).take(10000).mkString) getOrElse + Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } // regexes for URLs and "my" domain val http_pattern = """"https?://[^"]*"""".r -val my_urls = """urbanc""".r (*@\label{myurlline}@*) +val my_urls = """urbanc""".r /*@\label{myurlline}@*/ def unquote(s: String) = s.drop(1).dropRight(1) @@ -21,11 +21,11 @@ http_pattern.findAllIn(page).map(unquote).toSet def crawl(url: String, n: Int) : Unit = { - if (n == 0) () (*@\label{changestartline}@*) + if (n == 0) () /*@\label{changestartline}@*/ else if (my_urls.findFirstIn(url) == None) { println(s"Visiting: $n $url") get_page(url); () - } (*@\label{changeendline}@*) + } /*@\label{changeendline}@*/ else { println(s"Visiting: $n $url") for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) diff -r 0debe6f41396 -r 546f2090ce12 progs/crawler3.scala --- a/progs/crawler3.scala Tue Sep 20 12:47:46 2016 +0100 +++ b/progs/crawler3.scala Fri Sep 23 15:22:33 2016 +0100 @@ -6,13 +6,13 @@ import scala.util._ def get_page(url: String) : String = { - Try(Source.fromURL(url).take(10000).mkString) getOrElse + Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } // regexes for URLs, for "my" domain and for email addresses val http_pattern = """"https?://[^"]*"""".r -val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r (*@\label{emailline}@*) +val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/ def unquote(s: String) = s.drop(1).dropRight(1) @@ -27,7 +27,7 @@ else { println(s"Visiting: $n $url") val page = get_page(url) - print_str(email_pattern.findAllIn(page).mkString("\n")) (*@\label{mainline}@*) + print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/ for (u <- get_all_URLs(page).par) crawl(u, n - 1) } }