Binary file handouts/ho01.pdf has changed
Binary file handouts/ho02.pdf has changed
Binary file handouts/ho03.pdf has changed
Binary file handouts/ho04.pdf has changed
Binary file handouts/ho05.pdf has changed
Binary file handouts/ho06.pdf has changed
Binary file handouts/ho07.pdf has changed
Binary file handouts/ho08.pdf has changed
Binary file handouts/notation.pdf has changed
Binary file handouts/scala-ho.pdf has changed
--- a/langs.sty Tue Sep 20 12:47:46 2016 +0100
+++ b/langs.sty Fri Sep 23 15:22:33 2016 +0100
@@ -66,4 +66,5 @@
\newcommand{\scode}[1]{\mbox{\lstset{language={},basicstyle=\ttfamily\color{codegreen}}\lstinline!#1!}}
\makeatother
-\lstset{escapeinside={(*@}{@*)}}
+%%\lstset{escapeinside={(*@}{@*)}}
+\lstset{escapeinside={/*@}{@*/}}
--- a/progs/crawler1.scala Tue Sep 20 12:47:46 2016 +0100
+++ b/progs/crawler1.scala Fri Sep 23 15:22:33 2016 +0100
@@ -7,18 +7,18 @@
// gets the first 10K of a web-page
def get_page(url: String) : String = {
- Try(Source.fromURL(url).take(10000).mkString) getOrElse
+ Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse
{ println(s" Problem with: $url"); ""}
}
// regex for URLs
-val http_pattern = """"https?://[^"]*"""".r (*@\label{httpline}@*)
+val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/
// drops the first and last character from a string
def unquote(s: String) = s.drop(1).dropRight(1)
def get_all_URLs(page: String) : Set[String] =
- http_pattern.findAllIn(page).map(unquote).toSet (*@\label{findallline}@*)
+ http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/
// naive version of crawl - searches until a given depth,
@@ -32,8 +32,9 @@
}
// some starting URLs for the crawler
-val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
+//val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
//val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney"""
+val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-16.html"""
crawl(startURL, 2)
--- a/progs/crawler2.scala Tue Sep 20 12:47:46 2016 +0100
+++ b/progs/crawler2.scala Fri Sep 23 15:22:33 2016 +0100
@@ -7,13 +7,13 @@
// gets the first 10K of a web-page
def get_page(url: String) : String = {
- Try(Source.fromURL(url).take(10000).mkString) getOrElse
+ Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse
{ println(s" Problem with: $url"); ""}
}
// regexes for URLs and "my" domain
val http_pattern = """"https?://[^"]*"""".r
-val my_urls = """urbanc""".r (*@\label{myurlline}@*)
+val my_urls = """urbanc""".r /*@\label{myurlline}@*/
def unquote(s: String) = s.drop(1).dropRight(1)
@@ -21,11 +21,11 @@
http_pattern.findAllIn(page).map(unquote).toSet
def crawl(url: String, n: Int) : Unit = {
- if (n == 0) () (*@\label{changestartline}@*)
+ if (n == 0) () /*@\label{changestartline}@*/
else if (my_urls.findFirstIn(url) == None) {
println(s"Visiting: $n $url")
get_page(url); ()
- } (*@\label{changeendline}@*)
+ } /*@\label{changeendline}@*/
else {
println(s"Visiting: $n $url")
for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
--- a/progs/crawler3.scala Tue Sep 20 12:47:46 2016 +0100
+++ b/progs/crawler3.scala Fri Sep 23 15:22:33 2016 +0100
@@ -6,13 +6,13 @@
import scala.util._
def get_page(url: String) : String = {
- Try(Source.fromURL(url).take(10000).mkString) getOrElse
+ Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse
{ println(s" Problem with: $url"); ""}
}
// regexes for URLs, for "my" domain and for email addresses
val http_pattern = """"https?://[^"]*"""".r
-val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r (*@\label{emailline}@*)
+val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/
def unquote(s: String) = s.drop(1).dropRight(1)
@@ -27,7 +27,7 @@
else {
println(s"Visiting: $n $url")
val page = get_page(url)
- print_str(email_pattern.findAllIn(page).mkString("\n")) (*@\label{mainline}@*)
+ print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/
for (u <- get_all_URLs(page).par) crawl(u, n - 1)
}
}