diff -r 0debe6f41396 -r 546f2090ce12 progs/crawler1.scala --- a/progs/crawler1.scala Tue Sep 20 12:47:46 2016 +0100 +++ b/progs/crawler1.scala Fri Sep 23 15:22:33 2016 +0100 @@ -7,18 +7,18 @@ // gets the first 10K of a web-page def get_page(url: String) : String = { - Try(Source.fromURL(url).take(10000).mkString) getOrElse + Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } // regex for URLs -val http_pattern = """"https?://[^"]*"""".r (*@\label{httpline}@*) +val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ // drops the first and last character from a string def unquote(s: String) = s.drop(1).dropRight(1) def get_all_URLs(page: String) : Set[String] = - http_pattern.findAllIn(page).map(unquote).toSet (*@\label{findallline}@*) + http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/ // naive version of crawl - searches until a given depth, @@ -32,8 +32,9 @@ } // some starting URLs for the crawler -val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" +//val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" //val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney""" +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-16.html""" crawl(startURL, 2)