diff -r 0debe6f41396 -r 546f2090ce12 progs/crawler2.scala --- a/progs/crawler2.scala Tue Sep 20 12:47:46 2016 +0100 +++ b/progs/crawler2.scala Fri Sep 23 15:22:33 2016 +0100 @@ -7,13 +7,13 @@ // gets the first 10K of a web-page def get_page(url: String) : String = { - Try(Source.fromURL(url).take(10000).mkString) getOrElse + Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } // regexes for URLs and "my" domain val http_pattern = """"https?://[^"]*"""".r -val my_urls = """urbanc""".r (*@\label{myurlline}@*) +val my_urls = """urbanc""".r /*@\label{myurlline}@*/ def unquote(s: String) = s.drop(1).dropRight(1) @@ -21,11 +21,11 @@ http_pattern.findAllIn(page).map(unquote).toSet def crawl(url: String, n: Int) : Unit = { - if (n == 0) () (*@\label{changestartline}@*) + if (n == 0) () /*@\label{changestartline}@*/ else if (my_urls.findFirstIn(url) == None) { println(s"Visiting: $n $url") get_page(url); () - } (*@\label{changeendline}@*) + } /*@\label{changeendline}@*/ else { println(s"Visiting: $n $url") for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)