# HG changeset patch # User Christian Urban # Date 1380137754 -3600 # Node ID 91145f6d9b0ebae40f5d41c402f41cf94ecb92dc # Parent 1f3d89fe9820727b6d2aa31ca7a93b4ecfe4d09e added diff -r 1f3d89fe9820 -r 91145f6d9b0e progs/crawler1.scala --- a/progs/crawler1.scala Tue Sep 24 23:31:53 2013 +0100 +++ b/progs/crawler1.scala Wed Sep 25 20:35:54 2013 +0100 @@ -3,10 +3,10 @@ import scala.util._ // gets the first ~10K of a page -def get_page(url: String) : String = +def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} - +} // regex for URLs val http_pattern = """\"https?://[^\"]*\"""".r diff -r 1f3d89fe9820 -r 91145f6d9b0e progs/crawler3.scala --- a/progs/crawler3.scala Tue Sep 24 23:31:53 2013 +0100 +++ b/progs/crawler3.scala Wed Sep 25 20:35:54 2013 +0100 @@ -3,9 +3,10 @@ import scala.util._ // gets the first ~10K of a page -def get_page(url: String) : String = +def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} +} // staring URL for the crawler val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" diff -r 1f3d89fe9820 -r 91145f6d9b0e progs/scraper.scala --- a/progs/scraper.scala Tue Sep 24 23:31:53 2013 +0100 +++ b/progs/scraper.scala Wed Sep 25 20:35:54 2013 +0100 @@ -12,8 +12,8 @@ //sending data val wr = new OutputStreamWriter(conn.getOutputStream()) -//wr.write("Fdate=2012-9-24&Tdate=2012-09-25") -//wr.write("Fdate=2012-9-18&Tdate=2012-09-25") +//wr.write("Fdate=2012-9-24&Tdate=2013-08-25") +//wr.write("Fdate=2012-9-18&Tdate=2012-09-24") wr.write("Fdate=2001-5-18&Tdate=2012-09-25") wr.flush wr.close @@ -21,7 +21,7 @@ //receiving data val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") -//println(page) +println(page) // regular expression . excludes newlines, // therefore we have to use [\S\s] @@ -43,15 +43,15 @@ def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt //day with highest particle pollution (PM_10) -data.sortWith(compare(1)).last +println(data.sortWith(compare(1)).last) //day with highest sulfur dioxide (SO_2) -data.sortWith(compare(2)).last +println(data.sortWith(compare(2)).last) //day with highest nitro dioxide (NO_2) -data.sortWith(compare(3)).last +println(data.sortWith(compare(3)).last) //days with highest PM_10 val groups = data.groupBy(_(1).toInt) val max_key = groups.keySet.max -groups(max_key) +println(groups(max_key))