added
authorChristian Urban <christian dot urban at kcl dot ac dot uk>
Fri, 27 Sep 2013 12:22:43 +0100
changeset 114 735f7bbfae9b
parent 113 db6862f6bf6c
child 115 86c1c049eb3e
added
progs/scraper.scala
--- a/progs/scraper.scala	Fri Sep 27 11:58:22 2013 +0100
+++ b/progs/scraper.scala	Fri Sep 27 12:22:43 2013 +0100
@@ -4,7 +4,7 @@
 //   http://www.envir.gov.cn/eng/airep/index.asp
 //
 //
-// Important! They stoped providing this data in November
+// Important! They stopped providing this data in November
 // 2012, but kept the historical data since 2001. So dates
 // must be in that range.
 
@@ -24,26 +24,25 @@
 val wr = new OutputStreamWriter(conn.getOutputStream())
 
 //possible date ranges
-wr.write("Fdate=2012-9-24&Tdate=2012-08-25")
-//wr.write("Fdate=2012-9-18&Tdate=2012-09-24")
-//wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
+wr.write("Fdate=2012-8-24&Tdate=2012-09-25")
+//wr.write("Fdate=2001-9-18&Tdate=2012-09-24")
 wr.flush
 wr.close
 
 //receiving data
 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
 
-//data encoded as a string, which you can see with
+//data encoded as an HTML-string, which you can see with
 //println(page)
 
-// regular expression . excludes newlines, 
+// regular expression: excludes newlines, 
 // therefore we have to use [\S\s]
-val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
+val regex1 = """<tr align=\"center\">[\S\s]*?</tr>""".r
 val rows = regex1.findAllIn(page).toList
 
 //print(rows)
 
-val regex2 = """<td align="center">([\S\s]*?)</td>""".r
+val regex2 = """<td align=\"center\">([\S\s]*?)</td>""".r
 
 def aux(s: String) : Array[String] = {
   for (m <- regex2.findAllIn(s).toArray) yield m match {
@@ -51,20 +50,32 @@
   }
 }
 
+//data completely extracted
 val data = rows.map { aux }
 
+//for comparing elements from an array
 def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
 
-//day with highest particle pollution (PM_10)
-println(data.sortWith(compare(1)).last)
+println("The day with highest particle pollution (PM_10)")
+println(data.sortWith(compare(1)).last.mkString(","))
 
-//day with highest sulfur dioxide (SO_2)
-println(data.sortWith(compare(2)).last)
+println("The day with highest sulfur dioxide (SO_2)")
+println(data.sortWith(compare(2)).last.mkString(","))
+
+println("The day with highest nitro dioxide (NO_2)")
+println(data.sortWith(compare(3)).last.mkString(","))
 
-//day with highest nitro dioxide (NO_2)
-println(data.sortWith(compare(3)).last)
+println("The day(s) with highest PM_10")
+val groups1 = data.groupBy(_(1).toInt)
+val max_key1 = groups1.keySet.max
+println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
 
-//days with highest PM_10
-val groups = data.groupBy(_(1).toInt)
-val max_key = groups.keySet.max
-println(groups(max_key))
+println("The day(s) with highest SO_2")
+val groups2 = data.groupBy(_(2).toInt)
+val max_key2 = groups2.keySet.max
+println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
+
+println("The day(s) with highest NO_2")
+val groups3 = data.groupBy(_(3).toInt)
+val max_key3 = groups3.keySet.max
+println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))