scraper.scala
changeset 1 b606c9439fa6
child 7 73cf4406b773
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scraper.scala	Wed Sep 26 02:08:55 2012 +0100
@@ -0,0 +1,57 @@
+import java.io.OutputStreamWriter
+import java.net.URL
+import scala.io.Source.fromInputStream
+
+val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
+
+//connect to url
+val conn = url.openConnection
+conn.setRequestProperty("User-Agent", "")
+conn.setDoOutput(true)
+conn.connect
+
+//sending data
+val wr = new OutputStreamWriter(conn.getOutputStream())
+//wr.write("Fdate=2012-9-24&Tdate=2012-09-25")
+//wr.write("Fdate=2012-9-18&Tdate=2012-09-25")
+wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
+wr.flush
+wr.close
+
+//receiving data
+val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
+
+println(page)
+
+// regular expression . excludes newlines, 
+// therefore we have to use [\S\s]
+val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
+val rows = regex1.findAllIn(page).toList
+
+print(rows)
+
+val regex2 = """<td align="center">([\S\s]*?)</td>""".r
+
+def aux(s: String) : Array[String] = {
+  for (m <- regex2.findAllIn(s).toArray) yield m match {
+    case regex2(value) => value.trim
+  }
+}
+
+val data = rows.map { aux }
+
+def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
+
+//day with highest particle pollution (PM_10)
+data.sortWith(compare(1)).last
+
+//day with highest sulfur dioxide (SO_2)
+data.sortWith(compare(2)).last
+
+//day with highest nitro dioxide (NO_2)
+data.sortWith(compare(3)).last
+
+//days with highest PM_10
+val groups = data.groupBy(_(1).toInt)
+val max_key = groups.keySet.max
+groups(max_key)