scraper.scala
changeset 92 e85600529ca5
parent 91 47f86885d481
child 93 4794759139ea
--- a/scraper.scala	Sun Dec 23 00:38:56 2012 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,57 +0,0 @@
-import java.io.OutputStreamWriter
-import java.net.URL
-import scala.io.Source.fromInputStream
-
-val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
-
-//connect to url
-val conn = url.openConnection
-conn.setRequestProperty("User-Agent", "")
-conn.setDoOutput(true)
-conn.connect
-
-//sending data
-val wr = new OutputStreamWriter(conn.getOutputStream())
-//wr.write("Fdate=2012-9-24&Tdate=2012-09-25")
-//wr.write("Fdate=2012-9-18&Tdate=2012-09-25")
-wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
-wr.flush
-wr.close
-
-//receiving data
-val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
-
-//println(page)
-
-// regular expression . excludes newlines, 
-// therefore we have to use [\S\s]
-val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
-val rows = regex1.findAllIn(page).toList
-
-//print(rows)
-
-val regex2 = """<td align="center">([\S\s]*?)</td>""".r
-
-def aux(s: String) : Array[String] = {
-  for (m <- regex2.findAllIn(s).toArray) yield m match {
-    case regex2(value) => value.trim
-  }
-}
-
-val data = rows.map { aux }
-
-def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
-
-//day with highest particle pollution (PM_10)
-data.sortWith(compare(1)).last
-
-//day with highest sulfur dioxide (SO_2)
-data.sortWith(compare(2)).last
-
-//day with highest nitro dioxide (NO_2)
-data.sortWith(compare(3)).last
-
-//days with highest PM_10
-val groups = data.groupBy(_(1).toInt)
-val max_key = groups.keySet.max
-groups(max_key)