diff -r 3a5e09a2ae54 -r b606c9439fa6 scraper.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scraper.scala Wed Sep 26 02:08:55 2012 +0100 @@ -0,0 +1,57 @@ +import java.io.OutputStreamWriter +import java.net.URL +import scala.io.Source.fromInputStream + +val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp") + +//connect to url +val conn = url.openConnection +conn.setRequestProperty("User-Agent", "") +conn.setDoOutput(true) +conn.connect + +//sending data +val wr = new OutputStreamWriter(conn.getOutputStream()) +//wr.write("Fdate=2012-9-24&Tdate=2012-09-25") +//wr.write("Fdate=2012-9-18&Tdate=2012-09-25") +wr.write("Fdate=2001-5-18&Tdate=2012-09-25") +wr.flush +wr.close + +//receiving data +val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") + +println(page) + +// regular expression . excludes newlines, +// therefore we have to use [\S\s] +val regex1 = """[\S\s]*?""".r +val rows = regex1.findAllIn(page).toList + +print(rows) + +val regex2 = """([\S\s]*?)""".r + +def aux(s: String) : Array[String] = { + for (m <- regex2.findAllIn(s).toArray) yield m match { + case regex2(value) => value.trim + } +} + +val data = rows.map { aux } + +def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt + +//day with highest particle pollution (PM_10) +data.sortWith(compare(1)).last + +//day with highest sulfur dioxide (SO_2) +data.sortWith(compare(2)).last + +//day with highest nitro dioxide (NO_2) +data.sortWith(compare(3)).last + +//days with highest PM_10 +val groups = data.groupBy(_(1).toInt) +val max_key = groups.keySet.max +groups(max_key)