# HG changeset patch # User Christian Urban # Date 1380280963 -3600 # Node ID 735f7bbfae9bec23971fd1d62db9c99dac83b4b9 # Parent db6862f6bf6cd1b071524c91f8733c3f72f31e79 added diff -r db6862f6bf6c -r 735f7bbfae9b progs/scraper.scala --- a/progs/scraper.scala Fri Sep 27 11:58:22 2013 +0100 +++ b/progs/scraper.scala Fri Sep 27 12:22:43 2013 +0100 @@ -4,7 +4,7 @@ // http://www.envir.gov.cn/eng/airep/index.asp // // -// Important! They stoped providing this data in November +// Important! They stopped providing this data in November // 2012, but kept the historical data since 2001. So dates // must be in that range. @@ -24,26 +24,25 @@ val wr = new OutputStreamWriter(conn.getOutputStream()) //possible date ranges -wr.write("Fdate=2012-9-24&Tdate=2012-08-25") -//wr.write("Fdate=2012-9-18&Tdate=2012-09-24") -//wr.write("Fdate=2001-5-18&Tdate=2012-09-25") +wr.write("Fdate=2012-8-24&Tdate=2012-09-25") +//wr.write("Fdate=2001-9-18&Tdate=2012-09-24") wr.flush wr.close //receiving data val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") -//data encoded as a string, which you can see with +//data encoded as an HTML-string, which you can see with //println(page) -// regular expression . excludes newlines, +// regular expression: excludes newlines, // therefore we have to use [\S\s] -val regex1 = """[\S\s]*?""".r +val regex1 = """[\S\s]*?""".r val rows = regex1.findAllIn(page).toList //print(rows) -val regex2 = """([\S\s]*?)""".r +val regex2 = """([\S\s]*?)""".r def aux(s: String) : Array[String] = { for (m <- regex2.findAllIn(s).toArray) yield m match { @@ -51,20 +50,32 @@ } } +//data completely extracted val data = rows.map { aux } +//for comparing elements from an array def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt -//day with highest particle pollution (PM_10) -println(data.sortWith(compare(1)).last) +println("The day with highest particle pollution (PM_10)") +println(data.sortWith(compare(1)).last.mkString(",")) -//day with highest sulfur dioxide (SO_2) -println(data.sortWith(compare(2)).last) +println("The day with highest sulfur dioxide (SO_2)") +println(data.sortWith(compare(2)).last.mkString(",")) + +println("The day with highest nitro dioxide (NO_2)") +println(data.sortWith(compare(3)).last.mkString(",")) -//day with highest nitro dioxide (NO_2) -println(data.sortWith(compare(3)).last) +println("The day(s) with highest PM_10") +val groups1 = data.groupBy(_(1).toInt) +val max_key1 = groups1.keySet.max +println(groups1(max_key1).map(_.mkString(",")).mkString("\n")) -//days with highest PM_10 -val groups = data.groupBy(_(1).toInt) -val max_key = groups.keySet.max -println(groups(max_key)) +println("The day(s) with highest SO_2") +val groups2 = data.groupBy(_(2).toInt) +val max_key2 = groups2.keySet.max +println(groups2(max_key2).map(_.mkString(",")).mkString("\n")) + +println("The day(s) with highest NO_2") +val groups3 = data.groupBy(_(3).toInt) +val max_key3 = groups3.keySet.max +println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))