diff -r 377c59df7297 -r b79e704acb72 progs/scraper.scala --- a/progs/scraper.scala Mon Sep 15 04:54:01 2014 +0100 +++ b/progs/scraper.scala Mon Sep 15 07:25:17 2014 +0100 @@ -14,68 +14,71 @@ val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp") -//connecting to url +// connecting to url val conn = url.openConnection -conn.setRequestProperty("User-Agent", "") conn.setDoOutput(true) conn.connect -//sending data +// sending data val wr = new OutputStreamWriter(conn.getOutputStream()) //possible date ranges wr.write("Fdate=2012-8-24&Tdate=2012-09-25") -//wr.write("Fdate=2001-9-18&Tdate=2012-09-24") +//wr.write("Fdate=2001-9-18&Tdate=2012-09-25") wr.flush wr.close -//receiving data +// receiving data as page made of HTML val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") -//data encoded as an HTML-string, which you can see with -//println(page) +// received data can be seen with +// println(page) -// regular expression: excludes newlines, -// therefore we have to use [\S\s] -val regex1 = """[\S\s]*?""".r +// regular expression for extracting rows: +// - the usual . would exclude newlines, +// - therefore we have to use [\S\s], which really +// matches everything +// - *? is the "lazy" version of * +val regex1 = """[\S\s]*?""".r val rows = regex1.findAllIn(page).toList -//print(rows) - -val regex2 = """([\S\s]*?)""".r +// data in rows +// println(rows) -def aux(s: String) : Array[String] = { - for (m <- regex2.findAllIn(s).toArray) yield m match { - case regex2(value) => value.trim - } +// extracting row entries (date, PM_10, SO_2, NO_2) +// the use of (..) allows us to extract the matched text +val regex2 = """([\S\s]*?)""".r + +def extract(s: String) : List[String] = { + for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim } //data completely extracted -val data = rows.map { aux } +val data = rows.map(extract) //for comparing elements from an array -def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt +def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt -println("The day with highest particle pollution (PM_10)") -println(data.sortWith(compare(1)).last.mkString(",")) +println("The day with the highest particle pollution (PM_10)") +println(data.sortWith(compare(1)).head.mkString(",")) -println("The day with highest sulfur dioxide (SO_2)") -println(data.sortWith(compare(2)).last.mkString(",")) +println("The day with the highest sulfur dioxide (SO_2)") +println(data.sortWith(compare(2)).head.mkString(",")) -println("The day with highest nitro dioxide (NO_2)") -println(data.sortWith(compare(3)).last.mkString(",")) +println("The day with the highest nitro dioxide (NO_2)") +println(data.sortWith(compare(3)).head.mkString(",")) -println("The day(s) with highest PM_10") +println("The day(s) with the highest PM_10") val groups1 = data.groupBy(_(1).toInt) val max_key1 = groups1.keySet.max println(groups1(max_key1).map(_.mkString(",")).mkString("\n")) -println("The day(s) with highest SO_2") +println("The day(s) with the highest SO_2") val groups2 = data.groupBy(_(2).toInt) val max_key2 = groups2.keySet.max println(groups2(max_key2).map(_.mkString(",")).mkString("\n")) -println("The day(s) with highest NO_2") +println("The day(s) with the highest NO_2") val groups3 = data.groupBy(_(3).toInt) val max_key3 = groups3.keySet.max println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))