// A Web-Scraper that extracts the daily Shanghai polution+ −
// data from the web-page+ −
//+ −
// http://www.envir.gov.cn/eng/airep/index.asp+ −
//+ −
//+ −
// Important! They stopped providing this data in November+ −
// 2012, but kept the historical data since 2001. So dates+ −
// must be in that range.+ −
+ −
import java.io.OutputStreamWriter+ −
import java.net.URL+ −
import scala.io.Source.fromInputStream+ −
+ −
val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")+ −
+ −
// connecting to url+ −
val conn = url.openConnection+ −
conn.setRequestProperty("User-Agent", "")+ −
conn.setDoOutput(true)+ −
conn.connect+ −
+ −
// sending data+ −
val wr = new OutputStreamWriter(conn.getOutputStream())+ −
+ −
//possible date ranges+ −
wr.write("Fdate=2011-6-24&Tdate=2011-09-25")+ −
//wr.write("Fdate=2011-8-24&Tdate=2011-09-25")+ −
//wr.write("Fdate=2001-9-18&Tdate=2012-09-25")+ −
wr.flush+ −
wr.close+ −
+ −
// receiving data as page made of HTML+ −
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")+ −
+ −
// received data can be seen with+ −
// println(page)+ −
+ −
// regular expression for extracting rows: + −
// - the usual . would exclude newlines, + −
// - therefore we have to use [\S\s], which really + −
// matches everything+ −
// - *? is the "lazy" version of *+ −
val regex1 = """<tr align="center">[\S\s]*?</tr>""".r+ −
val rows = regex1.findAllIn(page).toList+ −
+ −
// data in rows+ −
// println(rows)+ −
+ −
// extracting row entries (date, PM_10, SO_2, NO_2)+ −
// the use of (..) allows us to extract the matched text+ −
val regex2 = """<td align="center">([\S\s]*?)</td>""".r+ −
+ −
def extract(s: String) : List[String] = {+ −
for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim+ −
}+ −
+ −
//data completely extracted+ −
val data = rows.map(extract)+ −
+ −
//for comparing elements from an array+ −
def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt+ −
+ −
println("The day with the highest particle pollution (PM_10)")+ −
println(data.sortWith(compare(1)).head.mkString(","))+ −
+ −
println("The day with the highest sulfur dioxide (SO_2)")+ −
println(data.sortWith(compare(2)).head.mkString(","))+ −
+ −
println("The day with the highest nitro dioxide (NO_2)")+ −
println(data.sortWith(compare(3)).head.mkString(","))+ −
+ −
println("The day(s) with the highest PM_10")+ −
val groups1 = data.groupBy(_(1).toInt)+ −
val max_key1 = groups1.keySet.max+ −
println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))+ −
+ −
println("The day(s) with the highest SO_2")+ −
val groups2 = data.groupBy(_(2).toInt)+ −
val max_key2 = groups2.keySet.max+ −
println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))+ −
+ −
println("The day(s) with the highest NO_2")+ −
val groups3 = data.groupBy(_(3).toInt)+ −
val max_key3 = groups3.keySet.max+ −
println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))+ −