// A Web-Scraper that extracts the daily Shanghai polution// data from the web-page//// http://www.envir.gov.cn/eng/airep/index.asp////// Important! They stoped providing this data in November// 2012, but kept the historical data since 2001. So dates// must be in that range.import java.io.OutputStreamWriterimport java.net.URLimport scala.io.Source.fromInputStreamval url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")//connecting to urlval conn = url.openConnectionconn.setRequestProperty("User-Agent", "")conn.setDoOutput(true)conn.connect//sending dataval wr = new OutputStreamWriter(conn.getOutputStream())//possible date rangeswr.write("Fdate=2012-9-24&Tdate=2012-08-25")//wr.write("Fdate=2012-9-18&Tdate=2012-09-24")//wr.write("Fdate=2001-5-18&Tdate=2012-09-25")wr.flushwr.close//receiving dataval page = fromInputStream(conn.getInputStream).getLines.mkString("\n")//data encoded as a string, which you can see with//println(page)// regular expression . excludes newlines, // therefore we have to use [\S\s]val regex1 = """<tr align="center">[\S\s]*?</tr>""".rval rows = regex1.findAllIn(page).toList//print(rows)val regex2 = """<td align="center">([\S\s]*?)</td>""".rdef aux(s: String) : Array[String] = { for (m <- regex2.findAllIn(s).toArray) yield m match { case regex2(value) => value.trim }}val data = rows.map { aux }def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt//day with highest particle pollution (PM_10)println(data.sortWith(compare(1)).last)//day with highest sulfur dioxide (SO_2)println(data.sortWith(compare(2)).last)//day with highest nitro dioxide (NO_2)println(data.sortWith(compare(3)).last)//days with highest PM_10val groups = data.groupBy(_(1).toInt)val max_key = groups.keySet.maxprintln(groups(max_key))