// A Web-Scraper that extracts the daily Shanghai polution// data from the web-page//// http://www.envir.gov.cn/eng/airep/index.asp////// Important! They stopped providing this data in November// 2012, but kept the historical data since 2001. So dates// must be in that range.import java.io.OutputStreamWriterimport java.net.URLimport scala.io.Source.fromInputStreamval url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")// connecting to urlval conn = url.openConnectionconn.setRequestProperty("User-Agent", "")conn.setDoOutput(true)conn.connect// sending dataval wr = new OutputStreamWriter(conn.getOutputStream())//possible date rangeswr.write("Fdate=2011-6-24&Tdate=2011-09-25")//wr.write("Fdate=2011-8-24&Tdate=2011-09-25")//wr.write("Fdate=2001-9-18&Tdate=2012-09-25")wr.flushwr.close// receiving data as page made of HTMLval page = fromInputStream(conn.getInputStream).getLines.mkString("\n")// received data can be seen with// println(page)// regular expression for extracting rows: // - the usual . would exclude newlines, // - therefore we have to use [\S\s], which really // matches everything// - *? is the "lazy" version of *val regex1 = """<tr align="center">[\S\s]*?</tr>""".rval rows = regex1.findAllIn(page).toList// data in rows// println(rows)// extracting row entries (date, PM_10, SO_2, NO_2)// the use of (..) allows us to extract the matched textval regex2 = """<td align="center">([\S\s]*?)</td>""".rdef extract(s: String) : List[String] = { for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim}//data completely extractedval data = rows.map(extract)//for comparing elements from an arraydef compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toIntprintln("The day with the highest particle pollution (PM_10)")println(data.sortWith(compare(1)).head.mkString(","))println("The day with the highest sulfur dioxide (SO_2)")println(data.sortWith(compare(2)).head.mkString(","))println("The day with the highest nitro dioxide (NO_2)")println(data.sortWith(compare(3)).head.mkString(","))println("The day(s) with the highest PM_10")val groups1 = data.groupBy(_(1).toInt)val max_key1 = groups1.keySet.maxprintln(groups1(max_key1).map(_.mkString(",")).mkString("\n"))println("The day(s) with the highest SO_2")val groups2 = data.groupBy(_(2).toInt)val max_key2 = groups2.keySet.maxprintln(groups2(max_key2).map(_.mkString(",")).mkString("\n"))println("The day(s) with the highest NO_2")val groups3 = data.groupBy(_(3).toInt)val max_key3 = groups3.keySet.maxprintln(groups3(max_key3).map(_.mkString(",")).mkString("\n"))