progs/scraper.scala
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Fri, 07 Nov 2014 16:01:50 +0000
changeset 301 e8c0269c8ff5
parent 258 1e4da6d2490c
permissions -rw-r--r--
update

// A Web-Scraper that extracts the daily Shanghai polution
// data from the web-page
//
//   http://www.envir.gov.cn/eng/airep/index.asp
//
//
// Important! They stopped providing this data in November
// 2012, but kept the historical data since 2001. So dates
// must be in that range.

import java.io.OutputStreamWriter
import java.net.URL
import scala.io.Source.fromInputStream

val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")

// connecting to url
val conn = url.openConnection
conn.setRequestProperty("User-Agent", "")
conn.setDoOutput(true)
conn.connect

// sending data
val wr = new OutputStreamWriter(conn.getOutputStream())

//possible date ranges
wr.write("Fdate=2011-6-24&Tdate=2011-09-25")
//wr.write("Fdate=2011-8-24&Tdate=2011-09-25")
//wr.write("Fdate=2001-9-18&Tdate=2012-09-25")
wr.flush
wr.close

// receiving data as page made of HTML
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")

// received data can be seen with
// println(page)

// regular expression for extracting rows: 
// - the usual . would exclude newlines, 
// - therefore we have to use [\S\s], which really 
//   matches everything
// - *? is the "lazy" version of *
val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
val rows = regex1.findAllIn(page).toList

// data in rows
// println(rows)

// extracting row entries (date, PM_10, SO_2, NO_2)
// the use of (..) allows us to extract the matched text
val regex2 = """<td align="center">([\S\s]*?)</td>""".r

def extract(s: String) : List[String] = {
  for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim
}

//data completely extracted
val data = rows.map(extract)

//for comparing elements from an array
def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt

println("The day with the highest particle pollution (PM_10)")
println(data.sortWith(compare(1)).head.mkString(","))

println("The day with the highest sulfur dioxide (SO_2)")
println(data.sortWith(compare(2)).head.mkString(","))

println("The day with the highest nitro dioxide (NO_2)")
println(data.sortWith(compare(3)).head.mkString(","))

println("The day(s) with the highest PM_10")
val groups1 = data.groupBy(_(1).toInt)
val max_key1 = groups1.keySet.max
println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))

println("The day(s) with the highest SO_2")
val groups2 = data.groupBy(_(2).toInt)
val max_key2 = groups2.keySet.max
println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))

println("The day(s) with the highest NO_2")
val groups3 = data.groupBy(_(3).toInt)
val max_key3 = groups3.keySet.max
println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))