progs/scraper.scala
author Christian Urban <christian dot urban at kcl dot ac dot uk>
Wed, 25 Sep 2013 20:35:54 +0100 (2013-09-25)
changeset 99 91145f6d9b0e
parent 93 4794759139ea
child 103 bea2dd1c7e73
permissions -rw-r--r--
added
import java.io.OutputStreamWriter
import java.net.URL
import scala.io.Source.fromInputStream

val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")

//connect to url
val conn = url.openConnection
conn.setRequestProperty("User-Agent", "")
conn.setDoOutput(true)
conn.connect

//sending data
val wr = new OutputStreamWriter(conn.getOutputStream())
//wr.write("Fdate=2012-9-24&Tdate=2013-08-25")
//wr.write("Fdate=2012-9-18&Tdate=2012-09-24")
wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
wr.flush
wr.close

//receiving data
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")

println(page)

// regular expression . excludes newlines, 
// therefore we have to use [\S\s]
val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
val rows = regex1.findAllIn(page).toList

//print(rows)

val regex2 = """<td align="center">([\S\s]*?)</td>""".r

def aux(s: String) : Array[String] = {
  for (m <- regex2.findAllIn(s).toArray) yield m match {
    case regex2(value) => value.trim
  }
}

val data = rows.map { aux }

def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt

//day with highest particle pollution (PM_10)
println(data.sortWith(compare(1)).last)

//day with highest sulfur dioxide (SO_2)
println(data.sortWith(compare(2)).last)

//day with highest nitro dioxide (NO_2)
println(data.sortWith(compare(3)).last)

//days with highest PM_10
val groups = data.groupBy(_(1).toInt)
val max_key = groups.keySet.max
println(groups(max_key))