103
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
1 |
// A Web-Scraper that extracts the daily Shanghai polution
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
2 |
// data from the web-page
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
3 |
//
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
4 |
// http://www.envir.gov.cn/eng/airep/index.asp
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
5 |
//
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
6 |
//
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
7 |
// Important! They stopped providing this data in November
|
103
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
8 |
// 2012, but kept the historical data since 2001. So dates
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
9 |
// must be in that range.
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
10 |
|
1
|
11 |
import java.io.OutputStreamWriter
|
|
12 |
import java.net.URL
|
|
13 |
import scala.io.Source.fromInputStream
|
|
14 |
|
|
15 |
val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
|
|
16 |
|
103
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
17 |
//connecting to url
|
1
|
18 |
val conn = url.openConnection
|
|
19 |
conn.setRequestProperty("User-Agent", "")
|
|
20 |
conn.setDoOutput(true)
|
|
21 |
conn.connect
|
|
22 |
|
|
23 |
//sending data
|
|
24 |
val wr = new OutputStreamWriter(conn.getOutputStream())
|
103
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
25 |
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
26 |
//possible date ranges
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
27 |
wr.write("Fdate=2012-8-24&Tdate=2012-09-25")
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
28 |
//wr.write("Fdate=2001-9-18&Tdate=2012-09-24")
|
1
|
29 |
wr.flush
|
|
30 |
wr.close
|
|
31 |
|
|
32 |
//receiving data
|
|
33 |
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
|
|
34 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
35 |
//data encoded as an HTML-string, which you can see with
|
103
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
36 |
//println(page)
|
1
|
37 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
38 |
// regular expression: excludes newlines,
|
1
|
39 |
// therefore we have to use [\S\s]
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
40 |
val regex1 = """<tr align=\"center\">[\S\s]*?</tr>""".r
|
1
|
41 |
val rows = regex1.findAllIn(page).toList
|
|
42 |
|
7
|
43 |
//print(rows)
|
1
|
44 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
45 |
val regex2 = """<td align=\"center\">([\S\s]*?)</td>""".r
|
1
|
46 |
|
|
47 |
def aux(s: String) : Array[String] = {
|
|
48 |
for (m <- regex2.findAllIn(s).toArray) yield m match {
|
|
49 |
case regex2(value) => value.trim
|
|
50 |
}
|
|
51 |
}
|
|
52 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
53 |
//data completely extracted
|
1
|
54 |
val data = rows.map { aux }
|
|
55 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
56 |
//for comparing elements from an array
|
1
|
57 |
def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
|
|
58 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
59 |
println("The day with highest particle pollution (PM_10)")
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
60 |
println(data.sortWith(compare(1)).last.mkString(","))
|
1
|
61 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
62 |
println("The day with highest sulfur dioxide (SO_2)")
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
63 |
println(data.sortWith(compare(2)).last.mkString(","))
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
64 |
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
65 |
println("The day with highest nitro dioxide (NO_2)")
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
66 |
println(data.sortWith(compare(3)).last.mkString(","))
|
1
|
67 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
68 |
println("The day(s) with highest PM_10")
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
69 |
val groups1 = data.groupBy(_(1).toInt)
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
70 |
val max_key1 = groups1.keySet.max
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
71 |
println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
|
1
|
72 |
|
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
73 |
println("The day(s) with highest SO_2")
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
74 |
val groups2 = data.groupBy(_(2).toInt)
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
75 |
val max_key2 = groups2.keySet.max
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
76 |
println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
77 |
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
78 |
println("The day(s) with highest NO_2")
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
79 |
val groups3 = data.groupBy(_(3).toInt)
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
80 |
val max_key3 = groups3.keySet.max
|
Christian Urban <christian dot urban at kcl dot ac dot uk>
diff
changeset
|
81 |
println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))
|