author | Christian Urban <christian dot urban at kcl dot ac dot uk> |
Thu, 26 Sep 2013 10:52:54 +0100 | |
changeset 103 | bea2dd1c7e73 |
parent 99 | 91145f6d9b0e |
child 114 | 735f7bbfae9b |
permissions | -rw-r--r-- |
103
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
1 |
// A Web-Scraper that extracts the daily Shanghai polution |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
2 |
// data from the web-page |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
3 |
// |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
4 |
// http://www.envir.gov.cn/eng/airep/index.asp |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
5 |
// |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
6 |
// |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
7 |
// Important! They stoped providing this data in November |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
8 |
// 2012, but kept the historical data since 2001. So dates |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
9 |
// must be in that range. |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
10 |
|
1 | 11 |
import java.io.OutputStreamWriter |
12 |
import java.net.URL |
|
13 |
import scala.io.Source.fromInputStream |
|
14 |
||
15 |
val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp") |
|
16 |
||
103
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
17 |
//connecting to url |
1 | 18 |
val conn = url.openConnection |
19 |
conn.setRequestProperty("User-Agent", "") |
|
20 |
conn.setDoOutput(true) |
|
21 |
conn.connect |
|
22 |
||
23 |
//sending data |
|
24 |
val wr = new OutputStreamWriter(conn.getOutputStream()) |
|
103
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
25 |
|
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
26 |
//possible date ranges |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
27 |
wr.write("Fdate=2012-9-24&Tdate=2012-08-25") |
99
91145f6d9b0e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
93
diff
changeset
|
28 |
//wr.write("Fdate=2012-9-18&Tdate=2012-09-24") |
103
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
29 |
//wr.write("Fdate=2001-5-18&Tdate=2012-09-25") |
1 | 30 |
wr.flush |
31 |
wr.close |
|
32 |
||
33 |
//receiving data |
|
34 |
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") |
|
35 |
||
103
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
36 |
//data encoded as a string, which you can see with |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
37 |
//println(page) |
1 | 38 |
|
39 |
// regular expression . excludes newlines, |
|
40 |
// therefore we have to use [\S\s] |
|
41 |
val regex1 = """<tr align="center">[\S\s]*?</tr>""".r |
|
42 |
val rows = regex1.findAllIn(page).toList |
|
43 |
||
7 | 44 |
//print(rows) |
1 | 45 |
|
46 |
val regex2 = """<td align="center">([\S\s]*?)</td>""".r |
|
47 |
||
48 |
def aux(s: String) : Array[String] = { |
|
49 |
for (m <- regex2.findAllIn(s).toArray) yield m match { |
|
50 |
case regex2(value) => value.trim |
|
51 |
} |
|
52 |
} |
|
53 |
||
54 |
val data = rows.map { aux } |
|
55 |
||
56 |
def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt |
|
57 |
||
58 |
//day with highest particle pollution (PM_10) |
|
99
91145f6d9b0e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
93
diff
changeset
|
59 |
println(data.sortWith(compare(1)).last) |
1 | 60 |
|
61 |
//day with highest sulfur dioxide (SO_2) |
|
99
91145f6d9b0e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
93
diff
changeset
|
62 |
println(data.sortWith(compare(2)).last) |
1 | 63 |
|
64 |
//day with highest nitro dioxide (NO_2) |
|
99
91145f6d9b0e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
93
diff
changeset
|
65 |
println(data.sortWith(compare(3)).last) |
1 | 66 |
|
67 |
//days with highest PM_10 |
|
68 |
val groups = data.groupBy(_(1).toInt) |
|
69 |
val max_key = groups.keySet.max |
|
99
91145f6d9b0e
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
93
diff
changeset
|
70 |
println(groups(max_key)) |