author | Christian Urban <christian dot urban at kcl dot ac dot uk> |
Tue, 04 Oct 2016 12:00:23 +0100 | |
changeset 440 | e14cd32ad497 |
parent 258 | 1e4da6d2490c |
permissions | -rw-r--r-- |
103
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
1 |
// A Web-Scraper that extracts the daily Shanghai polution |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
2 |
// data from the web-page |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
3 |
// |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
4 |
// http://www.envir.gov.cn/eng/airep/index.asp |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
5 |
// |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
6 |
// |
114
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
7 |
// Important! They stopped providing this data in November |
103
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
8 |
// 2012, but kept the historical data since 2001. So dates |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
9 |
// must be in that range. |
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
10 |
|
1 | 11 |
import java.io.OutputStreamWriter |
12 |
import java.net.URL |
|
13 |
import scala.io.Source.fromInputStream |
|
14 |
||
15 |
val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp") |
|
16 |
||
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
17 |
// connecting to url |
1 | 18 |
val conn = url.openConnection |
257
70c307641d05
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
250
diff
changeset
|
19 |
conn.setRequestProperty("User-Agent", "") |
1 | 20 |
conn.setDoOutput(true) |
21 |
conn.connect |
|
22 |
||
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
23 |
// sending data |
1 | 24 |
val wr = new OutputStreamWriter(conn.getOutputStream()) |
103
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
25 |
|
bea2dd1c7e73
links
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
99
diff
changeset
|
26 |
//possible date ranges |
258
1e4da6d2490c
updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
257
diff
changeset
|
27 |
wr.write("Fdate=2011-6-24&Tdate=2011-09-25") |
257
70c307641d05
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
250
diff
changeset
|
28 |
//wr.write("Fdate=2011-8-24&Tdate=2011-09-25") |
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
29 |
//wr.write("Fdate=2001-9-18&Tdate=2012-09-25") |
1 | 30 |
wr.flush |
31 |
wr.close |
|
32 |
||
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
33 |
// receiving data as page made of HTML |
1 | 34 |
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") |
35 |
||
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
36 |
// received data can be seen with |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
37 |
// println(page) |
1 | 38 |
|
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
39 |
// regular expression for extracting rows: |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
40 |
// - the usual . would exclude newlines, |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
41 |
// - therefore we have to use [\S\s], which really |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
42 |
// matches everything |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
43 |
// - *? is the "lazy" version of * |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
44 |
val regex1 = """<tr align="center">[\S\s]*?</tr>""".r |
1 | 45 |
val rows = regex1.findAllIn(page).toList |
46 |
||
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
47 |
// data in rows |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
48 |
// println(rows) |
1 | 49 |
|
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
50 |
// extracting row entries (date, PM_10, SO_2, NO_2) |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
51 |
// the use of (..) allows us to extract the matched text |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
52 |
val regex2 = """<td align="center">([\S\s]*?)</td>""".r |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
53 |
|
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
54 |
def extract(s: String) : List[String] = { |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
55 |
for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim |
1 | 56 |
} |
57 |
||
114
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
58 |
//data completely extracted |
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
59 |
val data = rows.map(extract) |
1 | 60 |
|
114
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
61 |
//for comparing elements from an array |
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
62 |
def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt |
1 | 63 |
|
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
64 |
println("The day with the highest particle pollution (PM_10)") |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
65 |
println(data.sortWith(compare(1)).head.mkString(",")) |
1 | 66 |
|
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
67 |
println("The day with the highest sulfur dioxide (SO_2)") |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
68 |
println(data.sortWith(compare(2)).head.mkString(",")) |
114
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
69 |
|
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
70 |
println("The day with the highest nitro dioxide (NO_2)") |
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
71 |
println(data.sortWith(compare(3)).head.mkString(",")) |
1 | 72 |
|
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
73 |
println("The day(s) with the highest PM_10") |
114
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
74 |
val groups1 = data.groupBy(_(1).toInt) |
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
75 |
val max_key1 = groups1.keySet.max |
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
76 |
println(groups1(max_key1).map(_.mkString(",")).mkString("\n")) |
1 | 77 |
|
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
78 |
println("The day(s) with the highest SO_2") |
114
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
79 |
val groups2 = data.groupBy(_(2).toInt) |
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
80 |
val max_key2 = groups2.keySet.max |
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
81 |
println(groups2(max_key2).map(_.mkString(",")).mkString("\n")) |
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
82 |
|
250
b79e704acb72
updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
114
diff
changeset
|
83 |
println("The day(s) with the highest NO_2") |
114
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
84 |
val groups3 = data.groupBy(_(3).toInt) |
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
85 |
val max_key3 = groups3.keySet.max |
735f7bbfae9b
added
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents:
103
diff
changeset
|
86 |
println(groups3(max_key3).map(_.mkString(",")).mkString("\n")) |