| author | Christian Urban <urbanc@in.tum.de> | 
| Mon, 30 Sep 2019 12:27:15 +0100 | |
| changeset 638 | e951b9688bb2 | 
| parent 258 | 1e4da6d2490c | 
| permissions | -rw-r--r-- | 
| 
103
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
1  | 
// A Web-Scraper that extracts the daily Shanghai polution  | 
| 
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
2  | 
// data from the web-page  | 
| 
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
3  | 
//  | 
| 
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
4  | 
// http://www.envir.gov.cn/eng/airep/index.asp  | 
| 
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
5  | 
//  | 
| 
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
6  | 
//  | 
| 
114
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
7  | 
// Important! They stopped providing this data in November  | 
| 
103
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
8  | 
// 2012, but kept the historical data since 2001. So dates  | 
| 
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
9  | 
// must be in that range.  | 
| 
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
10  | 
|
| 1 | 11  | 
import java.io.OutputStreamWriter  | 
12  | 
import java.net.URL  | 
|
13  | 
import scala.io.Source.fromInputStream  | 
|
14  | 
||
15  | 
val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
 | 
|
16  | 
||
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
17  | 
// connecting to url  | 
| 1 | 18  | 
val conn = url.openConnection  | 
| 
257
 
70c307641d05
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
250 
diff
changeset
 | 
19  | 
conn.setRequestProperty("User-Agent", "")
 | 
| 1 | 20  | 
conn.setDoOutput(true)  | 
21  | 
conn.connect  | 
|
22  | 
||
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
23  | 
// sending data  | 
| 1 | 24  | 
val wr = new OutputStreamWriter(conn.getOutputStream())  | 
| 
103
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
25  | 
|
| 
 
bea2dd1c7e73
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
26  | 
//possible date ranges  | 
| 
258
 
1e4da6d2490c
updated programs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
257 
diff
changeset
 | 
27  | 
wr.write("Fdate=2011-6-24&Tdate=2011-09-25")
 | 
| 
257
 
70c307641d05
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
250 
diff
changeset
 | 
28  | 
//wr.write("Fdate=2011-8-24&Tdate=2011-09-25")
 | 
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
29  | 
//wr.write("Fdate=2001-9-18&Tdate=2012-09-25")
 | 
| 1 | 30  | 
wr.flush  | 
31  | 
wr.close  | 
|
32  | 
||
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
33  | 
// receiving data as page made of HTML  | 
| 1 | 34  | 
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
 | 
35  | 
||
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
36  | 
// received data can be seen with  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
37  | 
// println(page)  | 
| 1 | 38  | 
|
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
39  | 
// regular expression for extracting rows:  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
40  | 
// - the usual . would exclude newlines,  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
41  | 
// - therefore we have to use [\S\s], which really  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
42  | 
// matches everything  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
43  | 
// - *? is the "lazy" version of *  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
44  | 
val regex1 = """<tr align="center">[\S\s]*?</tr>""".r  | 
| 1 | 45  | 
val rows = regex1.findAllIn(page).toList  | 
46  | 
||
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
47  | 
// data in rows  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
48  | 
// println(rows)  | 
| 1 | 49  | 
|
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
50  | 
// extracting row entries (date, PM_10, SO_2, NO_2)  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
51  | 
// the use of (..) allows us to extract the matched text  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
52  | 
val regex2 = """<td align="center">([\S\s]*?)</td>""".r  | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
53  | 
|
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
54  | 
def extract(s: String) : List[String] = {
 | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
55  | 
for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim  | 
| 1 | 56  | 
}  | 
57  | 
||
| 
114
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
58  | 
//data completely extracted  | 
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
59  | 
val data = rows.map(extract)  | 
| 1 | 60  | 
|
| 
114
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
61  | 
//for comparing elements from an array  | 
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
62  | 
def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt  | 
| 1 | 63  | 
|
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
64  | 
println("The day with the highest particle pollution (PM_10)")
 | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
65  | 
println(data.sortWith(compare(1)).head.mkString(","))
 | 
| 1 | 66  | 
|
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
67  | 
println("The day with the highest sulfur dioxide (SO_2)")
 | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
68  | 
println(data.sortWith(compare(2)).head.mkString(","))
 | 
| 
114
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
69  | 
|
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
70  | 
println("The day with the highest nitro dioxide (NO_2)")
 | 
| 
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
71  | 
println(data.sortWith(compare(3)).head.mkString(","))
 | 
| 1 | 72  | 
|
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
73  | 
println("The day(s) with the highest PM_10")
 | 
| 
114
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
74  | 
val groups1 = data.groupBy(_(1).toInt)  | 
| 
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
75  | 
val max_key1 = groups1.keySet.max  | 
| 
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
76  | 
println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
 | 
| 1 | 77  | 
|
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
78  | 
println("The day(s) with the highest SO_2")
 | 
| 
114
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
79  | 
val groups2 = data.groupBy(_(2).toInt)  | 
| 
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
80  | 
val max_key2 = groups2.keySet.max  | 
| 
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
81  | 
println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
 | 
| 
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
82  | 
|
| 
250
 
b79e704acb72
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
114 
diff
changeset
 | 
83  | 
println("The day(s) with the highest NO_2")
 | 
| 
114
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
84  | 
val groups3 = data.groupBy(_(3).toInt)  | 
| 
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
85  | 
val max_key3 = groups3.keySet.max  | 
| 
 
735f7bbfae9b
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
103 
diff
changeset
 | 
86  | 
println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))
 |