|
1 // A Web-Scraper that extracts the daily Shanghai polution |
|
2 // data from the web-page |
|
3 // |
|
4 // http://www.envir.gov.cn/eng/airep/index.asp |
|
5 // |
|
6 // |
|
7 // Important! They stoped providing this data in November |
|
8 // 2012, but kept the historical data since 2001. So dates |
|
9 // must be in that range. |
|
10 |
1 import java.io.OutputStreamWriter |
11 import java.io.OutputStreamWriter |
2 import java.net.URL |
12 import java.net.URL |
3 import scala.io.Source.fromInputStream |
13 import scala.io.Source.fromInputStream |
4 |
14 |
5 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp") |
15 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp") |
6 |
16 |
7 //connect to url |
17 //connecting to url |
8 val conn = url.openConnection |
18 val conn = url.openConnection |
9 conn.setRequestProperty("User-Agent", "") |
19 conn.setRequestProperty("User-Agent", "") |
10 conn.setDoOutput(true) |
20 conn.setDoOutput(true) |
11 conn.connect |
21 conn.connect |
12 |
22 |
13 //sending data |
23 //sending data |
14 val wr = new OutputStreamWriter(conn.getOutputStream()) |
24 val wr = new OutputStreamWriter(conn.getOutputStream()) |
15 //wr.write("Fdate=2012-9-24&Tdate=2013-08-25") |
25 |
|
26 //possible date ranges |
|
27 wr.write("Fdate=2012-9-24&Tdate=2012-08-25") |
16 //wr.write("Fdate=2012-9-18&Tdate=2012-09-24") |
28 //wr.write("Fdate=2012-9-18&Tdate=2012-09-24") |
17 wr.write("Fdate=2001-5-18&Tdate=2012-09-25") |
29 //wr.write("Fdate=2001-5-18&Tdate=2012-09-25") |
18 wr.flush |
30 wr.flush |
19 wr.close |
31 wr.close |
20 |
32 |
21 //receiving data |
33 //receiving data |
22 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") |
34 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") |
23 |
35 |
24 println(page) |
36 //data encoded as a string, which you can see with |
|
37 //println(page) |
25 |
38 |
26 // regular expression . excludes newlines, |
39 // regular expression . excludes newlines, |
27 // therefore we have to use [\S\s] |
40 // therefore we have to use [\S\s] |
28 val regex1 = """<tr align="center">[\S\s]*?</tr>""".r |
41 val regex1 = """<tr align="center">[\S\s]*?</tr>""".r |
29 val rows = regex1.findAllIn(page).toList |
42 val rows = regex1.findAllIn(page).toList |