| author | cu | 
| Tue, 17 Oct 2017 13:49:45 +0100 | |
| changeset 520 | fff0c9cab8d0 | 
| parent 258 | 1e4da6d2490c | 
| permissions | -rw-r--r-- | 
| 103 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 1 | // A Web-Scraper that extracts the daily Shanghai polution | 
| 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 2 | // data from the web-page | 
| 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 3 | // | 
| 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 4 | // http://www.envir.gov.cn/eng/airep/index.asp | 
| 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 5 | // | 
| 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 6 | // | 
| 114 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 7 | // Important! They stopped providing this data in November | 
| 103 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 8 | // 2012, but kept the historical data since 2001. So dates | 
| 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 9 | // must be in that range. | 
| 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 10 | |
| 1 | 11 | import java.io.OutputStreamWriter | 
| 12 | import java.net.URL | |
| 13 | import scala.io.Source.fromInputStream | |
| 14 | ||
| 15 | val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
 | |
| 16 | ||
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 17 | // connecting to url | 
| 1 | 18 | val conn = url.openConnection | 
| 257 
70c307641d05
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
250diff
changeset | 19 | conn.setRequestProperty("User-Agent", "")
 | 
| 1 | 20 | conn.setDoOutput(true) | 
| 21 | conn.connect | |
| 22 | ||
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 23 | // sending data | 
| 1 | 24 | val wr = new OutputStreamWriter(conn.getOutputStream()) | 
| 103 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 25 | |
| 
bea2dd1c7e73
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 26 | //possible date ranges | 
| 258 
1e4da6d2490c
updated programs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
257diff
changeset | 27 | wr.write("Fdate=2011-6-24&Tdate=2011-09-25")
 | 
| 257 
70c307641d05
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
250diff
changeset | 28 | //wr.write("Fdate=2011-8-24&Tdate=2011-09-25")
 | 
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 29 | //wr.write("Fdate=2001-9-18&Tdate=2012-09-25")
 | 
| 1 | 30 | wr.flush | 
| 31 | wr.close | |
| 32 | ||
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 33 | // receiving data as page made of HTML | 
| 1 | 34 | val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
 | 
| 35 | ||
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 36 | // received data can be seen with | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 37 | // println(page) | 
| 1 | 38 | |
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 39 | // regular expression for extracting rows: | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 40 | // - the usual . would exclude newlines, | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 41 | // - therefore we have to use [\S\s], which really | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 42 | // matches everything | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 43 | // - *? is the "lazy" version of * | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 44 | val regex1 = """<tr align="center">[\S\s]*?</tr>""".r | 
| 1 | 45 | val rows = regex1.findAllIn(page).toList | 
| 46 | ||
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 47 | // data in rows | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 48 | // println(rows) | 
| 1 | 49 | |
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 50 | // extracting row entries (date, PM_10, SO_2, NO_2) | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 51 | // the use of (..) allows us to extract the matched text | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 52 | val regex2 = """<td align="center">([\S\s]*?)</td>""".r | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 53 | |
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 54 | def extract(s: String) : List[String] = {
 | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 55 | for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim | 
| 1 | 56 | } | 
| 57 | ||
| 114 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 58 | //data completely extracted | 
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 59 | val data = rows.map(extract) | 
| 1 | 60 | |
| 114 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 61 | //for comparing elements from an array | 
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 62 | def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt | 
| 1 | 63 | |
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 64 | println("The day with the highest particle pollution (PM_10)")
 | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 65 | println(data.sortWith(compare(1)).head.mkString(","))
 | 
| 1 | 66 | |
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 67 | println("The day with the highest sulfur dioxide (SO_2)")
 | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 68 | println(data.sortWith(compare(2)).head.mkString(","))
 | 
| 114 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 69 | |
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 70 | println("The day with the highest nitro dioxide (NO_2)")
 | 
| 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 71 | println(data.sortWith(compare(3)).head.mkString(","))
 | 
| 1 | 72 | |
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 73 | println("The day(s) with the highest PM_10")
 | 
| 114 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 74 | val groups1 = data.groupBy(_(1).toInt) | 
| 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 75 | val max_key1 = groups1.keySet.max | 
| 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 76 | println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
 | 
| 1 | 77 | |
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 78 | println("The day(s) with the highest SO_2")
 | 
| 114 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 79 | val groups2 = data.groupBy(_(2).toInt) | 
| 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 80 | val max_key2 = groups2.keySet.max | 
| 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 81 | println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
 | 
| 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 82 | |
| 250 
b79e704acb72
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
114diff
changeset | 83 | println("The day(s) with the highest NO_2")
 | 
| 114 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 84 | val groups3 = data.groupBy(_(3).toInt) | 
| 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 85 | val max_key3 = groups3.keySet.max | 
| 
735f7bbfae9b
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
103diff
changeset | 86 | println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))
 |