progs/scraper.scala
author Christian Urban <urbanc@in.tum.de>
Wed, 24 Oct 2018 16:03:07 +0100
changeset 586 451a95e1bc25
parent 258 1e4da6d2490c
permissions -rw-r--r--
typos
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
103
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
     1
// A Web-Scraper that extracts the daily Shanghai polution
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
     2
// data from the web-page
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
     3
//
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
     4
//   http://www.envir.gov.cn/eng/airep/index.asp
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
     5
//
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
     6
//
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
     7
// Important! They stopped providing this data in November
103
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
     8
// 2012, but kept the historical data since 2001. So dates
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
     9
// must be in that range.
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
    10
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    11
import java.io.OutputStreamWriter
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    12
import java.net.URL
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    13
import scala.io.Source.fromInputStream
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    14
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    15
val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    16
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    17
// connecting to url
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    18
val conn = url.openConnection
257
70c307641d05 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 250
diff changeset
    19
conn.setRequestProperty("User-Agent", "")
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    20
conn.setDoOutput(true)
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    21
conn.connect
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    22
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    23
// sending data
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    24
val wr = new OutputStreamWriter(conn.getOutputStream())
103
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
    25
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 99
diff changeset
    26
//possible date ranges
258
1e4da6d2490c updated programs
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 257
diff changeset
    27
wr.write("Fdate=2011-6-24&Tdate=2011-09-25")
257
70c307641d05 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 250
diff changeset
    28
//wr.write("Fdate=2011-8-24&Tdate=2011-09-25")
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    29
//wr.write("Fdate=2001-9-18&Tdate=2012-09-25")
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    30
wr.flush
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    31
wr.close
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    32
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    33
// receiving data as page made of HTML
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    34
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    35
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    36
// received data can be seen with
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    37
// println(page)
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    38
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    39
// regular expression for extracting rows: 
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    40
// - the usual . would exclude newlines, 
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    41
// - therefore we have to use [\S\s], which really 
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    42
//   matches everything
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    43
// - *? is the "lazy" version of *
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    44
val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    45
val rows = regex1.findAllIn(page).toList
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    46
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    47
// data in rows
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    48
// println(rows)
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    49
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    50
// extracting row entries (date, PM_10, SO_2, NO_2)
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    51
// the use of (..) allows us to extract the matched text
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    52
val regex2 = """<td align="center">([\S\s]*?)</td>""".r
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    53
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    54
def extract(s: String) : List[String] = {
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    55
  for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    56
}
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    57
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    58
//data completely extracted
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    59
val data = rows.map(extract)
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    60
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    61
//for comparing elements from an array
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    62
def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    63
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    64
println("The day with the highest particle pollution (PM_10)")
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    65
println(data.sortWith(compare(1)).head.mkString(","))
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    66
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    67
println("The day with the highest sulfur dioxide (SO_2)")
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    68
println(data.sortWith(compare(2)).head.mkString(","))
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    69
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    70
println("The day with the highest nitro dioxide (NO_2)")
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    71
println(data.sortWith(compare(3)).head.mkString(","))
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    72
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    73
println("The day(s) with the highest PM_10")
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    74
val groups1 = data.groupBy(_(1).toInt)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    75
val max_key1 = groups1.keySet.max
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    76
println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    77
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    78
println("The day(s) with the highest SO_2")
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    79
val groups2 = data.groupBy(_(2).toInt)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    80
val max_key2 = groups2.keySet.max
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    81
println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    82
250
b79e704acb72 updated
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 114
diff changeset
    83
println("The day(s) with the highest NO_2")
114
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    84
val groups3 = data.groupBy(_(3).toInt)
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    85
val max_key3 = groups3.keySet.max
Christian Urban <christian dot urban at kcl dot ac dot uk>
parents: 103
diff changeset
    86
println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))