scraper.scala
author Christian Urban <urbanc@in.tum.de>
Wed, 26 Sep 2012 02:08:55 +0100
changeset 1 b606c9439fa6
child 7 73cf4406b773
permissions -rw-r--r--
new version
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
1
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     1
import java.io.OutputStreamWriter
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     2
import java.net.URL
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     3
import scala.io.Source.fromInputStream
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     4
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     5
val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     6
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     7
//connect to url
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     8
val conn = url.openConnection
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
     9
conn.setRequestProperty("User-Agent", "")
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    10
conn.setDoOutput(true)
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    11
conn.connect
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    12
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    13
//sending data
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    14
val wr = new OutputStreamWriter(conn.getOutputStream())
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    15
//wr.write("Fdate=2012-9-24&Tdate=2012-09-25")
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    16
//wr.write("Fdate=2012-9-18&Tdate=2012-09-25")
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    17
wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    18
wr.flush
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    19
wr.close
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    20
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    21
//receiving data
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    22
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    23
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    24
println(page)
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    25
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    26
// regular expression . excludes newlines, 
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    27
// therefore we have to use [\S\s]
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    28
val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    29
val rows = regex1.findAllIn(page).toList
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    30
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    31
print(rows)
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    32
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    33
val regex2 = """<td align="center">([\S\s]*?)</td>""".r
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    34
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    35
def aux(s: String) : Array[String] = {
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    36
  for (m <- regex2.findAllIn(s).toArray) yield m match {
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    37
    case regex2(value) => value.trim
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    38
  }
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    39
}
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    40
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    41
val data = rows.map { aux }
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    42
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    43
def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    44
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    45
//day with highest particle pollution (PM_10)
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    46
data.sortWith(compare(1)).last
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    47
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    48
//day with highest sulfur dioxide (SO_2)
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    49
data.sortWith(compare(2)).last
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    50
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    51
//day with highest nitro dioxide (NO_2)
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    52
data.sortWith(compare(3)).last
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    53
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    54
//days with highest PM_10
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    55
val groups = data.groupBy(_(1).toInt)
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    56
val max_key = groups.keySet.max
b606c9439fa6 new version
Christian Urban <urbanc@in.tum.de>
parents:
diff changeset
    57
groups(max_key)