scraper.scala
changeset 92 e85600529ca5
parent 91 47f86885d481
child 93 4794759139ea
equal deleted inserted replaced
91:47f86885d481 92:e85600529ca5
     1 import java.io.OutputStreamWriter
       
     2 import java.net.URL
       
     3 import scala.io.Source.fromInputStream
       
     4 
       
     5 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
       
     6 
       
     7 //connect to url
       
     8 val conn = url.openConnection
       
     9 conn.setRequestProperty("User-Agent", "")
       
    10 conn.setDoOutput(true)
       
    11 conn.connect
       
    12 
       
    13 //sending data
       
    14 val wr = new OutputStreamWriter(conn.getOutputStream())
       
    15 //wr.write("Fdate=2012-9-24&Tdate=2012-09-25")
       
    16 //wr.write("Fdate=2012-9-18&Tdate=2012-09-25")
       
    17 wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
       
    18 wr.flush
       
    19 wr.close
       
    20 
       
    21 //receiving data
       
    22 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
       
    23 
       
    24 //println(page)
       
    25 
       
    26 // regular expression . excludes newlines, 
       
    27 // therefore we have to use [\S\s]
       
    28 val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
       
    29 val rows = regex1.findAllIn(page).toList
       
    30 
       
    31 //print(rows)
       
    32 
       
    33 val regex2 = """<td align="center">([\S\s]*?)</td>""".r
       
    34 
       
    35 def aux(s: String) : Array[String] = {
       
    36   for (m <- regex2.findAllIn(s).toArray) yield m match {
       
    37     case regex2(value) => value.trim
       
    38   }
       
    39 }
       
    40 
       
    41 val data = rows.map { aux }
       
    42 
       
    43 def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
       
    44 
       
    45 //day with highest particle pollution (PM_10)
       
    46 data.sortWith(compare(1)).last
       
    47 
       
    48 //day with highest sulfur dioxide (SO_2)
       
    49 data.sortWith(compare(2)).last
       
    50 
       
    51 //day with highest nitro dioxide (NO_2)
       
    52 data.sortWith(compare(3)).last
       
    53 
       
    54 //days with highest PM_10
       
    55 val groups = data.groupBy(_(1).toInt)
       
    56 val max_key = groups.keySet.max
       
    57 groups(max_key)