progs/scraper.scala
changeset 114 735f7bbfae9b
parent 103 bea2dd1c7e73
child 250 b79e704acb72
equal deleted inserted replaced
113:db6862f6bf6c 114:735f7bbfae9b
     2 // data from the web-page
     2 // data from the web-page
     3 //
     3 //
     4 //   http://www.envir.gov.cn/eng/airep/index.asp
     4 //   http://www.envir.gov.cn/eng/airep/index.asp
     5 //
     5 //
     6 //
     6 //
     7 // Important! They stoped providing this data in November
     7 // Important! They stopped providing this data in November
     8 // 2012, but kept the historical data since 2001. So dates
     8 // 2012, but kept the historical data since 2001. So dates
     9 // must be in that range.
     9 // must be in that range.
    10 
    10 
    11 import java.io.OutputStreamWriter
    11 import java.io.OutputStreamWriter
    12 import java.net.URL
    12 import java.net.URL
    22 
    22 
    23 //sending data
    23 //sending data
    24 val wr = new OutputStreamWriter(conn.getOutputStream())
    24 val wr = new OutputStreamWriter(conn.getOutputStream())
    25 
    25 
    26 //possible date ranges
    26 //possible date ranges
    27 wr.write("Fdate=2012-9-24&Tdate=2012-08-25")
    27 wr.write("Fdate=2012-8-24&Tdate=2012-09-25")
    28 //wr.write("Fdate=2012-9-18&Tdate=2012-09-24")
    28 //wr.write("Fdate=2001-9-18&Tdate=2012-09-24")
    29 //wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
       
    30 wr.flush
    29 wr.flush
    31 wr.close
    30 wr.close
    32 
    31 
    33 //receiving data
    32 //receiving data
    34 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
    33 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
    35 
    34 
    36 //data encoded as a string, which you can see with
    35 //data encoded as an HTML-string, which you can see with
    37 //println(page)
    36 //println(page)
    38 
    37 
    39 // regular expression . excludes newlines, 
    38 // regular expression: excludes newlines, 
    40 // therefore we have to use [\S\s]
    39 // therefore we have to use [\S\s]
    41 val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
    40 val regex1 = """<tr align=\"center\">[\S\s]*?</tr>""".r
    42 val rows = regex1.findAllIn(page).toList
    41 val rows = regex1.findAllIn(page).toList
    43 
    42 
    44 //print(rows)
    43 //print(rows)
    45 
    44 
    46 val regex2 = """<td align="center">([\S\s]*?)</td>""".r
    45 val regex2 = """<td align=\"center\">([\S\s]*?)</td>""".r
    47 
    46 
    48 def aux(s: String) : Array[String] = {
    47 def aux(s: String) : Array[String] = {
    49   for (m <- regex2.findAllIn(s).toArray) yield m match {
    48   for (m <- regex2.findAllIn(s).toArray) yield m match {
    50     case regex2(value) => value.trim
    49     case regex2(value) => value.trim
    51   }
    50   }
    52 }
    51 }
    53 
    52 
       
    53 //data completely extracted
    54 val data = rows.map { aux }
    54 val data = rows.map { aux }
    55 
    55 
       
    56 //for comparing elements from an array
    56 def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
    57 def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
    57 
    58 
    58 //day with highest particle pollution (PM_10)
    59 println("The day with highest particle pollution (PM_10)")
    59 println(data.sortWith(compare(1)).last)
    60 println(data.sortWith(compare(1)).last.mkString(","))
    60 
    61 
    61 //day with highest sulfur dioxide (SO_2)
    62 println("The day with highest sulfur dioxide (SO_2)")
    62 println(data.sortWith(compare(2)).last)
    63 println(data.sortWith(compare(2)).last.mkString(","))
    63 
    64 
    64 //day with highest nitro dioxide (NO_2)
    65 println("The day with highest nitro dioxide (NO_2)")
    65 println(data.sortWith(compare(3)).last)
    66 println(data.sortWith(compare(3)).last.mkString(","))
    66 
    67 
    67 //days with highest PM_10
    68 println("The day(s) with highest PM_10")
    68 val groups = data.groupBy(_(1).toInt)
    69 val groups1 = data.groupBy(_(1).toInt)
    69 val max_key = groups.keySet.max
    70 val max_key1 = groups1.keySet.max
    70 println(groups(max_key))
    71 println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
       
    72 
       
    73 println("The day(s) with highest SO_2")
       
    74 val groups2 = data.groupBy(_(2).toInt)
       
    75 val max_key2 = groups2.keySet.max
       
    76 println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
       
    77 
       
    78 println("The day(s) with highest NO_2")
       
    79 val groups3 = data.groupBy(_(3).toInt)
       
    80 val max_key3 = groups3.keySet.max
       
    81 println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))