progs/scraper.scala
changeset 250 b79e704acb72
parent 114 735f7bbfae9b
child 257 70c307641d05
equal deleted inserted replaced
249:377c59df7297 250:b79e704acb72
    12 import java.net.URL
    12 import java.net.URL
    13 import scala.io.Source.fromInputStream
    13 import scala.io.Source.fromInputStream
    14 
    14 
    15 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
    15 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
    16 
    16 
    17 //connecting to url
    17 // connecting to url
    18 val conn = url.openConnection
    18 val conn = url.openConnection
    19 conn.setRequestProperty("User-Agent", "")
       
    20 conn.setDoOutput(true)
    19 conn.setDoOutput(true)
    21 conn.connect
    20 conn.connect
    22 
    21 
    23 //sending data
    22 // sending data
    24 val wr = new OutputStreamWriter(conn.getOutputStream())
    23 val wr = new OutputStreamWriter(conn.getOutputStream())
    25 
    24 
    26 //possible date ranges
    25 //possible date ranges
    27 wr.write("Fdate=2012-8-24&Tdate=2012-09-25")
    26 wr.write("Fdate=2012-8-24&Tdate=2012-09-25")
    28 //wr.write("Fdate=2001-9-18&Tdate=2012-09-24")
    27 //wr.write("Fdate=2001-9-18&Tdate=2012-09-25")
    29 wr.flush
    28 wr.flush
    30 wr.close
    29 wr.close
    31 
    30 
    32 //receiving data
    31 // receiving data as page made of HTML
    33 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
    32 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
    34 
    33 
    35 //data encoded as an HTML-string, which you can see with
    34 // received data can be seen with
    36 //println(page)
    35 // println(page)
    37 
    36 
    38 // regular expression: excludes newlines, 
    37 // regular expression for extracting rows: 
    39 // therefore we have to use [\S\s]
    38 // - the usual . would exclude newlines, 
    40 val regex1 = """<tr align=\"center\">[\S\s]*?</tr>""".r
    39 // - therefore we have to use [\S\s], which really 
       
    40 //   matches everything
       
    41 // - *? is the "lazy" version of *
       
    42 val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
    41 val rows = regex1.findAllIn(page).toList
    43 val rows = regex1.findAllIn(page).toList
    42 
    44 
    43 //print(rows)
    45 // data in rows
       
    46 // println(rows)
    44 
    47 
    45 val regex2 = """<td align=\"center\">([\S\s]*?)</td>""".r
    48 // extracting row entries (date, PM_10, SO_2, NO_2)
       
    49 // the use of (..) allows us to extract the matched text
       
    50 val regex2 = """<td align="center">([\S\s]*?)</td>""".r
    46 
    51 
    47 def aux(s: String) : Array[String] = {
    52 def extract(s: String) : List[String] = {
    48   for (m <- regex2.findAllIn(s).toArray) yield m match {
    53   for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim
    49     case regex2(value) => value.trim
       
    50   }
       
    51 }
    54 }
    52 
    55 
    53 //data completely extracted
    56 //data completely extracted
    54 val data = rows.map { aux }
    57 val data = rows.map(extract)
    55 
    58 
    56 //for comparing elements from an array
    59 //for comparing elements from an array
    57 def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
    60 def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt
    58 
    61 
    59 println("The day with highest particle pollution (PM_10)")
    62 println("The day with the highest particle pollution (PM_10)")
    60 println(data.sortWith(compare(1)).last.mkString(","))
    63 println(data.sortWith(compare(1)).head.mkString(","))
    61 
    64 
    62 println("The day with highest sulfur dioxide (SO_2)")
    65 println("The day with the highest sulfur dioxide (SO_2)")
    63 println(data.sortWith(compare(2)).last.mkString(","))
    66 println(data.sortWith(compare(2)).head.mkString(","))
    64 
    67 
    65 println("The day with highest nitro dioxide (NO_2)")
    68 println("The day with the highest nitro dioxide (NO_2)")
    66 println(data.sortWith(compare(3)).last.mkString(","))
    69 println(data.sortWith(compare(3)).head.mkString(","))
    67 
    70 
    68 println("The day(s) with highest PM_10")
    71 println("The day(s) with the highest PM_10")
    69 val groups1 = data.groupBy(_(1).toInt)
    72 val groups1 = data.groupBy(_(1).toInt)
    70 val max_key1 = groups1.keySet.max
    73 val max_key1 = groups1.keySet.max
    71 println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
    74 println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
    72 
    75 
    73 println("The day(s) with highest SO_2")
    76 println("The day(s) with the highest SO_2")
    74 val groups2 = data.groupBy(_(2).toInt)
    77 val groups2 = data.groupBy(_(2).toInt)
    75 val max_key2 = groups2.keySet.max
    78 val max_key2 = groups2.keySet.max
    76 println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
    79 println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
    77 
    80 
    78 println("The day(s) with highest NO_2")
    81 println("The day(s) with the highest NO_2")
    79 val groups3 = data.groupBy(_(3).toInt)
    82 val groups3 = data.groupBy(_(3).toInt)
    80 val max_key3 = groups3.keySet.max
    83 val max_key3 = groups3.keySet.max
    81 println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))
    84 println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))