progs/scraper.scala
changeset 103 bea2dd1c7e73
parent 99 91145f6d9b0e
child 114 735f7bbfae9b
equal deleted inserted replaced
102:1ab41c59e3d3 103:bea2dd1c7e73
       
     1 // A Web-Scraper that extracts the daily Shanghai polution
       
     2 // data from the web-page
       
     3 //
       
     4 //   http://www.envir.gov.cn/eng/airep/index.asp
       
     5 //
       
     6 //
       
     7 // Important! They stoped providing this data in November
       
     8 // 2012, but kept the historical data since 2001. So dates
       
     9 // must be in that range.
       
    10 
     1 import java.io.OutputStreamWriter
    11 import java.io.OutputStreamWriter
     2 import java.net.URL
    12 import java.net.URL
     3 import scala.io.Source.fromInputStream
    13 import scala.io.Source.fromInputStream
     4 
    14 
     5 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
    15 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
     6 
    16 
     7 //connect to url
    17 //connecting to url
     8 val conn = url.openConnection
    18 val conn = url.openConnection
     9 conn.setRequestProperty("User-Agent", "")
    19 conn.setRequestProperty("User-Agent", "")
    10 conn.setDoOutput(true)
    20 conn.setDoOutput(true)
    11 conn.connect
    21 conn.connect
    12 
    22 
    13 //sending data
    23 //sending data
    14 val wr = new OutputStreamWriter(conn.getOutputStream())
    24 val wr = new OutputStreamWriter(conn.getOutputStream())
    15 //wr.write("Fdate=2012-9-24&Tdate=2013-08-25")
    25 
       
    26 //possible date ranges
       
    27 wr.write("Fdate=2012-9-24&Tdate=2012-08-25")
    16 //wr.write("Fdate=2012-9-18&Tdate=2012-09-24")
    28 //wr.write("Fdate=2012-9-18&Tdate=2012-09-24")
    17 wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
    29 //wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
    18 wr.flush
    30 wr.flush
    19 wr.close
    31 wr.close
    20 
    32 
    21 //receiving data
    33 //receiving data
    22 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
    34 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
    23 
    35 
    24 println(page)
    36 //data encoded as a string, which you can see with
       
    37 //println(page)
    25 
    38 
    26 // regular expression . excludes newlines, 
    39 // regular expression . excludes newlines, 
    27 // therefore we have to use [\S\s]
    40 // therefore we have to use [\S\s]
    28 val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
    41 val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
    29 val rows = regex1.findAllIn(page).toList
    42 val rows = regex1.findAllIn(page).toList