# HG changeset patch # User Christian Urban # Date 1380189174 -3600 # Node ID bea2dd1c7e73f7fe52c14e4bb8e04db550e8457d # Parent 1ab41c59e3d368cc243d7155999b5de52cbee25b links diff -r 1ab41c59e3d3 -r bea2dd1c7e73 progs/scraper.scala --- a/progs/scraper.scala Thu Sep 26 10:41:47 2013 +0100 +++ b/progs/scraper.scala Thu Sep 26 10:52:54 2013 +0100 @@ -1,10 +1,20 @@ +// A Web-Scraper that extracts the daily Shanghai polution +// data from the web-page +// +// http://www.envir.gov.cn/eng/airep/index.asp +// +// +// Important! They stoped providing this data in November +// 2012, but kept the historical data since 2001. So dates +// must be in that range. + import java.io.OutputStreamWriter import java.net.URL import scala.io.Source.fromInputStream val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp") -//connect to url +//connecting to url val conn = url.openConnection conn.setRequestProperty("User-Agent", "") conn.setDoOutput(true) @@ -12,16 +22,19 @@ //sending data val wr = new OutputStreamWriter(conn.getOutputStream()) -//wr.write("Fdate=2012-9-24&Tdate=2013-08-25") + +//possible date ranges +wr.write("Fdate=2012-9-24&Tdate=2012-08-25") //wr.write("Fdate=2012-9-18&Tdate=2012-09-24") -wr.write("Fdate=2001-5-18&Tdate=2012-09-25") +//wr.write("Fdate=2001-5-18&Tdate=2012-09-25") wr.flush wr.close //receiving data val page = fromInputStream(conn.getInputStream).getLines.mkString("\n") -println(page) +//data encoded as a string, which you can see with +//println(page) // regular expression . excludes newlines, // therefore we have to use [\S\s]