progs/scraper.scala
changeset 103 bea2dd1c7e73
parent 99 91145f6d9b0e
child 114 735f7bbfae9b
--- a/progs/scraper.scala	Thu Sep 26 10:41:47 2013 +0100
+++ b/progs/scraper.scala	Thu Sep 26 10:52:54 2013 +0100
@@ -1,10 +1,20 @@
+// A Web-Scraper that extracts the daily Shanghai polution
+// data from the web-page
+//
+//   http://www.envir.gov.cn/eng/airep/index.asp
+//
+//
+// Important! They stoped providing this data in November
+// 2012, but kept the historical data since 2001. So dates
+// must be in that range.
+
 import java.io.OutputStreamWriter
 import java.net.URL
 import scala.io.Source.fromInputStream
 
 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
 
-//connect to url
+//connecting to url
 val conn = url.openConnection
 conn.setRequestProperty("User-Agent", "")
 conn.setDoOutput(true)
@@ -12,16 +22,19 @@
 
 //sending data
 val wr = new OutputStreamWriter(conn.getOutputStream())
-//wr.write("Fdate=2012-9-24&Tdate=2013-08-25")
+
+//possible date ranges
+wr.write("Fdate=2012-9-24&Tdate=2012-08-25")
 //wr.write("Fdate=2012-9-18&Tdate=2012-09-24")
-wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
+//wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
 wr.flush
 wr.close
 
 //receiving data
 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
 
-println(page)
+//data encoded as a string, which you can see with
+//println(page)
 
 // regular expression . excludes newlines, 
 // therefore we have to use [\S\s]