--- a/progs/scraper.scala Thu Sep 26 10:41:47 2013 +0100
+++ b/progs/scraper.scala Thu Sep 26 10:52:54 2013 +0100
@@ -1,10 +1,20 @@
+// A Web-Scraper that extracts the daily Shanghai polution
+// data from the web-page
+//
+// http://www.envir.gov.cn/eng/airep/index.asp
+//
+//
+// Important! They stoped providing this data in November
+// 2012, but kept the historical data since 2001. So dates
+// must be in that range.
+
import java.io.OutputStreamWriter
import java.net.URL
import scala.io.Source.fromInputStream
val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
-//connect to url
+//connecting to url
val conn = url.openConnection
conn.setRequestProperty("User-Agent", "")
conn.setDoOutput(true)
@@ -12,16 +22,19 @@
//sending data
val wr = new OutputStreamWriter(conn.getOutputStream())
-//wr.write("Fdate=2012-9-24&Tdate=2013-08-25")
+
+//possible date ranges
+wr.write("Fdate=2012-9-24&Tdate=2012-08-25")
//wr.write("Fdate=2012-9-18&Tdate=2012-09-24")
-wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
+//wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
wr.flush
wr.close
//receiving data
val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
-println(page)
+//data encoded as a string, which you can see with
+//println(page)
// regular expression . excludes newlines,
// therefore we have to use [\S\s]