afl-material: scraper.scala@3840d09e4271 (annotated)

1 b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	1	import java.io.OutputStreamWriter
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	2	import java.net.URL
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	3	import scala.io.Source.fromInputStream
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	4
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	5	val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	6
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	7	//connect to url
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	8	val conn = url.openConnection
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	9	conn.setRequestProperty("User-Agent", "")
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	10	conn.setDoOutput(true)
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	11	conn.connect
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	12
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	13	//sending data
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	14	val wr = new OutputStreamWriter(conn.getOutputStream())
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	15	//wr.write("Fdate=2012-9-24&Tdate=2012-09-25")
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	16	//wr.write("Fdate=2012-9-18&Tdate=2012-09-25")
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	17	wr.write("Fdate=2001-5-18&Tdate=2012-09-25")
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	18	wr.flush
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	19	wr.close
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	20
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	21	//receiving data
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	22	val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	23
7 73cf4406b773 updated Christian Urban <urbanc@in.tum.de> parents: 1 diff changeset	24	//println(page)
1 b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	25
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	26	// regular expression . excludes newlines,
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	27	// therefore we have to use [\S\s]
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	28	val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	29	val rows = regex1.findAllIn(page).toList
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	30
7 73cf4406b773 updated Christian Urban <urbanc@in.tum.de> parents: 1 diff changeset	31	//print(rows)
1 b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	32
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	33	val regex2 = """<td align="center">([\S\s]*?)</td>""".r
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	34
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	35	def aux(s: String) : Array[String] = {
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	36	for (m <- regex2.findAllIn(s).toArray) yield m match {
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	37	case regex2(value) => value.trim
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	38	}
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	39	}
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	40
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	41	val data = rows.map { aux }
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	42
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	43	def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	44
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	45	//day with highest particle pollution (PM_10)
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	46	data.sortWith(compare(1)).last
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	47
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	48	//day with highest sulfur dioxide (SO_2)
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	49	data.sortWith(compare(2)).last
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	50
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	51	//day with highest nitro dioxide (NO_2)
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	52	data.sortWith(compare(3)).last
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	53
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	54	//days with highest PM_10
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	55	val groups = data.groupBy(_(1).toInt)
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	56	val max_key = groups.keySet.max
b606c9439fa6 new version Christian Urban <urbanc@in.tum.de> parents: diff changeset	57	groups(max_key)

author	Christian Urban <urbanc@in.tum.de>
	Wed, 24 Oct 2012 03:40:50 +0100
changeset 44	3840d09e4271
parent 7	73cf4406b773
permissions	-rw-r--r--