updated
authorChristian Urban <christian dot urban at kcl dot ac dot uk>
Mon, 15 Sep 2014 07:25:17 +0100
changeset 250 b79e704acb72
parent 249 377c59df7297
child 251 5b5a68df6d16
updated
handouts/ho01.pdf
handouts/ho01.tex
progs/scraper.scala
Binary file handouts/ho01.pdf has changed
--- a/handouts/ho01.tex	Mon Sep 15 04:54:01 2014 +0100
+++ b/handouts/ho01.tex	Mon Sep 15 07:25:17 2014 +0100
@@ -101,14 +101,19 @@
 brackets\\
 \pcode{[^...]} & matches any single character not inside the 
 brackets\\
-\pcode{..-..} & character ranges\\
-\pcode{\\d} &	matches digits; equivalent to \pcode{[0-9]}
+\pcode{...-...} & character ranges\\
+\pcode{\\d} & matches digits; equivalent to \pcode{[0-9]}\\
+\pcode{.} & matches every character except newline\\
+\pcode{(re)}	& groups regular expressions and remembers 
+matched text
 \end{tabular}
 \end{center}
 
 \noindent With this table you can figure out the purpose of
 the regular expressions in the web-crawlers shown Figures
-\ref{crawler1}, \ref{crawler2} and \ref{crawler3}. Note,
+\ref{crawler1}, \ref{crawler2} and
+\ref{crawler3}.\footnote{There is an interesting twist in the
+web-scraber where \pcode{re*?} is used instead of \pcode{re*}.} Note,
 however, the regular expression for http-addresses in
 web-pages is meant to be
 
--- a/progs/scraper.scala	Mon Sep 15 04:54:01 2014 +0100
+++ b/progs/scraper.scala	Mon Sep 15 07:25:17 2014 +0100
@@ -14,68 +14,71 @@
 
 val url = new URL("http://www.envir.gov.cn/eng/airep/index.asp")
 
-//connecting to url
+// connecting to url
 val conn = url.openConnection
-conn.setRequestProperty("User-Agent", "")
 conn.setDoOutput(true)
 conn.connect
 
-//sending data
+// sending data
 val wr = new OutputStreamWriter(conn.getOutputStream())
 
 //possible date ranges
 wr.write("Fdate=2012-8-24&Tdate=2012-09-25")
-//wr.write("Fdate=2001-9-18&Tdate=2012-09-24")
+//wr.write("Fdate=2001-9-18&Tdate=2012-09-25")
 wr.flush
 wr.close
 
-//receiving data
+// receiving data as page made of HTML
 val page = fromInputStream(conn.getInputStream).getLines.mkString("\n")
 
-//data encoded as an HTML-string, which you can see with
-//println(page)
+// received data can be seen with
+// println(page)
 
-// regular expression: excludes newlines, 
-// therefore we have to use [\S\s]
-val regex1 = """<tr align=\"center\">[\S\s]*?</tr>""".r
+// regular expression for extracting rows: 
+// - the usual . would exclude newlines, 
+// - therefore we have to use [\S\s], which really 
+//   matches everything
+// - *? is the "lazy" version of *
+val regex1 = """<tr align="center">[\S\s]*?</tr>""".r
 val rows = regex1.findAllIn(page).toList
 
-//print(rows)
-
-val regex2 = """<td align=\"center\">([\S\s]*?)</td>""".r
+// data in rows
+// println(rows)
 
-def aux(s: String) : Array[String] = {
-  for (m <- regex2.findAllIn(s).toArray) yield m match {
-    case regex2(value) => value.trim
-  }
+// extracting row entries (date, PM_10, SO_2, NO_2)
+// the use of (..) allows us to extract the matched text
+val regex2 = """<td align="center">([\S\s]*?)</td>""".r
+
+def extract(s: String) : List[String] = {
+  for (regex2(value) <- regex2.findAllIn(s).toList) yield value.trim
 }
 
 //data completely extracted
-val data = rows.map { aux }
+val data = rows.map(extract)
 
 //for comparing elements from an array
-def compare(i: Int)(e: Array[String], f: Array[String]) = e(i).toInt < f(i).toInt
+def compare(i: Int)(e: List[String], f: List[String]) = e(i).toInt > f(i).toInt
 
-println("The day with highest particle pollution (PM_10)")
-println(data.sortWith(compare(1)).last.mkString(","))
+println("The day with the highest particle pollution (PM_10)")
+println(data.sortWith(compare(1)).head.mkString(","))
 
-println("The day with highest sulfur dioxide (SO_2)")
-println(data.sortWith(compare(2)).last.mkString(","))
+println("The day with the highest sulfur dioxide (SO_2)")
+println(data.sortWith(compare(2)).head.mkString(","))
 
-println("The day with highest nitro dioxide (NO_2)")
-println(data.sortWith(compare(3)).last.mkString(","))
+println("The day with the highest nitro dioxide (NO_2)")
+println(data.sortWith(compare(3)).head.mkString(","))
 
-println("The day(s) with highest PM_10")
+println("The day(s) with the highest PM_10")
 val groups1 = data.groupBy(_(1).toInt)
 val max_key1 = groups1.keySet.max
 println(groups1(max_key1).map(_.mkString(",")).mkString("\n"))
 
-println("The day(s) with highest SO_2")
+println("The day(s) with the highest SO_2")
 val groups2 = data.groupBy(_(2).toInt)
 val max_key2 = groups2.keySet.max
 println(groups2(max_key2).map(_.mkString(",")).mkString("\n"))
 
-println("The day(s) with highest NO_2")
+println("The day(s) with the highest NO_2")
 val groups3 = data.groupBy(_(3).toInt)
 val max_key3 = groups3.keySet.max
 println(groups3(max_key3).map(_.mkString(",")).mkString("\n"))