diff -r 1933e88cb73e -r 95ee5cc5c05d progs/crawler3.scala --- a/progs/crawler3.scala Fri Sep 27 11:01:31 2013 +0100 +++ b/progs/crawler3.scala Fri Sep 27 11:49:44 2013 +0100 @@ -1,20 +1,16 @@ -// This version of the crawler also -// harvests emails from webpages +// This version of the crawler that also +// "harvests" email addresses from webpages import io.Source import scala.util.matching.Regex import scala.util._ -// gets the first ~10K of a web-page def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } -// staring URL for the crawler -val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" - -// regex for URLs +// regexes for URLs, for "my" domain and for email addresses val http_pattern = """\"https?://[^\"]*\"""".r val my_urls = """urbanc""".r val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r @@ -28,8 +24,6 @@ http_pattern.findAllIn(page).map(unquote).toSet } -// naive version - seraches until a given depth -// visits pages potentially more than once def crawl(url: String, n: Int) : Unit = { if (n == 0) () //else if (my_urls.findFirstIn(url) == None) () @@ -41,4 +35,7 @@ } } +// staring URL for the crawler +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" + crawl(startURL, 3)