diff -r 1933e88cb73e -r 95ee5cc5c05d progs/crawler2.scala --- a/progs/crawler2.scala Fri Sep 27 11:01:31 2013 +0100 +++ b/progs/crawler2.scala Fri Sep 27 11:49:44 2013 +0100 @@ -5,16 +5,13 @@ import scala.util.matching.Regex import scala.util._ -// gets the first ~10K of a web-page +// gets the first 10K of a web-page def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} } -// staring URL for the crawler -val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" - -// regex for URLs +// regexes for URLs and "my" domain val http_pattern = """\"https?://[^\"]*\"""".r val my_urls = """urbanc""".r @@ -24,8 +21,6 @@ http_pattern.findAllIn(page).map(unquote).toSet } -// naive version - seraches until a given depth -// visits pages potentially more than once def crawl(url: String, n: Int) : Unit = { if (n == 0) () else if (my_urls.findFirstIn(url) == None) () @@ -35,8 +30,10 @@ } } -// can now deal with depth 3 -// start on command line +// staring URL for the crawler +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" + +// can now deal with depth 3 and beyond crawl(startURL, 4) -crawl("""http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-13.html""", 2) +