diff -r 91145f6d9b0e -r cbc2270c2938 progs/crawler2.scala --- a/progs/crawler2.scala Wed Sep 25 20:35:54 2013 +0100 +++ b/progs/crawler2.scala Thu Sep 26 10:36:24 2013 +0100 @@ -1,11 +1,15 @@ +// This version of the crawler only +// checks links in the "domain" urbanc + import io.Source import scala.util.matching.Regex import scala.util._ // gets the first ~10K of a page -def get_page(url: String) : String = +def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} +} // staring URL for the crawler val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" @@ -24,7 +28,7 @@ // visits pages potentially more than once def crawl(url: String, n: Int) : Unit = { if (n == 0) () - else if (my_urls.findFirstIn(url) == None) () + //else if (my_urls.findFirstIn(url) == None) () else { println(s"Visiting: $n $url") for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) @@ -35,3 +39,4 @@ // start on command line crawl(startURL, 4) +crawl("""http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-13.html""", 2)