diff -r 10f02605a46a -r 35104ee14f87 progs/crawler1.scala --- a/progs/crawler1.scala Sun Sep 07 08:37:44 2014 +0100 +++ b/progs/crawler1.scala Sat Sep 13 04:30:25 2014 +0100 @@ -1,5 +1,5 @@ -// A crawler which checks whether there -// are problems with links in web-pages +// A crawler which checks whether there are +// dead links in web-pages import io.Source import scala.util.matching.Regex @@ -12,7 +12,7 @@ } // regex for URLs -val http_pattern = """\"https?://[^\"]*\"""".r +val http_pattern = """"https?://[^"]*"""".r // drops the first and last character from a string def unquote(s: String) = s.drop(1).dropRight(1) @@ -21,7 +21,7 @@ http_pattern.findAllIn(page).map(unquote).toSet } -// naive version - seraches until a given depth +// naive version of crawl - searches until a given depth, // visits pages potentially more than once def crawl(url: String, n: Int) : Unit = { if (n == 0) () @@ -31,9 +31,9 @@ } } -// staring URL for the crawler -val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" -//val startURL = """http://www.inf.kcl.ac.uk/staff/mml/""" +// some starting URLs for the crawler +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" +//val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney""" crawl(startURL, 2)