diff -r e3c64f22dd31 -r 14914b57e207 progs/crawler1.scala --- a/progs/crawler1.scala Thu Apr 16 19:15:46 2020 +0100 +++ b/progs/crawler1.scala Wed May 06 15:37:31 2020 +0100 @@ -10,18 +10,20 @@ Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). getOrElse { println(s" Problem with: $url"); ""} } -get_page("https://nms.kcl.ac.uk/christiana.urban/") + +// e.g. get_page("https://nms.kcl.ac.uk/christiana.urban/") + // regex for URLs val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ -// drops the first and last character from a string +// drops the first and last characters from a string def unquote(s: String) = s.drop(1).dropRight(1) def get_all_URLs(page: String) : Set[String] = http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/ -// naive version of crawl - searches until a given depth, -// visits pages potentially more than once +// a very naive version of crawl - searches until a given +// depth, visits pages potentially more than once def crawl(url: String, n: Int) : Unit = { if (n == 0) () else { @@ -31,6 +33,7 @@ } // some starting URLs for the crawler + val startURL = """https://nms.kcl.ac.uk/christian.urban/""" //val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""