diff -r e3c64f22dd31 -r 14914b57e207 progs/crawler3.scala --- a/progs/crawler3.scala Thu Apr 16 19:15:46 2020 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -// This version of the crawler that also -// "harvests" email addresses from webpages - -import io.Source -import scala.util.matching.Regex -import scala.util._ - -def get_page(url: String) : String = { - Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). - getOrElse { println(s" Problem with: $url"); ""} -} - -// regexes for URLs, for "my" domain and for email addresses -val http_pattern = """"https?://[^"]*"""".r -val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/ - -def unquote(s: String) = s.drop(1).dropRight(1) - -def get_all_URLs(page: String) : Set[String] = - http_pattern.findAllIn(page).map(unquote).toSet - -def print_str(s: String) = - if (s == "") () else println(s) - -def crawl(url: String, n: Int) : Unit = { - if (n == 0) () - else { - println(s" Visiting: $n $url") - val page = get_page(url) - print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/ - for (u <- get_all_URLs(page).par) crawl(u, n - 1) - } -} - -// staring URL for the crawler -val startURL = """https://nms.kcl.ac.uk/christian.urban/""" - - -crawl(startURL, 3)