// A Web-Crawler//================// call parallel version with//// scala -cp scala-parallel-collections_2.13-0.2.0.jar crawler.scala import io.Sourceimport scala.util._import scala.collection.parallel.CollectionConverters._// the idea is to look for links using the// regular expression "https?://[^"]*" and for// email addresses using yet another regex.// gets the first 10K of a web-pagedef get_page(url: String) : String = { Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). getOrElse { println(s" Problem with: $url"); ""}}// regex for URLs and emailsval http_pattern = """"https?://[^"]*"""".rval email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r// val s = "foo bla christian@kcl.ac.uk 1234567"// email_pattern.findAllIn(s).toList// drops the first and last character from a stringdef unquote(s: String) = s.drop(1).dropRight(1)def get_all_URLs(page: String): Set[String] = http_pattern.findAllIn(page).map(unquote).toSet// a naive version of crawl - searches until a given depth,// visits pages potentially more than oncedef crawl(url: String, n: Int) : Unit = { if (n == 0) () else { println(s" Visiting: $n $url") val page = get_page(url) for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1) }}// some starting URLs for the crawlerval startURL = """https://nms.kcl.ac.uk/christian.urban/"""//crawl(startURL, 2)// a primitive email harvesterdef emails(url: String, n: Int) : Set[String] = { if (n == 0) Set() else { println(s" Visiting: $n $url") val page = get_page(url) val new_emails = email_pattern.findAllIn(page).toSet new_emails ++ (for (u <- get_all_URLs(page).par) yield emails(u, n - 1)).flatten }}println(emails(startURL, 3))