// A Web-Crawler+ −
//================+ −
+ −
// call parallel version with+ −
//+ −
// scala -cp scala-parallel-collections_2.13-0.2.0.jar crawler.scala + −
+ −
+ −
import io.Source+ −
import scala.util._+ −
import scala.collection.parallel.CollectionConverters._+ −
+ −
// the idea is to look for links using the+ −
// regular expression "https?://[^"]*" and for+ −
// email addresses using yet another regex.+ −
+ −
+ −
// gets the first 10K of a web-page+ −
def get_page(url: String) : String = {+ −
Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).+ −
getOrElse { println(s" Problem with: $url"); ""}+ −
}+ −
+ −
// regex for URLs and emails+ −
val http_pattern = """"https?://[^"]*"""".r+ −
val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r+ −
+ −
// val s = "foo bla christian@kcl.ac.uk 1234567"+ −
// email_pattern.findAllIn(s).toList+ −
+ −
// drops the first and last character from a string+ −
def unquote(s: String) = s.drop(1).dropRight(1)+ −
+ −
def get_all_URLs(page: String): Set[String] = + −
http_pattern.findAllIn(page).map(unquote).toSet+ −
+ −
// a naive version of crawl - searches until a given depth,+ −
// visits pages potentially more than once+ −
def crawl(url: String, n: Int) : Unit = {+ −
if (n == 0) ()+ −
else {+ −
println(s" Visiting: $n $url")+ −
val page = get_page(url)+ −
for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1)+ −
}+ −
}+ −
+ −
// some starting URLs for the crawler+ −
val startURL = """https://nms.kcl.ac.uk/christian.urban/"""+ −
+ −
//crawl(startURL, 2)+ −
+ −
// a primitive email harvester+ −
def emails(url: String, n: Int) : Set[String] = {+ −
if (n == 0) Set()+ −
else {+ −
println(s" Visiting: $n $url")+ −
val page = get_page(url)+ −
val new_emails = email_pattern.findAllIn(page).toSet+ −
new_emails ++ (for (u <- get_all_URLs(page).par) yield emails(u, n - 1)).flatten+ −
}+ −
}+ −
+ −
println(emails(startURL, 3))+ −
+ −