# HG changeset patch # User Christian Urban # Date 1574145500 0 # Node ID 755d165633ec8e11402b0b447fbfd7dd87674764 # Parent 7b0055205ec900fcc842035974696874a0512141 updated diff -r 7b0055205ec9 -r 755d165633ec progs/crawler.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/progs/crawler.scala Tue Nov 19 06:38:20 2019 +0000 @@ -0,0 +1,65 @@ +// A Web-Crawler +//================ + +// call parallel version with +// +// scala -cp scala-parallel-collections_2.13-0.2.0.jar crawler.scala + + +import io.Source +import scala.util._ +import scala.collection.parallel.CollectionConverters._ + +// the idea is to look for links using the +// regular expression "https?://[^"]*" and for +// email addresses using yet another regex. + + +// gets the first 10K of a web-page +def get_page(url: String) : String = { + Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). + getOrElse { println(s" Problem with: $url"); ""} +} + +// regex for URLs and emails +val http_pattern = """"https?://[^"]*"""".r +val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r + +// val s = "foo bla christian@kcl.ac.uk 1234567" +// email_pattern.findAllIn(s).toList + +// drops the first and last character from a string +def unquote(s: String) = s.drop(1).dropRight(1) + +def get_all_URLs(page: String): Set[String] = + http_pattern.findAllIn(page).map(unquote).toSet + +// a naive version of crawl - searches until a given depth, +// visits pages potentially more than once +def crawl(url: String, n: Int) : Unit = { + if (n == 0) () + else { + println(s" Visiting: $n $url") + val page = get_page(url) + for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1) + } +} + +// some starting URLs for the crawler +val startURL = """https://nms.kcl.ac.uk/christian.urban/""" + +//crawl(startURL, 2) + +// a primitive email harvester +def emails(url: String, n: Int) : Set[String] = { + if (n == 0) Set() + else { + println(s" Visiting: $n $url") + val page = get_page(url) + val new_emails = email_pattern.findAllIn(page).toSet + new_emails ++ (for (u <- get_all_URLs(page).par) yield emails(u, n - 1)).flatten + } +} + +println(emails(startURL, 3)) +