| 322 |      1 | // A Web-Crawler
 | 
|  |      2 | //================
 | 
|  |      3 | 
 | 
|  |      4 | // call parallel version with
 | 
|  |      5 | //
 | 
|  |      6 | // scala -cp scala-parallel-collections_2.13-0.2.0.jar crawler.scala 
 | 
|  |      7 | 
 | 
|  |      8 | 
 | 
|  |      9 | import io.Source
 | 
|  |     10 | import scala.util._
 | 
|  |     11 | import scala.collection.parallel.CollectionConverters._
 | 
|  |     12 | 
 | 
|  |     13 | // the idea is to look for links using the
 | 
|  |     14 | // regular expression "https?://[^"]*" and for
 | 
|  |     15 | // email addresses using yet another regex.
 | 
|  |     16 | 
 | 
|  |     17 | 
 | 
|  |     18 | // gets the first 10K of a web-page
 | 
|  |     19 | def get_page(url: String) : String = {
 | 
|  |     20 |   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
 | 
|  |     21 |     getOrElse { println(s" Problem with: $url"); ""}
 | 
|  |     22 | }
 | 
|  |     23 | 
 | 
|  |     24 | // regex for URLs and emails
 | 
|  |     25 | val http_pattern = """"https?://[^"]*"""".r
 | 
|  |     26 | val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
 | 
|  |     27 | 
 | 
|  |     28 | //  val s = "foo bla christian@kcl.ac.uk 1234567"
 | 
|  |     29 | //  email_pattern.findAllIn(s).toList
 | 
|  |     30 | 
 | 
|  |     31 | // drops the first and last character from a string
 | 
|  |     32 | def unquote(s: String) = s.drop(1).dropRight(1)
 | 
|  |     33 | 
 | 
|  |     34 | def get_all_URLs(page: String): Set[String] = 
 | 
|  |     35 |   http_pattern.findAllIn(page).map(unquote).toSet
 | 
|  |     36 | 
 | 
|  |     37 | // a naive version of crawl - searches until a given depth,
 | 
|  |     38 | // visits pages potentially more than once
 | 
|  |     39 | def crawl(url: String, n: Int) : Unit = {
 | 
|  |     40 |   if (n == 0) ()
 | 
|  |     41 |   else {
 | 
|  |     42 |     println(s"  Visiting: $n $url")
 | 
|  |     43 |     val page = get_page(url)
 | 
|  |     44 |     for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1)
 | 
|  |     45 |   }
 | 
|  |     46 | }
 | 
|  |     47 | 
 | 
|  |     48 | // some starting URLs for the crawler
 | 
|  |     49 | val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
 | 
|  |     50 | 
 | 
|  |     51 | //crawl(startURL, 2)
 | 
|  |     52 | 
 | 
|  |     53 | // a primitive email harvester
 | 
|  |     54 | def emails(url: String, n: Int) : Set[String] = {
 | 
|  |     55 |   if (n == 0) Set()
 | 
|  |     56 |   else {
 | 
|  |     57 |     println(s"  Visiting: $n $url")
 | 
|  |     58 |     val page = get_page(url)
 | 
|  |     59 |     val new_emails = email_pattern.findAllIn(page).toSet
 | 
|  |     60 |     new_emails ++ (for (u <- get_all_URLs(page).par) yield emails(u, n - 1)).flatten
 | 
|  |     61 |   }
 | 
|  |     62 | }
 | 
|  |     63 | 
 | 
|  |     64 | println(emails(startURL, 3))
 | 
|  |     65 | 
 |