| 
322
 | 
     1  | 
// A Web-Crawler
  | 
| 
 | 
     2  | 
//================
  | 
| 
 | 
     3  | 
  | 
| 
 | 
     4  | 
// call parallel version with
  | 
| 
 | 
     5  | 
//
  | 
| 
 | 
     6  | 
// scala -cp scala-parallel-collections_2.13-0.2.0.jar crawler.scala 
  | 
| 
 | 
     7  | 
  | 
| 
 | 
     8  | 
  | 
| 
 | 
     9  | 
import io.Source
  | 
| 
 | 
    10  | 
import scala.util._
  | 
| 
 | 
    11  | 
import scala.collection.parallel.CollectionConverters._
  | 
| 
 | 
    12  | 
  | 
| 
 | 
    13  | 
// the idea is to look for links using the
  | 
| 
 | 
    14  | 
// regular expression "https?://[^"]*" and for
  | 
| 
 | 
    15  | 
// email addresses using yet another regex.
  | 
| 
 | 
    16  | 
  | 
| 
 | 
    17  | 
  | 
| 
 | 
    18  | 
// gets the first 10K of a web-page
  | 
| 
 | 
    19  | 
def get_page(url: String) : String = {
 | 
| 
 | 
    20  | 
  Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
 | 
| 
 | 
    21  | 
    getOrElse { println(s" Problem with: $url"); ""}
 | 
| 
 | 
    22  | 
}
  | 
| 
 | 
    23  | 
  | 
| 
 | 
    24  | 
// regex for URLs and emails
  | 
| 
 | 
    25  | 
val http_pattern = """"https?://[^"]*"""".r
  | 
| 
 | 
    26  | 
val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
 | 
| 
 | 
    27  | 
  | 
| 
 | 
    28  | 
//  val s = "foo bla christian@kcl.ac.uk 1234567"
  | 
| 
 | 
    29  | 
//  email_pattern.findAllIn(s).toList
  | 
| 
 | 
    30  | 
  | 
| 
 | 
    31  | 
// drops the first and last character from a string
  | 
| 
 | 
    32  | 
def unquote(s: String) = s.drop(1).dropRight(1)
  | 
| 
 | 
    33  | 
  | 
| 
 | 
    34  | 
def get_all_URLs(page: String): Set[String] = 
  | 
| 
 | 
    35  | 
  http_pattern.findAllIn(page).map(unquote).toSet
  | 
| 
 | 
    36  | 
  | 
| 
 | 
    37  | 
// a naive version of crawl - searches until a given depth,
  | 
| 
 | 
    38  | 
// visits pages potentially more than once
  | 
| 
 | 
    39  | 
def crawl(url: String, n: Int) : Unit = {
 | 
| 
 | 
    40  | 
  if (n == 0) ()
  | 
| 
 | 
    41  | 
  else {
 | 
| 
 | 
    42  | 
    println(s"  Visiting: $n $url")
  | 
| 
 | 
    43  | 
    val page = get_page(url)
  | 
| 
 | 
    44  | 
    for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1)
  | 
| 
 | 
    45  | 
  }
  | 
| 
 | 
    46  | 
}
  | 
| 
 | 
    47  | 
  | 
| 
 | 
    48  | 
// some starting URLs for the crawler
  | 
| 
 | 
    49  | 
val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
  | 
| 
 | 
    50  | 
  | 
| 
 | 
    51  | 
//crawl(startURL, 2)
  | 
| 
 | 
    52  | 
  | 
| 
 | 
    53  | 
// a primitive email harvester
  | 
| 
 | 
    54  | 
def emails(url: String, n: Int) : Set[String] = {
 | 
| 
 | 
    55  | 
  if (n == 0) Set()
  | 
| 
 | 
    56  | 
  else {
 | 
| 
 | 
    57  | 
    println(s"  Visiting: $n $url")
  | 
| 
 | 
    58  | 
    val page = get_page(url)
  | 
| 
 | 
    59  | 
    val new_emails = email_pattern.findAllIn(page).toSet
  | 
| 
 | 
    60  | 
    new_emails ++ (for (u <- get_all_URLs(page).par) yield emails(u, n - 1)).flatten
  | 
| 
 | 
    61  | 
  }
  | 
| 
 | 
    62  | 
}
  | 
| 
 | 
    63  | 
  | 
| 
 | 
    64  | 
println(emails(startURL, 3))
  | 
| 
 | 
    65  | 
  |