progs/crawler.scala
changeset 322 755d165633ec
equal deleted inserted replaced
321:7b0055205ec9 322:755d165633ec
       
     1 // A Web-Crawler
       
     2 //================
       
     3 
       
     4 // call parallel version with
       
     5 //
       
     6 // scala -cp scala-parallel-collections_2.13-0.2.0.jar crawler.scala 
       
     7 
       
     8 
       
     9 import io.Source
       
    10 import scala.util._
       
    11 import scala.collection.parallel.CollectionConverters._
       
    12 
       
    13 // the idea is to look for links using the
       
    14 // regular expression "https?://[^"]*" and for
       
    15 // email addresses using yet another regex.
       
    16 
       
    17 
       
    18 // gets the first 10K of a web-page
       
    19 def get_page(url: String) : String = {
       
    20   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
       
    21     getOrElse { println(s" Problem with: $url"); ""}
       
    22 }
       
    23 
       
    24 // regex for URLs and emails
       
    25 val http_pattern = """"https?://[^"]*"""".r
       
    26 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
       
    27 
       
    28 //  val s = "foo bla christian@kcl.ac.uk 1234567"
       
    29 //  email_pattern.findAllIn(s).toList
       
    30 
       
    31 // drops the first and last character from a string
       
    32 def unquote(s: String) = s.drop(1).dropRight(1)
       
    33 
       
    34 def get_all_URLs(page: String): Set[String] = 
       
    35   http_pattern.findAllIn(page).map(unquote).toSet
       
    36 
       
    37 // a naive version of crawl - searches until a given depth,
       
    38 // visits pages potentially more than once
       
    39 def crawl(url: String, n: Int) : Unit = {
       
    40   if (n == 0) ()
       
    41   else {
       
    42     println(s"  Visiting: $n $url")
       
    43     val page = get_page(url)
       
    44     for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1)
       
    45   }
       
    46 }
       
    47 
       
    48 // some starting URLs for the crawler
       
    49 val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
       
    50 
       
    51 //crawl(startURL, 2)
       
    52 
       
    53 // a primitive email harvester
       
    54 def emails(url: String, n: Int) : Set[String] = {
       
    55   if (n == 0) Set()
       
    56   else {
       
    57     println(s"  Visiting: $n $url")
       
    58     val page = get_page(url)
       
    59     val new_emails = email_pattern.findAllIn(page).toSet
       
    60     new_emails ++ (for (u <- get_all_URLs(page).par) yield emails(u, n - 1)).flatten
       
    61   }
       
    62 }
       
    63 
       
    64 println(emails(startURL, 3))
       
    65