progs/crawler3.scala
changeset 242 35104ee14f87
parent 112 95ee5cc5c05d
child 254 dcd4688690ce
equal deleted inserted replaced
241:10f02605a46a 242:35104ee14f87
     9   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     9   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
    10     { println(s"  Problem with: $url"); ""}
    10     { println(s"  Problem with: $url"); ""}
    11 }
    11 }
    12 
    12 
    13 // regexes for URLs, for "my" domain and for email addresses
    13 // regexes for URLs, for "my" domain and for email addresses
    14 val http_pattern = """\"https?://[^\"]*\"""".r
    14 val http_pattern = """"https?://[^"]*"""".r
    15 val my_urls = """urbanc""".r
    15 val my_urls = """urbanc""".r
    16 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
    16 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
    17 
       
    18 // The regular expression for emails comes from: 
       
    19 //    http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/
       
    20 
    17 
    21 def unquote(s: String) = s.drop(1).dropRight(1)
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    22 
    19 
    23 def get_all_URLs(page: String) : Set[String] = {
    20 def get_all_URLs(page: String) : Set[String] = {
    24   http_pattern.findAllIn(page).map(unquote).toSet
    21   http_pattern.findAllIn(page).map(unquote).toSet
    25 }
    22 }
    26 
    23 
       
    24 def print_str(s: String) = 
       
    25   if (s == "") () else println(s)
       
    26 
    27 def crawl(url: String, n: Int) : Unit = {
    27 def crawl(url: String, n: Int) : Unit = {
    28   if (n == 0) ()
    28   if (n == 0) ()
    29   //else if (my_urls.findFirstIn(url) == None) ()
       
    30   else {
    29   else {
    31     println(s"Visiting: $n $url")
    30     println(s"Visiting: $n $url")
    32     val page = get_page(url)
    31     val page = get_page(url)
    33     println(email_pattern.findAllIn(page).mkString("\n"))
    32     print_str(email_pattern.findAllIn(page).mkString("\n"))
    34     for (u <- get_all_URLs(page)) crawl(u, n - 1)
    33     for (u <- get_all_URLs(page).par) crawl(u, n - 1)
    35   }
    34   }
    36 }
    35 }
    37 
    36 
    38 // staring URL for the crawler
    37 // staring URL for the crawler
    39 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    38 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
    40 
    39 
    41 crawl(startURL, 3)
    40 crawl(startURL, 3)