1 // This version of the crawler only |
1 // This version of the crawler that also |
2 // checks links in the "domain" urbanc |
2 // "harvests" email addresses from webpages |
3 |
3 |
4 import io.Source |
4 import io.Source |
5 import scala.util.matching.Regex |
5 import scala.util.matching.Regex |
6 import scala.util._ |
6 import scala.util._ |
7 |
7 |
8 // gets the first 10K of a web-page |
|
9 def get_page(url: String) : String = { |
8 def get_page(url: String) : String = { |
10 Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). |
9 Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). |
11 getOrElse { println(s" Problem with: $url"); ""} |
10 getOrElse { println(s" Problem with: $url"); ""} |
12 } |
11 } |
13 |
12 |
14 // regexes for URLs and "my" domain |
13 // regexes for URLs, for "my" domain and for email addresses |
15 val http_pattern = """"https?://[^"]*"""".r |
14 val http_pattern = """"https?://[^"]*"""".r |
16 val my_urls = """urban""".r /*@\label{myurlline}@*/ |
15 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/ |
17 //val my_urls = """kcl.ac.uk""".r |
|
18 |
16 |
19 def unquote(s: String) = s.drop(1).dropRight(1) |
17 def unquote(s: String) = s.drop(1).dropRight(1) |
20 |
18 |
21 def get_all_URLs(page: String) : Set[String] = |
19 def get_all_URLs(page: String) : Set[String] = |
22 http_pattern.findAllIn(page).map(unquote).toSet |
20 http_pattern.findAllIn(page).map(unquote).toSet |
23 |
21 |
|
22 def print_str(s: String) = |
|
23 if (s == "") () else println(s) |
|
24 |
24 def crawl(url: String, n: Int) : Unit = { |
25 def crawl(url: String, n: Int) : Unit = { |
25 if (n == 0) () /*@\label{changestartline}@*/ |
26 if (n == 0) () |
26 else if (my_urls.findFirstIn(url) == None) { |
|
27 println(s"Visiting: $n $url") |
|
28 get_page(url); () |
|
29 } /*@\label{changeendline}@*/ |
|
30 else { |
27 else { |
31 println(s"Visiting: $n $url") |
28 println(s" Visiting: $n $url") |
32 for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1) |
29 val page = get_page(url) |
|
30 print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/ |
|
31 for (u <- get_all_URLs(page).par) crawl(u, n - 1) |
33 } |
32 } |
34 } |
33 } |
35 |
34 |
36 // starting URL for the crawler |
35 // staring URL for the crawler |
37 val startURL = """https://nms.kcl.ac.uk/christian.urban/""" |
36 val startURL = """https://nms.kcl.ac.uk/christian.urban/""" |
38 //val startURL = """https://nms.kcl.ac.uk/christian.urban/bsc-projects-17.html""" |
|
39 |
37 |
40 |
|
41 // can now deal with depth 3 and beyond |
|
42 crawl(startURL, 3) |
38 crawl(startURL, 3) |
43 |
|
44 |
|