equal
deleted
inserted
replaced
1 // This version of the crawler also |
1 // This version of the crawler that also |
2 // harvests emails from webpages |
2 // "harvests" email addresses from webpages |
3 |
3 |
4 import io.Source |
4 import io.Source |
5 import scala.util.matching.Regex |
5 import scala.util.matching.Regex |
6 import scala.util._ |
6 import scala.util._ |
7 |
7 |
8 // gets the first ~10K of a web-page |
|
9 def get_page(url: String) : String = { |
8 def get_page(url: String) : String = { |
10 Try(Source.fromURL(url).take(10000).mkString) getOrElse |
9 Try(Source.fromURL(url).take(10000).mkString) getOrElse |
11 { println(s" Problem with: $url"); ""} |
10 { println(s" Problem with: $url"); ""} |
12 } |
11 } |
13 |
12 |
14 // staring URL for the crawler |
13 // regexes for URLs, for "my" domain and for email addresses |
15 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
|
16 |
|
17 // regex for URLs |
|
18 val http_pattern = """\"https?://[^\"]*\"""".r |
14 val http_pattern = """\"https?://[^\"]*\"""".r |
19 val my_urls = """urbanc""".r |
15 val my_urls = """urbanc""".r |
20 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r |
16 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r |
21 |
17 |
22 // The regular expression for emails comes from: |
18 // The regular expression for emails comes from: |
26 |
22 |
27 def get_all_URLs(page: String) : Set[String] = { |
23 def get_all_URLs(page: String) : Set[String] = { |
28 http_pattern.findAllIn(page).map(unquote).toSet |
24 http_pattern.findAllIn(page).map(unquote).toSet |
29 } |
25 } |
30 |
26 |
31 // naive version - seraches until a given depth |
|
32 // visits pages potentially more than once |
|
33 def crawl(url: String, n: Int) : Unit = { |
27 def crawl(url: String, n: Int) : Unit = { |
34 if (n == 0) () |
28 if (n == 0) () |
35 //else if (my_urls.findFirstIn(url) == None) () |
29 //else if (my_urls.findFirstIn(url) == None) () |
36 else { |
30 else { |
37 println(s"Visiting: $n $url") |
31 println(s"Visiting: $n $url") |
39 println(email_pattern.findAllIn(page).mkString("\n")) |
33 println(email_pattern.findAllIn(page).mkString("\n")) |
40 for (u <- get_all_URLs(page)) crawl(u, n - 1) |
34 for (u <- get_all_URLs(page)) crawl(u, n - 1) |
41 } |
35 } |
42 } |
36 } |
43 |
37 |
|
38 // staring URL for the crawler |
|
39 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
|
40 |
44 crawl(startURL, 3) |
41 crawl(startURL, 3) |