equal
deleted
inserted
replaced
|
1 // This version of the crawler also |
|
2 // harvests emails from webpages |
|
3 |
1 import io.Source |
4 import io.Source |
2 import scala.util.matching.Regex |
5 import scala.util.matching.Regex |
3 import scala.util._ |
6 import scala.util._ |
4 |
7 |
5 // gets the first ~10K of a page |
8 // gets the first ~10K of a page |
14 // regex for URLs |
17 // regex for URLs |
15 val http_pattern = """\"https?://[^\"]*\"""".r |
18 val http_pattern = """\"https?://[^\"]*\"""".r |
16 val my_urls = """urbanc""".r |
19 val my_urls = """urbanc""".r |
17 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r |
20 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r |
18 |
21 |
19 // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ |
22 // The regular expression for emails comes from: |
|
23 // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ |
20 |
24 |
21 def unquote(s: String) = s.drop(1).dropRight(1) |
25 def unquote(s: String) = s.drop(1).dropRight(1) |
22 |
26 |
23 def get_all_URLs(page: String) : Set[String] = { |
27 def get_all_URLs(page: String) : Set[String] = { |
24 http_pattern.findAllIn(page).map(unquote).toSet |
28 http_pattern.findAllIn(page).map(unquote).toSet |
35 println(email_pattern.findAllIn(page).mkString("\n")) |
39 println(email_pattern.findAllIn(page).mkString("\n")) |
36 for (u <- get_all_URLs(page)) crawl(u, n - 1) |
40 for (u <- get_all_URLs(page)) crawl(u, n - 1) |
37 } |
41 } |
38 } |
42 } |
39 |
43 |
40 // can now deal with depth 3 |
|
41 // start on command line |
|
42 crawl(startURL, 3) |
44 crawl(startURL, 3) |
43 |
|