|
1 // A Web-Crawler |
|
2 //================ |
|
3 |
|
4 // call parallel version with |
|
5 // |
|
6 // scala -cp scala-parallel-collections_2.13-0.2.0.jar crawler.scala |
|
7 |
|
8 |
|
9 import io.Source |
|
10 import scala.util._ |
|
11 import scala.collection.parallel.CollectionConverters._ |
|
12 |
|
13 // the idea is to look for links using the |
|
14 // regular expression "https?://[^"]*" and for |
|
15 // email addresses using yet another regex. |
|
16 |
|
17 |
|
18 // gets the first 10K of a web-page |
|
19 def get_page(url: String) : String = { |
|
20 Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). |
|
21 getOrElse { println(s" Problem with: $url"); ""} |
|
22 } |
|
23 |
|
24 // regex for URLs and emails |
|
25 val http_pattern = """"https?://[^"]*"""".r |
|
26 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r |
|
27 |
|
28 // val s = "foo bla christian@kcl.ac.uk 1234567" |
|
29 // email_pattern.findAllIn(s).toList |
|
30 |
|
31 // drops the first and last character from a string |
|
32 def unquote(s: String) = s.drop(1).dropRight(1) |
|
33 |
|
34 def get_all_URLs(page: String): Set[String] = |
|
35 http_pattern.findAllIn(page).map(unquote).toSet |
|
36 |
|
37 // a naive version of crawl - searches until a given depth, |
|
38 // visits pages potentially more than once |
|
39 def crawl(url: String, n: Int) : Unit = { |
|
40 if (n == 0) () |
|
41 else { |
|
42 println(s" Visiting: $n $url") |
|
43 val page = get_page(url) |
|
44 for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1) |
|
45 } |
|
46 } |
|
47 |
|
48 // some starting URLs for the crawler |
|
49 val startURL = """https://nms.kcl.ac.uk/christian.urban/""" |
|
50 |
|
51 //crawl(startURL, 2) |
|
52 |
|
53 // a primitive email harvester |
|
54 def emails(url: String, n: Int) : Set[String] = { |
|
55 if (n == 0) Set() |
|
56 else { |
|
57 println(s" Visiting: $n $url") |
|
58 val page = get_page(url) |
|
59 val new_emails = email_pattern.findAllIn(page).toSet |
|
60 new_emails ++ (for (u <- get_all_URLs(page).par) yield emails(u, n - 1)).flatten |
|
61 } |
|
62 } |
|
63 |
|
64 println(emails(startURL, 3)) |
|
65 |