|
1 import io.Source |
|
2 import scala.util.matching.Regex |
|
3 |
|
4 // gets the first ~10K of a page |
|
5 def get_page(url: String) : String = { |
|
6 try { |
|
7 Source.fromURL(url).take(10000).mkString |
|
8 } |
|
9 catch { |
|
10 case e => { |
|
11 println(" Problem with: " + url) |
|
12 "" |
|
13 } |
|
14 } |
|
15 } |
|
16 |
|
17 // non-existing page -> returns the empty string |
|
18 get_page("""http://www.foobar.com""") |
|
19 |
|
20 |
|
21 // staring URL for the crawler |
|
22 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
|
23 |
|
24 // starts with an " |
|
25 // then either http or https |
|
26 // then :// |
|
27 // then any character that is not " |
|
28 // finally " |
|
29 val http_pattern = """\"((?:http|https)://(?:[^\"])*)\"""".r |
|
30 val http_pattern = """\"(https?://[^\"]*)\"""".r |
|
31 |
|
32 def unquote(s: String) = s.drop(1).dropRight(1) |
|
33 |
|
34 def get_all_URLs(page: String) : Set[String] = { |
|
35 (http_pattern.findAllIn(page)).map { unquote(_) }.toSet |
|
36 } |
|
37 |
|
38 // get all urls in startURL |
|
39 get_all_URLs(get_page(startURL)) |
|
40 |
|
41 // number of all urls in startURL |
|
42 get_all_URLs(get_page(startURL)).toList.length |
|
43 |
|
44 |
|
45 // naive version - seraches until a given depth |
|
46 // visits pages potentially more than once |
|
47 def crawl(url: String, n: Int) : Unit = { |
|
48 if (n == 0) () |
|
49 else { |
|
50 println("Visiting: " + n + " " + url) |
|
51 for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) |
|
52 } |
|
53 } |
|
54 |
|
55 crawl(startURL, 2) |
|
56 |
|
57 |
|
58 //breadth-first version without visiting |
|
59 //pages twice |
|
60 def bf_crawl(todo: Set[String], visited: Set[String], n: Int) : Unit = { |
|
61 if (n == 0) () |
|
62 else { |
|
63 val new_todo = todo.flatMap { |
|
64 url => { |
|
65 if (visited.contains(url)) Set[String]() |
|
66 else { |
|
67 println("Visiting: " + n + " " + url) |
|
68 get_all_URLs(get_page(url)) |
|
69 } |
|
70 } |
|
71 } |
|
72 bf_crawl(new_todo, visited union todo, n - 1) |
|
73 } |
|
74 } |
|
75 |
|
76 bf_crawl(Set(startURL1), Set(), 2) |
|
77 |
|
78 |
|
79 //breadth-first version without visiting |
|
80 //pages twice and only in "my" domain |
|
81 val my_pattern = """urbanc""".r |
|
82 |
|
83 // breadth first search avoiding double searches |
|
84 def bf_crawl2(todo: Set[String], visited: Set[String], n: Int) : Unit = { |
|
85 if (n == 0) () |
|
86 else { |
|
87 val new_todo = todo.flatMap { |
|
88 url => { |
|
89 if (visited.contains(url)) Set[String]() |
|
90 else if (my_pattern.findFirstIn(url) == None) Set[String]() |
|
91 else { |
|
92 println("Visiting: " + n + " " + url); |
|
93 get_all_URLs(get_page(url)) |
|
94 } |
|
95 } |
|
96 } |
|
97 bf_crawl2(new_todo, visited union todo, n - 1) |
|
98 } |
|
99 } |
|
100 |
|
101 bf_crawl2(Set(startURL1), Set(), 5) |
|
102 |
|
103 // email harvester |
|
104 // from |
|
105 // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ |
|
106 |
|
107 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r |
|
108 |
|
109 def bf_crawl3(todo: Set[String], visited: Set[String], n: Int) : Unit = { |
|
110 if (n == 0) () |
|
111 else { |
|
112 val new_todo = todo.flatMap { |
|
113 url => { |
|
114 if (visited.contains(url)) Set[String]() |
|
115 else { |
|
116 println("Visiting: " + n + " " + url); |
|
117 val page = get_page(url) |
|
118 println(email_pattern.findAllIn(page).mkString("\n")) |
|
119 get_all_URLs(get_page(url)) |
|
120 } |
|
121 } |
|
122 } |
|
123 bf_crawl3(new_todo, visited union todo, n - 1) |
|
124 } |
|
125 } |
|
126 |
|
127 bf_crawl3(Set(startURL1), Set(), 3) |
|
128 |
|
129 |
|
130 // depth-first version does not work, |
|
131 // because it might visit pages at depth 1 |
|
132 // while it still wants to visit them at |
|
133 // depth 2 |
|
134 var visited = Set("") |
|
135 |
|
136 def crawl(url: String, n: Int) : Unit = { |
|
137 if (n == 0) () |
|
138 else if (visited.contains(url)) () //println("Already visited: " + n + " " + url) |
|
139 else { |
|
140 println("Visiting: " + n + " " + url); |
|
141 visited += url |
|
142 for (u <- getAllURLs(getURLpage(url))) crawl(u, n - 1); |
|
143 } |
|
144 } |