1
|
1 |
import io.Source
|
|
2 |
import scala.util.matching.Regex
|
|
3 |
|
|
4 |
// gets the first ~10K of a page
|
|
5 |
def get_page(url: String) : String = {
|
|
6 |
try {
|
|
7 |
Source.fromURL(url).take(10000).mkString
|
|
8 |
}
|
|
9 |
catch {
|
|
10 |
case e => {
|
|
11 |
println(" Problem with: " + url)
|
|
12 |
""
|
|
13 |
}
|
|
14 |
}
|
|
15 |
}
|
|
16 |
|
|
17 |
// non-existing page -> returns the empty string
|
|
18 |
get_page("""http://www.foobar.com""")
|
|
19 |
|
|
20 |
|
|
21 |
// staring URL for the crawler
|
|
22 |
val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
|
|
23 |
|
|
24 |
// starts with an "
|
|
25 |
// then either http or https
|
|
26 |
// then ://
|
|
27 |
// then any character that is not "
|
|
28 |
// finally "
|
|
29 |
val http_pattern = """\"((?:http|https)://(?:[^\"])*)\"""".r
|
|
30 |
val http_pattern = """\"(https?://[^\"]*)\"""".r
|
|
31 |
|
|
32 |
def unquote(s: String) = s.drop(1).dropRight(1)
|
|
33 |
|
|
34 |
def get_all_URLs(page: String) : Set[String] = {
|
|
35 |
(http_pattern.findAllIn(page)).map { unquote(_) }.toSet
|
|
36 |
}
|
|
37 |
|
|
38 |
// get all urls in startURL
|
|
39 |
get_all_URLs(get_page(startURL))
|
|
40 |
|
|
41 |
// number of all urls in startURL
|
|
42 |
get_all_URLs(get_page(startURL)).toList.length
|
|
43 |
|
|
44 |
|
|
45 |
// naive version - seraches until a given depth
|
|
46 |
// visits pages potentially more than once
|
|
47 |
def crawl(url: String, n: Int) : Unit = {
|
|
48 |
if (n == 0) ()
|
|
49 |
else {
|
|
50 |
println("Visiting: " + n + " " + url)
|
|
51 |
for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
|
|
52 |
}
|
|
53 |
}
|
|
54 |
|
|
55 |
crawl(startURL, 2)
|
|
56 |
|
|
57 |
|
|
58 |
//breadth-first version without visiting
|
|
59 |
//pages twice
|
|
60 |
def bf_crawl(todo: Set[String], visited: Set[String], n: Int) : Unit = {
|
|
61 |
if (n == 0) ()
|
|
62 |
else {
|
|
63 |
val new_todo = todo.flatMap {
|
|
64 |
url => {
|
|
65 |
if (visited.contains(url)) Set[String]()
|
|
66 |
else {
|
|
67 |
println("Visiting: " + n + " " + url)
|
|
68 |
get_all_URLs(get_page(url))
|
|
69 |
}
|
|
70 |
}
|
|
71 |
}
|
|
72 |
bf_crawl(new_todo, visited union todo, n - 1)
|
|
73 |
}
|
|
74 |
}
|
|
75 |
|
|
76 |
bf_crawl(Set(startURL1), Set(), 2)
|
|
77 |
|
|
78 |
|
|
79 |
//breadth-first version without visiting
|
|
80 |
//pages twice and only in "my" domain
|
|
81 |
val my_pattern = """urbanc""".r
|
|
82 |
|
|
83 |
// breadth first search avoiding double searches
|
|
84 |
def bf_crawl2(todo: Set[String], visited: Set[String], n: Int) : Unit = {
|
|
85 |
if (n == 0) ()
|
|
86 |
else {
|
|
87 |
val new_todo = todo.flatMap {
|
|
88 |
url => {
|
|
89 |
if (visited.contains(url)) Set[String]()
|
|
90 |
else if (my_pattern.findFirstIn(url) == None) Set[String]()
|
|
91 |
else {
|
|
92 |
println("Visiting: " + n + " " + url);
|
|
93 |
get_all_URLs(get_page(url))
|
|
94 |
}
|
|
95 |
}
|
|
96 |
}
|
|
97 |
bf_crawl2(new_todo, visited union todo, n - 1)
|
|
98 |
}
|
|
99 |
}
|
|
100 |
|
|
101 |
bf_crawl2(Set(startURL1), Set(), 5)
|
|
102 |
|
|
103 |
// email harvester
|
|
104 |
// from
|
|
105 |
// http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/
|
|
106 |
|
|
107 |
val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
|
|
108 |
|
|
109 |
def bf_crawl3(todo: Set[String], visited: Set[String], n: Int) : Unit = {
|
|
110 |
if (n == 0) ()
|
|
111 |
else {
|
|
112 |
val new_todo = todo.flatMap {
|
|
113 |
url => {
|
|
114 |
if (visited.contains(url)) Set[String]()
|
|
115 |
else {
|
|
116 |
println("Visiting: " + n + " " + url);
|
|
117 |
val page = get_page(url)
|
|
118 |
println(email_pattern.findAllIn(page).mkString("\n"))
|
|
119 |
get_all_URLs(get_page(url))
|
|
120 |
}
|
|
121 |
}
|
|
122 |
}
|
|
123 |
bf_crawl3(new_todo, visited union todo, n - 1)
|
|
124 |
}
|
|
125 |
}
|
|
126 |
|
|
127 |
bf_crawl3(Set(startURL1), Set(), 3)
|
|
128 |
|
|
129 |
|
|
130 |
// depth-first version does not work,
|
|
131 |
// because it might visit pages at depth 1
|
|
132 |
// while it still wants to visit them at
|
|
133 |
// depth 2
|
|
134 |
var visited = Set("")
|
|
135 |
|
|
136 |
def crawl(url: String, n: Int) : Unit = {
|
|
137 |
if (n == 0) ()
|
|
138 |
else if (visited.contains(url)) () //println("Already visited: " + n + " " + url)
|
|
139 |
else {
|
|
140 |
println("Visiting: " + n + " " + url);
|
|
141 |
visited += url
|
|
142 |
for (u <- getAllURLs(getURLpage(url))) crawl(u, n - 1);
|
|
143 |
}
|
|
144 |
}
|