import io.Source+ −
import scala.util.matching.Regex+ −
+ −
// gets the first ~10K of a page+ −
def get_page(url: String) : String = { + −
try {+ −
Source.fromURL(url).take(10000).mkString + −
}+ −
catch {+ −
case e => {+ −
println(" Problem with: " + url)+ −
""+ −
}+ −
}+ −
}+ −
+ −
// non-existing page -> returns the empty string+ −
get_page("""http://www.foobar.com""")+ −
+ −
+ −
// staring URL for the crawler+ −
val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""+ −
+ −
// starts with an "+ −
// then either http or https+ −
// then ://+ −
// then any character that is not "+ −
// finally "+ −
val http_pattern = """\"((?:http|https)://(?:[^\"])*)\"""".r+ −
val http_pattern = """\"(https?://[^\"]*)\"""".r+ −
+ −
def unquote(s: String) = s.drop(1).dropRight(1)+ −
+ −
def get_all_URLs(page: String) : Set[String] = {+ −
(http_pattern.findAllIn(page)).map { unquote(_) }.toSet+ −
}+ −
+ −
// get all urls in startURL+ −
get_all_URLs(get_page(startURL))+ −
+ −
// number of all urls in startURL + −
get_all_URLs(get_page(startURL)).toList.length+ −
+ −
+ −
// naive version - seraches until a given depth+ −
// visits pages potentially more than once+ −
def crawl(url: String, n: Int) : Unit = {+ −
if (n == 0) ()+ −
else {+ −
println("Visiting: " + n + " " + url)+ −
for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)+ −
}+ −
}+ −
+ −
crawl(startURL, 2)+ −
+ −
+ −
//breadth-first version without visiting + −
//pages twice+ −
def bf_crawl(todo: Set[String], visited: Set[String], n: Int) : Unit = {+ −
if (n == 0) ()+ −
else {+ −
val new_todo = todo.flatMap { + −
url => {+ −
if (visited.contains(url)) Set[String]()+ −
else {+ −
println("Visiting: " + n + " " + url)+ −
get_all_URLs(get_page(url))+ −
}+ −
}+ −
} + −
bf_crawl(new_todo, visited union todo, n - 1)+ −
}+ −
}+ −
+ −
bf_crawl(Set(startURL1), Set(), 2)+ −
+ −
+ −
//breadth-first version without visiting + −
//pages twice and only in "my" domain+ −
val my_pattern = """urbanc""".r+ −
+ −
// breadth first search avoiding double searches+ −
def bf_crawl2(todo: Set[String], visited: Set[String], n: Int) : Unit = {+ −
if (n == 0) ()+ −
else {+ −
val new_todo = todo.flatMap { + −
url => {+ −
if (visited.contains(url)) Set[String]()+ −
else if (my_pattern.findFirstIn(url) == None) Set[String]()+ −
else {+ −
println("Visiting: " + n + " " + url);+ −
get_all_URLs(get_page(url))+ −
}+ −
}+ −
} + −
bf_crawl2(new_todo, visited union todo, n - 1)+ −
}+ −
}+ −
+ −
bf_crawl2(Set(startURL1), Set(), 5)+ −
+ −
// email harvester+ −
// from + −
// http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/+ −
+ −
val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r+ −
+ −
def bf_crawl3(todo: Set[String], visited: Set[String], n: Int) : Unit = {+ −
if (n == 0) ()+ −
else {+ −
val new_todo = todo.flatMap { + −
url => {+ −
if (visited.contains(url)) Set[String]()+ −
else {+ −
println("Visiting: " + n + " " + url);+ −
val page = get_page(url)+ −
println(email_pattern.findAllIn(page).mkString("\n"))+ −
get_all_URLs(get_page(url))+ −
}+ −
}+ −
} + −
bf_crawl3(new_todo, visited union todo, n - 1)+ −
}+ −
}+ −
+ −
bf_crawl3(Set(startURL1), Set(), 3)+ −
+ −
+ −
// depth-first version does not work,+ −
// because it might visit pages at depth 1+ −
// while it still wants to visit them at + −
// depth 2 + −
var visited = Set("")+ −
+ −
def crawl(url: String, n: Int) : Unit = {+ −
if (n == 0) ()+ −
else if (visited.contains(url)) () //println("Already visited: " + n + " " + url)+ −
else {+ −
println("Visiting: " + n + " " + url);+ −
visited += url+ −
for (u <- getAllURLs(getURLpage(url))) crawl(u, n - 1);+ −
}+ −
}+ −