diff -r 3a5e09a2ae54 -r b606c9439fa6 crawler.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/crawler.scala Wed Sep 26 02:08:55 2012 +0100 @@ -0,0 +1,144 @@ +import io.Source +import scala.util.matching.Regex + +// gets the first ~10K of a page +def get_page(url: String) : String = { + try { + Source.fromURL(url).take(10000).mkString + } + catch { + case e => { + println(" Problem with: " + url) + "" + } + } +} + +// non-existing page -> returns the empty string +get_page("""http://www.foobar.com""") + + +// staring URL for the crawler +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" + +// starts with an " +// then either http or https +// then :// +// then any character that is not " +// finally " +val http_pattern = """\"((?:http|https)://(?:[^\"])*)\"""".r +val http_pattern = """\"(https?://[^\"]*)\"""".r + +def unquote(s: String) = s.drop(1).dropRight(1) + +def get_all_URLs(page: String) : Set[String] = { + (http_pattern.findAllIn(page)).map { unquote(_) }.toSet +} + +// get all urls in startURL +get_all_URLs(get_page(startURL)) + +// number of all urls in startURL +get_all_URLs(get_page(startURL)).toList.length + + +// naive version - seraches until a given depth +// visits pages potentially more than once +def crawl(url: String, n: Int) : Unit = { + if (n == 0) () + else { + println("Visiting: " + n + " " + url) + for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) + } +} + +crawl(startURL, 2) + + +//breadth-first version without visiting +//pages twice +def bf_crawl(todo: Set[String], visited: Set[String], n: Int) : Unit = { + if (n == 0) () + else { + val new_todo = todo.flatMap { + url => { + if (visited.contains(url)) Set[String]() + else { + println("Visiting: " + n + " " + url) + get_all_URLs(get_page(url)) + } + } + } + bf_crawl(new_todo, visited union todo, n - 1) + } +} + +bf_crawl(Set(startURL1), Set(), 2) + + +//breadth-first version without visiting +//pages twice and only in "my" domain +val my_pattern = """urbanc""".r + +// breadth first search avoiding double searches +def bf_crawl2(todo: Set[String], visited: Set[String], n: Int) : Unit = { + if (n == 0) () + else { + val new_todo = todo.flatMap { + url => { + if (visited.contains(url)) Set[String]() + else if (my_pattern.findFirstIn(url) == None) Set[String]() + else { + println("Visiting: " + n + " " + url); + get_all_URLs(get_page(url)) + } + } + } + bf_crawl2(new_todo, visited union todo, n - 1) + } +} + +bf_crawl2(Set(startURL1), Set(), 5) + +// email harvester +// from +// http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ + +val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r + +def bf_crawl3(todo: Set[String], visited: Set[String], n: Int) : Unit = { + if (n == 0) () + else { + val new_todo = todo.flatMap { + url => { + if (visited.contains(url)) Set[String]() + else { + println("Visiting: " + n + " " + url); + val page = get_page(url) + println(email_pattern.findAllIn(page).mkString("\n")) + get_all_URLs(get_page(url)) + } + } + } + bf_crawl3(new_todo, visited union todo, n - 1) + } +} + +bf_crawl3(Set(startURL1), Set(), 3) + + +// depth-first version does not work, +// because it might visit pages at depth 1 +// while it still wants to visit them at +// depth 2 +var visited = Set("") + +def crawl(url: String, n: Int) : Unit = { + if (n == 0) () + else if (visited.contains(url)) () //println("Already visited: " + n + " " + url) + else { + println("Visiting: " + n + " " + url); + visited += url + for (u <- getAllURLs(getURLpage(url))) crawl(u, n - 1); + } +}