# HG changeset patch # User Christian Urban # Date 1380188184 -3600 # Node ID cbc2270c2938232805f05aa77cd0bdccc8ce4eca # Parent 91145f6d9b0ebae40f5d41c402f41cf94ecb92dc updated progs diff -r 91145f6d9b0e -r cbc2270c2938 progs/crawler2.scala --- a/progs/crawler2.scala Wed Sep 25 20:35:54 2013 +0100 +++ b/progs/crawler2.scala Thu Sep 26 10:36:24 2013 +0100 @@ -1,11 +1,15 @@ +// This version of the crawler only +// checks links in the "domain" urbanc + import io.Source import scala.util.matching.Regex import scala.util._ // gets the first ~10K of a page -def get_page(url: String) : String = +def get_page(url: String) : String = { Try(Source.fromURL(url).take(10000).mkString) getOrElse { println(s" Problem with: $url"); ""} +} // staring URL for the crawler val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" @@ -24,7 +28,7 @@ // visits pages potentially more than once def crawl(url: String, n: Int) : Unit = { if (n == 0) () - else if (my_urls.findFirstIn(url) == None) () + //else if (my_urls.findFirstIn(url) == None) () else { println(s"Visiting: $n $url") for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) @@ -35,3 +39,4 @@ // start on command line crawl(startURL, 4) +crawl("""http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-13.html""", 2) diff -r 91145f6d9b0e -r cbc2270c2938 progs/crawler3.scala --- a/progs/crawler3.scala Wed Sep 25 20:35:54 2013 +0100 +++ b/progs/crawler3.scala Thu Sep 26 10:36:24 2013 +0100 @@ -1,3 +1,6 @@ +// This version of the crawler also +// harvests emails from webpages + import io.Source import scala.util.matching.Regex import scala.util._ @@ -16,7 +19,8 @@ val my_urls = """urbanc""".r val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r -// http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ +// The regular expression for emails comes from: +// http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ def unquote(s: String) = s.drop(1).dropRight(1) @@ -37,7 +41,4 @@ } } -// can now deal with depth 3 -// start on command line crawl(startURL, 3) -