diff -r 10f02605a46a -r 35104ee14f87 progs/crawler3.scala --- a/progs/crawler3.scala Sun Sep 07 08:37:44 2014 +0100 +++ b/progs/crawler3.scala Sat Sep 13 04:30:25 2014 +0100 @@ -11,31 +11,30 @@ } // regexes for URLs, for "my" domain and for email addresses -val http_pattern = """\"https?://[^\"]*\"""".r +val http_pattern = """"https?://[^"]*"""".r val my_urls = """urbanc""".r val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r -// The regular expression for emails comes from: -// http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ - def unquote(s: String) = s.drop(1).dropRight(1) def get_all_URLs(page: String) : Set[String] = { http_pattern.findAllIn(page).map(unquote).toSet } +def print_str(s: String) = + if (s == "") () else println(s) + def crawl(url: String, n: Int) : Unit = { if (n == 0) () - //else if (my_urls.findFirstIn(url) == None) () else { println(s"Visiting: $n $url") val page = get_page(url) - println(email_pattern.findAllIn(page).mkString("\n")) - for (u <- get_all_URLs(page)) crawl(u, n - 1) + print_str(email_pattern.findAllIn(page).mkString("\n")) + for (u <- get_all_URLs(page).par) crawl(u, n - 1) } } // staring URL for the crawler -val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" +val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" crawl(startURL, 3)