equal
deleted
inserted
replaced
1 import io.Source |
1 import io.Source |
2 import scala.util.matching.Regex |
2 import scala.util.matching.Regex |
|
3 import scala.util._ |
3 |
4 |
4 // gets the first ~10K of a page |
5 // gets the first ~10K of a page |
5 def get_page(url: String) : String = { |
6 def get_page(url: String) : String = |
6 try { |
7 Try(Source.fromURL(url).take(10000).mkString) getOrElse |
7 Source.fromURL(url).take(10000).mkString |
8 { println(s" Problem with: $url"); ""} |
8 } |
|
9 catch { |
|
10 case _ : Throwable => { |
|
11 println(s" Problem with: $url") |
|
12 "" |
|
13 } |
|
14 } |
|
15 } |
|
16 |
9 |
17 // staring URL for the crawler |
10 // staring URL for the crawler |
18 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
11 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
19 |
12 |
20 // regex for URLs |
13 // regex for URLs |
25 // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ |
18 // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/ |
26 |
19 |
27 def unquote(s: String) = s.drop(1).dropRight(1) |
20 def unquote(s: String) = s.drop(1).dropRight(1) |
28 |
21 |
29 def get_all_URLs(page: String) : Set[String] = { |
22 def get_all_URLs(page: String) : Set[String] = { |
30 (http_pattern.findAllIn(page)).map { unquote(_) }.toSet |
23 http_pattern.findAllIn(page).map(unquote).toSet |
31 } |
24 } |
32 |
25 |
33 // naive version - seraches until a given depth |
26 // naive version - seraches until a given depth |
34 // visits pages potentially more than once |
27 // visits pages potentially more than once |
35 def crawl(url: String, n: Int) : Unit = { |
28 def crawl(url: String, n: Int) : Unit = { |