equal
deleted
inserted
replaced
1 import io.Source |
1 import io.Source |
2 import scala.util.matching.Regex |
2 import scala.util.matching.Regex |
|
3 import scala.util._ |
3 |
4 |
4 // gets the first ~10K of a page |
5 // gets the first ~10K of a page |
5 def get_page(url: String) : String = { |
6 def get_page(url: String) : String = |
6 try { |
7 Try(Source.fromURL(url).take(10000).mkString) getOrElse |
7 Source.fromURL(url).take(10000).mkString |
8 { println(s" Problem with: $url"); ""} |
8 } |
|
9 catch { |
|
10 case _ : Throwable => { |
|
11 println(s" Problem with: $url") |
|
12 "" |
|
13 } |
|
14 } |
|
15 } |
|
16 |
9 |
17 // staring URL for the crawler |
10 // staring URL for the crawler |
18 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
11 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
19 |
12 |
20 // regex for URLs |
13 // regex for URLs |
22 val my_urls = """urbanc""".r |
15 val my_urls = """urbanc""".r |
23 |
16 |
24 def unquote(s: String) = s.drop(1).dropRight(1) |
17 def unquote(s: String) = s.drop(1).dropRight(1) |
25 |
18 |
26 def get_all_URLs(page: String) : Set[String] = { |
19 def get_all_URLs(page: String) : Set[String] = { |
27 (http_pattern.findAllIn(page)).map { unquote(_) }.toSet |
20 http_pattern.findAllIn(page).map(unquote).toSet |
28 } |
21 } |
29 |
22 |
30 // naive version - seraches until a given depth |
23 // naive version - seraches until a given depth |
31 // visits pages potentially more than once |
24 // visits pages potentially more than once |
32 def crawl(url: String, n: Int) : Unit = { |
25 def crawl(url: String, n: Int) : Unit = { |