equal
deleted
inserted
replaced
5 def get_page(url: String) : String = { |
5 def get_page(url: String) : String = { |
6 try { |
6 try { |
7 Source.fromURL(url).take(10000).mkString |
7 Source.fromURL(url).take(10000).mkString |
8 } |
8 } |
9 catch { |
9 catch { |
10 case e => { |
10 case _ : Throwable => { |
11 println(" Problem with: " + url) |
11 println(s" Problem with: $url") |
12 "" |
12 "" |
13 } |
13 } |
14 } |
14 } |
15 } |
15 } |
16 |
16 |
27 // naive version - seraches until a given depth |
27 // naive version - seraches until a given depth |
28 // visits pages potentially more than once |
28 // visits pages potentially more than once |
29 def crawl(url: String, n: Int) : Unit = { |
29 def crawl(url: String, n: Int) : Unit = { |
30 if (n == 0) () |
30 if (n == 0) () |
31 else { |
31 else { |
32 println("Visiting: " + n + " " + url) |
32 println(s"Visiting: $n $url") |
33 for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) |
33 for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) |
34 } |
34 } |
35 } |
35 } |
36 |
36 |
37 // staring URL for the crawler |
37 // staring URL for the crawler |
40 |
40 |
41 |
41 |
42 // call on the command line |
42 // call on the command line |
43 crawl(startURL, 2) |
43 crawl(startURL, 2) |
44 |
44 |
45 crawl("""http://www.dcs.kcl.ac.uk/staff/urbanc/msc-projects-12.html""", 2) |
45 crawl("""http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-13.html""", 2) |