equal
deleted
inserted
replaced
|
1 // This version of the crawler only |
|
2 // checks links in the "domain" urbanc |
|
3 |
1 import io.Source |
4 import io.Source |
2 import scala.util.matching.Regex |
5 import scala.util.matching.Regex |
3 import scala.util._ |
6 import scala.util._ |
4 |
7 |
5 // gets the first ~10K of a page |
8 // gets the first ~10K of a page |
6 def get_page(url: String) : String = |
9 def get_page(url: String) : String = { |
7 Try(Source.fromURL(url).take(10000).mkString) getOrElse |
10 Try(Source.fromURL(url).take(10000).mkString) getOrElse |
8 { println(s" Problem with: $url"); ""} |
11 { println(s" Problem with: $url"); ""} |
|
12 } |
9 |
13 |
10 // staring URL for the crawler |
14 // staring URL for the crawler |
11 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
15 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |
12 |
16 |
13 // regex for URLs |
17 // regex for URLs |
22 |
26 |
23 // naive version - seraches until a given depth |
27 // naive version - seraches until a given depth |
24 // visits pages potentially more than once |
28 // visits pages potentially more than once |
25 def crawl(url: String, n: Int) : Unit = { |
29 def crawl(url: String, n: Int) : Unit = { |
26 if (n == 0) () |
30 if (n == 0) () |
27 else if (my_urls.findFirstIn(url) == None) () |
31 //else if (my_urls.findFirstIn(url) == None) () |
28 else { |
32 else { |
29 println(s"Visiting: $n $url") |
33 println(s"Visiting: $n $url") |
30 for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) |
34 for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) |
31 } |
35 } |
32 } |
36 } |
33 |
37 |
34 // can now deal with depth 3 |
38 // can now deal with depth 3 |
35 // start on command line |
39 // start on command line |
36 crawl(startURL, 4) |
40 crawl(startURL, 4) |
37 |
41 |
|
42 crawl("""http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-13.html""", 2) |