equal
  deleted
  inserted
  replaced
  
    
    
|         |      1 // This version of the crawler only | 
|         |      2 // checks links in the "domain" urbanc | 
|         |      3  | 
|      1 import io.Source |      4 import io.Source | 
|      2 import scala.util.matching.Regex |      5 import scala.util.matching.Regex | 
|      3 import scala.util._ |      6 import scala.util._ | 
|      4  |      7  | 
|      5 // gets the first ~10K of a page |      8 // gets the first ~10K of a page | 
|      6 def get_page(url: String) : String =  |      9 def get_page(url: String) : String = { | 
|      7   Try(Source.fromURL(url).take(10000).mkString) getOrElse  |     10   Try(Source.fromURL(url).take(10000).mkString) getOrElse  | 
|      8     { println(s"  Problem with: $url"); ""} |     11     { println(s"  Problem with: $url"); ""} | 
|         |     12 } | 
|      9  |     13  | 
|     10 // staring URL for the crawler |     14 // staring URL for the crawler | 
|     11 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" |     15 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/""" | 
|     12  |     16  | 
|     13 // regex for URLs |     17 // regex for URLs | 
|     22  |     26  | 
|     23 // naive version - seraches until a given depth |     27 // naive version - seraches until a given depth | 
|     24 // visits pages potentially more than once |     28 // visits pages potentially more than once | 
|     25 def crawl(url: String, n: Int) : Unit = { |     29 def crawl(url: String, n: Int) : Unit = { | 
|     26   if (n == 0) () |     30   if (n == 0) () | 
|     27   else if (my_urls.findFirstIn(url) == None) () |     31   //else if (my_urls.findFirstIn(url) == None) () | 
|     28   else { |     32   else { | 
|     29     println(s"Visiting: $n $url") |     33     println(s"Visiting: $n $url") | 
|     30     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) |     34     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) | 
|     31   } |     35   } | 
|     32 } |     36 } | 
|     33  |     37  | 
|     34 // can now deal with depth 3 |     38 // can now deal with depth 3 | 
|     35 // start on command line |     39 // start on command line | 
|     36 crawl(startURL, 4) |     40 crawl(startURL, 4) | 
|     37  |     41  | 
|         |     42 crawl("""http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-13.html""", 2) |