equal
  deleted
  inserted
  replaced
  
    
    
     3   | 
     3   | 
     4 import io.Source  | 
     4 import io.Source  | 
     5 import scala.util.matching.Regex  | 
     5 import scala.util.matching.Regex  | 
     6 import scala.util._  | 
     6 import scala.util._  | 
     7   | 
     7   | 
     8 // gets the first ~10K of a web-page  | 
     8 // gets the first 10K of a web-page  | 
     9 def get_page(url: String) : String = { | 
     9 def get_page(url: String) : String = { | 
    10   Try(Source.fromURL(url).take(10000).mkString) getOrElse   | 
    10   Try(Source.fromURL(url).take(10000).mkString) getOrElse   | 
    11     { println(s"  Problem with: $url"); ""} | 
    11     { println(s"  Problem with: $url"); ""} | 
    12 }  | 
    12 }  | 
    13   | 
    13   | 
    16   | 
    16   | 
    17 // drops the first and last character from a string  | 
    17 // drops the first and last character from a string  | 
    18 def unquote(s: String) = s.drop(1).dropRight(1)  | 
    18 def unquote(s: String) = s.drop(1).dropRight(1)  | 
    19   | 
    19   | 
    20 def get_all_URLs(page: String) : Set[String] = { | 
    20 def get_all_URLs(page: String) : Set[String] = { | 
    21   (http_pattern.findAllIn(page)).map { unquote(_) }.toSet | 
    21   http_pattern.findAllIn(page).map(unquote).toSet  | 
    22 }  | 
    22 }  | 
    23   | 
    23   | 
    24 // naive version - seraches until a given depth  | 
    24 // naive version - seraches until a given depth  | 
    25 // visits pages potentially more than once  | 
    25 // visits pages potentially more than once  | 
    26 def crawl(url: String, n: Int) : Unit = { | 
    26 def crawl(url: String, n: Int) : Unit = { | 
    33   | 
    33   | 
    34 // staring URL for the crawler  | 
    34 // staring URL for the crawler  | 
    35 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""  | 
    35 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""  | 
    36 //val startURL = """http://www.inf.kcl.ac.uk/staff/mml/"""  | 
    36 //val startURL = """http://www.inf.kcl.ac.uk/staff/mml/"""  | 
    37   | 
    37   | 
    38   | 
         | 
    39 crawl(startURL, 2)  | 
    38 crawl(startURL, 2)  | 
    40   | 
    39   |