|      8 // gets the first 10K of a web-page |      8 // gets the first 10K of a web-page | 
|      9 def get_page(url: String) : String = { |      9 def get_page(url: String) : String = { | 
|     10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). |     10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). | 
|     11     getOrElse { println(s"  Problem with: $url"); ""} |     11     getOrElse { println(s"  Problem with: $url"); ""} | 
|     12 } |     12 } | 
|     13 get_page("https://nms.kcl.ac.uk/christiana.urban/") |     13  | 
|         |     14 // e.g. get_page("https://nms.kcl.ac.uk/christiana.urban/") | 
|         |     15  | 
|     14 // regex for URLs |     16 // regex for URLs | 
|     15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/  |     17 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/  | 
|     16  |     18  | 
|     17 // drops the first and last character from a string |     19 // drops the first and last characters from a string | 
|     18 def unquote(s: String) = s.drop(1).dropRight(1) |     20 def unquote(s: String) = s.drop(1).dropRight(1) | 
|     19  |     21  | 
|     20 def get_all_URLs(page: String) : Set[String] =  |     22 def get_all_URLs(page: String) : Set[String] =  | 
|     21   http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/ |     23   http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/ | 
|     22  |     24  | 
|     23 // naive version of crawl - searches until a given depth, |     25 // a very naive version of crawl - searches until a given  | 
|     24 // visits pages potentially more than once |     26 // depth, visits pages potentially more than once | 
|     25 def crawl(url: String, n: Int) : Unit = { |     27 def crawl(url: String, n: Int) : Unit = { | 
|     26   if (n == 0) () |     28   if (n == 0) () | 
|     27   else { |     29   else { | 
|     28     println(s"Visiting: $n $url") |     30     println(s"Visiting: $n $url") | 
|     29     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) |     31     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) | 
|     30   } |     32   } | 
|     31 } |     33 } | 
|     32  |     34  | 
|     33 // some starting URLs for the crawler |     35 // some starting URLs for the crawler | 
|         |     36  | 
|     34 val startURL = """https://nms.kcl.ac.uk/christian.urban/""" |     37 val startURL = """https://nms.kcl.ac.uk/christian.urban/""" | 
|     35 //val startURL = """https://nms.kcl.ac.uk/luc.moreau/""" |     38 //val startURL = """https://nms.kcl.ac.uk/luc.moreau/""" | 
|     36  |     39  | 
|     37 crawl(startURL, 2) |     40 crawl(startURL, 2) | 
|     38  |     41  |