| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Fri, 10 Oct 2025 10:18:05 +0100 | |
| changeset 1005 | 0ffb6e4de10a | 
| parent 722 | 14914b57e207 | 
| permissions | -rw-r--r-- | 
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 1 | // A crawler which checks whether there are | 
| 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 2 | // dead links in web-pages | 
| 101 
4758a6155878
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 3 | |
| 1 | 4 | import io.Source | 
| 5 | import scala.util.matching.Regex | |
| 96 
9fcd3de53c06
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
95diff
changeset | 6 | import scala.util._ | 
| 1 | 7 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 8 | // gets the first 10K of a web-page | 
| 99 
91145f6d9b0e
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 9 | def get_page(url: String) : String = {
 | 
| 428 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 10 |   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
 | 
| 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 11 |     getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 99 
91145f6d9b0e
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 12 | } | 
| 722 | 13 | |
| 14 | // e.g. get_page("https://nms.kcl.ac.uk/christiana.urban/")
 | |
| 15 | ||
| 1 | 16 | // regex for URLs | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 17 | val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
 | 
| 1 | 18 | |
| 722 | 19 | // drops the first and last characters from a string | 
| 1 | 20 | def unquote(s: String) = s.drop(1).dropRight(1) | 
| 21 | ||
| 254 
dcd4688690ce
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
242diff
changeset | 22 | def get_all_URLs(page: String) : Set[String] = | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 23 |   http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/
 | 
| 254 
dcd4688690ce
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
242diff
changeset | 24 | |
| 722 | 25 | // a very naive version of crawl - searches until a given | 
| 26 | // depth, visits pages potentially more than once | |
| 1 | 27 | def crawl(url: String, n: Int) : Unit = {
 | 
| 28 | if (n == 0) () | |
| 29 |   else {
 | |
| 95 
dbe49327b6c5
added new stuff
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
93diff
changeset | 30 | println(s"Visiting: $n $url") | 
| 1 | 31 | for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) | 
| 32 | } | |
| 33 | } | |
| 34 | ||
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 35 | // some starting URLs for the crawler | 
| 722 | 36 | |
| 550 | 37 | val startURL = """https://nms.kcl.ac.uk/christian.urban/""" | 
| 38 | //val startURL = """https://nms.kcl.ac.uk/luc.moreau/""" | |
| 7 | 39 | |
| 561 | 40 | crawl(startURL, 2) | 
| 432 
55be90b2a642
added pictures
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
428diff
changeset | 41 |