| author | Christian Urban <urbanc@in.tum.de> | 
| Thu, 21 Nov 2019 14:41:53 +0000 | |
| changeset 697 | f5655aa04cac | 
| parent 561 | cf3e57e6fec7 | 
| child 722 | 7c09b7eadc6b | 
| permissions | -rw-r--r-- | 
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 1 | // A crawler which checks whether there are | 
| 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 2 | // dead links in web-pages | 
| 101 
4758a6155878
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 3 | |
| 1 | 4 | import io.Source | 
| 5 | import scala.util.matching.Regex | |
| 96 
9fcd3de53c06
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
95diff
changeset | 6 | import scala.util._ | 
| 1 | 7 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 8 | // gets the first 10K of a web-page | 
| 99 
91145f6d9b0e
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 9 | def get_page(url: String) : String = {
 | 
| 428 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 10 |   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
 | 
| 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 11 |     getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 99 
91145f6d9b0e
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 12 | } | 
| 561 | 13 | get_page("https://nms.kcl.ac.uk/christiana.urban/")
 | 
| 1 | 14 | // regex for URLs | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 15 | val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
 | 
| 1 | 16 | |
| 101 
4758a6155878
links
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 17 | // drops the first and last character from a string | 
| 1 | 18 | def unquote(s: String) = s.drop(1).dropRight(1) | 
| 19 | ||
| 254 
dcd4688690ce
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
242diff
changeset | 20 | def get_all_URLs(page: String) : Set[String] = | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 21 |   http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/
 | 
| 254 
dcd4688690ce
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
242diff
changeset | 22 | |
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 23 | // naive version of crawl - searches until a given depth, | 
| 1 | 24 | // visits pages potentially more than once | 
| 25 | def crawl(url: String, n: Int) : Unit = {
 | |
| 26 | if (n == 0) () | |
| 27 |   else {
 | |
| 95 
dbe49327b6c5
added new stuff
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
93diff
changeset | 28 | println(s"Visiting: $n $url") | 
| 1 | 29 | for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) | 
| 30 | } | |
| 31 | } | |
| 32 | ||
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 33 | // some starting URLs for the crawler | 
| 550 | 34 | val startURL = """https://nms.kcl.ac.uk/christian.urban/""" | 
| 35 | //val startURL = """https://nms.kcl.ac.uk/luc.moreau/""" | |
| 7 | 36 | |
| 561 | 37 | crawl(startURL, 2) | 
| 432 
55be90b2a642
added pictures
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
428diff
changeset | 38 |