| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Tue, 06 Oct 2020 00:39:34 +0100 | |
| changeset 775 | 5d3f3a5f2354 | 
| parent 722 | 7c09b7eadc6b | 
| permissions | -rw-r--r-- | 
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
1  | 
// A crawler which checks whether there are  | 
| 
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
2  | 
// dead links in web-pages  | 
| 
101
 
4758a6155878
links
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
3  | 
|
| 1 | 4  | 
import io.Source  | 
5  | 
import scala.util.matching.Regex  | 
|
| 
96
 
9fcd3de53c06
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
95 
diff
changeset
 | 
6  | 
import scala.util._  | 
| 1 | 7  | 
|
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
8  | 
// gets the first 10K of a web-page  | 
| 
99
 
91145f6d9b0e
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
9  | 
def get_page(url: String) : String = {
 | 
| 
428
 
a47c4227a0c6
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
427 
diff
changeset
 | 
10  | 
  Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
 | 
| 
 
a47c4227a0c6
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
427 
diff
changeset
 | 
11  | 
    getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 
99
 
91145f6d9b0e
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
12  | 
}  | 
| 722 | 13  | 
|
14  | 
// e.g. get_page("https://nms.kcl.ac.uk/christiana.urban/")
 | 
|
15  | 
||
| 1 | 16  | 
// regex for URLs  | 
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
421 
diff
changeset
 | 
17  | 
val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
 | 
| 1 | 18  | 
|
| 722 | 19  | 
// drops the first and last characters from a string  | 
| 1 | 20  | 
def unquote(s: String) = s.drop(1).dropRight(1)  | 
21  | 
||
| 
254
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
242 
diff
changeset
 | 
22  | 
def get_all_URLs(page: String) : Set[String] =  | 
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
421 
diff
changeset
 | 
23  | 
  http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/
 | 
| 
254
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
242 
diff
changeset
 | 
24  | 
|
| 722 | 25  | 
// a very naive version of crawl - searches until a given  | 
26  | 
// depth, visits pages potentially more than once  | 
|
| 1 | 27  | 
def crawl(url: String, n: Int) : Unit = {
 | 
28  | 
if (n == 0) ()  | 
|
29  | 
  else {
 | 
|
| 
95
 
dbe49327b6c5
added new stuff
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
93 
diff
changeset
 | 
30  | 
println(s"Visiting: $n $url")  | 
| 1 | 31  | 
for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)  | 
32  | 
}  | 
|
33  | 
}  | 
|
34  | 
||
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
35  | 
// some starting URLs for the crawler  | 
| 722 | 36  | 
|
| 550 | 37  | 
val startURL = """https://nms.kcl.ac.uk/christian.urban/"""  | 
38  | 
//val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""  | 
|
| 7 | 39  | 
|
| 561 | 40  | 
crawl(startURL, 2)  | 
| 
432
 
55be90b2a642
added pictures
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
428 
diff
changeset
 | 
41  |