| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Fri, 26 Sep 2025 19:09:50 +0100 | |
| changeset 989 | 84401da2e277 | 
| parent 722 | 7c09b7eadc6b | 
| permissions | -rw-r--r-- | 
| 
100
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
1  | 
// This version of the crawler only  | 
| 
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
2  | 
// checks links in the "domain" urbanc  | 
| 
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
3  | 
|
| 1 | 4  | 
import io.Source  | 
5  | 
import scala.util.matching.Regex  | 
|
| 
96
 
9fcd3de53c06
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
95 
diff
changeset
 | 
6  | 
import scala.util._  | 
| 1 | 7  | 
|
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
8  | 
// gets the first 10K of a web-page  | 
| 
100
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
9  | 
def get_page(url: String) : String = {
 | 
| 
428
 
a47c4227a0c6
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
427 
diff
changeset
 | 
10  | 
  Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). 
 | 
| 
 
a47c4227a0c6
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
427 
diff
changeset
 | 
11  | 
    getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 
100
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
12  | 
}  | 
| 1 | 13  | 
|
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
14  | 
// regexes for URLs and "my" domain  | 
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
116 
diff
changeset
 | 
15  | 
val http_pattern = """"https?://[^"]*"""".r  | 
| 550 | 16  | 
val my_urls = """urban""".r       /*@\label{myurlline}@*/
 | 
17  | 
//val my_urls = """kcl.ac.uk""".r  | 
|
| 1 | 18  | 
|
19  | 
def unquote(s: String) = s.drop(1).dropRight(1)  | 
|
20  | 
||
| 
254
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
242 
diff
changeset
 | 
21  | 
def get_all_URLs(page: String) : Set[String] =  | 
| 
96
 
9fcd3de53c06
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
95 
diff
changeset
 | 
22  | 
http_pattern.findAllIn(page).map(unquote).toSet  | 
| 1 | 23  | 
|
24  | 
def crawl(url: String, n: Int) : Unit = {
 | 
|
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
421 
diff
changeset
 | 
25  | 
  if (n == 0) ()                   /*@\label{changestartline}@*/
 | 
| 
116
 
010ae7288327
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
26  | 
  else if (my_urls.findFirstIn(url) == None) { 
 | 
| 
 
010ae7288327
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
27  | 
println(s"Visiting: $n $url")  | 
| 
 
010ae7288327
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
28  | 
get_page(url); ()  | 
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
421 
diff
changeset
 | 
29  | 
  }                                /*@\label{changeendline}@*/
 | 
| 1 | 30  | 
  else {
 | 
| 
95
 
dbe49327b6c5
added new stuff
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
93 
diff
changeset
 | 
31  | 
println(s"Visiting: $n $url")  | 
| 550 | 32  | 
for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1)  | 
| 1 | 33  | 
}  | 
34  | 
}  | 
|
35  | 
||
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
116 
diff
changeset
 | 
36  | 
// starting URL for the crawler  | 
| 550 | 37  | 
val startURL = """https://nms.kcl.ac.uk/christian.urban/"""  | 
38  | 
//val startURL = """https://nms.kcl.ac.uk/christian.urban/bsc-projects-17.html"""  | 
|
39  | 
||
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
40  | 
|
| 
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
41  | 
// can now deal with depth 3 and beyond  | 
| 550 | 42  | 
crawl(startURL, 3)  | 
| 1 | 43  | 
|
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
44  |