| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Sun, 29 Sep 2024 18:46:02 +0100 | |
| changeset 964 | d3e22099963d | 
| parent 722 | 7c09b7eadc6b | 
| permissions | -rw-r--r-- | 
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
1  | 
// This version of the crawler that also  | 
| 
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
2  | 
// "harvests" email addresses from webpages  | 
| 
100
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
99 
diff
changeset
 | 
3  | 
|
| 7 | 4  | 
import io.Source  | 
5  | 
import scala.util.matching.Regex  | 
|
| 
96
 
9fcd3de53c06
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
95 
diff
changeset
 | 
6  | 
import scala.util._  | 
| 7 | 7  | 
|
| 
99
 
91145f6d9b0e
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
8  | 
def get_page(url: String) : String = {
 | 
| 
428
 
a47c4227a0c6
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
427 
diff
changeset
 | 
9  | 
  Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
 | 
| 
 
a47c4227a0c6
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
427 
diff
changeset
 | 
10  | 
    getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 
99
 
91145f6d9b0e
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
11  | 
}  | 
| 7 | 12  | 
|
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
13  | 
// regexes for URLs, for "my" domain and for email addresses  | 
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
14  | 
val http_pattern = """"https?://[^"]*"""".r  | 
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
399 
diff
changeset
 | 
15  | 
val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/
 | 
| 7 | 16  | 
|
17  | 
def unquote(s: String) = s.drop(1).dropRight(1)  | 
|
18  | 
||
| 
254
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
242 
diff
changeset
 | 
19  | 
def get_all_URLs(page: String) : Set[String] =  | 
| 
96
 
9fcd3de53c06
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
95 
diff
changeset
 | 
20  | 
http_pattern.findAllIn(page).map(unquote).toSet  | 
| 7 | 21  | 
|
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
22  | 
def print_str(s: String) =  | 
| 
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
23  | 
if (s == "") () else println(s)  | 
| 
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
24  | 
|
| 7 | 25  | 
def crawl(url: String, n: Int) : Unit = {
 | 
26  | 
if (n == 0) ()  | 
|
27  | 
  else {
 | 
|
| 550 | 28  | 
println(s" Visiting: $n $url")  | 
| 7 | 29  | 
val page = get_page(url)  | 
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
399 
diff
changeset
 | 
30  | 
    print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/
 | 
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
31  | 
for (u <- get_all_URLs(page).par) crawl(u, n - 1)  | 
| 7 | 32  | 
}  | 
33  | 
}  | 
|
34  | 
||
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
35  | 
// staring URL for the crawler  | 
| 550 | 36  | 
val startURL = """https://nms.kcl.ac.uk/christian.urban/"""  | 
37  | 
||
| 7 | 38  | 
crawl(startURL, 3)  |