| author | Christian Urban <urbanc@in.tum.de> | 
| Wed, 06 Nov 2019 23:17:05 +0000 | |
| changeset 683 | 9acbe46df3fd | 
| parent 330 | 0806e45d873c | 
| permissions | -rw-r--r-- | 
| 
254
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
97 
diff
changeset
 | 
1  | 
val http_pattern = """"https?://[^"]*"""".r  | 
| 7 | 2  | 
val email_pattern =  | 
3  | 
  """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
 | 
|
4  | 
||
| 
254
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
97 
diff
changeset
 | 
5  | 
def print_str(s: String) =  | 
| 
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
97 
diff
changeset
 | 
6  | 
if (s == "") () else println(s)  | 
| 
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
97 
diff
changeset
 | 
7  | 
|
| 7 | 8  | 
def crawl(url: String, n: Int) : Unit = {
 | 
9  | 
if (n == 0) ()  | 
|
10  | 
  else {
 | 
|
| 
96
 
9fcd3de53c06
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
93 
diff
changeset
 | 
11  | 
println(s"Visiting: $n $url")  | 
| 7 | 12  | 
val page = get_page(url)  | 
| 
254
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
97 
diff
changeset
 | 
13  | 
    print_str(email_pattern.findAllIn(page).mkString("\n"))
 | 
| 
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
97 
diff
changeset
 | 
14  | 
for (u <- get_all_URLs(page).par) crawl(u, n - 1)  | 
| 7 | 15  | 
}  | 
16  | 
}  |