| author | Christian Urban <urbanc@in.tum.de> | 
| Tue, 25 Apr 2017 12:33:16 +0100 | |
| changeset 486 | 3cc1799daf08 | 
| parent 432 | 55be90b2a642 | 
| child 550 | a62357075346 | 
| permissions | -rw-r--r-- | 
| 
100
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
1  | 
// This version of the crawler only  | 
| 
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
2  | 
// checks links in the "domain" urbanc  | 
| 
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
3  | 
|
| 1 | 4  | 
import io.Source  | 
5  | 
import scala.util.matching.Regex  | 
|
| 
96
 
9fcd3de53c06
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
95 
diff
changeset
 | 
6  | 
import scala.util._  | 
| 1 | 7  | 
|
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
8  | 
// gets the first 10K of a web-page  | 
| 
100
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
9  | 
def get_page(url: String) : String = {
 | 
| 
428
 
a47c4227a0c6
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
427 
diff
changeset
 | 
10  | 
  Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). 
 | 
| 
 
a47c4227a0c6
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
427 
diff
changeset
 | 
11  | 
    getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 
100
 
cbc2270c2938
updated progs
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
96 
diff
changeset
 | 
12  | 
}  | 
| 1 | 13  | 
|
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
14  | 
// regexes for URLs and "my" domain  | 
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
116 
diff
changeset
 | 
15  | 
val http_pattern = """"https?://[^"]*"""".r  | 
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
421 
diff
changeset
 | 
16  | 
val my_urls = """urbanc""".r       /*@\label{myurlline}@*/
 | 
| 1 | 17  | 
|
18  | 
def unquote(s: String) = s.drop(1).dropRight(1)  | 
|
19  | 
||
| 
254
 
dcd4688690ce
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
242 
diff
changeset
 | 
20  | 
def get_all_URLs(page: String) : Set[String] =  | 
| 
96
 
9fcd3de53c06
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
95 
diff
changeset
 | 
21  | 
http_pattern.findAllIn(page).map(unquote).toSet  | 
| 1 | 22  | 
|
23  | 
def crawl(url: String, n: Int) : Unit = {
 | 
|
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
421 
diff
changeset
 | 
24  | 
  if (n == 0) ()                   /*@\label{changestartline}@*/
 | 
| 
116
 
010ae7288327
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
25  | 
  else if (my_urls.findFirstIn(url) == None) { 
 | 
| 
 
010ae7288327
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
26  | 
println(s"Visiting: $n $url")  | 
| 
 
010ae7288327
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
112 
diff
changeset
 | 
27  | 
get_page(url); ()  | 
| 
427
 
546f2090ce12
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
421 
diff
changeset
 | 
28  | 
  }                                /*@\label{changeendline}@*/
 | 
| 1 | 29  | 
  else {
 | 
| 
95
 
dbe49327b6c5
added new stuff
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
93 
diff
changeset
 | 
30  | 
println(s"Visiting: $n $url")  | 
| 1 | 31  | 
for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)  | 
32  | 
}  | 
|
33  | 
}  | 
|
34  | 
||
| 
242
 
35104ee14f87
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
116 
diff
changeset
 | 
35  | 
// starting URL for the crawler  | 
| 415 | 36  | 
val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""  | 
| 
432
 
55be90b2a642
added pictures
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
428 
diff
changeset
 | 
37  | 
val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-16.html"""  | 
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
38  | 
|
| 
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
39  | 
// can now deal with depth 3 and beyond  | 
| 
303
 
4439c56d96cf
updated
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
254 
diff
changeset
 | 
40  | 
crawl(startURL, 2)  | 
| 1 | 41  | 
|
| 
112
 
95ee5cc5c05d
added
 
Christian Urban <christian dot urban at kcl dot ac dot uk> 
parents: 
101 
diff
changeset
 | 
42  |