| author | cu | 
| Tue, 03 Oct 2017 23:01:06 +0100 | |
| changeset 512 | 56550ad904d8 | 
| parent 432 | 55be90b2a642 | 
| child 550 | a62357075346 | 
| permissions | -rw-r--r-- | 
| 100 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 1 | // This version of the crawler only | 
| 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 2 | // checks links in the "domain" urbanc | 
| 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 3 | |
| 1 | 4 | import io.Source | 
| 5 | import scala.util.matching.Regex | |
| 96 
9fcd3de53c06
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
95diff
changeset | 6 | import scala.util._ | 
| 1 | 7 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 8 | // gets the first 10K of a web-page | 
| 100 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 9 | def get_page(url: String) : String = {
 | 
| 428 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 10 |   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). 
 | 
| 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 11 |     getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 100 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 12 | } | 
| 1 | 13 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 14 | // regexes for URLs and "my" domain | 
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
116diff
changeset | 15 | val http_pattern = """"https?://[^"]*"""".r | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 16 | val my_urls = """urbanc""".r       /*@\label{myurlline}@*/
 | 
| 1 | 17 | |
| 18 | def unquote(s: String) = s.drop(1).dropRight(1) | |
| 19 | ||
| 254 
dcd4688690ce
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
242diff
changeset | 20 | def get_all_URLs(page: String) : Set[String] = | 
| 96 
9fcd3de53c06
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
95diff
changeset | 21 | http_pattern.findAllIn(page).map(unquote).toSet | 
| 1 | 22 | |
| 23 | def crawl(url: String, n: Int) : Unit = {
 | |
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 24 |   if (n == 0) ()                   /*@\label{changestartline}@*/
 | 
| 116 
010ae7288327
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 25 |   else if (my_urls.findFirstIn(url) == None) { 
 | 
| 
010ae7288327
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 26 | println(s"Visiting: $n $url") | 
| 
010ae7288327
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 27 | get_page(url); () | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 28 |   }                                /*@\label{changeendline}@*/
 | 
| 1 | 29 |   else {
 | 
| 95 
dbe49327b6c5
added new stuff
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
93diff
changeset | 30 | println(s"Visiting: $n $url") | 
| 1 | 31 | for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1) | 
| 32 | } | |
| 33 | } | |
| 34 | ||
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
116diff
changeset | 35 | // starting URL for the crawler | 
| 415 | 36 | val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" | 
| 432 
55be90b2a642
added pictures
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
428diff
changeset | 37 | val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-16.html""" | 
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 38 | |
| 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 39 | // can now deal with depth 3 and beyond | 
| 303 
4439c56d96cf
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
254diff
changeset | 40 | crawl(startURL, 2) | 
| 1 | 41 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 42 |