| author | Christian Urban <christian.urban@kcl.ac.uk> | 
| Fri, 30 Sep 2022 06:03:46 +0100 | |
| changeset 879 | f712b16df8a2 | 
| parent 722 | 7c09b7eadc6b | 
| permissions | -rw-r--r-- | 
| 100 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 1 | // This version of the crawler only | 
| 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 2 | // checks links in the "domain" urbanc | 
| 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 3 | |
| 1 | 4 | import io.Source | 
| 5 | import scala.util.matching.Regex | |
| 96 
9fcd3de53c06
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
95diff
changeset | 6 | import scala.util._ | 
| 1 | 7 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 8 | // gets the first 10K of a web-page | 
| 100 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 9 | def get_page(url: String) : String = {
 | 
| 428 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 10 |   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). 
 | 
| 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 11 |     getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 100 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 12 | } | 
| 1 | 13 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 14 | // regexes for URLs and "my" domain | 
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
116diff
changeset | 15 | val http_pattern = """"https?://[^"]*"""".r | 
| 550 | 16 | val my_urls = """urban""".r       /*@\label{myurlline}@*/
 | 
| 17 | //val my_urls = """kcl.ac.uk""".r | |
| 1 | 18 | |
| 19 | def unquote(s: String) = s.drop(1).dropRight(1) | |
| 20 | ||
| 254 
dcd4688690ce
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
242diff
changeset | 21 | def get_all_URLs(page: String) : Set[String] = | 
| 96 
9fcd3de53c06
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
95diff
changeset | 22 | http_pattern.findAllIn(page).map(unquote).toSet | 
| 1 | 23 | |
| 24 | def crawl(url: String, n: Int) : Unit = {
 | |
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 25 |   if (n == 0) ()                   /*@\label{changestartline}@*/
 | 
| 116 
010ae7288327
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 26 |   else if (my_urls.findFirstIn(url) == None) { 
 | 
| 
010ae7288327
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 27 | println(s"Visiting: $n $url") | 
| 
010ae7288327
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 28 | get_page(url); () | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
421diff
changeset | 29 |   }                                /*@\label{changeendline}@*/
 | 
| 1 | 30 |   else {
 | 
| 95 
dbe49327b6c5
added new stuff
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
93diff
changeset | 31 | println(s"Visiting: $n $url") | 
| 550 | 32 | for (u <- get_all_URLs(get_page(url)).par) crawl(u, n - 1) | 
| 1 | 33 | } | 
| 34 | } | |
| 35 | ||
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
116diff
changeset | 36 | // starting URL for the crawler | 
| 550 | 37 | val startURL = """https://nms.kcl.ac.uk/christian.urban/""" | 
| 38 | //val startURL = """https://nms.kcl.ac.uk/christian.urban/bsc-projects-17.html""" | |
| 39 | ||
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 40 | |
| 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 41 | // can now deal with depth 3 and beyond | 
| 550 | 42 | crawl(startURL, 3) | 
| 1 | 43 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 44 |