| author | Christian Urban <urbanc@in.tum.de> | 
| Sat, 21 Jan 2017 00:25:09 +0000 | |
| changeset 473 | 99dd9e0f5577 | 
| parent 428 | a47c4227a0c6 | 
| child 550 | a62357075346 | 
| permissions | -rw-r--r-- | 
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 1 | // This version of the crawler that also | 
| 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 2 | // "harvests" email addresses from webpages | 
| 100 
cbc2270c2938
updated progs
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
99diff
changeset | 3 | |
| 7 | 4 | import io.Source | 
| 5 | import scala.util.matching.Regex | |
| 96 
9fcd3de53c06
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
95diff
changeset | 6 | import scala.util._ | 
| 7 | 7 | |
| 99 
91145f6d9b0e
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 8 | def get_page(url: String) : String = {
 | 
| 428 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 9 |   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
 | 
| 
a47c4227a0c6
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
427diff
changeset | 10 |     getOrElse { println(s"  Problem with: $url"); ""}
 | 
| 99 
91145f6d9b0e
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
96diff
changeset | 11 | } | 
| 7 | 12 | |
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 13 | // regexes for URLs, for "my" domain and for email addresses | 
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 14 | val http_pattern = """"https?://[^"]*"""".r | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
399diff
changeset | 15 | val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/
 | 
| 7 | 16 | |
| 17 | def unquote(s: String) = s.drop(1).dropRight(1) | |
| 18 | ||
| 254 
dcd4688690ce
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
242diff
changeset | 19 | def get_all_URLs(page: String) : Set[String] = | 
| 96 
9fcd3de53c06
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
95diff
changeset | 20 | http_pattern.findAllIn(page).map(unquote).toSet | 
| 7 | 21 | |
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 22 | def print_str(s: String) = | 
| 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 23 | if (s == "") () else println(s) | 
| 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 24 | |
| 7 | 25 | def crawl(url: String, n: Int) : Unit = {
 | 
| 26 | if (n == 0) () | |
| 27 |   else {
 | |
| 95 
dbe49327b6c5
added new stuff
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
93diff
changeset | 28 | println(s"Visiting: $n $url") | 
| 7 | 29 | val page = get_page(url) | 
| 427 
546f2090ce12
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
399diff
changeset | 30 |     print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/
 | 
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 31 | for (u <- get_all_URLs(page).par) crawl(u, n - 1) | 
| 7 | 32 | } | 
| 33 | } | |
| 34 | ||
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 35 | // staring URL for the crawler | 
| 242 
35104ee14f87
updated
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
112diff
changeset | 36 | val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc""" | 
| 112 
95ee5cc5c05d
added
 Christian Urban <christian dot urban at kcl dot ac dot uk> parents: 
101diff
changeset | 37 | |
| 7 | 38 | crawl(startURL, 3) |