progs/crawler3.scala
changeset 722 14914b57e207
parent 721 e3c64f22dd31
child 723 16db16950593
--- a/progs/crawler3.scala	Thu Apr 16 19:15:46 2020 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-// This version of the crawler that also
-// "harvests" email addresses from webpages
-
-import io.Source
-import scala.util.matching.Regex
-import scala.util._
-
-def get_page(url: String) : String = {
-  Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
-    getOrElse { println(s"  Problem with: $url"); ""}
-}
-
-// regexes for URLs, for "my" domain and for email addresses
-val http_pattern = """"https?://[^"]*"""".r
-val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/
-
-def unquote(s: String) = s.drop(1).dropRight(1)
-
-def get_all_URLs(page: String) : Set[String] = 
-  http_pattern.findAllIn(page).map(unquote).toSet
-
-def print_str(s: String) = 
-  if (s == "") () else println(s)
-
-def crawl(url: String, n: Int) : Unit = {
-  if (n == 0) ()
-  else {
-    println(s"  Visiting: $n $url")
-    val page = get_page(url)
-    print_str(email_pattern.findAllIn(page).mkString("\n")) /*@\label{mainline}@*/
-    for (u <- get_all_URLs(page).par) crawl(u, n - 1)
-  }
-}
-
-// staring URL for the crawler
-val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
-
-
-crawl(startURL, 3)