progs/crawler2.scala
changeset 96 9fcd3de53c06
parent 95 dbe49327b6c5
child 100 cbc2270c2938
equal deleted inserted replaced
95:dbe49327b6c5 96:9fcd3de53c06
     1 import io.Source
     1 import io.Source
     2 import scala.util.matching.Regex
     2 import scala.util.matching.Regex
       
     3 import scala.util._
     3 
     4 
     4 // gets the first ~10K of a page
     5 // gets the first ~10K of a page
     5 def get_page(url: String) : String = { 
     6 def get_page(url: String) : String = 
     6   try {
     7   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     7     Source.fromURL(url).take(10000).mkString  
     8     { println(s"  Problem with: $url"); ""}
     8   }
       
     9   catch {
       
    10     case _ : Throwable => {
       
    11       println(s"  Problem with: $url")
       
    12       ""
       
    13     }
       
    14   }
       
    15 }
       
    16 
     9 
    17 // staring URL for the crawler
    10 // staring URL for the crawler
    18 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    11 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    19 
    12 
    20 // regex for URLs
    13 // regex for URLs
    22 val my_urls = """urbanc""".r
    15 val my_urls = """urbanc""".r
    23 
    16 
    24 def unquote(s: String) = s.drop(1).dropRight(1)
    17 def unquote(s: String) = s.drop(1).dropRight(1)
    25 
    18 
    26 def get_all_URLs(page: String) : Set[String] = {
    19 def get_all_URLs(page: String) : Set[String] = {
    27   (http_pattern.findAllIn(page)).map { unquote(_) }.toSet
    20   http_pattern.findAllIn(page).map(unquote).toSet
    28 }
    21 }
    29 
    22 
    30 // naive version - seraches until a given depth
    23 // naive version - seraches until a given depth
    31 // visits pages potentially more than once
    24 // visits pages potentially more than once
    32 def crawl(url: String, n: Int) : Unit = {
    25 def crawl(url: String, n: Int) : Unit = {