progs/crawler3.scala
changeset 96 9fcd3de53c06
parent 95 dbe49327b6c5
child 99 91145f6d9b0e
equal deleted inserted replaced
95:dbe49327b6c5 96:9fcd3de53c06
     1 import io.Source
     1 import io.Source
     2 import scala.util.matching.Regex
     2 import scala.util.matching.Regex
       
     3 import scala.util._
     3 
     4 
     4 // gets the first ~10K of a page
     5 // gets the first ~10K of a page
     5 def get_page(url: String) : String = { 
     6 def get_page(url: String) : String = 
     6   try {
     7   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     7     Source.fromURL(url).take(10000).mkString  
     8     { println(s"  Problem with: $url"); ""}
     8   }
       
     9   catch {
       
    10     case _ : Throwable => {
       
    11       println(s"  Problem with: $url")
       
    12       ""
       
    13     }
       
    14   }
       
    15 }
       
    16 
     9 
    17 // staring URL for the crawler
    10 // staring URL for the crawler
    18 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    11 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    19 
    12 
    20 // regex for URLs
    13 // regex for URLs
    25 // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/
    18 // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/
    26 
    19 
    27 def unquote(s: String) = s.drop(1).dropRight(1)
    20 def unquote(s: String) = s.drop(1).dropRight(1)
    28 
    21 
    29 def get_all_URLs(page: String) : Set[String] = {
    22 def get_all_URLs(page: String) : Set[String] = {
    30   (http_pattern.findAllIn(page)).map { unquote(_) }.toSet
    23   http_pattern.findAllIn(page).map(unquote).toSet
    31 }
    24 }
    32 
    25 
    33 // naive version - seraches until a given depth
    26 // naive version - seraches until a given depth
    34 // visits pages potentially more than once
    27 // visits pages potentially more than once
    35 def crawl(url: String, n: Int) : Unit = {
    28 def crawl(url: String, n: Int) : Unit = {