progs/crawler1.scala
changeset 428 a47c4227a0c6
parent 427 546f2090ce12
child 432 55be90b2a642
equal deleted inserted replaced
427:546f2090ce12 428:a47c4227a0c6
     5 import scala.util.matching.Regex
     5 import scala.util.matching.Regex
     6 import scala.util._
     6 import scala.util._
     7 
     7 
     8 // gets the first 10K of a web-page
     8 // gets the first 10K of a web-page
     9 def get_page(url: String) : String = {
     9 def get_page(url: String) : String = {
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse 
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
    11     { println(s"  Problem with: $url"); ""}
    11     getOrElse { println(s"  Problem with: $url"); ""}
    12 }
    12 }
    13 
    13 
    14 // regex for URLs
    14 // regex for URLs
    15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    16 
    16 
    30     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    30     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    31   }
    31   }
    32 }
    32 }
    33 
    33 
    34 // some starting URLs for the crawler
    34 // some starting URLs for the crawler
    35 //val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
    35 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
    36 //val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney"""
    36 //val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney"""
    37 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-16.html"""
       
    38 
    37 
    39 crawl(startURL, 2)
    38 crawl(startURL, 2)
    40 
    39