progs/crawler1.scala
changeset 427 546f2090ce12
parent 421 7a04f2c532c1
child 428 a47c4227a0c6
equal deleted inserted replaced
426:0debe6f41396 427:546f2090ce12
     5 import scala.util.matching.Regex
     5 import scala.util.matching.Regex
     6 import scala.util._
     6 import scala.util._
     7 
     7 
     8 // gets the first 10K of a web-page
     8 // gets the first 10K of a web-page
     9 def get_page(url: String) : String = {
     9 def get_page(url: String) : String = {
    10   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse 
    11     { println(s"  Problem with: $url"); ""}
    11     { println(s"  Problem with: $url"); ""}
    12 }
    12 }
    13 
    13 
    14 // regex for URLs
    14 // regex for URLs
    15 val http_pattern = """"https?://[^"]*"""".r (*@\label{httpline}@*) 
    15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    16 
    16 
    17 // drops the first and last character from a string
    17 // drops the first and last character from a string
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    19 
    19 
    20 def get_all_URLs(page: String) : Set[String] = 
    20 def get_all_URLs(page: String) : Set[String] = 
    21   http_pattern.findAllIn(page).map(unquote).toSet (*@\label{findallline}@*) 
    21   http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/
    22 
    22 
    23 
    23 
    24 // naive version of crawl - searches until a given depth,
    24 // naive version of crawl - searches until a given depth,
    25 // visits pages potentially more than once
    25 // visits pages potentially more than once
    26 def crawl(url: String, n: Int) : Unit = {
    26 def crawl(url: String, n: Int) : Unit = {
    30     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    30     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    31   }
    31   }
    32 }
    32 }
    33 
    33 
    34 // some starting URLs for the crawler
    34 // some starting URLs for the crawler
    35 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
    35 //val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc"""
    36 //val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney"""
    36 //val startURL = """http://www.inf.kcl.ac.uk/staff/mcburney"""
       
    37 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/bsc-projects-16.html"""
    37 
    38 
    38 crawl(startURL, 2)
    39 crawl(startURL, 2)
    39 
    40