progs/crawler1.scala
changeset 722 14914b57e207
parent 561 164bcaaedf88
equal deleted inserted replaced
721:e3c64f22dd31 722:14914b57e207
     8 // gets the first 10K of a web-page
     8 // gets the first 10K of a web-page
     9 def get_page(url: String) : String = {
     9 def get_page(url: String) : String = {
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
    11     getOrElse { println(s"  Problem with: $url"); ""}
    11     getOrElse { println(s"  Problem with: $url"); ""}
    12 }
    12 }
    13 get_page("https://nms.kcl.ac.uk/christiana.urban/")
    13 
       
    14 // e.g. get_page("https://nms.kcl.ac.uk/christiana.urban/")
       
    15 
    14 // regex for URLs
    16 // regex for URLs
    15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    17 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    16 
    18 
    17 // drops the first and last character from a string
    19 // drops the first and last characters from a string
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    20 def unquote(s: String) = s.drop(1).dropRight(1)
    19 
    21 
    20 def get_all_URLs(page: String) : Set[String] = 
    22 def get_all_URLs(page: String) : Set[String] = 
    21   http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/
    23   http_pattern.findAllIn(page).map(unquote).toSet /*@\label{findallline}@*/
    22 
    24 
    23 // naive version of crawl - searches until a given depth,
    25 // a very naive version of crawl - searches until a given 
    24 // visits pages potentially more than once
    26 // depth, visits pages potentially more than once
    25 def crawl(url: String, n: Int) : Unit = {
    27 def crawl(url: String, n: Int) : Unit = {
    26   if (n == 0) ()
    28   if (n == 0) ()
    27   else {
    29   else {
    28     println(s"Visiting: $n $url")
    30     println(s"Visiting: $n $url")
    29     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    31     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    30   }
    32   }
    31 }
    33 }
    32 
    34 
    33 // some starting URLs for the crawler
    35 // some starting URLs for the crawler
       
    36 
    34 val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
    37 val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
    35 //val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""
    38 //val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""
    36 
    39 
    37 crawl(startURL, 2)
    40 crawl(startURL, 2)
    38 
    41