progs/crawler1.scala
changeset 399 5c1fbb39c93e
parent 257 70c307641d05
child 421 7a04f2c532c1
equal deleted inserted replaced
398:c8ce95067c1a 399:5c1fbb39c93e
    10   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
    10   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
    11     { println(s"  Problem with: $url"); ""}
    11     { println(s"  Problem with: $url"); ""}
    12 }
    12 }
    13 
    13 
    14 // regex for URLs
    14 // regex for URLs
    15 val http_pattern = """"https?://[^"]*"""".r
    15 val http_pattern = """"https?://[^"]*"""".r (*@\label{httpline}@*) 
    16 
    16 
    17 // drops the first and last character from a string
    17 // drops the first and last character from a string
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    19 
    19 
    20 def get_all_URLs(page: String) : Set[String] = 
    20 def get_all_URLs(page: String) : Set[String] = 
    21   http_pattern.findAllIn(page).map(unquote).toSet
    21   http_pattern.findAllIn(page).map(unquote).toSet (*@\label{findallline}@*) 
    22 
    22 
    23 
    23 
    24 // naive version of crawl - searches until a given depth,
    24 // naive version of crawl - searches until a given depth,
    25 // visits pages potentially more than once
    25 // visits pages potentially more than once
    26 def crawl(url: String, n: Int) : Unit = {
    26 def crawl(url: String, n: Int) : Unit = {