progs/crawler2.scala
changeset 101 4758a6155878
parent 100 cbc2270c2938
child 112 95ee5cc5c05d
equal deleted inserted replaced
100:cbc2270c2938 101:4758a6155878
     3 
     3 
     4 import io.Source
     4 import io.Source
     5 import scala.util.matching.Regex
     5 import scala.util.matching.Regex
     6 import scala.util._
     6 import scala.util._
     7 
     7 
     8 // gets the first ~10K of a page
     8 // gets the first ~10K of a web-page
     9 def get_page(url: String) : String = {
     9 def get_page(url: String) : String = {
    10   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
    10   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
    11     { println(s"  Problem with: $url"); ""}
    11     { println(s"  Problem with: $url"); ""}
    12 }
    12 }
    13 
    13 
    26 
    26 
    27 // naive version - seraches until a given depth
    27 // naive version - seraches until a given depth
    28 // visits pages potentially more than once
    28 // visits pages potentially more than once
    29 def crawl(url: String, n: Int) : Unit = {
    29 def crawl(url: String, n: Int) : Unit = {
    30   if (n == 0) ()
    30   if (n == 0) ()
    31   //else if (my_urls.findFirstIn(url) == None) ()
    31   else if (my_urls.findFirstIn(url) == None) ()
    32   else {
    32   else {
    33     println(s"Visiting: $n $url")
    33     println(s"Visiting: $n $url")
    34     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    34     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    35   }
    35   }
    36 }
    36 }