progs/crawler2.scala
changeset 95 dbe49327b6c5
parent 93 4794759139ea
child 96 9fcd3de53c06
equal deleted inserted replaced
94:9ea667baf097 95:dbe49327b6c5
     5 def get_page(url: String) : String = { 
     5 def get_page(url: String) : String = { 
     6   try {
     6   try {
     7     Source.fromURL(url).take(10000).mkString  
     7     Source.fromURL(url).take(10000).mkString  
     8   }
     8   }
     9   catch {
     9   catch {
    10     case e => {
    10     case _ : Throwable => {
    11       println("  Problem with: " + url)
    11       println(s"  Problem with: $url")
    12       ""
    12       ""
    13     }
    13     }
    14   }
    14   }
    15 }
    15 }
    16 
    16 
    31 // visits pages potentially more than once
    31 // visits pages potentially more than once
    32 def crawl(url: String, n: Int) : Unit = {
    32 def crawl(url: String, n: Int) : Unit = {
    33   if (n == 0) ()
    33   if (n == 0) ()
    34   else if (my_urls.findFirstIn(url) == None) ()
    34   else if (my_urls.findFirstIn(url) == None) ()
    35   else {
    35   else {
    36     println("Visiting: " + n + " " + url)
    36     println(s"Visiting: $n $url")
    37     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    37     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    38   }
    38   }
    39 }
    39 }
    40 
    40 
    41 // can now deal with depth 3
    41 // can now deal with depth 3