progs/crawler3.scala
changeset 95 dbe49327b6c5
parent 93 4794759139ea
child 96 9fcd3de53c06
equal deleted inserted replaced
94:9ea667baf097 95:dbe49327b6c5
     5 def get_page(url: String) : String = { 
     5 def get_page(url: String) : String = { 
     6   try {
     6   try {
     7     Source.fromURL(url).take(10000).mkString  
     7     Source.fromURL(url).take(10000).mkString  
     8   }
     8   }
     9   catch {
     9   catch {
    10     case e => {
    10     case _ : Throwable => {
    11       println("  Problem with: " + url)
    11       println(s"  Problem with: $url")
    12       ""
    12       ""
    13     }
    13     }
    14   }
    14   }
    15 }
    15 }
    16 
    16 
    34 // visits pages potentially more than once
    34 // visits pages potentially more than once
    35 def crawl(url: String, n: Int) : Unit = {
    35 def crawl(url: String, n: Int) : Unit = {
    36   if (n == 0) ()
    36   if (n == 0) ()
    37   //else if (my_urls.findFirstIn(url) == None) ()
    37   //else if (my_urls.findFirstIn(url) == None) ()
    38   else {
    38   else {
    39     println("Visiting: " + n + " " + url)
    39     println(s"Visiting: $n $url")
    40     val page = get_page(url)
    40     val page = get_page(url)
    41     println(email_pattern.findAllIn(page).mkString("\n"))
    41     println(email_pattern.findAllIn(page).mkString("\n"))
    42     for (u <- get_all_URLs(page)) crawl(u, n - 1)
    42     for (u <- get_all_URLs(page)) crawl(u, n - 1)
    43   }
    43   }
    44 }
    44 }