progs/crawler1.scala
changeset 96 9fcd3de53c06
parent 95 dbe49327b6c5
child 99 91145f6d9b0e
equal deleted inserted replaced
95:dbe49327b6c5 96:9fcd3de53c06
     1 import io.Source
     1 import io.Source
     2 import scala.util.matching.Regex
     2 import scala.util.matching.Regex
       
     3 import scala.util._
     3 
     4 
     4 // gets the first ~10K of a page
     5 // gets the first ~10K of a page
     5 def get_page(url: String) : String = { 
     6 def get_page(url: String) : String = 
     6   try {
     7   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     7     Source.fromURL(url).take(10000).mkString  
     8     { println(s"  Problem with: $url"); ""}
     8   }
       
     9   catch {
       
    10     case _ : Throwable => {
       
    11       println(s"  Problem with: $url")
       
    12       ""
       
    13     }
       
    14   }
       
    15 }
       
    16 
     9 
    17 
    10 
    18 // regex for URLs
    11 // regex for URLs
    19 val http_pattern = """\"https?://[^\"]*\"""".r
    12 val http_pattern = """\"https?://[^\"]*\"""".r
    20 
    13