progs/crawler1.scala
changeset 99 91145f6d9b0e
parent 96 9fcd3de53c06
child 101 4758a6155878
equal deleted inserted replaced
98:1f3d89fe9820 99:91145f6d9b0e
     1 import io.Source
     1 import io.Source
     2 import scala.util.matching.Regex
     2 import scala.util.matching.Regex
     3 import scala.util._
     3 import scala.util._
     4 
     4 
     5 // gets the first ~10K of a page
     5 // gets the first ~10K of a page
     6 def get_page(url: String) : String = 
     6 def get_page(url: String) : String = {
     7   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     7   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     8     { println(s"  Problem with: $url"); ""}
     8     { println(s"  Problem with: $url"); ""}
     9 
     9 }
    10 
    10 
    11 // regex for URLs
    11 // regex for URLs
    12 val http_pattern = """\"https?://[^\"]*\"""".r
    12 val http_pattern = """\"https?://[^\"]*\"""".r
    13 
    13 
    14 def unquote(s: String) = s.drop(1).dropRight(1)
    14 def unquote(s: String) = s.drop(1).dropRight(1)