progs/crawler2.scala
changeset 428 a47c4227a0c6
parent 427 546f2090ce12
child 432 55be90b2a642
equal deleted inserted replaced
427:546f2090ce12 428:a47c4227a0c6
     5 import scala.util.matching.Regex
     5 import scala.util.matching.Regex
     6 import scala.util._
     6 import scala.util._
     7 
     7 
     8 // gets the first 10K of a web-page
     8 // gets the first 10K of a web-page
     9 def get_page(url: String) : String = {
     9 def get_page(url: String) : String = {
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse 
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString). 
    11     { println(s"  Problem with: $url"); ""}
    11     getOrElse { println(s"  Problem with: $url"); ""}
    12 }
    12 }
    13 
    13 
    14 // regexes for URLs and "my" domain
    14 // regexes for URLs and "my" domain
    15 val http_pattern = """"https?://[^"]*"""".r
    15 val http_pattern = """"https?://[^"]*"""".r
    16 val my_urls = """urbanc""".r       /*@\label{myurlline}@*/
    16 val my_urls = """urbanc""".r       /*@\label{myurlline}@*/