progs/crawler3.scala
changeset 428 a47c4227a0c6
parent 427 546f2090ce12
child 550 71fc4a7a7039
equal deleted inserted replaced
427:546f2090ce12 428:a47c4227a0c6
     4 import io.Source
     4 import io.Source
     5 import scala.util.matching.Regex
     5 import scala.util.matching.Regex
     6 import scala.util._
     6 import scala.util._
     7 
     7 
     8 def get_page(url: String) : String = {
     8 def get_page(url: String) : String = {
     9   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString) getOrElse 
     9   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
    10     { println(s"  Problem with: $url"); ""}
    10     getOrElse { println(s"  Problem with: $url"); ""}
    11 }
    11 }
    12 
    12 
    13 // regexes for URLs, for "my" domain and for email addresses
    13 // regexes for URLs, for "my" domain and for email addresses
    14 val http_pattern = """"https?://[^"]*"""".r
    14 val http_pattern = """"https?://[^"]*"""".r
    15 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/
    15 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r /*@\label{emailline}@*/