progs/crawler3.scala
changeset 99 91145f6d9b0e
parent 96 9fcd3de53c06
child 100 cbc2270c2938
equal deleted inserted replaced
98:1f3d89fe9820 99:91145f6d9b0e
     1 import io.Source
     1 import io.Source
     2 import scala.util.matching.Regex
     2 import scala.util.matching.Regex
     3 import scala.util._
     3 import scala.util._
     4 
     4 
     5 // gets the first ~10K of a page
     5 // gets the first ~10K of a page
     6 def get_page(url: String) : String = 
     6 def get_page(url: String) : String = {
     7   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     7   Try(Source.fromURL(url).take(10000).mkString) getOrElse 
     8     { println(s"  Problem with: $url"); ""}
     8     { println(s"  Problem with: $url"); ""}
       
     9 }
     9 
    10 
    10 // staring URL for the crawler
    11 // staring URL for the crawler
    11 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    12 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    12 
    13 
    13 // regex for URLs
    14 // regex for URLs