progs/crawler1.scala
changeset 561 cf3e57e6fec7
parent 550 a62357075346
child 722 7c09b7eadc6b
equal deleted inserted replaced
560:0acbeb9a4b9f 561:cf3e57e6fec7
     8 // gets the first 10K of a web-page
     8 // gets the first 10K of a web-page
     9 def get_page(url: String) : String = {
     9 def get_page(url: String) : String = {
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
    11     getOrElse { println(s"  Problem with: $url"); ""}
    11     getOrElse { println(s"  Problem with: $url"); ""}
    12 }
    12 }
    13 
    13 get_page("https://nms.kcl.ac.uk/christiana.urban/")
    14 // regex for URLs
    14 // regex for URLs
    15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    16 
    16 
    17 // drops the first and last character from a string
    17 // drops the first and last character from a string
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    32 
    32 
    33 // some starting URLs for the crawler
    33 // some starting URLs for the crawler
    34 val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
    34 val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
    35 //val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""
    35 //val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""
    36 
    36 
    37 crawl(startURL, 3)
    37 crawl(startURL, 2)
    38 
    38