progs/crawler1.scala
changeset 561 164bcaaedf88
parent 550 71fc4a7a7039
child 722 14914b57e207
equal deleted inserted replaced
560:99d2bb1f145c 561:164bcaaedf88
     8 // gets the first 10K of a web-page
     8 // gets the first 10K of a web-page
     9 def get_page(url: String) : String = {
     9 def get_page(url: String) : String = {
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
    10   Try(Source.fromURL(url)("ISO-8859-1").take(10000).mkString).
    11     getOrElse { println(s"  Problem with: $url"); ""}
    11     getOrElse { println(s"  Problem with: $url"); ""}
    12 }
    12 }
    13 
    13 get_page("https://nms.kcl.ac.uk/christiana.urban/")
    14 // regex for URLs
    14 // regex for URLs
    15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    15 val http_pattern = """"https?://[^"]*"""".r /*@\label{httpline}@*/ 
    16 
    16 
    17 // drops the first and last character from a string
    17 // drops the first and last character from a string
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    18 def unquote(s: String) = s.drop(1).dropRight(1)
    32 
    32 
    33 // some starting URLs for the crawler
    33 // some starting URLs for the crawler
    34 val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
    34 val startURL = """https://nms.kcl.ac.uk/christian.urban/"""
    35 //val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""
    35 //val startURL = """https://nms.kcl.ac.uk/luc.moreau/"""
    36 
    36 
    37 crawl(startURL, 3)
    37 crawl(startURL, 2)
    38 
    38