progs/crawler2.scala
changeset 116 010ae7288327
parent 112 95ee5cc5c05d
child 242 35104ee14f87
equal deleted inserted replaced
115:86c1c049eb3e 116:010ae7288327
    21   http_pattern.findAllIn(page).map(unquote).toSet
    21   http_pattern.findAllIn(page).map(unquote).toSet
    22 }
    22 }
    23 
    23 
    24 def crawl(url: String, n: Int) : Unit = {
    24 def crawl(url: String, n: Int) : Unit = {
    25   if (n == 0) ()
    25   if (n == 0) ()
    26   else if (my_urls.findFirstIn(url) == None) ()
    26   else if (my_urls.findFirstIn(url) == None) { 
       
    27     println(s"Visiting: $n $url")
       
    28     get_page(url); () 
       
    29   }
    27   else {
    30   else {
    28     println(s"Visiting: $n $url")
    31     println(s"Visiting: $n $url")
    29     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    32     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
    30   }
    33   }
    31 }
    34 }
    32 
    35 
    33 // staring URL for the crawler
    36 // staring URL for the crawler
    34 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    37 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
    35 
    38 
    36 // can now deal with depth 3 and beyond
    39 // can now deal with depth 3 and beyond
    37 crawl(startURL, 4)
    40 crawl(startURL, 3)
    38 
    41 
    39 
    42