progs/crawler.scala
changeset 93 4794759139ea
parent 92 e85600529ca5
equal deleted inserted replaced
92:e85600529ca5 93:4794759139ea
       
     1 import io.Source
       
     2 import scala.util.matching.Regex
       
     3 
       
     4 // gets the first ~10K of a page
       
     5 def get_page(url: String) : String = { 
       
     6   try {
       
     7     Source.fromURL(url).take(10000).mkString  
       
     8   }
       
     9   catch {
       
    10     case e => {
       
    11       println("  Problem with: " + url)
       
    12       ""
       
    13     }
       
    14   }
       
    15 }
       
    16 
       
    17 // non-existing page -> returns the empty string
       
    18 get_page("""http://www.foobar.com""")
       
    19 
       
    20 
       
    21 // staring URL for the crawler
       
    22 val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
       
    23 
       
    24 // starts with an "
       
    25 // then either http or https
       
    26 // then ://
       
    27 // then any character that is not "
       
    28 // finally "
       
    29 val http_pattern = """\"((?:http|https)://(?:[^\"])*)\"""".r
       
    30 val http_pattern = """\"(https?://[^\"]*)\"""".r
       
    31 
       
    32 def unquote(s: String) = s.drop(1).dropRight(1)
       
    33 
       
    34 def get_all_URLs(page: String) : Set[String] = {
       
    35   (http_pattern.findAllIn(page)).map { unquote(_) }.toSet
       
    36 }
       
    37 
       
    38 // get all urls in startURL
       
    39 get_all_URLs(get_page(startURL))
       
    40 
       
    41 // number of all urls in startURL 
       
    42 get_all_URLs(get_page(startURL)).toList.length
       
    43 
       
    44 
       
    45 // naive version - seraches until a given depth
       
    46 // visits pages potentially more than once
       
    47 def crawl(url: String, n: Int) : Unit = {
       
    48   if (n == 0) ()
       
    49   else {
       
    50     println("Visiting: " + n + " " + url)
       
    51     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
       
    52   }
       
    53 }
       
    54 
       
    55 crawl(startURL, 2)
       
    56 
       
    57 
       
    58 //breadth-first version without visiting 
       
    59 //pages twice
       
    60 def bf_crawl(todo: Set[String], visited: Set[String], n: Int) : Unit = {
       
    61   if (n == 0) ()
       
    62   else {
       
    63     val new_todo = todo.flatMap { 
       
    64       url => {
       
    65         if (visited.contains(url)) Set[String]()
       
    66         else {
       
    67           println("Visiting: " + n + " " + url)
       
    68           get_all_URLs(get_page(url))
       
    69         }
       
    70       }
       
    71     } 
       
    72     bf_crawl(new_todo, visited union todo, n - 1)
       
    73   }
       
    74 }
       
    75 
       
    76 bf_crawl(Set(startURL1), Set(), 2)
       
    77 
       
    78 
       
    79 //breadth-first version without visiting 
       
    80 //pages twice and only in "my" domain
       
    81 val my_pattern = """urbanc""".r
       
    82 
       
    83 // breadth first search avoiding double searches
       
    84 def bf_crawl2(todo: Set[String], visited: Set[String], n: Int) : Unit = {
       
    85   if (n == 0) ()
       
    86   else {
       
    87     val new_todo = todo.flatMap { 
       
    88       url => {
       
    89         if (visited.contains(url)) Set[String]()
       
    90         else if (my_pattern.findFirstIn(url) == None) Set[String]()
       
    91         else {
       
    92           println("Visiting: " + n + " " + url);
       
    93           get_all_URLs(get_page(url))
       
    94         }
       
    95       }
       
    96     } 
       
    97     bf_crawl2(new_todo, visited union todo, n - 1)
       
    98   }
       
    99 }
       
   100 
       
   101 bf_crawl2(Set(startURL1), Set(), 5)
       
   102 
       
   103 // email harvester
       
   104 // from 
       
   105 // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/
       
   106 
       
   107 val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
       
   108 
       
   109 def bf_crawl3(todo: Set[String], visited: Set[String], n: Int) : Unit = {
       
   110   if (n == 0) ()
       
   111   else {
       
   112     val new_todo = todo.flatMap { 
       
   113       url => {
       
   114         if (visited.contains(url)) Set[String]()
       
   115         else {
       
   116           println("Visiting: " + n + " " + url);
       
   117           val page = get_page(url)
       
   118           println(email_pattern.findAllIn(page).mkString("\n"))
       
   119           get_all_URLs(get_page(url))
       
   120         }
       
   121       }
       
   122     } 
       
   123     bf_crawl3(new_todo, visited union todo, n - 1)
       
   124   }
       
   125 }
       
   126 
       
   127 bf_crawl3(Set(startURL1), Set(), 3)
       
   128 
       
   129 
       
   130 // depth-first version does not work,
       
   131 // because it might visit pages at depth 1
       
   132 // while it still wants to visit them at 
       
   133 // depth 2 
       
   134 var visited = Set("")
       
   135 
       
   136 def crawl(url: String, n: Int) : Unit = {
       
   137   if (n == 0) ()
       
   138   else if (visited.contains(url)) () //println("Already visited: " + n + " " + url)
       
   139   else {
       
   140     println("Visiting: " + n + " " + url);
       
   141     visited += url
       
   142     for (u <- getAllURLs(getURLpage(url))) crawl(u, n - 1);
       
   143   }
       
   144 }