| 1 |      1 | import io.Source
 | 
|  |      2 | import scala.util.matching.Regex
 | 
|  |      3 | 
 | 
|  |      4 | // gets the first ~10K of a page
 | 
|  |      5 | def get_page(url: String) : String = { 
 | 
|  |      6 |   try {
 | 
|  |      7 |     Source.fromURL(url).take(10000).mkString  
 | 
|  |      8 |   }
 | 
|  |      9 |   catch {
 | 
|  |     10 |     case e => {
 | 
|  |     11 |       println("  Problem with: " + url)
 | 
|  |     12 |       ""
 | 
|  |     13 |     }
 | 
|  |     14 |   }
 | 
|  |     15 | }
 | 
|  |     16 | 
 | 
|  |     17 | // non-existing page -> returns the empty string
 | 
|  |     18 | get_page("""http://www.foobar.com""")
 | 
|  |     19 | 
 | 
|  |     20 | 
 | 
|  |     21 | // staring URL for the crawler
 | 
|  |     22 | val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
 | 
|  |     23 | 
 | 
|  |     24 | // starts with an "
 | 
|  |     25 | // then either http or https
 | 
|  |     26 | // then ://
 | 
|  |     27 | // then any character that is not "
 | 
|  |     28 | // finally "
 | 
|  |     29 | val http_pattern = """\"((?:http|https)://(?:[^\"])*)\"""".r
 | 
|  |     30 | val http_pattern = """\"(https?://[^\"]*)\"""".r
 | 
|  |     31 | 
 | 
|  |     32 | def unquote(s: String) = s.drop(1).dropRight(1)
 | 
|  |     33 | 
 | 
|  |     34 | def get_all_URLs(page: String) : Set[String] = {
 | 
|  |     35 |   (http_pattern.findAllIn(page)).map { unquote(_) }.toSet
 | 
|  |     36 | }
 | 
|  |     37 | 
 | 
|  |     38 | // get all urls in startURL
 | 
|  |     39 | get_all_URLs(get_page(startURL))
 | 
|  |     40 | 
 | 
|  |     41 | // number of all urls in startURL 
 | 
|  |     42 | get_all_URLs(get_page(startURL)).toList.length
 | 
|  |     43 | 
 | 
|  |     44 | 
 | 
|  |     45 | // naive version - seraches until a given depth
 | 
|  |     46 | // visits pages potentially more than once
 | 
|  |     47 | def crawl(url: String, n: Int) : Unit = {
 | 
|  |     48 |   if (n == 0) ()
 | 
|  |     49 |   else {
 | 
|  |     50 |     println("Visiting: " + n + " " + url)
 | 
|  |     51 |     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
 | 
|  |     52 |   }
 | 
|  |     53 | }
 | 
|  |     54 | 
 | 
|  |     55 | crawl(startURL, 2)
 | 
|  |     56 | 
 | 
|  |     57 | 
 | 
|  |     58 | //breadth-first version without visiting 
 | 
|  |     59 | //pages twice
 | 
|  |     60 | def bf_crawl(todo: Set[String], visited: Set[String], n: Int) : Unit = {
 | 
|  |     61 |   if (n == 0) ()
 | 
|  |     62 |   else {
 | 
|  |     63 |     val new_todo = todo.flatMap { 
 | 
|  |     64 |       url => {
 | 
|  |     65 |         if (visited.contains(url)) Set[String]()
 | 
|  |     66 |         else {
 | 
|  |     67 |           println("Visiting: " + n + " " + url)
 | 
|  |     68 |           get_all_URLs(get_page(url))
 | 
|  |     69 |         }
 | 
|  |     70 |       }
 | 
|  |     71 |     } 
 | 
|  |     72 |     bf_crawl(new_todo, visited union todo, n - 1)
 | 
|  |     73 |   }
 | 
|  |     74 | }
 | 
|  |     75 | 
 | 
|  |     76 | bf_crawl(Set(startURL1), Set(), 2)
 | 
|  |     77 | 
 | 
|  |     78 | 
 | 
|  |     79 | //breadth-first version without visiting 
 | 
|  |     80 | //pages twice and only in "my" domain
 | 
|  |     81 | val my_pattern = """urbanc""".r
 | 
|  |     82 | 
 | 
|  |     83 | // breadth first search avoiding double searches
 | 
|  |     84 | def bf_crawl2(todo: Set[String], visited: Set[String], n: Int) : Unit = {
 | 
|  |     85 |   if (n == 0) ()
 | 
|  |     86 |   else {
 | 
|  |     87 |     val new_todo = todo.flatMap { 
 | 
|  |     88 |       url => {
 | 
|  |     89 |         if (visited.contains(url)) Set[String]()
 | 
|  |     90 |         else if (my_pattern.findFirstIn(url) == None) Set[String]()
 | 
|  |     91 |         else {
 | 
|  |     92 |           println("Visiting: " + n + " " + url);
 | 
|  |     93 |           get_all_URLs(get_page(url))
 | 
|  |     94 |         }
 | 
|  |     95 |       }
 | 
|  |     96 |     } 
 | 
|  |     97 |     bf_crawl2(new_todo, visited union todo, n - 1)
 | 
|  |     98 |   }
 | 
|  |     99 | }
 | 
|  |    100 | 
 | 
|  |    101 | bf_crawl2(Set(startURL1), Set(), 5)
 | 
|  |    102 | 
 | 
|  |    103 | // email harvester
 | 
|  |    104 | // from 
 | 
|  |    105 | // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/
 | 
|  |    106 | 
 | 
|  |    107 | val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
 | 
|  |    108 | 
 | 
|  |    109 | def bf_crawl3(todo: Set[String], visited: Set[String], n: Int) : Unit = {
 | 
|  |    110 |   if (n == 0) ()
 | 
|  |    111 |   else {
 | 
|  |    112 |     val new_todo = todo.flatMap { 
 | 
|  |    113 |       url => {
 | 
|  |    114 |         if (visited.contains(url)) Set[String]()
 | 
|  |    115 |         else {
 | 
|  |    116 |           println("Visiting: " + n + " " + url);
 | 
|  |    117 |           val page = get_page(url)
 | 
|  |    118 |           println(email_pattern.findAllIn(page).mkString("\n"))
 | 
|  |    119 |           get_all_URLs(get_page(url))
 | 
|  |    120 |         }
 | 
|  |    121 |       }
 | 
|  |    122 |     } 
 | 
|  |    123 |     bf_crawl3(new_todo, visited union todo, n - 1)
 | 
|  |    124 |   }
 | 
|  |    125 | }
 | 
|  |    126 | 
 | 
|  |    127 | bf_crawl3(Set(startURL1), Set(), 3)
 | 
|  |    128 | 
 | 
|  |    129 | 
 | 
|  |    130 | // depth-first version does not work,
 | 
|  |    131 | // because it might visit pages at depth 1
 | 
|  |    132 | // while it still wants to visit them at 
 | 
|  |    133 | // depth 2 
 | 
|  |    134 | var visited = Set("")
 | 
|  |    135 | 
 | 
|  |    136 | def crawl(url: String, n: Int) : Unit = {
 | 
|  |    137 |   if (n == 0) ()
 | 
|  |    138 |   else if (visited.contains(url)) () //println("Already visited: " + n + " " + url)
 | 
|  |    139 |   else {
 | 
|  |    140 |     println("Visiting: " + n + " " + url);
 | 
|  |    141 |     visited += url
 | 
|  |    142 |     for (u <- getAllURLs(getURLpage(url))) crawl(u, n - 1);
 | 
|  |    143 |   }
 | 
|  |    144 | }
 |