| 981 |      1 | // Try out requests library
 | 
|  |      2 | //
 | 
|  |      3 | // https://github.com/com-lihaoyi/requests-scala
 | 
|  |      4 | //
 | 
|  |      5 | //
 | 
|  |      6 | import $ivy.`com.lihaoyi::requests:0.9.0`
 | 
|  |      7 | 
 | 
|  |      8 | 
 | 
|  |      9 | 
 | 
|  |     10 | requests.get("https://nms.kcl.ac.uk/christian.urban/")
 | 
|  |     11 | 
 | 
|  |     12 | 
 | 
| 1 |     13 | import io.Source
 | 
|  |     14 | import scala.util.matching.Regex
 | 
|  |     15 | 
 | 
|  |     16 | // gets the first ~10K of a page
 | 
|  |     17 | def get_page(url: String) : String = { 
 | 
|  |     18 |   try {
 | 
|  |     19 |     Source.fromURL(url).take(10000).mkString  
 | 
|  |     20 |   }
 | 
|  |     21 |   catch {
 | 
|  |     22 |     case e => {
 | 
|  |     23 |       println("  Problem with: " + url)
 | 
|  |     24 |       ""
 | 
|  |     25 |     }
 | 
|  |     26 |   }
 | 
|  |     27 | }
 | 
|  |     28 | 
 | 
|  |     29 | // non-existing page -> returns the empty string
 | 
|  |     30 | get_page("""http://www.foobar.com""")
 | 
|  |     31 | 
 | 
|  |     32 | 
 | 
|  |     33 | // staring URL for the crawler
 | 
|  |     34 | val startURL = """http://www.inf.kcl.ac.uk/staff/urbanc/"""
 | 
|  |     35 | 
 | 
|  |     36 | // starts with an "
 | 
|  |     37 | // then either http or https
 | 
|  |     38 | // then ://
 | 
|  |     39 | // then any character that is not "
 | 
|  |     40 | // finally "
 | 
|  |     41 | val http_pattern = """\"((?:http|https)://(?:[^\"])*)\"""".r
 | 
|  |     42 | val http_pattern = """\"(https?://[^\"]*)\"""".r
 | 
|  |     43 | 
 | 
|  |     44 | def unquote(s: String) = s.drop(1).dropRight(1)
 | 
|  |     45 | 
 | 
|  |     46 | def get_all_URLs(page: String) : Set[String] = {
 | 
|  |     47 |   (http_pattern.findAllIn(page)).map { unquote(_) }.toSet
 | 
|  |     48 | }
 | 
|  |     49 | 
 | 
|  |     50 | // get all urls in startURL
 | 
|  |     51 | get_all_URLs(get_page(startURL))
 | 
|  |     52 | 
 | 
|  |     53 | // number of all urls in startURL 
 | 
|  |     54 | get_all_URLs(get_page(startURL)).toList.length
 | 
|  |     55 | 
 | 
|  |     56 | 
 | 
|  |     57 | // naive version - seraches until a given depth
 | 
|  |     58 | // visits pages potentially more than once
 | 
|  |     59 | def crawl(url: String, n: Int) : Unit = {
 | 
|  |     60 |   if (n == 0) ()
 | 
|  |     61 |   else {
 | 
|  |     62 |     println("Visiting: " + n + " " + url)
 | 
|  |     63 |     for (u <- get_all_URLs(get_page(url))) crawl(u, n - 1)
 | 
|  |     64 |   }
 | 
|  |     65 | }
 | 
|  |     66 | 
 | 
|  |     67 | crawl(startURL, 2)
 | 
|  |     68 | 
 | 
|  |     69 | 
 | 
|  |     70 | //breadth-first version without visiting 
 | 
|  |     71 | //pages twice
 | 
|  |     72 | def bf_crawl(todo: Set[String], visited: Set[String], n: Int) : Unit = {
 | 
|  |     73 |   if (n == 0) ()
 | 
|  |     74 |   else {
 | 
|  |     75 |     val new_todo = todo.flatMap { 
 | 
|  |     76 |       url => {
 | 
|  |     77 |         if (visited.contains(url)) Set[String]()
 | 
|  |     78 |         else {
 | 
|  |     79 |           println("Visiting: " + n + " " + url)
 | 
|  |     80 |           get_all_URLs(get_page(url))
 | 
|  |     81 |         }
 | 
|  |     82 |       }
 | 
|  |     83 |     } 
 | 
|  |     84 |     bf_crawl(new_todo, visited union todo, n - 1)
 | 
|  |     85 |   }
 | 
|  |     86 | }
 | 
|  |     87 | 
 | 
|  |     88 | bf_crawl(Set(startURL1), Set(), 2)
 | 
|  |     89 | 
 | 
|  |     90 | 
 | 
|  |     91 | //breadth-first version without visiting 
 | 
|  |     92 | //pages twice and only in "my" domain
 | 
|  |     93 | val my_pattern = """urbanc""".r
 | 
|  |     94 | 
 | 
|  |     95 | // breadth first search avoiding double searches
 | 
|  |     96 | def bf_crawl2(todo: Set[String], visited: Set[String], n: Int) : Unit = {
 | 
|  |     97 |   if (n == 0) ()
 | 
|  |     98 |   else {
 | 
|  |     99 |     val new_todo = todo.flatMap { 
 | 
|  |    100 |       url => {
 | 
|  |    101 |         if (visited.contains(url)) Set[String]()
 | 
|  |    102 |         else if (my_pattern.findFirstIn(url) == None) Set[String]()
 | 
|  |    103 |         else {
 | 
|  |    104 |           println("Visiting: " + n + " " + url);
 | 
|  |    105 |           get_all_URLs(get_page(url))
 | 
|  |    106 |         }
 | 
|  |    107 |       }
 | 
|  |    108 |     } 
 | 
|  |    109 |     bf_crawl2(new_todo, visited union todo, n - 1)
 | 
|  |    110 |   }
 | 
|  |    111 | }
 | 
|  |    112 | 
 | 
|  |    113 | bf_crawl2(Set(startURL1), Set(), 5)
 | 
|  |    114 | 
 | 
|  |    115 | // email harvester
 | 
|  |    116 | // from 
 | 
|  |    117 | // http://net.tutsplus.com/tutorials/other/8-regular-expressions-you-should-know/
 | 
|  |    118 | 
 | 
|  |    119 | val email_pattern = """([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})""".r
 | 
|  |    120 | 
 | 
|  |    121 | def bf_crawl3(todo: Set[String], visited: Set[String], n: Int) : Unit = {
 | 
|  |    122 |   if (n == 0) ()
 | 
|  |    123 |   else {
 | 
|  |    124 |     val new_todo = todo.flatMap { 
 | 
|  |    125 |       url => {
 | 
|  |    126 |         if (visited.contains(url)) Set[String]()
 | 
|  |    127 |         else {
 | 
|  |    128 |           println("Visiting: " + n + " " + url);
 | 
|  |    129 |           val page = get_page(url)
 | 
|  |    130 |           println(email_pattern.findAllIn(page).mkString("\n"))
 | 
|  |    131 |           get_all_URLs(get_page(url))
 | 
|  |    132 |         }
 | 
|  |    133 |       }
 | 
|  |    134 |     } 
 | 
|  |    135 |     bf_crawl3(new_todo, visited union todo, n - 1)
 | 
|  |    136 |   }
 | 
|  |    137 | }
 | 
|  |    138 | 
 | 
|  |    139 | bf_crawl3(Set(startURL1), Set(), 3)
 | 
|  |    140 | 
 | 
|  |    141 | 
 | 
|  |    142 | // depth-first version does not work,
 | 
|  |    143 | // because it might visit pages at depth 1
 | 
|  |    144 | // while it still wants to visit them at 
 | 
|  |    145 | // depth 2 
 | 
|  |    146 | var visited = Set("")
 | 
|  |    147 | 
 | 
|  |    148 | def crawl(url: String, n: Int) : Unit = {
 | 
|  |    149 |   if (n == 0) ()
 | 
|  |    150 |   else if (visited.contains(url)) () //println("Already visited: " + n + " " + url)
 | 
|  |    151 |   else {
 | 
|  |    152 |     println("Visiting: " + n + " " + url);
 | 
|  |    153 |     visited += url
 | 
|  |    154 |     for (u <- getAllURLs(getURLpage(url))) crawl(u, n - 1);
 | 
|  |    155 |   }
 | 
|  |    156 | }
 |