|         |      1 // Core Part about Movie Recommendations  | 
|         |      2 // at Danube.co.uk | 
|         |      3 //======================================== | 
|         |      4  | 
|         |      5  | 
|         |      6 object CW7b { // for purposes of generating a jar | 
|         |      7  | 
|         |      8 import io.Source | 
|         |      9 import scala.util._ | 
|         |     10  | 
|         |     11  | 
|         |     12 // (1) Implement the function get_csv_url which takes an url-string | 
|         |     13 //     as argument and requests the corresponding file. The two urls | 
|         |     14 //     of interest are ratings_url and movies_url, which correspond  | 
|         |     15 //     to CSV-files. | 
|         |     16 //     The function should return the CSV file appropriately broken | 
|         |     17 //     up into lines, and the first line should be dropped (that is without | 
|         |     18 //     the header of the CSV file). The result is a list of strings (lines | 
|         |     19 //     in the file). | 
|         |     20  | 
|         |     21 def get_csv_url(url: String) : List[String] = { | 
|         |     22   val csv = Source.fromURL(url)("ISO-8859-1") | 
|         |     23   csv.mkString.split("\n").toList.drop(1) | 
|         |     24 } | 
|         |     25  | 
|         |     26 val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv""" | 
|         |     27 val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv""" | 
|         |     28  | 
|         |     29 // test cases | 
|         |     30  | 
|         |     31 //val ratings = get_csv_url(ratings_url) | 
|         |     32 //val movies = get_csv_url(movies_url) | 
|         |     33  | 
|         |     34 //ratings.length  // 87313 | 
|         |     35 //movies.length   // 9742 | 
|         |     36  | 
|         |     37 // (2) Implement two functions that process the CSV files. The ratings | 
|         |     38 //     function filters out all ratings below 4 and returns a list of  | 
|         |     39 //     (userID, movieID) pairs. The movies function just returns a list  | 
|         |     40 //     of (movieId, title) pairs. | 
|         |     41  | 
|         |     42  | 
|         |     43 def process_ratings(lines: List[String]) : List[(String, String)] = { | 
|         |     44   for (cols <- lines.map(_.split(",").toList);  | 
|         |     45        if (cols(2).toInt >= 4)) yield (cols(0), cols(1))   | 
|         |     46 } | 
|         |     47  | 
|         |     48 def process_movies(lines: List[String]) : List[(String, String)] = { | 
|         |     49   for (cols <- lines.map(_.split(",").toList)) yield (cols(0), cols(1))   | 
|         |     50 } | 
|         |     51  | 
|         |     52 // test cases | 
|         |     53  | 
|         |     54 //val good_ratings = process_ratings(ratings) | 
|         |     55 //val movie_names = process_movies(movies) | 
|         |     56  | 
|         |     57 //good_ratings.length   //48580 | 
|         |     58 //movie_names.length    // 9742 | 
|         |     59  | 
|         |     60 //============================================== | 
|         |     61 // Do not change anything below, unless you want  | 
|         |     62 // to submit the file for the advanced part 3! | 
|         |     63 //============================================== | 
|         |     64  | 
|         |     65  | 
|         |     66 // (3) Implement a grouping function that calulates a map | 
|         |     67 //     containing the userIds and all the corresponding recommendations  | 
|         |     68 //     (list of movieIds). This  should be implemented in a tail | 
|         |     69 //     recursive fashion, using a map m as accumulator. This map | 
|         |     70 //     is set to Map() at the beginning of the claculation. | 
|         |     71  | 
|         |     72 def groupById(ratings: List[(String, String)],  | 
|         |     73               m: Map[String, List[String]]) : Map[String, List[String]] = ratings match { | 
|         |     74   case Nil => m | 
|         |     75   case (id, mov) :: rest => { | 
|         |     76     val old_ratings = m.getOrElse (id, Nil) | 
|         |     77     val new_ratings = m + (id -> (mov :: old_ratings)) | 
|         |     78     groupById(rest, new_ratings) | 
|         |     79   } | 
|         |     80 } | 
|         |     81  | 
|         |     82 // test cases | 
|         |     83 //val ratings_map = groupById(good_ratings, Map()) | 
|         |     84 //val movies_map = movie_names.toMap | 
|         |     85  | 
|         |     86 //ratings_map.get("414").get.map(movies_map.get(_)) // most prolific recommender with 1227 positive ratings | 
|         |     87 //ratings_map.get("474").get.map(movies_map.get(_)) // second-most prolific recommender with 787 positive ratings | 
|         |     88 //ratings_map.get("214").get.map(movies_map.get(_)) // least prolific recommender with only 1 positive rating | 
|         |     89  | 
|         |     90  | 
|         |     91  | 
|         |     92 //(4) Implement a function that takes a ratings map and a movie_name as argument. | 
|         |     93 // The function calculates all suggestions containing | 
|         |     94 // the movie mov in its recommendations. It returns a list of all these | 
|         |     95 // recommendations (each of them is a list and needs to have mov deleted,  | 
|         |     96 // otherwise it might happen we recommend the same movie). | 
|         |     97  | 
|         |     98 def favourites(m: Map[String, List[String]], mov: String) : List[List[String]] =  | 
|         |     99   (for (id <- m.keys.toList; | 
|         |    100         if m(id).contains(mov)) yield m(id).filter(_ != mov)) | 
|         |    101  | 
|         |    102  | 
|         |    103  | 
|         |    104 // test cases | 
|         |    105 // movie ID "912" -> Casablanca (1942) | 
|         |    106 //          "858" -> Godfather | 
|         |    107 //          "260" -> Star Wars: Episode IV - A New Hope (1977) | 
|         |    108  | 
|         |    109 //favourites(ratings_map, "912").length  // => 80 | 
|         |    110  | 
|         |    111 // That means there are 80 users that recommend the movie with ID 912. | 
|         |    112 // Of these 80  users, 55 gave a good rating to movie 858 and | 
|         |    113 // 52 a good rating to movies 260, 318, 593. | 
|         |    114  | 
|         |    115  | 
|         |    116 // (5) Implement a suggestions function which takes a rating | 
|         |    117 // map and a movie_name as arguments. It calculates all the recommended | 
|         |    118 // movies sorted according to the most frequently suggested movie(s) first. | 
|         |    119 def suggestions(recs: Map[String, List[String]],  | 
|         |    120                     mov_name: String) : List[String] = { | 
|         |    121   val favs = favourites(recs, mov_name).flatten | 
|         |    122   val favs_counted = favs.groupBy(identity).view.mapValues(_.size).toList | 
|         |    123   val favs_sorted = favs_counted.sortBy(_._2).reverse | 
|         |    124   favs_sorted.map(_._1) | 
|         |    125 } | 
|         |    126  | 
|         |    127 // test cases | 
|         |    128  | 
|         |    129 //suggestions(ratings_map, "912") | 
|         |    130 //suggestions(ratings_map, "912").length   | 
|         |    131 // => 4110 suggestions with List(858, 260, 318, 593, ...) | 
|         |    132 //    being the most frequently suggested movies | 
|         |    133  | 
|         |    134 // (6) Implement recommendations functions which generates at most | 
|         |    135 // *two* of the most frequently suggested movies. It Returns the  | 
|         |    136 // actual movie names, not the movieIDs. | 
|         |    137  | 
|         |    138 def recommendations(recs: Map[String, List[String]], | 
|         |    139                    movs: Map[String, String], | 
|         |    140                    mov_name: String) : List[String] = | 
|         |    141   suggestions(recs, mov_name).take(2).map(movs.get(_).get)                  | 
|         |    142  | 
|         |    143  | 
|         |    144 // testcases | 
|         |    145  | 
|         |    146 // recommendations(ratings_map, movies_map, "912") | 
|         |    147 //   => List(Godfather, Star Wars: Episode IV - A NewHope (1977)) | 
|         |    148  | 
|         |    149 //recommendations(ratings_map, movies_map, "260") | 
|         |    150 //   => List(Star Wars: Episode V - The Empire Strikes Back (1980),  | 
|         |    151 //           Star Wars: Episode VI - Return of the Jedi (1983)) | 
|         |    152  | 
|         |    153 // recommendations(ratings_map, movies_map, "2") | 
|         |    154 //   => List(Lion King, Jurassic Park (1993)) | 
|         |    155  | 
|         |    156 // recommendations(ratings_map, movies_map, "0") | 
|         |    157 //   => Nil | 
|         |    158  | 
|         |    159 // recommendations(ratings_map, movies_map, "1") | 
|         |    160 //   => List(Shawshank Redemption, Forrest Gump (1994)) | 
|         |    161  | 
|         |    162 // recommendations(ratings_map, movies_map, "4") | 
|         |    163 //   => Nil  (there are three ratings for this movie in ratings.csv but they are not positive)      | 
|         |    164  | 
|         |    165 // (7) Calculate the recommendations for all movies according to | 
|         |    166 // what the recommendations function in (6) produces (this | 
|         |    167 // can take a few seconds). Put all recommendations into a list  | 
|         |    168 // (of strings) and count how often the strings occur in | 
|         |    169 // this list. This produces a list of string-int pairs, | 
|         |    170 // where the first component is the movie name and the second | 
|         |    171 // is the number of how many times they were recommended.  | 
|         |    172 // Sort all the pairs according to the number | 
|         |    173 // of times they were recommended (most recommended movie name  | 
|         |    174 // first). | 
|         |    175  | 
|         |    176 def occurrences(xs: List[String]): List[(String, Int)] = | 
|         |    177   for (x <- xs.distinct) yield (x, xs.count(_ == x)) | 
|         |    178  | 
|         |    179 def most_recommended(recs: Map[String, List[String]], | 
|         |    180                      movs: Map[String, String]) : List[(String, Int)] = { | 
|         |    181    val all =  (for (name <- movs.toList.map(_._1)) yield { | 
|         |    182      recommendations(recs, movs, name)                      | 
|         |    183    }).flatten | 
|         |    184    val occs = occurrences(all) | 
|         |    185    occs.sortBy(_._2).reverse | 
|         |    186 } | 
|         |    187  | 
|         |    188  | 
|         |    189 //most_recommended(ratings_map, movies_map).take(3) | 
|         |    190 // => | 
|         |    191 // List((Matrix,698),  | 
|         |    192 //      (Star Wars: Episode IV - A New Hope (1977),402),  | 
|         |    193 //      (Jerry Maguire (1996),382)) | 
|         |    194  | 
|         |    195  | 
|         |    196 } |