main_testing2/danube.scala
changeset 379 73ad2e018516
parent 347 0b727d1a8184
child 384 627a944c744b
equal deleted inserted replaced
378:efc6a287c369 379:73ad2e018516
     1 // Core Part about Movie Recommendations 
     1 // Core Part about Movie Recommendations 
     2 // at Danube.co.uk
     2 // at Danube.co.uk
     3 //===========================================
     3 //========================================
       
     4 
       
     5 
       
     6 object CW7b { // for purposes of generating a jar
     4 
     7 
     5 import io.Source
     8 import io.Source
     6 import scala.util._
     9 import scala.util._
     7 
    10 
     8 object CW7b { // for purposes of generating a jar
       
     9 
    11 
    10 // (1) Implement the function get_csv_url which takes an url-string
    12 // (1) Implement the function get_csv_url which takes an url-string
    11 //     as argument and requests the corresponding file. The two urls
    13 //     as argument and requests the corresponding file. The two urls
    12 //     of interest are ratings_url and movies_url, which correspond 
    14 //     of interest are ratings_url and movies_url, which correspond 
    13 //     to CSV-files.
    15 //     to CSV-files.
    14 //     The function should ReTurn the CSV file appropriately broken
    16 //     The function should return the CSV file appropriately broken
    15 //     up into lines, and the first line should be dropped (that is without
    17 //     up into lines, and the first line should be dropped (that is without
    16 //     the header of the CSV file). The result is a list of strings (lines
    18 //     the header of the CSV file). The result is a list of strings (lines
    17 //     in the file).
    19 //     in the file).
    18 
    20 
    19 def get_csv_url(url: String) : List[String] = {
    21 def get_csv_url(url: String) : List[String] = {
    31 
    33 
    32 //ratings.length  // 87313
    34 //ratings.length  // 87313
    33 //movies.length   // 9742
    35 //movies.length   // 9742
    34 
    36 
    35 // (2) Implement two functions that process the CSV files. The ratings
    37 // (2) Implement two functions that process the CSV files. The ratings
    36 //     function filters out all ratings below 4 and ReTurns a list of 
    38 //     function filters out all ratings below 4 and returns a list of 
    37 //     (userID, movieID) pairs. The movies function just ReTurns a list 
    39 //     (userID, movieID) pairs. The movies function just returns a list 
    38 //     of (movieId, title) pairs.
    40 //     of (movieId, title) pairs.
    39 
    41 
       
    42 
       
    43 //def process_ratings(lines: List[String]) : List[(String, String)] = {
       
    44 //  for (cols <- lines.map(_.split(",").toList); 
       
    45 //       if (cols(2).toFloat >= 4)) yield (cols(0), cols(1))  
       
    46 //}
    40 
    47 
    41 def process_ratings(lines: List[String]) : List[(String, String)] = {
    48 def process_ratings(lines: List[String]) : List[(String, String)] = {
    42   for (cols <- lines.map(_.split(",").toList); 
    49   for (cols <- lines.map(_.split(",").toList); 
    43        if (cols(2).toFloat >= 4)) yield (cols(0), cols(1))  
    50        if (cols(2).toInt >= 4)) yield (cols(0), cols(1))  
    44 }
    51 }
    45 
    52 
    46 def process_movies(lines: List[String]) : List[(String, String)] = {
    53 def process_movies(lines: List[String]) : List[(String, String)] = {
    47   for (cols <- lines.map(_.split(",").toList)) yield (cols(0), cols(1))  
    54   for (cols <- lines.map(_.split(",").toList)) yield (cols(0), cols(1))  
    48 }
    55 }
    75     val new_ratings = m + (id -> (mov :: old_ratings))
    82     val new_ratings = m + (id -> (mov :: old_ratings))
    76     groupById(rest, new_ratings)
    83     groupById(rest, new_ratings)
    77   }
    84   }
    78 }
    85 }
    79 
    86 
    80 //
       
    81 //val ls = List(("1", "a"), ("2", "a"), ("1", "c"), ("2", "a"), ("1", "c"))
       
    82 //
       
    83 //val m = groupById(ls, Map())
       
    84 //
       
    85 //m.getOrElse("1", Nil).count(_ == "c") // => 2
       
    86 //m.getOrElse("1", Nil).count(_ == "a") // => 1
       
    87 
       
    88 // test cases
    87 // test cases
    89 //val ratings_map = groupById(good_ratings, Map())
    88 //val ratings_map = groupById(good_ratings, Map())
    90 //groupById(good_ratings, Map()).get("214")
       
    91 //groupById(good_ratings, Map()).toList.minBy(_._2.length)
       
    92 //val movies_map = movie_names.toMap
    89 //val movies_map = movie_names.toMap
    93 
    90 
    94 //ratings_map.get("414").get.map(movies_map.get(_)) // most prolific recommender with 1227 positive ratings
    91 //ratings_map.get("414").get.map(movies_map.get(_)) // most prolific recommender with 1227 positive ratings
    95 //ratings_map.get("474").get.map(movies_map.get(_)) // second-most prolific recommender with 787 positive ratings
    92 //ratings_map.get("474").get.map(movies_map.get(_)) // second-most prolific recommender with 787 positive ratings
    96 //ratings_map.get("214").get.map(movies_map.get(_)) // least prolific recommender with only 1 positive rating
    93 //ratings_map.get("214").get.map(movies_map.get(_)) // least prolific recommender with only 1 positive rating
    97 
    94 
    98 
    95 
    99 
       
   100 //(4) Implement a function that takes a ratings map and a movie_name as argument.
    96 //(4) Implement a function that takes a ratings map and a movie_name as argument.
   101 // The function calculates all suggestions containing
    97 // The function calculates all suggestions containing
   102 // the movie mov in its recommendations. It ReTurns a list of all these
    98 // the movie mov in its recommendations. It returns a list of all these
   103 // recommendations (each of them is a list and needs to have mov deleted, 
    99 // recommendations (each of them is a list and needs to have mov deleted, 
   104 // otherwise it might happen we recommend the same movie).
   100 // otherwise it might happen we recommend the same movie).
   105 
   101 
   106 def favourites(m: Map[String, List[String]], mov: String) : List[List[String]] = 
   102 def favourites(m: Map[String, List[String]], mov: String) : List[List[String]] = 
   107   (for (id <- m.keys.toList;
   103   (for (id <- m.keys.toList;
   122 
   118 
   123 
   119 
   124 // (5) Implement a suggestions function which takes a rating
   120 // (5) Implement a suggestions function which takes a rating
   125 // map and a movie_name as arguments. It calculates all the recommended
   121 // map and a movie_name as arguments. It calculates all the recommended
   126 // movies sorted according to the most frequently suggested movie(s) first.
   122 // movies sorted according to the most frequently suggested movie(s) first.
   127 
       
   128 // needed in Scala 2.13.
       
   129  
       
   130 def mapValues[S, T, R](m: Map[S, T], f: T => R) =
       
   131   m.map { case (x, y) => (x, f(y)) }
       
   132 
       
   133 def suggestions(recs: Map[String, List[String]], 
   123 def suggestions(recs: Map[String, List[String]], 
   134                     mov_name: String) : List[String] = {
   124                     mov_name: String) : List[String] = {
   135   val favs = favourites(recs, mov_name).flatten
   125   val favs = favourites(recs, mov_name).flatten
   136   val favs_counted = mapValues(favs.groupBy(identity), (v:List[String]) => v.size).toList
   126   val favs_counted = favs.groupBy(identity).view.mapValues(_.size).toList
   137   val favs_sorted = favs_counted.sortBy(_._2).reverse
   127   val favs_sorted = favs_counted.sortBy(_._2).reverse
   138   favs_sorted.map(_._1)
   128   favs_sorted.map(_._1)
   139 }
   129 }
   140 
       
   141 // check
       
   142 // groupMap is equivalent to groupBy(key).mapValues(_.map(f))
       
   143 
   130 
   144 // test cases
   131 // test cases
   145 
   132 
   146 //suggestions(ratings_map, "912")
   133 //suggestions(ratings_map, "912")
   147 //suggestions(ratings_map, "912").length  
   134 //suggestions(ratings_map, "912").length  
   161 // testcases
   148 // testcases
   162 
   149 
   163 // recommendations(ratings_map, movies_map, "912")
   150 // recommendations(ratings_map, movies_map, "912")
   164 //   => List(Godfather, Star Wars: Episode IV - A NewHope (1977))
   151 //   => List(Godfather, Star Wars: Episode IV - A NewHope (1977))
   165 
   152 
   166 //recommendations(ratings_map, movies_map, "260")
   153 // recommendations(ratings_map, movies_map, "260")
   167 //   => List(Star Wars: Episode V - The Empire Strikes Back (1980), 
   154 //   => List(Star Wars: Episode V - The Empire Strikes Back (1980), 
   168 //           Star Wars: Episode VI - Return of the Jedi (1983))
   155 //           Star Wars: Episode VI - Return of the Jedi (1983))
   169 
   156 
   170 // recommendations(ratings_map, movies_map, "2")
   157 // recommendations(ratings_map, movies_map, "2")
   171 //   => List(Lion King, Jurassic Park (1993))
   158 //   => List(Lion King, Jurassic Park (1993))
   175 
   162 
   176 // recommendations(ratings_map, movies_map, "1")
   163 // recommendations(ratings_map, movies_map, "1")
   177 //   => List(Shawshank Redemption, Forrest Gump (1994))
   164 //   => List(Shawshank Redemption, Forrest Gump (1994))
   178 
   165 
   179 // recommendations(ratings_map, movies_map, "4")
   166 // recommendations(ratings_map, movies_map, "4")
   180 //   => Nil  (there are three ratings fro this movie in ratings.csv but they are not positive)     
   167 //   => Nil  (there are three ratings for this movie in ratings.csv but they are not positive)     
   181 
   168 
   182 
   169 // (7) Calculate the recommendations for all movies according to
   183 // If you want to calculate the recomendations for all movies.
   170 // what the recommendations function in (6) produces (this
   184 // Will take a few seconds calculation time.
   171 // can take a few seconds). Put all recommendations into a list 
   185 
   172 // (of strings) and count how often the strings occur in
   186 //val all = for (name <- movie_names.map(_._1)) yield {
   173 // this list. This produces a list of string-int pairs,
   187 //  recommendations(ratings_map, movies_map, name)
   174 // where the first component is the movie name and the second
   188 //}
   175 // is the number of how many times they were recommended. 
   189 
   176 // Sort all the pairs according to the number
   190 // helper functions
   177 // of times they were recommended (most recommended movie name 
   191 //List().take(2)
   178 // first).
   192 //List(1).take(2)
   179 
   193 //List(1,2).take(2)
   180 def occurrences(xs: List[String]): List[(String, Int)] =
   194 //List(1,2,3).take(2)
   181   for (x <- xs.distinct) yield (x, xs.count(_ == x))
   195 
   182 
   196 }
   183 def most_recommended(recs: Map[String, List[String]],
       
   184                      movs: Map[String, String]) : List[(String, Int)] = {
       
   185    val all =  (for (name <- movs.toList.map(_._1)) yield {
       
   186      recommendations(recs, movs, name)                     
       
   187    }).flatten
       
   188    val occs = occurrences(all)
       
   189    occs.sortBy(_._2).reverse
       
   190 }
       
   191 
       
   192 
       
   193 //most_recommended(ratings_map, movies_map).take(3)
       
   194 // =>
       
   195 // List((Matrix,698), 
       
   196 //      (Star Wars: Episode IV - A New Hope (1977),402), 
       
   197 //      (Jerry Maguire (1996),382))
       
   198 
       
   199 }
       
   200 
       
   201 //val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
       
   202 //val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv"""
       
   203 
       
   204 /*
       
   205 val ratings = get_csv_url(ratings_url)
       
   206 val movies = get_csv_url(movies_url)
       
   207 
       
   208 val good_ratings = process_ratings(ratings)
       
   209 val movie_names = process_movies(movies)
       
   210 
       
   211 val ratings_map = groupById(good_ratings, Map())
       
   212 val movies_map = movie_names.toMap
       
   213 
       
   214 
       
   215 println(most_recommended(ratings_map, movies_map).take(3))
       
   216 */