main_testing2/danube.scala
changeset 384 6e1237691307
parent 379 5616b45d656f
child 403 ffce7b61b446
equal deleted inserted replaced
383:c02929f2647c 384:6e1237691307
     1 // Core Part about Movie Recommendations 
     1 // Core Part about Movie Recommendations
     2 // at Danube.co.uk
     2 // at Danube.co.uk
     3 //========================================
     3 //===========================================
     4 
     4 
     5 
     5 object CW7b {
     6 object CW7b { // for purposes of generating a jar
       
     7 
     6 
     8 import io.Source
     7 import io.Source
     9 import scala.util._
     8 import scala.util._
    10 
     9 
    11 
    10 
    12 // (1) Implement the function get_csv_url which takes an url-string
    11 // (1) Implement the function get_csv_url which takes an url-string
    13 //     as argument and requests the corresponding file. The two urls
    12 //     as argument and requests the corresponding file. The two urls
    14 //     of interest are ratings_url and movies_url, which correspond 
    13 //     of interest are ratings_url and movies_url, which correspond 
    15 //     to CSV-files.
    14 //     to CSV-files.
    16 //     The function should return the CSV file appropriately broken
    15 //
       
    16 //     The function should ReTurn the CSV-file appropriately broken
    17 //     up into lines, and the first line should be dropped (that is without
    17 //     up into lines, and the first line should be dropped (that is without
    18 //     the header of the CSV file). The result is a list of strings (lines
    18 //     the header of the CSV-file). The result is a list of strings (lines
    19 //     in the file).
    19 //     in the file).
    20 
    20 
    21 def get_csv_url(url: String) : List[String] = {
    21 def get_csv_url(url: String) : List[String] = {
    22   val csv = Source.fromURL(url)("ISO-8859-1")
    22   val site = Source.fromURL(url, "ISO-8859-1")
    23   csv.mkString.split("\n").toList.drop(1)
    23   val site_string = site.mkString
    24 }
    24   val output = (site_string.split("\n")).toList
    25 
    25   output.tail
    26 val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
    26 }
    27 val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv"""
    27 
    28 
    28   // get_csv_url("https://nms.kcl.ac.uk/christian.urban/ratings.csv")
    29 // test cases
    29 
    30 
    30 //val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
    31 //val ratings = get_csv_url(ratings_url)
    31 //val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv"""
       
    32 
       
    33 // testcases
       
    34 //-----------
       
    35 //:
    32 //val movies = get_csv_url(movies_url)
    36 //val movies = get_csv_url(movies_url)
       
    37   // val ratings = get_csv_url(ratings_url)
    33 
    38 
    34 //ratings.length  // 87313
    39 //ratings.length  // 87313
    35 //movies.length   // 9742
    40 //movies.length   // 9742
    36 
    41 
    37 // (2) Implement two functions that process the CSV files. The ratings
    42 
    38 //     function filters out all ratings below 4 and returns a list of 
    43 // (2) Implement two functions that process the CSV-files from (1). The ratings
    39 //     (userID, movieID) pairs. The movies function just returns a list 
    44 //     function filters out all ratings below 4 and ReTurns a list of 
    40 //     of (movieId, title) pairs.
    45 //     (userID, movieID) pairs. The movies function just ReTurns a list 
    41 
    46 //     of (movieID, title) pairs. Note the input to these functions, that is
    42 
    47 //     the argument lines, will be the output of the function get_csv_url.
    43 //def process_ratings(lines: List[String]) : List[(String, String)] = {
    48 
    44 //  for (cols <- lines.map(_.split(",").toList); 
       
    45 //       if (cols(2).toFloat >= 4)) yield (cols(0), cols(1))  
       
    46 //}
       
    47 
    49 
    48 def process_ratings(lines: List[String]) : List[(String, String)] = {
    50 def process_ratings(lines: List[String]) : List[(String, String)] = {
       
    51   val filter = lines.filter(_.last.asDigit >=4)
       
    52   val output = (for(i <- 0 until filter.length) yield ((filter(i).split(",").toList)(0), (filter(i).split(",").toList)(1))).toList
       
    53   output
       
    54 }
       
    55 
       
    56 def process_movies(lines: List[String]) : List[(String, String)] = {
       
    57   val output = (for(i <- 0 until lines.length) yield ((lines(i).split(",").toList)(0), (lines(i).split(",").toList)(1))).toList
       
    58   output
       
    59 }
       
    60 
       
    61 
       
    62 
       
    63 def process_ratings2(lines: List[String]) : List[(String, String)] = {
    49   for (cols <- lines.map(_.split(",").toList); 
    64   for (cols <- lines.map(_.split(",").toList); 
    50        if (cols(2).toInt >= 4)) yield (cols(0), cols(1))  
    65        if (cols(2).toFloat >= 4)) yield (cols(0), cols(1))  
    51 }
    66 }
    52 
    67 
    53 def process_movies(lines: List[String]) : List[(String, String)] = {
    68 def process_movies2(lines: List[String]) : List[(String, String)] = {
    54   for (cols <- lines.map(_.split(",").toList)) yield (cols(0), cols(1))  
    69   for (cols <- lines.map(_.split(",").toList)) yield (cols(0), cols(1))  
    55 }
    70 }
    56 
    71 
    57 // test cases
    72 // testcases
    58 
    73 //-----------
    59 //val good_ratings = process_ratings(ratings)
    74 //val good_ratings = process_ratings(ratings)
    60 //val movie_names = process_movies(movies)
    75 //val movie_names = process_movies(movies)
    61 
    76 
    62 //good_ratings.length   //48580
    77 //good_ratings.length   //48580
    63 //movie_names.length    // 9742
    78 //movie_names.length    // 9742
    64 
    79 
    65 //==============================================
    80 
    66 // Do not change anything below, unless you want 
    81 
    67 // to submit the file for the advanced part 3!
    82 
    68 //==============================================
    83 // (3) Implement a grouping function that calculates a Map
    69 
    84 //     containing the userIDs and all the corresponding recommendations 
    70 
    85 //     (list of movieIDs). This  should be implemented in a tail
    71 // (3) Implement a grouping function that calulates a map
    86 //     recursive fashion, using a Map m as accumulator. This Map m
    72 //     containing the userIds and all the corresponding recommendations 
    87 //     is set to Map() at the beginning of the calculation.
    73 //     (list of movieIds). This  should be implemented in a tail
    88 
    74 //     recursive fashion, using a map m as accumulator. This map
    89 val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
    75 //     is set to Map() at the beginning of the claculation.
    90 val ratings = get_csv_url(ratings_url)
       
    91 val good_ratings = process_ratings(ratings)
       
    92 val v515 = good_ratings.filter(_._1 == "515")
       
    93 val v515_2 = v515.map(_._2)
    76 
    94 
    77 def groupById(ratings: List[(String, String)], 
    95 def groupById(ratings: List[(String, String)], 
       
    96               m: Map[String, List[String]]) : Map[String, List[String]] = {
       
    97 val users = (for((k,v) <- ratings) yield k).distinct
       
    98 val movie_ids = (for(i <- 1 to users.length) yield
       
    99   (for ((k,v) <- ratings if(i.toString == k)) yield v).toList).toList
       
   100   val out_map = (users zip movie_ids).toMap
       
   101 out_map
       
   102 }
       
   103 
       
   104 def groupById2(ratings: List[(String, String)], 
    78               m: Map[String, List[String]]) : Map[String, List[String]] = ratings match {
   105               m: Map[String, List[String]]) : Map[String, List[String]] = ratings match {
    79   case Nil => m
   106   case Nil => m
    80   case (id, mov) :: rest => {
   107   case (id, mov) :: rest => {
    81     val old_ratings = m.getOrElse (id, Nil)
   108     val old_ratings = m.getOrElse (id, Nil)
    82     val new_ratings = m + (id -> (mov :: old_ratings))
   109     val new_ratings = m + (id -> (mov :: old_ratings))
    83     groupById(rest, new_ratings)
   110     groupById2(rest, new_ratings)
    84   }
   111   }
    85 }
   112 }
    86 
   113 
    87 // test cases
   114 val ls0_urban = 
       
   115   List(("1", "a"), ("1", "c"), ("1", "c"))
       
   116 
       
   117 groupById(ls0_urban, Map())
       
   118 groupById2(ls0_urban, Map())
       
   119 
       
   120 val ls00_urban = 
       
   121   List(("3", "a"), ("3", "c"), ("3", "c"))
       
   122 
       
   123 groupById(ls00_urban, Map())
       
   124 groupById2(ls00_urban, Map())
       
   125 
       
   126 groupById(good_ratings, Map()).getOrElse("515", Nil)
       
   127 groupById2(good_ratings, Map()).getOrElse("515", Nil)
       
   128 
       
   129 val ls1_urban = 
       
   130   List(("1", "a"), ("2", "a"), 
       
   131        ("1", "c"), ("2", "a"), ("1", "c"))
       
   132 
       
   133 groupById(ls1_urban, Map())
       
   134 groupById2(ls1_urban, Map())
       
   135 
       
   136 val ls2_urban = 
       
   137   List(("1", "a"), ("1", "b"), ("2", "x"), 
       
   138        ("3", "a"), ("2", "y"), ("3", "c"))
       
   139 
       
   140 groupById(ls2_urban, Map())
       
   141 groupById2(ls2_urban, Map())
       
   142 
       
   143 val ls3_urban = (1 to 1000 by 10).map(_.toString).toList
       
   144 val ls4_urban = ls3_urban zip ls3_urban.tail
       
   145 val ls5_urban = ls4_urban ::: ls4_urban.reverse
       
   146 
       
   147 groupById(ls5_urban, Map()) == groupById2(ls5_urban, Map())
       
   148 
       
   149 groupById(ls5_urban, Map())
       
   150 groupById2(ls5_urban, Map())
       
   151 
       
   152 groupById(v515, Map())
       
   153 groupById2(v515, Map())
       
   154 
       
   155 groupById(v515.take(1), Map())
       
   156 groupById2(v515.take(2), Map())
       
   157 
       
   158 // testcases
       
   159 //-----------
    88 //val ratings_map = groupById(good_ratings, Map())
   160 //val ratings_map = groupById(good_ratings, Map())
    89 //val movies_map = movie_names.toMap
   161 //val movies_map = movie_names.toMap
    90 
   162 
    91 //ratings_map.get("414").get.map(movies_map.get(_)) // most prolific recommender with 1227 positive ratings
   163 //ratings_map.get("414").get.map(movies_map.get(_)).length
    92 //ratings_map.get("474").get.map(movies_map.get(_)) // second-most prolific recommender with 787 positive ratings
   164 //    => most prolific recommender with 1227 positive ratings
    93 //ratings_map.get("214").get.map(movies_map.get(_)) // least prolific recommender with only 1 positive rating
   165 
    94 
   166 //ratings_map.get("475").get.map(movies_map.get(_)).length
    95 
   167 //    => second-most prolific recommender with 787 positive ratings
    96 //(4) Implement a function that takes a ratings map and a movie_name as argument.
   168 
    97 // The function calculates all suggestions containing
   169 //ratings_map.get("214").get.map(movies_map.get(_)).length 
    98 // the movie mov in its recommendations. It returns a list of all these
   170 //    => least prolific recommender with only 1 positive rating
    99 // recommendations (each of them is a list and needs to have mov deleted, 
   171 
   100 // otherwise it might happen we recommend the same movie).
   172 
   101 
   173 // (4) Implement a function that takes a ratings map and a movie_name as argument.
   102 def favourites(m: Map[String, List[String]], mov: String) : List[List[String]] = 
   174 //     The function calculates all suggestions containing
       
   175 //     the movie in its recommendations. It ReTurns a list of all these
       
   176 //     recommendations (each of them is a list and needs to have the movie deleted, 
       
   177 //     otherwise it might happen we recommend the same movie).
       
   178 
       
   179 
       
   180 def favourites(m: Map[String, List[String]], mov: String) : List[List[String]] = {
       
   181  (for((k,v) <- m if (v.contains(mov))) yield v.filter(_!=mov).toList).toList
       
   182 }
       
   183 
       
   184 def favourites2(m: Map[String, List[String]], mov: String) : List[List[String]] = 
   103   (for (id <- m.keys.toList;
   185   (for (id <- m.keys.toList;
   104         if m(id).contains(mov)) yield m(id).filter(_ != mov))
   186         if m(id).contains(mov)) yield m(id).filter(_ != mov))
   105 
   187 
   106 
   188 
   107 
   189 // testcases
   108 // test cases
   190 //-----------
   109 // movie ID "912" -> Casablanca (1942)
   191 // movie ID "912" -> Casablanca (1942)
   110 //          "858" -> Godfather
   192 //          "858" -> Godfather
   111 //          "260" -> Star Wars: Episode IV - A New Hope (1977)
   193 //          "260" -> Star Wars: Episode IV - A New Hope (1977)
   112 
   194 
   113 //favourites(ratings_map, "912").length  // => 80
   195 //favourites(ratings_map, "912").length  // => 80
   115 // That means there are 80 users that recommend the movie with ID 912.
   197 // That means there are 80 users that recommend the movie with ID 912.
   116 // Of these 80  users, 55 gave a good rating to movie 858 and
   198 // Of these 80  users, 55 gave a good rating to movie 858 and
   117 // 52 a good rating to movies 260, 318, 593.
   199 // 52 a good rating to movies 260, 318, 593.
   118 
   200 
   119 
   201 
       
   202 
   120 // (5) Implement a suggestions function which takes a rating
   203 // (5) Implement a suggestions function which takes a rating
   121 // map and a movie_name as arguments. It calculates all the recommended
   204 //     map and a movie_name as arguments. It calculates all the recommended
   122 // movies sorted according to the most frequently suggested movie(s) first.
   205 //     movies sorted according to the most frequently suggested movie(s) first.
       
   206 
   123 def suggestions(recs: Map[String, List[String]], 
   207 def suggestions(recs: Map[String, List[String]], 
       
   208                 mov_name: String) : List[String] = {
       
   209   val flat = favourites(recs, mov_name).flatten.groupMapReduce(identity)(_ => 1)(_ + _)
       
   210   val sorted = flat.toList.sortWith(_._2 > _._2).map(_._1)
       
   211   sorted
       
   212 }
       
   213 
       
   214 
       
   215 def mapValues[S, T, R](m: Map[S, T], f: T => R) =
       
   216   m.map { case (x, y) => (x, f(y)) }
       
   217 
       
   218 def suggestions2(recs: Map[String, List[String]], 
   124                     mov_name: String) : List[String] = {
   219                     mov_name: String) : List[String] = {
   125   val favs = favourites(recs, mov_name).flatten
   220   val favs = favourites(recs, mov_name).flatten
   126   val favs_counted = favs.groupBy(identity).view.mapValues(_.size).toList
   221   val favs_counted = mapValues(favs.groupBy(identity), (v:List[String]) => v.size).toList
   127   val favs_sorted = favs_counted.sortBy(_._2).reverse
   222   val favs_sorted = favs_counted.sortBy(_._2).reverse
   128   favs_sorted.map(_._1)
   223   favs_sorted.map(_._1)
   129 }
   224 }
   130 
   225 
   131 // test cases
   226 // testcases
       
   227 //-----------
   132 
   228 
   133 //suggestions(ratings_map, "912")
   229 //suggestions(ratings_map, "912")
   134 //suggestions(ratings_map, "912").length  
   230 //suggestions(ratings_map, "912").length  
   135 // => 4110 suggestions with List(858, 260, 318, 593, ...)
   231 // => 4110 suggestions with List(858, 260, 318, 593, ...)
   136 //    being the most frequently suggested movies
   232 //    being the most frequently suggested movies
   137 
   233 
   138 // (6) Implement recommendations functions which generates at most
   234 
   139 // *two* of the most frequently suggested movies. It Returns the 
   235 
   140 // actual movie names, not the movieIDs.
   236 // (6) Implement a recommendations function which generates at most
       
   237 //     *two* of the most frequently suggested movies. It ReTurns the 
       
   238 //     actual movie names, not the movieIDs.
       
   239 
   141 
   240 
   142 def recommendations(recs: Map[String, List[String]],
   241 def recommendations(recs: Map[String, List[String]],
   143                    movs: Map[String, String],
   242                     movs: Map[String, String],
   144                    mov_name: String) : List[String] =
   243                     mov_name: String) : List[String] = {
   145   suggestions(recs, mov_name).take(2).map(movs.get(_).get)                 
   244   val sugg = suggestions(recs, mov_name)
   146 
   245   val movies = (for (i <- 0 until 2 if (i < sugg.length)) yield movs(sugg(i))).toList
   147 
   246   movies
   148 // testcases
   247 }
   149 
   248 
       
   249 
       
   250 
       
   251 // testcases
       
   252 //-----------
   150 // recommendations(ratings_map, movies_map, "912")
   253 // recommendations(ratings_map, movies_map, "912")
   151 //   => List(Godfather, Star Wars: Episode IV - A NewHope (1977))
   254 //   => List(Godfather, Star Wars: Episode IV - A NewHope (1977))
   152 
   255 
   153 // recommendations(ratings_map, movies_map, "260")
   256 //recommendations(ratings_map, movies_map, "260")
   154 //   => List(Star Wars: Episode V - The Empire Strikes Back (1980), 
   257 //   => List(Star Wars: Episode V - The Empire Strikes Back (1980), 
   155 //           Star Wars: Episode VI - Return of the Jedi (1983))
   258 //           Star Wars: Episode VI - Return of the Jedi (1983))
   156 
   259 
   157 // recommendations(ratings_map, movies_map, "2")
   260 // recommendations(ratings_map, movies_map, "2")
   158 //   => List(Lion King, Jurassic Park (1993))
   261 //   => List(Lion King, Jurassic Park (1993))
   163 // recommendations(ratings_map, movies_map, "1")
   266 // recommendations(ratings_map, movies_map, "1")
   164 //   => List(Shawshank Redemption, Forrest Gump (1994))
   267 //   => List(Shawshank Redemption, Forrest Gump (1994))
   165 
   268 
   166 // recommendations(ratings_map, movies_map, "4")
   269 // recommendations(ratings_map, movies_map, "4")
   167 //   => Nil  (there are three ratings for this movie in ratings.csv but they are not positive)     
   270 //   => Nil  (there are three ratings for this movie in ratings.csv but they are not positive)     
       
   271 
       
   272 
   168 
   273 
   169 // (7) Calculate the recommendations for all movies according to
   274 // (7) Calculate the recommendations for all movies according to
   170 // what the recommendations function in (6) produces (this
   275 // what the recommendations function in (6) produces (this
   171 // can take a few seconds). Put all recommendations into a list 
   276 // can take a few seconds). Put all recommendations into a list 
   172 // (of strings) and count how often the strings occur in
   277 // (of strings) and count how often the strings occur in
   173 // this list. This produces a list of string-int pairs,
   278 // this list. This produces a list of string-int pairs,
   174 // where the first component is the movie name and the second
   279 // where the first component is the movie name and the second
   175 // is the number of how many times they were recommended. 
   280 // is the number of how many times the movie was recommended. 
   176 // Sort all the pairs according to the number
   281 // Sort all the pairs according to the number
   177 // of times they were recommended (most recommended movie name 
   282 // of times they were recommended (most recommended movie name 
   178 // first).
   283 // first).
   179 
   284 
   180 def occurrences(xs: List[String]): List[(String, Int)] =
       
   181   for (x <- xs.distinct) yield (x, xs.count(_ == x))
       
   182 
       
   183 def most_recommended(recs: Map[String, List[String]],
   285 def most_recommended(recs: Map[String, List[String]],
   184                      movs: Map[String, String]) : List[(String, Int)] = {
   286                      movs: Map[String, String]) : List[(String, Int)] = {
   185    val all =  (for (name <- movs.toList.map(_._1)) yield {
   287   val movies = (((for((k,v) <- movs) yield recommendations(recs, movs, k)).toList).flatten).groupMapReduce(identity)(_ => 1)(_ + _)
   186      recommendations(recs, movs, name)                     
   288   val sorted = movies.toList.sortWith(_._2 > _._2)
   187    }).flatten
   289   sorted
   188    val occs = occurrences(all)
   290 }
   189    occs.sortBy(_._2).reverse
   291 
   190 }
   292 // testcase
   191 
   293 //
   192 
       
   193 //most_recommended(ratings_map, movies_map).take(3)
   294 //most_recommended(ratings_map, movies_map).take(3)
   194 // =>
   295 // =>
   195 // List((Matrix,698), 
   296 // List((Matrix,698), 
   196 //      (Star Wars: Episode IV - A New Hope (1977),402), 
   297 //      (Star Wars: Episode IV - A New Hope (1977),402), 
   197 //      (Jerry Maguire (1996),382))
   298 //      (Jerry Maguire (1996),382))
   198 
   299 
   199 }
   300 
   200 
   301 
   201 //val ratings_url = """https://nms.kcl.ac.uk/christian.urban/ratings.csv"""
   302 }
   202 //val movies_url = """https://nms.kcl.ac.uk/christian.urban/movies.csv"""
       
   203 
       
   204 /*
       
   205 val ratings = get_csv_url(ratings_url)
       
   206 val movies = get_csv_url(movies_url)
       
   207 
       
   208 val good_ratings = process_ratings(ratings)
       
   209 val movie_names = process_movies(movies)
       
   210 
       
   211 val ratings_map = groupById(good_ratings, Map())
       
   212 val movies_map = movie_names.toMap
       
   213 
       
   214 
       
   215 println(most_recommended(ratings_map, movies_map).take(3))
       
   216 */